diff --git a/.clang-format b/.clang-format
index c7370bb66a..018938c588 100644
--- a/.clang-format
+++ b/.clang-format
@@ -1,156 +1,255 @@
----
-Language:        Cpp
-# BasedOnStyle:  Google
-AccessModifierOffset: -1
-AlignAfterOpenBracket: Align
-AlignConsecutiveMacros: false
-AlignConsecutiveAssignments: false
-AlignConsecutiveDeclarations: false
-AlignEscapedNewlines: Left
-AlignOperands:   true
-AlignTrailingComments: true
-AllowAllArgumentsOnNextLine: true
-AllowAllConstructorInitializersOnNextLine: true
-AllowAllParametersOfDeclarationOnNextLine: true
-AllowShortBlocksOnASingleLine: false
-AllowShortCaseLabelsOnASingleLine: false
-AllowShortFunctionsOnASingleLine: All
-AllowShortLambdasOnASingleLine: All
-AllowShortIfStatementsOnASingleLine: WithoutElse
-AllowShortLoopsOnASingleLine: true
-AlwaysBreakAfterDefinitionReturnType: None
-AlwaysBreakAfterReturnType: None
-AlwaysBreakBeforeMultilineStrings: true
-AlwaysBreakTemplateDeclarations: Yes
-BinPackArguments: true
-BinPackParameters: true
-BraceWrapping:
-  AfterCaseLabel:  false
-  AfterClass:      false
-  AfterControlStatement: false
-  AfterEnum:       false
-  AfterFunction:   false
-  AfterNamespace:  false
-  AfterObjCDeclaration: false
-  AfterStruct:     false
-  AfterUnion:      false
-  AfterExternBlock: false
-  BeforeCatch:     false
-  BeforeElse:      false
-  IndentBraces:    false
+# reference from https://clang.llvm.org/docs/ClangFormatStyleOptions.html
+
+# 关闭格式化
+DisableFormat:	false 
+
+# 基础格式化方案
+BasedOnStyle:	LLVM
+
+# 语言: None, Cpp, Java, JavaScript, ObjC, Proto, TableGen, TextProto
+Language:	Cpp
+
+# 标准: Cpp03, Cpp11, Auto
+Standard:	Cpp11
+
+# tab宽度
+TabWidth:	4
+
+# 使用tab字符: Never, ForIndentation, ForContinuationAndIndentation, Always
+UseTab:	Never
+
+# 访问说明符(public、private等)的偏移
+AccessModifierOffset:	-2
+
+# 缩进宽度
+IndentWidth:	4
+
+# 构造函数的初始化列表的缩进宽度
+ConstructorInitializerIndentWidth:	4
+
+# 延续的行的最小缩进宽度
+ContinuationIndentWidth:	4
+
+# 缩进case标签
+IndentCaseLabels:	true
+
+# 函数返回类型换行时，缩进函数声明或函数定义的函数名
+IndentWrappedFunctionNames:	true
+
+# 命名空间的缩进: None, Inner(缩进嵌套的命名空间中的内容), All
+NamespaceIndentation:	All
+
+# 预处理缩进, None, AfterHash, BeforeHash
+IndentPPDirectives: BeforeHash
+
+# 开括号(开圆括号、开尖括号、开方括号)后的对齐: Align, DontAlign, AlwaysBreak(总是在开括号后换行)
+AlignAfterOpenBracket:	Align
+
+# 连续赋值时，对齐所有等号
+#AlignConsecutiveAssignments:	AcrossEmptyLinesAndComments
+AlignConsecutiveAssignments: AcrossComments
+
+# 连续声明时，对齐所有声明的变量名
+AlignConsecutiveDeclarations:	AcrossEmptyLinesAndComments
+#AlignConsecutiveDeclarations:	AcrossComments
+
+#AlignEscapedNewlines: Right
+ 
+# 左对齐逃脱换行(使用反斜杠换行)的反斜杠
+#AlignEscapedNewlinesLeft:	true
+
+# 水平对齐二元和三元表达式的操作数
+AlignOperands:	true
+
+# 对齐连续的尾随的注释
+AlignTrailingComments:	true
+ 
+# 指针和引用的对齐: Left, Right, Middle
+PointerAlignment:	Left
+
+# 继承最常用的指针和引用的对齐方式
+DerivePointerAlignment:	false
+
+# 允许函数声明的所有参数在放在下一行
+AllowAllParametersOfDeclarationOnNextLine:	false
+
+# false表示函数实参要么都在同一行，要么都各自一行
+BinPackArguments:	false
+
+# false表示所有形参要么都在同一行，要么都各自一行
+BinPackParameters:	false
+
+# 允许函数调用的所有参数在放在下一行,即使BinPackParameters为false
+AllowAllArgumentsOnNextLine: false
+
+# 允许短的块放在同一行
+AllowShortBlocksOnASingleLine:	true
+
+# 允许短的case标签放在同一行
+AllowShortCaseLabelsOnASingleLine:	true
+
+# 允许短的函数放在同一行: None, InlineOnly(定义在类中), Empty(空函数), Inline(定义在类中，空函数), All
+AllowShortFunctionsOnASingleLine:	Empty
+
+# 允许短的if语句保持在同一行
+AllowShortIfStatementsOnASingleLine:	true
+
+# 允许短的循环保持在同一行
+AllowShortLoopsOnASingleLine:	true
+ 
+# 总是在定义返回类型后换行(deprecated)
+AlwaysBreakAfterDefinitionReturnType:	None
+
+# 总是在返回类型后换行: None, All, TopLevel(顶级函数，不包括在类中的函数), 
+#   AllDefinitions(所有的定义，不包括声明), TopLevelDefinitions(所有的顶级函数的定义)
+AlwaysBreakAfterReturnType:	None
+
+# 总是在多行string字面量前换行
+AlwaysBreakBeforeMultilineStrings:	false
+
+# 总是在template声明后换行
+AlwaysBreakTemplateDeclarations:	true
+
+# 构造函数的初始化列表要么都在同一行，要么都各自一行
+ConstructorInitializerAllOnOneLineOrOnePerLine:	false
+
+# 构造函数的初始化列表的逗号和分号在前,对齐参数
+BreakConstructorInitializers: BeforeComma
+
+# 自动检测函数的调用和定义是否被格式为每行一个参数(Experimental)
+ExperimentalAutoDetectBinPacking:	true
+
+# 去除C++11的列表初始化的大括号{后和}前的空格
+Cpp11BracedListStyle:	true
+
+# 大括号换行，只有当BreakBeforeBraces设置为Custom时才有效
+BraceWrapping:   
+  # class定义后面
+  AfterClass:	true
+  # 控制语句后面
+  AfterControlStatement:	true
+  # enum定义后面
+  AfterEnum: true
+  # 函数定义后面
+  AfterFunction: true
+  # 命名空间定义后面
+  AfterNamespace:	true
+  # ObjC定义后面
+  AfterObjCDeclaration:	true
+  # struct定义后面
+  AfterStruct:	true
+  # union定义后面
+  AfterUnion:	true
+  AfterExternBlock: true
+  # catch之前
+  BeforeCatch:	true
+  # else之前
+  BeforeElse:	true
+  # 缩进大括号
+  IndentBraces:	false
   SplitEmptyFunction: true
   SplitEmptyRecord: true
   SplitEmptyNamespace: true
-BreakBeforeBinaryOperators: None
-BreakBeforeBraces: Attach
-BreakBeforeInheritanceComma: false
-BreakInheritanceList: BeforeColon
-BreakBeforeTernaryOperators: true
-BreakConstructorInitializersBeforeComma: false
-BreakConstructorInitializers: BeforeColon
-BreakAfterJavaFieldAnnotations: false
-BreakStringLiterals: true
-ColumnLimit:     100
-CommentPragmas:  '^ IWYU pragma:'
+ 
+# 在二元运算符前换行: None(在操作符后换行), NonAssignment(在非赋值的操作符前换行), All(在操作符前换行)
+BreakBeforeBinaryOperators:	None
+
+# 在大括号前换行: Attach(始终将大括号附加到周围的上下文), Linux(除函数、命名空间和类定义，与Attach类似), 
+#   Mozilla(除枚举、函数、记录定义，与Attach类似), Stroustrup(除函数定义、catch、else，与Attach类似), 
+#   Allman(总是在大括号前换行), GNU(总是在大括号前换行，并对于控制语句的大括号增加额外的缩进), WebKit(在函数前换行), Custom
+#   注：这里认为语句块也属于函数
+BreakBeforeBraces:	Allman
+
+# 在三元运算符前换行
+BreakBeforeTernaryOperators:	false
+
+# 字符串字面值换行
+BreakStringLiterals: false
+
+# 每行字符的限制，0表示没有限制
+ColumnLimit:	0
+
+# 赋值对齐换行的penalty
+PenaltyBreakAssignment: 100
+
+# 在call(后对函数调用换行的penalty
+PenaltyBreakBeforeFirstCallParameter:	100
+
+# 在一个注释中引入换行的penalty
+PenaltyBreakComment:	100
+
+# 第一次在<<前换行的penalty
+PenaltyBreakFirstLessLess:	100
+
+# 在一个字符串字面量中引入换行的penalty
+PenaltyBreakString:	100
+
+# 对于每个在行字符数限制之外的字符的penalty
+PenaltyExcessCharacter:	100
+
+# 将函数的返回类型放到它自己的行的penalty
+PenaltyReturnTypeOnItsOwnLine:	100
+
+# 在C风格类型转换后添加空格
+SpaceAfterCStyleCast:	false
+ 
+# 在模板 template 关键字后面添加空格
+SpaceAfterTemplateKeyword: false
+ 
+# 在赋值运算符之前添加空格
+SpaceBeforeAssignmentOperators:	true
+
+# 开圆括号之前添加一个空格: Never, ControlStatements, Always
+SpaceBeforeParens:	ControlStatements
+
+# 在尾随的评论前添加的空格数(只适用于//)
+SpacesBeforeTrailingComments:	2
+
+# 在尖括号的<后和>前添加空格
+SpacesInAngles:	false
+
+# 在容器(ObjC和JavaScript的数组和字典等)字面量中添加空格
+SpacesInContainerLiterals:	false
+
+# 在C风格类型转换的括号中添加空格
+SpacesInCStyleCastParentheses:	false
+
+# 在圆括号的(后和)前添加空格
+SpacesInParentheses:	false
+
+# 在空的圆括号中添加空格
+SpaceInEmptyParentheses:	false
+
+# 在方括号的[后和]前添加空格，lamda表达式和未指明大小的数组的声明不受影响
+SpacesInSquareBrackets:	false
+
+# 单行最多允许的连续空格?
+PenaltyIndentedWhitespace: 10
+
+# 描述具有特殊意义的注释的正则表达式，它不应该被分割为多行或以其它方式改变
+CommentPragmas:	'^ IWYU pragma:'
+
+# 连续 namespace 
 CompactNamespaces: false
-ConstructorInitializerAllOnOneLineOrOnePerLine: true
-ConstructorInitializerIndentWidth: 4
-ContinuationIndentWidth: 4
-Cpp11BracedListStyle: true
-DerivePointerAlignment: true
-DisableFormat:   false
-ExperimentalAutoDetectBinPacking: false
-FixNamespaceComments: true
-ForEachMacros:
-  - foreach
-  - Q_FOREACH
-  - BOOST_FOREACH
-IncludeBlocks:   Regroup
-IncludeCategories:
-  - Regex:           '^<ext/.*\.h>'
-    Priority:        2
-  - Regex:           '^<.*\.h>'
-    Priority:        1
-  - Regex:           '^<.*'
-    Priority:        2
-  - Regex:           '.*'
-    Priority:        3
-IncludeIsMainRegex: '([-_](test|unittest))?$'
-IndentCaseLabels: true
-IndentPPDirectives: None
-IndentWidth:     2
-IndentWrappedFunctionNames: false
-JavaScriptQuotes: Leave
-JavaScriptWrapImports: true
-KeepEmptyLinesAtTheStartOfBlocks: false
-MacroBlockBegin: ''
-MacroBlockEnd:   ''
-MaxEmptyLinesToKeep: 1
-NamespaceIndentation: None
-ObjCBinPackProtocolList: Never
-ObjCBlockIndentWidth: 2
-ObjCSpaceAfterProperty: false
-ObjCSpaceBeforeProtocolList: true
-PenaltyBreakAssignment: 2
-PenaltyBreakBeforeFirstCallParameter: 1
-PenaltyBreakComment: 300
-PenaltyBreakFirstLessLess: 120
-PenaltyBreakString: 1000
-PenaltyBreakTemplateDeclaration: 10
-PenaltyExcessCharacter: 1000000
-PenaltyReturnTypeOnItsOwnLine: 200
-PointerAlignment: Left
-RawStringFormats:
-  - Language:        Cpp
-    Delimiters:
-      - cc
-      - CC
-      - cpp
-      - Cpp
-      - CPP
-      - 'c++'
-      - 'C++'
-    CanonicalDelimiter: ''
-    BasedOnStyle:    google
-  - Language:        TextProto
-    Delimiters:
-      - pb
-      - PB
-      - proto
-      - PROTO
-    EnclosingFunctions:
-      - EqualsProto
-      - EquivToProto
-      - PARSE_PARTIAL_TEXT_PROTO
-      - PARSE_TEST_PROTO
-      - PARSE_TEXT_PROTO
-      - ParseTextOrDie
-      - ParseTextProtoOrDie
-    CanonicalDelimiter: ''
-    BasedOnStyle:    google
-ReflowComments:  true
-SortIncludes:    true
-SortUsingDeclarations: true
-SpaceAfterCStyleCast: false
-SpaceAfterLogicalNot: false
-SpaceAfterTemplateKeyword: true
-SpaceBeforeAssignmentOperators: true
-SpaceBeforeCpp11BracedList: false
-SpaceBeforeCtorInitializerColon: true
-SpaceBeforeInheritanceColon: true
-SpaceBeforeParens: ControlStatements
-SpaceBeforeRangeBasedForLoopColon: true
-SpaceInEmptyParentheses: false
-SpacesBeforeTrailingComments: 2
-SpacesInAngles:  false
-SpacesInContainerLiterals: true
-SpacesInCStyleCastParentheses: false
-SpacesInParentheses: false
-SpacesInSquareBrackets: false
-Standard:        Auto
-StatementMacros:
-  - Q_UNUSED
-  - QT_REQUIRE_VERSION
-TabWidth:        8
-UseTab:          Never
-...
+
+# 保留在块开始处的空行
+KeepEmptyLinesAtTheStartOfBlocks:	false
+
+# 连续空行的最大数量
+MaxEmptyLinesToKeep:	2
+
+# 允许重新排版注释
+ReflowComments:	true
+
+# 允许排序#include
+SortIncludes:	false
+
+# 对#include进行排序，匹配了某正则表达式的#include拥有对应的优先级，匹配不到的则默认优先级为INT_MAX(优先级越小排序越靠前)，
+#   可以定义负数优先级从而保证某些#include永远在最前面
+IncludeCategories: 
+  - Regex:	'^"(llvm|llvm-c|clang|clang-c)/'
+    Priority:	2
+  - Regex:	'^(<|"(gtest|isl|json)/)'
+    Priority:	3
+  - Regex:	'.*'
+    Priority:	1
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 2d78033dcf..452e541fc8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -113,7 +113,7 @@ venv*
 
 data/
 data
-.vscode
+# .vscode
 .idea
 .DS_Store
 
@@ -172,3 +172,5 @@ demo/csharp/*/Properties
 
 # doxygen
 docs/cppapi/docs
+
+*debug*
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a241d90674..a4252ef0d0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,12 +1,16 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-if (NOT DEFINED CMAKE_INSTALL_PREFIX)
-    set(CMAKE_INSTALL_PREFIX "${CMAKE_BINARY_DIR}/install" CACHE PATH "installation directory")
-endif ()
+if(NOT DEFINED CMAKE_INSTALL_PREFIX)
+  set(CMAKE_INSTALL_PREFIX
+      "${CMAKE_BINARY_DIR}/install"
+      CACHE PATH "installation directory")
+endif()
 message(STATUS "CMAKE_INSTALL_PREFIX: ${CMAKE_INSTALL_PREFIX}")
 
-if (NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE Release CACHE STRING "choose 'Release' as default build type" FORCE)
-endif ()
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE
+      Release
+      CACHE STRING "choose 'Release' as default build type" FORCE)
+endif()
 
 cmake_minimum_required(VERSION 3.14)
 project(MMDeploy VERSION 1.3.1)
@@ -18,11 +22,11 @@ set(MMDEPLOY_VERSION_MINOR ${PROJECT_VERSION_MINOR})
 set(MMDEPLOY_VERSION_PATCH ${PROJECT_VERSION_PATCH})
 
 set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
-if (MSVC)
-    set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
-else ()
-    set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
-endif ()
+if(MSVC)
+  set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
+else()
+  set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
+endif()
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
 
 # options
@@ -41,141 +45,146 @@ option(MMDEPLOY_COVERAGE "build SDK for coverage" OFF)
 option(MMDEPLOY_USE_MSCV_STATIC "statically linked CRT" OFF)
 option(MMDEPLOY_ELENA_FUSION "use elena to fuse preprocess" OFF)
 
-set(MMDEPLOY_TARGET_DEVICES "cpu" CACHE STRING "target devices to support")
-set(MMDEPLOY_TARGET_BACKENDS "" CACHE STRING "target inference engines to support")
-set(MMDEPLOY_CODEBASES "all" CACHE STRING "select OpenMMLab codebases")
-
-if ((NOT MMDEPLOY_BUILD_SDK_MONOLITHIC) AND MMDEPLOY_DYNAMIC_BACKEND)
-    set(MMDEPLOY_DYNAMIC_BACKEND OFF)
-endif ()
+set(MMDEPLOY_TARGET_DEVICES
+    "cpu"
+    CACHE STRING "target devices to support")
+set(MMDEPLOY_TARGET_BACKENDS
+    ""
+    CACHE STRING "target inference engines to support")
+set(MMDEPLOY_CODEBASES
+    "all"
+    CACHE STRING "select OpenMMLab codebases")
+
+if((NOT MMDEPLOY_BUILD_SDK_MONOLITHIC) AND MMDEPLOY_DYNAMIC_BACKEND)
+  set(MMDEPLOY_DYNAMIC_BACKEND OFF)
+endif()
 
-if (MMDEPLOY_SHARED_LIBS)
-    set(MMDEPLOY_LIB_TYPE SHARED)
-else ()
-    set(MMDEPLOY_LIB_TYPE STATIC)
-endif ()
+if(MMDEPLOY_SHARED_LIBS)
+  set(MMDEPLOY_LIB_TYPE SHARED)
+else()
+  set(MMDEPLOY_LIB_TYPE STATIC)
+endif()
 
-set(MMDEPLOY_TASKS "" CACHE INTERNAL "")
+set(MMDEPLOY_TASKS
+    ""
+    CACHE INTERNAL "")
 
-if (MMDEPLOY_COVERAGE)
-    add_compile_options(-coverage -fprofile-arcs -ftest-coverage)
-    add_link_options(-coverage -lgcov)
-endif ()
+if(MMDEPLOY_COVERAGE)
+  add_compile_options(-coverage -fprofile-arcs -ftest-coverage)
+  add_link_options(-coverage -lgcov)
+endif()
 
-# when CUDA devices are enabled, the environment variable ASAN_OPTIONS=protect_shadow_gap=0
-# must be set at runtime
-if (MMDEPLOY_ASAN_ENABLE)
-    add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-fsanitize=address>)
-    add_link_options(-fsanitize=address)
-endif ()
+# when CUDA devices are enabled, the environment variable
+# ASAN_OPTIONS=protect_shadow_gap=0 must be set at runtime
+if(MMDEPLOY_ASAN_ENABLE)
+  add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-fsanitize=address>)
+  add_link_options(-fsanitize=address)
+endif()
 
 # notice that ubsan has linker issues for ubuntu < 18.04, see
 # https://stackoverflow.com/questions/50024731/ld-unrecognized-option-push-state-no-as-needed
-if (MMDEPLOY_UBSAN_ENABLE)
-    add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-fsanitize=undefined>)
-    add_link_options(-fsanitize=undefined)
-endif ()
-
-if (MSVC)
-    add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/diagnostics:classic>)
-    add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/wd4251>)
-    if (MMDEPLOY_USE_MSCV_STATIC)
-        foreach(lang C CXX)
-            string(REPLACE /MD /MT CMAKE_${lang}_FLAGS_DEBUG "${CMAKE_${lang}_FLAGS_DEBUG}")
-            string(REPLACE /MD /MT CMAKE_${lang}_FLAGS_RELEASE "${CMAKE_${lang}_FLAGS_RELEASE}")
-        endforeach()
-    endif ()
-endif ()
+if(MMDEPLOY_UBSAN_ENABLE)
+  add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-fsanitize=undefined>)
+  add_link_options(-fsanitize=undefined)
+endif()
+
+if(MSVC)
+  add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/diagnostics:classic>)
+  add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/wd4251>)
+  if(MMDEPLOY_USE_MSCV_STATIC)
+    foreach(lang C CXX)
+      string(REPLACE /MD /MT CMAKE_${lang}_FLAGS_DEBUG
+                     "${CMAKE_${lang}_FLAGS_DEBUG}")
+      string(REPLACE /MD /MT CMAKE_${lang}_FLAGS_RELEASE
+                     "${CMAKE_${lang}_FLAGS_RELEASE}")
+    endforeach()
+  endif()
+endif()
 
 if(APPLE)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fobjc-arc")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fobjc-arc")
 endif()
 
 add_library(MMDeployStaticModules INTERFACE)
 add_library(MMDeployDynamicModules INTERFACE)
 add_library(MMDeployLibs INTERFACE)
 
-if ((cuda IN_LIST MMDEPLOY_TARGET_DEVICES) OR (trt IN_LIST MMDEPLOY_TARGET_BACKENDS))
-    include(cmake/cuda.cmake NO_POLICY_SCOPE)
-endif ()
+if((cuda IN_LIST MMDEPLOY_TARGET_DEVICES) OR (trt IN_LIST
+                                              MMDEPLOY_TARGET_BACKENDS))
+  include(cmake/cuda.cmake NO_POLICY_SCOPE)
+endif()
 
-# this must come after including cuda.cmake because policies in function scope is captured
-# at function definition
+# this must come after including cuda.cmake because policies in function scope
+# is captured at function definition
 include(cmake/MMDeploy.cmake)
 
 add_subdirectory(csrc/mmdeploy)
 
-if (MMDEPLOY_BUILD_SDK)
-    if (NOT MMDEPLOY_BUILD_SDK_MONOLITHIC)
-        install(TARGETS MMDeployStaticModules
-                MMDeployDynamicModules
-                MMDeployLibs
-                EXPORT MMDeployTargets)
-    endif ()
-
-    if (MMDEPLOY_BUILD_TEST)
-        add_subdirectory(tests/test_csrc)
-    endif ()
-
-    if (MMDEPLOY_BUILD_EXAMPLES)
-        include(${CMAKE_SOURCE_DIR}/cmake/opencv.cmake)
-        add_subdirectory(demo/csrc)
-    endif ()
-
-    # export MMDeploy package
-    install(EXPORT MMDeployTargets
-            FILE MMDeployTargets.cmake
+if(MMDEPLOY_BUILD_SDK)
+  if(NOT MMDEPLOY_BUILD_SDK_MONOLITHIC)
+    install(TARGETS MMDeployStaticModules MMDeployDynamicModules MMDeployLibs
+            EXPORT MMDeployTargets)
+  endif()
+
+  if(MMDEPLOY_BUILD_TEST)
+    add_subdirectory(tests/test_csrc)
+  endif()
+
+  if(MMDEPLOY_BUILD_EXAMPLES)
+    include(${CMAKE_SOURCE_DIR}/cmake/opencv.cmake)
+    add_subdirectory(demo/csrc)
+  endif()
+
+  # export MMDeploy package
+  install(
+    EXPORT MMDeployTargets
+    FILE MMDeployTargets.cmake
+    DESTINATION lib/cmake/MMDeploy)
+
+  if(MMDEPLOY_SPDLOG_EXTERNAL)
+    set(SPDLOG_DEPENDENCY "find_package(spdlog QUIET)")
+  endif()
+  # append backend deps
+  mmdeploy_add_deps(trt BACKENDS ${MMDEPLOY_TARGET_BACKENDS} DEPS TENSORRT
+                    CUDNN)
+  mmdeploy_add_deps(ort BACKENDS ${MMDEPLOY_TARGET_BACKENDS} DEPS ONNXRUNTIME)
+  mmdeploy_add_deps(ncnn BACKENDS ${MMDEPLOY_TARGET_BACKENDS} DEPS ncnn)
+  mmdeploy_add_deps(openvino BACKENDS ${MMDEPLOY_TARGET_BACKENDS} DEPS
+                    InferenceEngine)
+  if(NOT MMDEPLOY_SHARED_LIBS)
+    mmdeploy_add_deps(pplnn BACKENDS ${MMDEPLOY_TARGET_BACKENDS} DEPS pplnn)
+  endif()
+  mmdeploy_add_deps(snpe BACKENDS ${MMDEPLOY_TARGET_BACKENDS} DEPS snpe)
+  mmdeploy_add_deps(rknn BACKENDS ${MMDEPLOY_TARGET_BACKENDS} DEPS rknn)
+
+  include(CMakePackageConfigHelpers)
+  # generate the config file that is includes the exports
+  configure_package_config_file(
+    ${CMAKE_SOURCE_DIR}/cmake/MMDeployConfig.cmake.in
+    "${CMAKE_CURRENT_BINARY_DIR}/MMDeployConfig.cmake"
+    INSTALL_DESTINATION "lib/cmake"
+    NO_SET_AND_CHECK_MACRO NO_CHECK_REQUIRED_COMPONENTS_MACRO)
+
+  write_basic_package_version_file(
+    "${CMAKE_CURRENT_BINARY_DIR}/MMDeployConfigVersion.cmake"
+    VERSION "${MMDeploy_VERSION_MAJOR}.${MMDeploy_VERSION_MINOR}"
+    COMPATIBILITY AnyNewerVersion)
+
+  install(
+    FILES ${CMAKE_CURRENT_BINARY_DIR}/MMDeployConfig.cmake
+          ${CMAKE_CURRENT_BINARY_DIR}/MMDeployConfigVersion.cmake
+          ${CMAKE_CURRENT_SOURCE_DIR}/cmake/MMDeploy.cmake
+    DESTINATION lib/cmake/MMDeploy)
+
+  if(MSVC)
+    install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/cmake/loader.cpp.in
             DESTINATION lib/cmake/MMDeploy)
+  endif()
+
+  install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules
+          DESTINATION lib/cmake/MMDeploy)
 
-    if (MMDEPLOY_SPDLOG_EXTERNAL)
-        set(SPDLOG_DEPENDENCY "find_package(spdlog QUIET)")
-    endif ()
-    # append backend deps
-    mmdeploy_add_deps(trt BACKENDS ${MMDEPLOY_TARGET_BACKENDS} DEPS TENSORRT CUDNN)
-    mmdeploy_add_deps(ort BACKENDS ${MMDEPLOY_TARGET_BACKENDS} DEPS ONNXRUNTIME)
-    mmdeploy_add_deps(ncnn BACKENDS ${MMDEPLOY_TARGET_BACKENDS} DEPS ncnn)
-    mmdeploy_add_deps(openvino BACKENDS ${MMDEPLOY_TARGET_BACKENDS} DEPS InferenceEngine)
-    if (NOT MMDEPLOY_SHARED_LIBS)
-        mmdeploy_add_deps(pplnn BACKENDS ${MMDEPLOY_TARGET_BACKENDS} DEPS pplnn)
-    endif ()
-    mmdeploy_add_deps(snpe BACKENDS ${MMDEPLOY_TARGET_BACKENDS} DEPS snpe)
-    mmdeploy_add_deps(rknn BACKENDS ${MMDEPLOY_TARGET_BACKENDS} DEPS rknn)
-
-    include(CMakePackageConfigHelpers)
-    # generate the config file that is includes the exports
-    configure_package_config_file(${CMAKE_SOURCE_DIR}/cmake/MMDeployConfig.cmake.in
-            "${CMAKE_CURRENT_BINARY_DIR}/MMDeployConfig.cmake"
-            INSTALL_DESTINATION "lib/cmake"
-            NO_SET_AND_CHECK_MACRO
-            NO_CHECK_REQUIRED_COMPONENTS_MACRO
-            )
-
-    write_basic_package_version_file(
-            "${CMAKE_CURRENT_BINARY_DIR}/MMDeployConfigVersion.cmake"
-            VERSION "${MMDeploy_VERSION_MAJOR}.${MMDeploy_VERSION_MINOR}"
-            COMPATIBILITY AnyNewerVersion
-    )
-
-    install(FILES
-            ${CMAKE_CURRENT_BINARY_DIR}/MMDeployConfig.cmake
-            ${CMAKE_CURRENT_BINARY_DIR}/MMDeployConfigVersion.cmake
-            ${CMAKE_CURRENT_SOURCE_DIR}/cmake/MMDeploy.cmake
-            DESTINATION lib/cmake/MMDeploy
-            )
-
-    if (MSVC)
-        install(FILES
-                ${CMAKE_CURRENT_SOURCE_DIR}/cmake/loader.cpp.in
-                DESTINATION lib/cmake/MMDeploy
-                )
-    endif ()
-
-    install(DIRECTORY
-            ${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules
-            DESTINATION lib/cmake/MMDeploy
-            )
-
-    if (${CMAKE_VERSION} VERSION_LESS "3.17.0")
-        install(SCRIPT cmake/post-install.cmake)
-    endif ()
-endif ()
+  if(${CMAKE_VERSION} VERSION_LESS "3.17.0")
+    install(SCRIPT cmake/post-install.cmake)
+  endif()
+endif()
diff --git a/cmake/MMDeploy.cmake b/cmake/MMDeploy.cmake
index 304c7b1bc1..30e15c4c7c 100644
--- a/cmake/MMDeploy.cmake
+++ b/cmake/MMDeploy.cmake
@@ -1,220 +1,228 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-function (mmdeploy_export_impl NAME)
-    set(_LIB_DIR lib)
-    if (MSVC)
-        set(_LIB_DIR bin)
-    endif ()
-    install(TARGETS ${NAME}
-            EXPORT MMDeployTargets
-            ARCHIVE DESTINATION lib
-            LIBRARY DESTINATION ${_LIB_DIR}
-            RUNTIME DESTINATION bin)
-endfunction ()
-
-function (mmdeploy_add_rpath NAME)
-    if (MSVC)
-        return()
-    elseif(APPLE)
-        set_target_properties(${NAME} PROPERTIES
-                INSTALL_RPATH "@loader_path"
-                BUILD_RPATH "@loader_path")
-    else ()
-        set_target_properties(${NAME} PROPERTIES
-                INSTALL_RPATH "\$ORIGIN"
-                BUILD_RPATH "\$ORIGIN")
-        target_link_libraries(${NAME} PRIVATE -Wl,--disable-new-dtags)
-    endif ()
-endfunction ()
+function(mmdeploy_export_impl NAME)
+  set(_LIB_DIR lib)
+  if(MSVC)
+    set(_LIB_DIR bin)
+  endif()
+  install(
+    TARGETS ${NAME}
+    EXPORT MMDeployTargets
+    ARCHIVE DESTINATION lib
+    LIBRARY DESTINATION ${_LIB_DIR}
+    RUNTIME DESTINATION bin)
+endfunction()
+
+function(mmdeploy_add_rpath NAME)
+  if(MSVC)
+    return()
+  elseif(APPLE)
+    set_target_properties(${NAME} PROPERTIES INSTALL_RPATH "@loader_path"
+                                             BUILD_RPATH "@loader_path")
+  else()
+    set_target_properties(${NAME} PROPERTIES INSTALL_RPATH "\$ORIGIN"
+                                             BUILD_RPATH "\$ORIGIN")
+    target_link_libraries(${NAME} PRIVATE -Wl,--disable-new-dtags)
+  endif()
+endfunction()
 
 macro(mmdeploy_add_net NAME)
-    if (MMDEPLOY_DYNAMIC_BACKEND)
-        mmdeploy_add_library(${NAME} SHARED ${ARGN})
-        mmdeploy_add_rpath(${NAME})
-        # DYNAMIC_BACKEND implies BUILD_SDK_MONOLITHIC
-        mmdeploy_export_impl(${NAME})
-        target_link_libraries(${PROJECT_NAME} PRIVATE mmdeploy)
-        set(BACKEND_LIB_NAMES ${BACKEND_LIB_NAMES} ${PROJECT_NAME} PARENT_SCOPE)
-    else ()
-        mmdeploy_add_module(${NAME} ${ARGN})
-    endif ()
+  if(MMDEPLOY_DYNAMIC_BACKEND)
+    mmdeploy_add_library(${NAME} SHARED ${ARGN})
+    mmdeploy_add_rpath(${NAME})
+    # DYNAMIC_BACKEND implies BUILD_SDK_MONOLITHIC
+    mmdeploy_export_impl(${NAME})
+    target_link_libraries(${PROJECT_NAME} PRIVATE mmdeploy)
+    set(BACKEND_LIB_NAMES
+        ${BACKEND_LIB_NAMES} ${PROJECT_NAME}
+        PARENT_SCOPE)
+  else()
+    mmdeploy_add_module(${NAME} ${ARGN})
+  endif()
 endmacro()
 
-function (mmdeploy_export NAME)
-    if (NOT MMDEPLOY_BUILD_SDK_MONOLITHIC)
-        mmdeploy_export_impl(${NAME})
-    endif ()
-endfunction ()
-
-
-function (mmdeploy_add_library NAME)
-    # EXCLUDE: exclude from registering & exporting
-    cmake_parse_arguments(_MMDEPLOY "EXCLUDE" "" "" ${ARGN})
-    # search for add_library keywords
-    cmake_parse_arguments(_TYPE "STATIC;SHARED;MODULE" "" "" ${_MMDEPLOY_UNPARSED_ARGUMENTS})
-    set(_MAYBE_TYPE)
-    if (NOT (_TYPE_STATIC OR _TYPE_SHARED OR _TYPE_MODULE))
-        set(_MAYBE_TYPE ${MMDEPLOY_LIB_TYPE})
-    endif ()
-    add_library(${NAME} ${_MAYBE_TYPE} ${_MMDEPLOY_UNPARSED_ARGUMENTS})
-    if (NOT MSVC)
-        target_compile_options(${NAME} PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-fvisibility=hidden>)
-    endif ()
+function(mmdeploy_export NAME)
+  if(NOT MMDEPLOY_BUILD_SDK_MONOLITHIC)
+    mmdeploy_export_impl(${NAME})
+  endif()
+endfunction()
+
+function(mmdeploy_add_library NAME)
+  # EXCLUDE: exclude from registering & exporting
+  cmake_parse_arguments(_MMDEPLOY "EXCLUDE" "" "" ${ARGN})
+  # search for add_library keywords
+  cmake_parse_arguments(_TYPE "STATIC;SHARED;MODULE" "" ""
+                        ${_MMDEPLOY_UNPARSED_ARGUMENTS})
+  set(_MAYBE_TYPE)
+  if(NOT
+     (_TYPE_STATIC
+      OR _TYPE_SHARED
+      OR _TYPE_MODULE))
+    set(_MAYBE_TYPE ${MMDEPLOY_LIB_TYPE})
+  endif()
+  add_library(${NAME} ${_MAYBE_TYPE} ${_MMDEPLOY_UNPARSED_ARGUMENTS})
+  if(NOT MSVC)
+    target_compile_options(
+      ${NAME} PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-fvisibility=hidden>)
+  endif()
+  target_compile_definitions(${NAME} PRIVATE -DMMDEPLOY_API_EXPORTS=1)
+  get_target_property(_TYPE ${NAME} TYPE)
+  if(_TYPE STREQUAL STATIC_LIBRARY)
+    set_target_properties(${NAME} PROPERTIES POSITION_INDEPENDENT_CODE 1)
+  elseif(_TYPE STREQUAL SHARED_LIBRARY)
+
+  else()
+    message(FATAL_ERROR "unsupported type: ${_TYPE}")
+  endif()
+  if(NOT _MMDEPLOY_EXCLUDE)
+    target_link_libraries(MMDeployLibs INTERFACE ${NAME})
+    mmdeploy_export(${NAME})
+  endif()
+endfunction()
+
+function(mmdeploy_add_module NAME)
+  # EXCLUDE: exclude from registering & exporting as SDK module LIBRARY: the
+  # module is also a library (add_libray with SHARED instead of MODULE)
+  cmake_parse_arguments(_MMDEPLOY "EXCLUDE;LIBRARY" "" "" ${ARGN})
+  # search for add_library keywords
+  cmake_parse_arguments(_TYPE "STATIC;SHARED;MODULE" "" ""
+                        ${_MMDEPLOY_UNPARSED_ARGUMENTS})
+
+  set(_MAYBE_TYPE)
+  # no library type specified
+  if(NOT
+     (_TYPE_STATIC
+      OR _TYPE_SHARED
+      OR _TYPE_MODULE))
+    # shared but not marked as a library, build module library so that no .lib
+    # dependency will be generated for MSVC
+    if(MSVC
+       AND MMDEPLOY_SHARED_LIBS
+       AND NOT _MMDEPLOY_LIBRARY)
+      set(_MAYBE_TYPE MODULE)
+    else()
+      set(_MAYBE_TYPE ${MMDEPLOY_LIB_TYPE})
+    endif()
+  endif()
+
+  add_library(${NAME} ${_MAYBE_TYPE} ${_MMDEPLOY_UNPARSED_ARGUMENTS})
+
+  if(NOT MSVC)
+    target_compile_options(
+      ${NAME} PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-fvisibility=hidden>)
+  endif()
+
+  # automatically link mmdeploy::core if exists
+  if(TARGET mmdeploy::core)
+    target_link_libraries(${NAME} PRIVATE mmdeploy::core)
+  endif()
+
+  # export public symbols when marked as a library
+  if(_MMDEPLOY_LIBRARY)
     target_compile_definitions(${NAME} PRIVATE -DMMDEPLOY_API_EXPORTS=1)
-    get_target_property(_TYPE ${NAME} TYPE)
-    if (_TYPE STREQUAL STATIC_LIBRARY)
-        set_target_properties(${NAME} PROPERTIES POSITION_INDEPENDENT_CODE 1)
-    elseif (_TYPE STREQUAL SHARED_LIBRARY)
-    else ()
-        message(FATAL_ERROR "unsupported type: ${_TYPE}")
-    endif ()
-    if (NOT _MMDEPLOY_EXCLUDE)
-        target_link_libraries(MMDeployLibs INTERFACE ${NAME})
-        mmdeploy_export(${NAME})
-    endif ()
-endfunction ()
-
-
-function (mmdeploy_add_module NAME)
-    # EXCLUDE: exclude from registering & exporting as SDK module
-    # LIBRARY: the module is also a library (add_libray with SHARED instead of MODULE)
-    cmake_parse_arguments(_MMDEPLOY "EXCLUDE;LIBRARY" "" "" ${ARGN})
-    # search for add_library keywords
-    cmake_parse_arguments(_TYPE "STATIC;SHARED;MODULE" "" "" ${_MMDEPLOY_UNPARSED_ARGUMENTS})
-
-    set(_MAYBE_TYPE)
-    # no library type specified
-    if (NOT (_TYPE_STATIC OR _TYPE_SHARED OR _TYPE_MODULE))
-        # shared but not marked as a library, build module library so that no .lib dependency
-        # will be generated for MSVC
-        if (MSVC AND MMDEPLOY_SHARED_LIBS AND NOT _MMDEPLOY_LIBRARY)
-            set(_MAYBE_TYPE MODULE)
-        else ()
-            set(_MAYBE_TYPE ${MMDEPLOY_LIB_TYPE})
-        endif ()
-    endif ()
-
-    add_library(${NAME} ${_MAYBE_TYPE} ${_MMDEPLOY_UNPARSED_ARGUMENTS})
-
-    if (NOT MSVC)
-        target_compile_options(${NAME} PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-fvisibility=hidden>)
-    endif ()
-
-    # automatically link mmdeploy::core if exists
-    if (TARGET mmdeploy::core)
-        target_link_libraries(${NAME} PRIVATE mmdeploy::core)
-    endif ()
-
-    # export public symbols when marked as a library
-    if (_MMDEPLOY_LIBRARY)
-        target_compile_definitions(${NAME} PRIVATE -DMMDEPLOY_API_EXPORTS=1)
-    endif ()
-
-    get_target_property(_TYPE ${NAME} TYPE)
-    if (_TYPE STREQUAL STATIC_LIBRARY)
-        set_target_properties(${NAME} PROPERTIES POSITION_INDEPENDENT_CODE 1)
-        if (MSVC)
-            target_link_options(${NAME} INTERFACE "/WHOLEARCHIVE:${NAME}")
-        endif ()
-        # register static modules
-        if (NOT _MMDEPLOY_EXCLUDE)
-            target_link_libraries(MMDeployStaticModules INTERFACE ${NAME})
-        endif ()
-    elseif (_TYPE STREQUAL SHARED_LIBRARY OR _TYPE STREQUAL MODULE_LIBRARY)
-        # register dynamic modules
-        if (NOT _MMDEPLOY_EXCLUDE)
-            target_link_libraries(MMDeployDynamicModules INTERFACE ${NAME})
-        endif ()
-    else ()
-        message(FATAL_ERROR "unsupported type: ${_TYPE}")
-    endif ()
-    if (NOT _MMDEPLOY_EXCLUDE)
-        mmdeploy_export(${NAME})
-    endif ()
-endfunction ()
-
-
-function (_mmdeploy_flatten_modules RETVAL)
-    set(_RETVAL)
-    foreach (ARG IN LISTS ARGN)
-        get_target_property(TYPE ${ARG} TYPE)
-        if (TYPE STREQUAL "INTERFACE_LIBRARY")
-            get_target_property(LIBS ${ARG} INTERFACE_LINK_LIBRARIES)
-            if (LIBS)
-                # pattern for 3.17+
-                list(FILTER LIBS EXCLUDE REGEX "^::@")
-                # pattern for 3.13-3.16
-                list(TRANSFORM LIBS REPLACE "(.+)::@.*" "\\1")
-                list(APPEND _RETVAL ${LIBS})
-            endif ()
-        else ()
-            list(APPEND _RETVAL ${ARG})
-        endif ()
-    endforeach ()
-    set(${RETVAL} ${_RETVAL} PARENT_SCOPE)
-endfunction ()
-
-
-function (mmdeploy_load_static NAME)
-    if (MSVC)
-        target_link_libraries(${NAME} PRIVATE ${ARGN})
-    else ()
-        _mmdeploy_flatten_modules(_MODULE_LIST ${ARGN})
-        if (APPLE)
-            foreach (module IN LISTS _MODULE_LIST)
-                target_link_libraries(${NAME} PRIVATE -force_load ${module})
-            endforeach ()
-        else ()
-            target_link_libraries(${NAME} PRIVATE
-                    -Wl,--whole-archive
-                    ${_MODULE_LIST}
-                    -Wl,--no-whole-archive)
-        endif ()
-    endif ()
-endfunction ()
-
-function (mmdeploy_load_dynamic NAME)
+  endif()
+
+  get_target_property(_TYPE ${NAME} TYPE)
+  if(_TYPE STREQUAL STATIC_LIBRARY)
+    set_target_properties(${NAME} PROPERTIES POSITION_INDEPENDENT_CODE 1)
+    if(MSVC)
+      target_link_options(${NAME} INTERFACE "/WHOLEARCHIVE:${NAME}")
+    endif()
+    # register static modules
+    if(NOT _MMDEPLOY_EXCLUDE)
+      target_link_libraries(MMDeployStaticModules INTERFACE ${NAME})
+    endif()
+  elseif(_TYPE STREQUAL SHARED_LIBRARY OR _TYPE STREQUAL MODULE_LIBRARY)
+    # register dynamic modules
+    if(NOT _MMDEPLOY_EXCLUDE)
+      target_link_libraries(MMDeployDynamicModules INTERFACE ${NAME})
+    endif()
+  else()
+    message(FATAL_ERROR "unsupported type: ${_TYPE}")
+  endif()
+  if(NOT _MMDEPLOY_EXCLUDE)
+    mmdeploy_export(${NAME})
+  endif()
+endfunction()
+
+function(_mmdeploy_flatten_modules RETVAL)
+  set(_RETVAL)
+  foreach(ARG IN LISTS ARGN)
+    get_target_property(TYPE ${ARG} TYPE)
+    if(TYPE STREQUAL "INTERFACE_LIBRARY")
+      get_target_property(LIBS ${ARG} INTERFACE_LINK_LIBRARIES)
+      if(LIBS)
+        # pattern for 3.17+
+        list(FILTER LIBS EXCLUDE REGEX "^::@")
+        # pattern for 3.13-3.16
+        list(TRANSFORM LIBS REPLACE "(.+)::@.*" "\\1")
+        list(APPEND _RETVAL ${LIBS})
+      endif()
+    else()
+      list(APPEND _RETVAL ${ARG})
+    endif()
+  endforeach()
+  set(${RETVAL}
+      ${_RETVAL}
+      PARENT_SCOPE)
+endfunction()
+
+function(mmdeploy_load_static NAME)
+  if(MSVC)
+    target_link_libraries(${NAME} PRIVATE ${ARGN})
+  else()
     _mmdeploy_flatten_modules(_MODULE_LIST ${ARGN})
-    if (MSVC)
-        if (NOT _MODULE_LIST)
-            return ()
-        endif ()
-        # MSVC has nothing like "-Wl,--no-as-needed ... -Wl,--as-needed", as a
-        # workaround we build a static module which loads the dynamic modules
-        set(_MODULE_STR ${_MODULE_LIST})
-        list(TRANSFORM _MODULE_STR REPLACE "(.+)" "\"\\1\"")
-        string(JOIN ",\n        " _MODULE_STR ${_MODULE_STR})
-        set(_MMDEPLOY_DYNAMIC_MODULES ${_MODULE_STR})
-
-        set(_LOADER_NAME ${NAME}_loader)
-
-        add_dependencies(${NAME} ${_MODULE_LIST})
-
-        set(_LOADER_PATH ${CMAKE_BINARY_DIR}/${_LOADER_NAME}.cpp)
-        # ! CMAKE_CURRENT_FUNCTION_LIST_DIR requires cmake 3.17+
-        configure_file(
-                ${CMAKE_CURRENT_FUNCTION_LIST_DIR}/loader.cpp.in
-                ${_LOADER_PATH})
-
-        mmdeploy_add_module(${_LOADER_NAME} STATIC EXCLUDE ${_LOADER_PATH})
-        mmdeploy_load_static(${NAME} ${_LOADER_NAME})
-    elseif (APPLE)
-        target_link_libraries(${NAME} PRIVATE ${_MODULE_LIST})
-    else ()
-        target_link_libraries(${NAME} PRIVATE
-                -Wl,--no-as-needed
-                ${_MODULE_LIST}
-                -Wl,--as-needed)
-    endif ()
-endfunction ()
+    if(APPLE)
+      foreach(module IN LISTS _MODULE_LIST)
+        target_link_libraries(${NAME} PRIVATE -force_load ${module})
+      endforeach()
+    else()
+      target_link_libraries(${NAME} PRIVATE -Wl,--whole-archive ${_MODULE_LIST}
+                                            -Wl,--no-whole-archive)
+    endif()
+  endif()
+endfunction()
+
+function(mmdeploy_load_dynamic NAME)
+  _mmdeploy_flatten_modules(_MODULE_LIST ${ARGN})
+  if(MSVC)
+    if(NOT _MODULE_LIST)
+      return()
+    endif()
+    # MSVC has nothing like "-Wl,--no-as-needed ... -Wl,--as-needed", as a
+    # workaround we build a static module which loads the dynamic modules
+    set(_MODULE_STR ${_MODULE_LIST})
+    list(TRANSFORM _MODULE_STR REPLACE "(.+)" "\"\\1\"")
+    string(JOIN ",\n        " _MODULE_STR ${_MODULE_STR})
+    set(_MMDEPLOY_DYNAMIC_MODULES ${_MODULE_STR})
+
+    set(_LOADER_NAME ${NAME}_loader)
+
+    add_dependencies(${NAME} ${_MODULE_LIST})
+
+    set(_LOADER_PATH ${CMAKE_BINARY_DIR}/${_LOADER_NAME}.cpp)
+    # ! CMAKE_CURRENT_FUNCTION_LIST_DIR requires cmake 3.17+
+    configure_file(${CMAKE_CURRENT_FUNCTION_LIST_DIR}/loader.cpp.in
+                   ${_LOADER_PATH})
+
+    mmdeploy_add_module(${_LOADER_NAME} STATIC EXCLUDE ${_LOADER_PATH})
+    mmdeploy_load_static(${NAME} ${_LOADER_NAME})
+  elseif(APPLE)
+    target_link_libraries(${NAME} PRIVATE ${_MODULE_LIST})
+  else()
+    target_link_libraries(${NAME} PRIVATE -Wl,--no-as-needed ${_MODULE_LIST}
+                                          -Wl,--as-needed)
+  endif()
+endfunction()
 
 macro(mmdeploy_add_deps backend)
-    set(multiValueArgs BACKENDS DEPS)
-    cmake_parse_arguments(INFO "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    set(has_backend OFF)
-    if (${backend} IN_LIST INFO_BACKENDS)
-        foreach(pkg IN LISTS INFO_DEPS)
-            set(${pkg}_DEPENDENCY "find_package(${pkg} REQUIRED)")
-        endforeach()
-    endif()
+  set(multiValueArgs BACKENDS DEPS)
+  cmake_parse_arguments(INFO "${options}" "${oneValueArgs}" "${multiValueArgs}"
+                        ${ARGN})
+  set(has_backend OFF)
+  if(${backend} IN_LIST INFO_BACKENDS)
+    foreach(pkg IN LISTS INFO_DEPS)
+      set(${pkg}_DEPENDENCY "find_package(${pkg} REQUIRED)")
+    endforeach()
+  endif()
 endmacro()
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 578fdc7e74..7b2e1c7d83 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -1,110 +1,114 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-if (${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.18.0")
-    # suppress 'CMAKE_CUDA_ARCHITECTURES' warning
-    cmake_policy(SET CMP0104 OLD)
-endif ()
+if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.18.0")
+  # suppress 'CMAKE_CUDA_ARCHITECTURES' warning
+  cmake_policy(SET CMP0104 OLD)
+endif()
 
-if (MSVC OR (NOT DEFINED CMAKE_CUDA_RUNTIME_LIBRARY))
-    # use shared, on windows, python api can't build with static lib.
-    set(CMAKE_CUDA_RUNTIME_LIBRARY Shared)
-    set(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
-endif ()
+if(MSVC OR (NOT DEFINED CMAKE_CUDA_RUNTIME_LIBRARY))
+  # use shared, on windows, python api can't build with static lib.
+  set(CMAKE_CUDA_RUNTIME_LIBRARY Shared)
+  set(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
+endif()
 
-if (MSVC)
-    # no plugin in BuildCustomizations and no specify cuda toolset
-    if (NOT CMAKE_VS_PLATFORM_TOOLSET_CUDA)
-        message(FATAL_ERROR "Please install CUDA MSBuildExtensions")
-    endif ()
+if(MSVC)
+  # no plugin in BuildCustomizations and no specify cuda toolset
+  if(NOT CMAKE_VS_PLATFORM_TOOLSET_CUDA)
+    message(FATAL_ERROR "Please install CUDA MSBuildExtensions")
+  endif()
 
-    if (CMAKE_VS_PLATFORM_TOOLSET_CUDA_CUSTOM_DIR)
-        # find_package(CUDA) required ENV{CUDA_PATH}
-        set(ENV{CUDA_PATH} ${CMAKE_VS_PLATFORM_TOOLSET_CUDA_CUSTOM_DIR})
-    else ()
-        # we use CUDA_PATH and ignore nvcc.exe
-        # cmake will import highest cuda props version, which may not equal to CUDA_PATH
-        if (NOT (DEFINED ENV{CUDA_PATH}))
-            message(FATAL_ERROR "Please set CUDA_PATH environment variable")
-        endif ()
+  if(CMAKE_VS_PLATFORM_TOOLSET_CUDA_CUSTOM_DIR)
+    # find_package(CUDA) required ENV{CUDA_PATH}
+    set(ENV{CUDA_PATH} ${CMAKE_VS_PLATFORM_TOOLSET_CUDA_CUSTOM_DIR})
+  else()
+    # we use CUDA_PATH and ignore nvcc.exe cmake will import highest cuda props
+    # version, which may not equal to CUDA_PATH
+    if(NOT (DEFINED ENV{CUDA_PATH}))
+      message(FATAL_ERROR "Please set CUDA_PATH environment variable")
+    endif()
 
-        string(REGEX REPLACE ".*v([0-9]+)\\..*" "\\1" _MAJOR $ENV{CUDA_PATH})
-        string(REGEX REPLACE ".*v[0-9]+\\.([0-9]+).*" "\\1" _MINOR $ENV{CUDA_PATH})
-        if (NOT (${CMAKE_VS_PLATFORM_TOOLSET_CUDA} STREQUAL "${_MAJOR}.${_MINOR}"))
-            message(FATAL_ERROR "Auto detected cuda version ${CMAKE_VS_PLATFORM_TOOLSET_CUDA}"
-                " is mismatch with ENV{CUDA_PATH} $ENV{CUDA_PATH}. Please modify CUDA_PATH"
-                " to match ${CMAKE_VS_PLATFORM_TOOLSET_CUDA} or specify cuda toolset by"
-                " cmake -T cuda=/path/to/cuda ..")
-        endif ()
+    string(REGEX REPLACE ".*v([0-9]+)\\..*" "\\1" _MAJOR $ENV{CUDA_PATH})
+    string(REGEX REPLACE ".*v[0-9]+\\.([0-9]+).*" "\\1" _MINOR $ENV{CUDA_PATH})
+    if(NOT (${CMAKE_VS_PLATFORM_TOOLSET_CUDA} STREQUAL "${_MAJOR}.${_MINOR}"))
+      message(
+        FATAL_ERROR
+          "Auto detected cuda version ${CMAKE_VS_PLATFORM_TOOLSET_CUDA}"
+          " is mismatch with ENV{CUDA_PATH} $ENV{CUDA_PATH}. Please modify CUDA_PATH"
+          " to match ${CMAKE_VS_PLATFORM_TOOLSET_CUDA} or specify cuda toolset by"
+          " cmake -T cuda=/path/to/cuda ..")
+    endif()
 
-        if (NOT (DEFINED ENV{CUDA_PATH_V${_MAJOR}_${_MINOR}}))
-            message(FATAL_ERROR "Please set CUDA_PATH_V${_MAJOR}_${_MINOR} environment variable")
-        endif ()
-    endif ()
-endif ()
+    if(NOT (DEFINED ENV{CUDA_PATH_V${_MAJOR}_${_MINOR}}))
+      message(
+        FATAL_ERROR
+          "Please set CUDA_PATH_V${_MAJOR}_${_MINOR} environment variable")
+    endif()
+  endif()
+endif()
 
 # nvcc compiler settings
 find_package(CUDA REQUIRED)
 
-if (MSVC)
-    set(CMAKE_CUDA_COMPILER ${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc.exe)
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=/wd4819,/wd4828")
-    if (HAVE_CXX_FLAG_UTF_8)
-        set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=/utf-8")
-    endif ()
-else ()
-    set(CMAKE_CUDA_COMPILER ${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc)
-    # Explicitly set the cuda host compiler. Because the default host compiler #
-    # selected by cmake maybe wrong.
-    set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
-    set(CMAKE_CUDA_FLAGS
-            "${CMAKE_CUDA_FLAGS} -Xcompiler=-fPIC,-Wall,-fvisibility=hidden")
-    if (CMAKE_CXX_COMPILER_ID MATCHES "GNU")
-        set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=-fno-gnu-unique")
-    endif ()
-endif ()
+if(MSVC)
+  set(CMAKE_CUDA_COMPILER ${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc.exe)
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=/wd4819,/wd4828")
+  if(HAVE_CXX_FLAG_UTF_8)
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=/utf-8")
+  endif()
+else()
+  set(CMAKE_CUDA_COMPILER ${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc)
+  # Explicitly set the cuda host compiler. Because the default host compiler #
+  # selected by cmake maybe wrong.
+  set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
+  set(CMAKE_CUDA_FLAGS
+      "${CMAKE_CUDA_FLAGS} -Xcompiler=-fPIC,-Wall,-fvisibility=hidden")
+  if(CMAKE_CXX_COMPILER_ID MATCHES "GNU")
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=-fno-gnu-unique")
+  endif()
+endif()
 
 enable_language(CUDA)
 
 # set virtual compute architecture and real ones
 set(_NVCC_FLAGS)
-if (NOT CMAKE_CUDA_ARCHITECTURES)
-    set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_52,code=sm_52")
-    set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_53,code=sm_53")
-    if (CUDA_VERSION_MAJOR VERSION_GREATER_EQUAL "8")
-        set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_60,code=sm_60")
-        set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_61,code=sm_61")
-        set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_62,code=sm_62")
-    endif ()
-    if (CUDA_VERSION_MAJOR VERSION_GREATER_EQUAL "9")
-        set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_70,code=sm_70")
-    endif ()
-    if (CUDA_VERSION_MAJOR VERSION_GREATER_EQUAL "10")
-        set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_72,code=sm_72")
-        set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_75,code=sm_75")
-    endif ()
-    if (CUDA_VERSION_MAJOR VERSION_GREATER_EQUAL "11")
-        set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_80,code=sm_80")
-        if (CUDA_VERSION_MINOR VERSION_GREATER_EQUAL "1")
-            # cuda doesn't support `sm_86` until version 11.1
-            set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_86,code=sm_86")
-        endif ()
-        if (CUDA_VERSION_MINOR VERSION_GREATER_EQUAL "4")
-            set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_87,code=sm_87")
-        endif ()
-    endif ()
-endif ()
+if(NOT CMAKE_CUDA_ARCHITECTURES)
+  set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_52,code=sm_52")
+  set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_53,code=sm_53")
+  if(CUDA_VERSION_MAJOR VERSION_GREATER_EQUAL "8")
+    set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_60,code=sm_60")
+    set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_61,code=sm_61")
+    set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_62,code=sm_62")
+  endif()
+  if(CUDA_VERSION_MAJOR VERSION_GREATER_EQUAL "9")
+    set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_70,code=sm_70")
+  endif()
+  if(CUDA_VERSION_MAJOR VERSION_GREATER_EQUAL "10")
+    set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_72,code=sm_72")
+    set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_75,code=sm_75")
+  endif()
+  if(CUDA_VERSION_MAJOR VERSION_GREATER_EQUAL "11")
+    set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_80,code=sm_80")
+    if(CUDA_VERSION_MINOR VERSION_GREATER_EQUAL "1")
+      # cuda doesn't support `sm_86` until version 11.1
+      set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_86,code=sm_86")
+    endif()
+    if(CUDA_VERSION_MINOR VERSION_GREATER_EQUAL "4")
+      set(_NVCC_FLAGS "${_NVCC_FLAGS} -gencode arch=compute_87,code=sm_87")
+    endif()
+  endif()
+endif()
 
 set(CMAKE_CUDA_FLAGS_DEBUG "-g -O0")
 set(CMAKE_CUDA_FLAGS_RELEASE "-O3")
 
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DMMDEPLOY_USE_CUDA=1")
 
-if (NOT MSVC)
-    set(CMAKE_CUDA_STANDARD 14)
-endif ()
+if(NOT MSVC)
+  set(CMAKE_CUDA_STANDARD 14)
+endif()
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${_NVCC_FLAGS}")
 
-if (MSVC AND MMDEPLOY_USE_MSCV_STATIC)
-    string(REPLACE -MD -MT CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG}")
-    string(REPLACE -MD -MT CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE}")
-endif ()
+if(MSVC AND MMDEPLOY_USE_MSCV_STATIC)
+  string(REPLACE -MD -MT CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG}")
+  string(REPLACE -MD -MT CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE}")
+endif()
diff --git a/cmake/filesystem.cmake b/cmake/filesystem.cmake
index 787923f2cc..14f1aaaadf 100644
--- a/cmake/filesystem.cmake
+++ b/cmake/filesystem.cmake
@@ -1,43 +1,48 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-# Modified from https://github.com/pybind/pybind11/blob/master/tests/CMakeLists.txt
+# Copyright (c) OpenMMLab. All rights reserved. Modified from
+# https://github.com/pybind/pybind11/blob/master/tests/CMakeLists.txt
 
-if (MSVC)
-    set(STD_FS_NO_LIB_NEEDED TRUE)
-else ()
-    file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/main.cpp
-            "#include <filesystem>\nint main(int,char**argv){return std::filesystem::path(argv[0]).string().length();}")
-    try_compile(HAS_INC_FS ${CMAKE_CURRENT_BINARY_DIR}
-            SOURCES ${CMAKE_CURRENT_BINARY_DIR}/main.cpp
-            COMPILE_DEFINITIONS -std=c++17 -c)
+if(MSVC)
+  set(STD_FS_NO_LIB_NEEDED TRUE)
+else()
+  file(
+    WRITE ${CMAKE_CURRENT_BINARY_DIR}/main.cpp
+    "#include <filesystem>\nint main(int,char**argv){return std::filesystem::path(argv[0]).string().length();}"
+  )
+  try_compile(
+    HAS_INC_FS ${CMAKE_CURRENT_BINARY_DIR}
+    SOURCES ${CMAKE_CURRENT_BINARY_DIR}/main.cpp
+    COMPILE_DEFINITIONS -std=c++17 -c)
 
-    if (NOT HAS_INC_FS)
-        file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/main.cpp
-                "#include <experimental/filesystem>\nint main(int,char**argv){return std::experimental::filesystem::path(argv[0]).string().length();}")
-    endif ()
+  if(NOT HAS_INC_FS)
+    file(
+      WRITE ${CMAKE_CURRENT_BINARY_DIR}/main.cpp
+      "#include <experimental/filesystem>\nint main(int,char**argv){return std::experimental::filesystem::path(argv[0]).string().length();}"
+    )
+  endif()
 
-    try_compile(
-            STD_FS_NO_LIB_NEEDED ${CMAKE_CURRENT_BINARY_DIR}
-            SOURCES ${CMAKE_CURRENT_BINARY_DIR}/main.cpp
-            COMPILE_DEFINITIONS -std=c++17)
-    try_compile(
-            STD_FS_NEEDS_STDCXXFS ${CMAKE_CURRENT_BINARY_DIR}
-            SOURCES ${CMAKE_CURRENT_BINARY_DIR}/main.cpp
-            COMPILE_DEFINITIONS -std=c++17
-            LINK_LIBRARIES stdc++fs)
-    try_compile(
-            STD_FS_NEEDS_CXXFS ${CMAKE_CURRENT_BINARY_DIR}
-            SOURCES ${CMAKE_CURRENT_BINARY_DIR}/main.cpp
-            COMPILE_DEFINITIONS -std=c++17
-            LINK_LIBRARIES c++fs)
-endif ()
+  try_compile(
+    STD_FS_NO_LIB_NEEDED ${CMAKE_CURRENT_BINARY_DIR}
+    SOURCES ${CMAKE_CURRENT_BINARY_DIR}/main.cpp
+    COMPILE_DEFINITIONS -std=c++17)
+  try_compile(
+    STD_FS_NEEDS_STDCXXFS ${CMAKE_CURRENT_BINARY_DIR}
+    SOURCES ${CMAKE_CURRENT_BINARY_DIR}/main.cpp
+    COMPILE_DEFINITIONS -std=c++17
+    LINK_LIBRARIES stdc++fs)
+  try_compile(
+    STD_FS_NEEDS_CXXFS ${CMAKE_CURRENT_BINARY_DIR}
+    SOURCES ${CMAKE_CURRENT_BINARY_DIR}/main.cpp
+    COMPILE_DEFINITIONS -std=c++17
+    LINK_LIBRARIES c++fs)
+endif()
 
-if (${STD_FS_NO_LIB_NEEDED})
-    set(STD_FS_LIB "")
-elseif (${STD_FS_NEEDS_STDCXXFS})
-    set(STD_FS_LIB stdc++fs)
-elseif (${STD_FS_NEEDS_CXXFS})
-    set(STD_FS_LIB c++fs)
-else ()
-    message(WARNING "Unknown C++17 compiler - not passing -lstdc++fs")
-    set(STD_FS_LIB "")
-endif ()
+if(${STD_FS_NO_LIB_NEEDED})
+  set(STD_FS_LIB "")
+elseif(${STD_FS_NEEDS_STDCXXFS})
+  set(STD_FS_LIB stdc++fs)
+elseif(${STD_FS_NEEDS_CXXFS})
+  set(STD_FS_LIB c++fs)
+else()
+  message(WARNING "Unknown C++17 compiler - not passing -lstdc++fs")
+  set(STD_FS_LIB "")
+endif()
diff --git a/cmake/modules/FindCUDNN.cmake b/cmake/modules/FindCUDNN.cmake
index 3f3f9b893a..332fad48eb 100644
--- a/cmake/modules/FindCUDNN.cmake
+++ b/cmake/modules/FindCUDNN.cmake
@@ -1,36 +1,39 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-if (NOT DEFINED CUDNN_DIR)
-    set(CUDNN_DIR $ENV{CUDNN_DIR})
-endif ()
+if(NOT DEFINED CUDNN_DIR)
+  set(CUDNN_DIR $ENV{CUDNN_DIR})
+endif()
 
 find_path(
-    CUDNN_INCLUDE_DIR cudnn.h
-    HINTS ${CUDNN_DIR} ${CUDA_TOOLKIT_ROOT_DIR}
-    PATH_SUFFIXES include)
+  CUDNN_INCLUDE_DIR cudnn.h
+  HINTS ${CUDNN_DIR} ${CUDA_TOOLKIT_ROOT_DIR}
+  PATH_SUFFIXES include)
 
 find_library(
-    CUDNN_LIBRARY_CUDNN_PATH cudnn
-    HINTS ${CUDNN_DIR} ${CUDA_TOOLKIT_ROOT_DIR}
-    PATH_SUFFIXES lib lib64 lib/x64)
+  CUDNN_LIBRARY_CUDNN_PATH cudnn
+  HINTS ${CUDNN_DIR} ${CUDA_TOOLKIT_ROOT_DIR}
+  PATH_SUFFIXES lib lib64 lib/x64)
 
-if (NOT (CUDNN_INCLUDE_DIR AND CUDNN_LIBRARY_CUDNN_PATH))
-    message(FATAL_ERROR "Couldn't find cuDNN in CUDNN_DIR: ${CUDNN_DIR}, "
-        "or in CUDA_TOOLKIT_ROOT_DIR: ${CUDA_TOOLKIT_ROOT_DIR}, "
-        "please check if the path is correct.")
+if(NOT (CUDNN_INCLUDE_DIR AND CUDNN_LIBRARY_CUDNN_PATH))
+  message(
+    FATAL_ERROR
+      "Couldn't find cuDNN in CUDNN_DIR: ${CUDNN_DIR}, "
+      "or in CUDA_TOOLKIT_ROOT_DIR: ${CUDA_TOOLKIT_ROOT_DIR}, "
+      "please check if the path is correct.")
 endif()
 
 add_library(cudnn SHARED IMPORTED)
-set_property(TARGET cudnn APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
-if (MSVC)
-    set_target_properties(cudnn PROPERTIES
-        IMPORTED_IMPLIB_RELEASE ${CUDNN_LIBRARY_CUDNN_PATH}
-        INTERFACE_INCLUDE_DIRECTORIES ${CUDNN_INCLUDE_DIR}
-    )
+set_property(
+  TARGET cudnn
+  APPEND
+  PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
+if(MSVC)
+  set_target_properties(
+    cudnn PROPERTIES IMPORTED_IMPLIB_RELEASE ${CUDNN_LIBRARY_CUDNN_PATH}
+                     INTERFACE_INCLUDE_DIRECTORIES ${CUDNN_INCLUDE_DIR})
 
 else()
-    set_target_properties(cudnn PROPERTIES
-        IMPORTED_LOCATION_RELEASE ${CUDNN_LIBRARY_CUDNN_PATH}
-        INTERFACE_INCLUDE_DIRECTORIES ${CUDNN_INCLUDE_DIR}
-    )
+  set_target_properties(
+    cudnn PROPERTIES IMPORTED_LOCATION_RELEASE ${CUDNN_LIBRARY_CUDNN_PATH}
+                     INTERFACE_INCLUDE_DIRECTORIES ${CUDNN_INCLUDE_DIR})
 endif()
diff --git a/cmake/modules/FindONNXRUNTIME.cmake b/cmake/modules/FindONNXRUNTIME.cmake
index 63ea176595..d3eff87f65 100644
--- a/cmake/modules/FindONNXRUNTIME.cmake
+++ b/cmake/modules/FindONNXRUNTIME.cmake
@@ -1,36 +1,40 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-if (NOT DEFINED ONNXRUNTIME_DIR)
-    set(ONNXRUNTIME_DIR $ENV{ONNXRUNTIME_DIR})
-endif ()
-if (NOT ONNXRUNTIME_DIR)
-    message(FATAL_ERROR "Please set ONNXRUNTIME_DIR with cmake -D option.")
+if(NOT DEFINED ONNXRUNTIME_DIR)
+  set(ONNXRUNTIME_DIR $ENV{ONNXRUNTIME_DIR})
+endif()
+if(NOT ONNXRUNTIME_DIR)
+  message(FATAL_ERROR "Please set ONNXRUNTIME_DIR with cmake -D option.")
 endif()
 
 find_path(
-    ONNXRUNTIME_INCLUDE_DIR onnxruntime_cxx_api.h
-    HINTS ${ONNXRUNTIME_DIR}
-    PATH_SUFFIXES include)
+  ONNXRUNTIME_INCLUDE_DIR onnxruntime_cxx_api.h
+  HINTS ${ONNXRUNTIME_DIR}
+  PATH_SUFFIXES include)
 find_library(
-    ONNXRUNTIME_LIBRARY_ONNXRUNTIME_PATH onnxruntime
-    HINTS ${ONNXRUNTIME_DIR}
-    PATH_SUFFIXES lib lib64 lib/x64)
-if (NOT (ONNXRUNTIME_INCLUDE_DIR AND ONNXRUNTIME_LIBRARY_ONNXRUNTIME_PATH))
-    message(FATAL_ERROR "Couldn't find onnxruntime in ONNXRUNTIME_DIR: "
-        "${ONNXRUNTIME_DIR}, please check if the path is correct.")
+  ONNXRUNTIME_LIBRARY_ONNXRUNTIME_PATH onnxruntime
+  HINTS ${ONNXRUNTIME_DIR}
+  PATH_SUFFIXES lib lib64 lib/x64)
+if(NOT (ONNXRUNTIME_INCLUDE_DIR AND ONNXRUNTIME_LIBRARY_ONNXRUNTIME_PATH))
+  message(
+    FATAL_ERROR "Couldn't find onnxruntime in ONNXRUNTIME_DIR: "
+                "${ONNXRUNTIME_DIR}, please check if the path is correct.")
 endif()
 
 add_library(onnxruntime SHARED IMPORTED)
-set_property(TARGET onnxruntime APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
-if (MSVC)
-    set_target_properties(onnxruntime PROPERTIES
-        IMPORTED_IMPLIB_RELEASE ${ONNXRUNTIME_LIBRARY_ONNXRUNTIME_PATH}
-        INTERFACE_INCLUDE_DIRECTORIES ${ONNXRUNTIME_INCLUDE_DIR}
-    )
+set_property(
+  TARGET onnxruntime
+  APPEND
+  PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
+if(MSVC)
+  set_target_properties(
+    onnxruntime
+    PROPERTIES IMPORTED_IMPLIB_RELEASE ${ONNXRUNTIME_LIBRARY_ONNXRUNTIME_PATH}
+               INTERFACE_INCLUDE_DIRECTORIES ${ONNXRUNTIME_INCLUDE_DIR})
 
 else()
-    set_target_properties(onnxruntime PROPERTIES
-        IMPORTED_LOCATION_RELEASE ${ONNXRUNTIME_LIBRARY_ONNXRUNTIME_PATH}
-        INTERFACE_INCLUDE_DIRECTORIES ${ONNXRUNTIME_INCLUDE_DIR}
-    )
+  set_target_properties(
+    onnxruntime
+    PROPERTIES IMPORTED_LOCATION_RELEASE ${ONNXRUNTIME_LIBRARY_ONNXRUNTIME_PATH}
+               INTERFACE_INCLUDE_DIRECTORIES ${ONNXRUNTIME_INCLUDE_DIR})
 endif()
diff --git a/cmake/modules/FindTENSORRT.cmake b/cmake/modules/FindTENSORRT.cmake
index e2c328923e..25d015a52c 100644
--- a/cmake/modules/FindTENSORRT.cmake
+++ b/cmake/modules/FindTENSORRT.cmake
@@ -1,51 +1,56 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-if (NOT DEFINED TENSORRT_DIR)
-    set(TENSORRT_DIR $ENV{TENSORRT_DIR})
-endif ()
-if (NOT TENSORRT_DIR)
-    message(FATAL_ERROR "Please set TENSORRT_DIR with cmake -D option.")
+if(NOT DEFINED TENSORRT_DIR)
+  set(TENSORRT_DIR $ENV{TENSORRT_DIR})
+endif()
+if(NOT TENSORRT_DIR)
+  message(FATAL_ERROR "Please set TENSORRT_DIR with cmake -D option.")
 endif()
 
 find_path(
-    TENSORRT_INCLUDE_DIR NvInfer.h
-    HINTS ${TENSORRT_DIR} ${CUDA_TOOLKIT_ROOT_DIR}
-    PATH_SUFFIXES include)
+  TENSORRT_INCLUDE_DIR NvInfer.h
+  HINTS ${TENSORRT_DIR} ${CUDA_TOOLKIT_ROOT_DIR}
+  PATH_SUFFIXES include)
 
-if (NOT TENSORRT_INCLUDE_DIR)
-    message(FATAL_ERROR "Cannot find TensorRT header NvInfer.h "
-        "in TENSORRT_DIR: ${TENSORRT_DIR} or in CUDA_TOOLKIT_ROOT_DIR: "
-        "${CUDA_TOOLKIT_ROOT_DIR}, please check if the path is correct.")
-endif ()
+if(NOT TENSORRT_INCLUDE_DIR)
+  message(
+    FATAL_ERROR
+      "Cannot find TensorRT header NvInfer.h "
+      "in TENSORRT_DIR: ${TENSORRT_DIR} or in CUDA_TOOLKIT_ROOT_DIR: "
+      "${CUDA_TOOLKIT_ROOT_DIR}, please check if the path is correct.")
+endif()
 
 set(__TENSORRT_LIB_COMPONENTS nvinfer;nvinfer_plugin)
 foreach(__component ${__TENSORRT_LIB_COMPONENTS})
-    find_library(
-        __component_path ${__component}
-        HINTS ${TENSORRT_DIR} ${CUDA_TOOLKIT_ROOT_DIR}
-        PATH_SUFFIXES lib lib64 lib/x64)
-    if (NOT __component_path)
-        message(FATAL_ERROR "Cannot find TensorRT lib ${__component} in "
-            "TENSORRT_DIR: ${TENSORRT_DIR} or CUDA_TOOLKIT_ROOT_DIR: ${CUDA_TOOLKIT_ROOT_DIR}, "
-            "please check if the path is correct")
-    endif()
+  find_library(
+    __component_path ${__component}
+    HINTS ${TENSORRT_DIR} ${CUDA_TOOLKIT_ROOT_DIR}
+    PATH_SUFFIXES lib lib64 lib/x64)
+  if(NOT __component_path)
+    message(
+      FATAL_ERROR
+        "Cannot find TensorRT lib ${__component} in "
+        "TENSORRT_DIR: ${TENSORRT_DIR} or CUDA_TOOLKIT_ROOT_DIR: ${CUDA_TOOLKIT_ROOT_DIR}, "
+        "please check if the path is correct")
+  endif()
 
-    add_library(${__component} SHARED IMPORTED)
-    set_property(TARGET ${__component} APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
-    if (MSVC)
-        set_target_properties(
-            ${__component} PROPERTIES
-            IMPORTED_IMPLIB_RELEASE ${__component_path}
-            INTERFACE_INCLUDE_DIRECTORIES ${TENSORRT_INCLUDE_DIR}
-        )
-    else()
-        set_target_properties(
-            ${__component} PROPERTIES
-            IMPORTED_LOCATION_RELEASE ${__component_path}
-            INTERFACE_INCLUDE_DIRECTORIES ${TENSORRT_INCLUDE_DIR}
-        )
-    endif()
-    unset(__component_path CACHE)
+  add_library(${__component} SHARED IMPORTED)
+  set_property(
+    TARGET ${__component}
+    APPEND
+    PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
+  if(MSVC)
+    set_target_properties(
+      ${__component}
+      PROPERTIES IMPORTED_IMPLIB_RELEASE ${__component_path}
+                 INTERFACE_INCLUDE_DIRECTORIES ${TENSORRT_INCLUDE_DIR})
+  else()
+    set_target_properties(
+      ${__component}
+      PROPERTIES IMPORTED_LOCATION_RELEASE ${__component_path}
+                 INTERFACE_INCLUDE_DIRECTORIES ${TENSORRT_INCLUDE_DIR})
+  endif()
+  unset(__component_path CACHE)
 endforeach()
 
 set(TENSORRT_LIBS ${__TENSORRT_LIB_COMPONENTS})
diff --git a/cmake/modules/FindTVM.cmake b/cmake/modules/FindTVM.cmake
index f6443609e4..8ae3a48abd 100644
--- a/cmake/modules/FindTVM.cmake
+++ b/cmake/modules/FindTVM.cmake
@@ -1,47 +1,56 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-if (NOT DEFINED TVM_DIR)
-    set(TVM_DIR $ENV{TVM_DIR})
-endif ()
-if (NOT TVM_DIR)
-    message(FATAL_ERROR "Please set TVM_DIR with cmake -D option.")
+if(NOT DEFINED TVM_DIR)
+  set(TVM_DIR $ENV{TVM_DIR})
+endif()
+if(NOT TVM_DIR)
+  message(FATAL_ERROR "Please set TVM_DIR with cmake -D option.")
 endif()
 
 find_path(
-    TVM_INCLUDE_DIR tvm/runtime/c_runtime_api.h
-    HINTS ${TVM_DIR}
-    PATH_SUFFIXES include)
+  TVM_INCLUDE_DIR tvm/runtime/c_runtime_api.h
+  HINTS ${TVM_DIR}
+  PATH_SUFFIXES include)
 
 find_path(
-    DMLC_CORE_INCLUDE_DIR  dmlc/io.h
-    HINTS ${TVM_DIR}/3rdparty/dmlc-core
-    PATH_SUFFIXES include)
+  DMLC_CORE_INCLUDE_DIR dmlc/io.h
+  HINTS ${TVM_DIR}/3rdparty/dmlc-core
+  PATH_SUFFIXES include)
 
 find_path(
-    DLPACK_INCLUDE_DIR dlpack/dlpack.h
-    HINTS ${TVM_DIR}/3rdparty/dlpack
-    PATH_SUFFIXES include)
+  DLPACK_INCLUDE_DIR dlpack/dlpack.h
+  HINTS ${TVM_DIR}/3rdparty/dlpack
+  PATH_SUFFIXES include)
 
 find_library(
-    TVM_LIBRARY_PATH tvm_runtime
-    HINTS ${TVM_DIR}
-    PATH_SUFFIXES build lib build/${CMAKE_BUILD_TYPE})
-if (NOT (TVM_INCLUDE_DIR AND DMLC_CORE_INCLUDE_DIR AND DLPACK_INCLUDE_DIR AND TVM_LIBRARY_PATH))
-    message(FATAL_ERROR "Couldn't find tvm in TVM_DIR: "
-        "${TVM_DIR}, please check if the path is correct.")
+  TVM_LIBRARY_PATH tvm_runtime
+  HINTS ${TVM_DIR}
+  PATH_SUFFIXES build lib build/${CMAKE_BUILD_TYPE})
+if(NOT
+   (TVM_INCLUDE_DIR
+    AND DMLC_CORE_INCLUDE_DIR
+    AND DLPACK_INCLUDE_DIR
+    AND TVM_LIBRARY_PATH))
+  message(FATAL_ERROR "Couldn't find tvm in TVM_DIR: "
+                      "${TVM_DIR}, please check if the path is correct.")
 endif()
 
 add_library(tvm_runtime SHARED IMPORTED)
-set_property(TARGET tvm_runtime APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
-if (MSVC)
-    set_target_properties(tvm_runtime PROPERTIES
-        IMPORTED_IMPLIB_RELEASE ${TVM_LIBRARY_PATH}
-        INTERFACE_INCLUDE_DIRECTORIES ${TVM_INCLUDE_DIR} ${DMLC_CORE_INCLUDE_DIR} ${DLPACK_INCLUDE_DIR}
-    )
+set_property(
+  TARGET tvm_runtime
+  APPEND
+  PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
+if(MSVC)
+  set_target_properties(
+    tvm_runtime
+    PROPERTIES IMPORTED_IMPLIB_RELEASE ${TVM_LIBRARY_PATH}
+               INTERFACE_INCLUDE_DIRECTORIES ${TVM_INCLUDE_DIR}
+               ${DMLC_CORE_INCLUDE_DIR} ${DLPACK_INCLUDE_DIR})
 
 else()
-    set_target_properties(tvm_runtime PROPERTIES
-        IMPORTED_LOCATION_RELEASE ${TVM_LIBRARY_PATH}
-        INTERFACE_INCLUDE_DIRECTORIES ${TVM_INCLUDE_DIR} ${DMLC_CORE_INCLUDE_DIR} ${DLPACK_INCLUDE_DIR}
-    )
+  set_target_properties(
+    tvm_runtime
+    PROPERTIES IMPORTED_LOCATION_RELEASE ${TVM_LIBRARY_PATH}
+               INTERFACE_INCLUDE_DIRECTORIES ${TVM_INCLUDE_DIR}
+               ${DMLC_CORE_INCLUDE_DIR} ${DLPACK_INCLUDE_DIR})
 endif()
diff --git a/cmake/post-install.cmake b/cmake/post-install.cmake
index d289e53996..c9ae0d6dd9 100644
--- a/cmake/post-install.cmake
+++ b/cmake/post-install.cmake
@@ -1,10 +1,10 @@
 
-
-set(_TARGETS_PATH ${CMAKE_INSTALL_PREFIX}/lib/cmake/MMDeploy/MMDeployTargets.cmake)
+set(_TARGETS_PATH
+    ${CMAKE_INSTALL_PREFIX}/lib/cmake/MMDeploy/MMDeployTargets.cmake)
 
 file(READ ${_TARGETS_PATH} _MMDEPLOY_TARGETS)
 
-string(REGEX REPLACE "::@<0x[a-z0-9]+>" ""
-        _MMDEPLOY_TARGETS_FIXED "${_MMDEPLOY_TARGETS}")
+string(REGEX REPLACE "::@<0x[a-z0-9]+>" "" _MMDEPLOY_TARGETS_FIXED
+                     "${_MMDEPLOY_TARGETS}")
 
 file(WRITE ${_TARGETS_PATH} "${_MMDEPLOY_TARGETS_FIXED}")
diff --git a/cmake/stacktrace.cmake b/cmake/stacktrace.cmake
index bd0761a217..4ef719aaa2 100644
--- a/cmake/stacktrace.cmake
+++ b/cmake/stacktrace.cmake
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 find_package(Boost 1.65 COMPONENTS stacktrace_backtrace)
-if (Boost_FOUND)
-    target_link_libraries(mmdeploy_core PUBLIC Boost::stacktrace_backtrace)
-    target_compile_definitions(mmdeploy_core PUBLIC -DMMDEPLOY_STATUS_USE_STACKTRACE=1)
+if(Boost_FOUND)
+  target_link_libraries(mmdeploy_core PUBLIC Boost::stacktrace_backtrace)
+  target_compile_definitions(mmdeploy_core
+                             PUBLIC -DMMDEPLOY_STATUS_USE_STACKTRACE=1)
 endif()
diff --git a/cmake/tensorrt.cmake b/cmake/tensorrt.cmake
index 546a85070d..1302f4bbcf 100644
--- a/cmake/tensorrt.cmake
+++ b/cmake/tensorrt.cmake
@@ -3,35 +3,33 @@
 include(${CMAKE_SOURCE_DIR}/cmake/modules/FindTENSORRT.cmake)
 include(${CMAKE_SOURCE_DIR}/cmake/modules/FindCUDNN.cmake)
 find_path(
-        TENSORRT_INCLUDE_DIR NvInfer.h
-        HINTS ${TENSORRT_DIR} ${CUDA_TOOLKIT_ROOT_DIR}
-        PATH_SUFFIXES include)
-if (TENSORRT_INCLUDE_DIR)
-    message(STATUS "Found TensorRT headers at ${TENSORRT_INCLUDE_DIR}")
-else ()
-    message(ERROR "Cannot find TensorRT headers")
-endif ()
+  TENSORRT_INCLUDE_DIR NvInfer.h
+  HINTS ${TENSORRT_DIR} ${CUDA_TOOLKIT_ROOT_DIR}
+  PATH_SUFFIXES include)
+if(TENSORRT_INCLUDE_DIR)
+  message(STATUS "Found TensorRT headers at ${TENSORRT_INCLUDE_DIR}")
+else()
+  message(ERROR "Cannot find TensorRT headers")
+endif()
 
 find_library(
-        TENSORRT_LIBRARY_INFER nvinfer
-        HINTS ${TENSORRT_DIR} ${TENSORRT_BUILD} ${CUDA_TOOLKIT_ROOT_DIR}
-        PATH_SUFFIXES lib lib64 lib/x64)
+  TENSORRT_LIBRARY_INFER nvinfer
+  HINTS ${TENSORRT_DIR} ${TENSORRT_BUILD} ${CUDA_TOOLKIT_ROOT_DIR}
+  PATH_SUFFIXES lib lib64 lib/x64)
 find_library(
-        TENSORRT_LIBRARY_INFER_PLUGIN nvinfer_plugin
-        HINTS ${TENSORRT_DIR} ${TENSORRT_BUILD} ${CUDA_TOOLKIT_ROOT_DIR}
-        PATH_SUFFIXES lib lib64 lib/x64)
-set(TENSORRT_LIBRARY ${TENSORRT_LIBRARY_INFER}
-        ${TENSORRT_LIBRARY_INFER_PLUGIN})
-if (TENSORRT_LIBRARY_INFER
-        AND TENSORRT_LIBRARY_INFER_PLUGIN)
-    message(STATUS "Found TensorRT libs at ${TENSORRT_LIBRARY}")
-else ()
-    message(FATAL_ERROR "Cannot find TensorRT libs")
-endif ()
+  TENSORRT_LIBRARY_INFER_PLUGIN nvinfer_plugin
+  HINTS ${TENSORRT_DIR} ${TENSORRT_BUILD} ${CUDA_TOOLKIT_ROOT_DIR}
+  PATH_SUFFIXES lib lib64 lib/x64)
+set(TENSORRT_LIBRARY ${TENSORRT_LIBRARY_INFER} ${TENSORRT_LIBRARY_INFER_PLUGIN})
+if(TENSORRT_LIBRARY_INFER AND TENSORRT_LIBRARY_INFER_PLUGIN)
+  message(STATUS "Found TensorRT libs at ${TENSORRT_LIBRARY}")
+else()
+  message(FATAL_ERROR "Cannot find TensorRT libs")
+endif()
 
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(TENSORRT DEFAULT_MSG TENSORRT_INCLUDE_DIR
-        TENSORRT_LIBRARY)
-if (NOT TENSORRT_FOUND)
-    message(ERROR "Cannot find TensorRT library.")
-endif ()
+                                  TENSORRT_LIBRARY)
+if(NOT TENSORRT_FOUND)
+  message(ERROR "Cannot find TensorRT library.")
+endif()
diff --git a/cmake/toolchains/aarch64-linux-gnu.cmake b/cmake/toolchains/aarch64-linux-gnu.cmake
index f95911efd1..dfb3fced21 100644
--- a/cmake/toolchains/aarch64-linux-gnu.cmake
+++ b/cmake/toolchains/aarch64-linux-gnu.cmake
@@ -13,5 +13,9 @@ set(CMAKE_C_FLAGS "-march=armv8-a")
 set(CMAKE_CXX_FLAGS "-march=armv8-a")
 
 # cache flags
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "c flags")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" CACHE STRING "c++ flags")
+set(CMAKE_C_FLAGS
+    "${CMAKE_C_FLAGS}"
+    CACHE STRING "c flags")
+set(CMAKE_CXX_FLAGS
+    "${CMAKE_CXX_FLAGS}"
+    CACHE STRING "c++ flags")
diff --git a/cmake/toolchains/arm-linux-gnueabihf.cmake b/cmake/toolchains/arm-linux-gnueabihf.cmake
index 74ed5bf935..d4cfe513a0 100644
--- a/cmake/toolchains/arm-linux-gnueabihf.cmake
+++ b/cmake/toolchains/arm-linux-gnueabihf.cmake
@@ -12,5 +12,9 @@ set(CMAKE_C_FLAGS "-march=armv7-a -mfloat-abi=hard -mfpu=neon")
 set(CMAKE_CXX_FLAGS "-march=armv7-a -mfloat-abi=hard -mfpu=neon")
 
 # cache flags
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "c flags")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" CACHE STRING "c++ flags")
+set(CMAKE_C_FLAGS
+    "${CMAKE_C_FLAGS}"
+    CACHE STRING "c flags")
+set(CMAKE_CXX_FLAGS
+    "${CMAKE_CXX_FLAGS}"
+    CACHE STRING "c++ flags")
diff --git a/cmake/toolchains/riscv64-linux-gnu.cmake b/cmake/toolchains/riscv64-linux-gnu.cmake
index e3b3b2adbc..a6515dbd7f 100644
--- a/cmake/toolchains/riscv64-linux-gnu.cmake
+++ b/cmake/toolchains/riscv64-linux-gnu.cmake
@@ -13,5 +13,9 @@ set(CMAKE_C_FLAGS "-march=rv64gc")
 set(CMAKE_CXX_FLAGS "-march=rv64gc")
 
 # cache flags
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "c flags")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" CACHE STRING "c++ flags")
+set(CMAKE_C_FLAGS
+    "${CMAKE_C_FLAGS}"
+    CACHE STRING "c flags")
+set(CMAKE_CXX_FLAGS
+    "${CMAKE_CXX_FLAGS}"
+    CACHE STRING "c++ flags")
diff --git a/cmake/toolchains/riscv64-unknown-linux-gnu.cmake b/cmake/toolchains/riscv64-unknown-linux-gnu.cmake
index c24661f6e6..93ddc583fe 100644
--- a/cmake/toolchains/riscv64-unknown-linux-gnu.cmake
+++ b/cmake/toolchains/riscv64-unknown-linux-gnu.cmake
@@ -2,15 +2,17 @@ set(CMAKE_SYSTEM_NAME Linux)
 set(CMAKE_SYSTEM_PROCESSOR riscv)
 
 if(DEFINED ENV{RISCV_ROOT_PATH})
-    file(TO_CMAKE_PATH $ENV{RISCV_ROOT_PATH} RISCV_ROOT_PATH)
+  file(TO_CMAKE_PATH $ENV{RISCV_ROOT_PATH} RISCV_ROOT_PATH)
 else()
-    message(FATAL_ERROR "RISCV_ROOT_PATH env must be defined")
+  message(FATAL_ERROR "RISCV_ROOT_PATH env must be defined")
 endif()
 
 set(CMAKE_C_COMPILER ${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-gcc)
 set(CMAKE_CXX_COMPILER ${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-g++)
 
-set(CMAKE_SYSROOT "${RISCV_ROOT_PATH}/sysroot" CACHE PATH "riscv sysroot")
+set(CMAKE_SYSROOT
+    "${RISCV_ROOT_PATH}/sysroot"
+    CACHE PATH "riscv sysroot")
 set(CMAKE_FIND_ROOT_PATH ${RISCV_ROOT_PATH}/riscv64-unknown-linux-gnu)
 
 set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
@@ -22,5 +24,9 @@ set(CMAKE_C_FLAGS "-march=rv64gc")
 set(CMAKE_CXX_FLAGS "-march=rv64gc")
 
 # cache flags
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "c flags")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" CACHE STRING "c++ flags")
+set(CMAKE_C_FLAGS
+    "${CMAKE_C_FLAGS}"
+    CACHE STRING "c flags")
+set(CMAKE_CXX_FLAGS
+    "${CMAKE_CXX_FLAGS}"
+    CACHE STRING "c++ flags")
diff --git a/cmake/toolchains/rknpu2-linux-gnu.cmake b/cmake/toolchains/rknpu2-linux-gnu.cmake
index 2bb6835430..4a94f8b238 100644
--- a/cmake/toolchains/rknpu2-linux-gnu.cmake
+++ b/cmake/toolchains/rknpu2-linux-gnu.cmake
@@ -2,9 +2,9 @@ set(CMAKE_SYSTEM_NAME Linux)
 set(CMAKE_SYSTEM_PROCESSOR rockchip)
 
 if(DEFINED ENV{RKNN_TOOL_CHAIN})
-    file(TO_CMAKE_PATH $ENV{RKNN_TOOL_CHAIN} RKNN_TOOL_CHAIN)
+  file(TO_CMAKE_PATH $ENV{RKNN_TOOL_CHAIN} RKNN_TOOL_CHAIN)
 else()
-    message(FATAL_ERROR "RKNN_TOOL_CHAIN env must be defined")
+  message(FATAL_ERROR "RKNN_TOOL_CHAIN env must be defined")
 endif()
 
 set(CMAKE_C_COMPILER ${RKNN_TOOL_CHAIN}/bin/aarch64-rockchip-linux-gnu-gcc)
@@ -19,5 +19,9 @@ set(CMAKE_C_FLAGS "-Wl,--allow-shlib-undefined")
 set(CMAKE_CXX_FLAGS "-Wl,--allow-shlib-undefined")
 
 # cache flags
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "c flags")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" CACHE STRING "c++ flags")
+set(CMAKE_C_FLAGS
+    "${CMAKE_C_FLAGS}"
+    CACHE STRING "c flags")
+set(CMAKE_CXX_FLAGS
+    "${CMAKE_CXX_FLAGS}"
+    CACHE STRING "c++ flags")
diff --git a/csrc/mmdeploy/CMakeLists.txt b/csrc/mmdeploy/CMakeLists.txt
index 6bfbd3a95a..26dce4f586 100644
--- a/csrc/mmdeploy/CMakeLists.txt
+++ b/csrc/mmdeploy/CMakeLists.txt
@@ -2,20 +2,20 @@
 
 add_subdirectory(backend_ops)
 
-if (MMDEPLOY_BUILD_SDK)
-    # include OpenCV for SDK modules since many of them depends on it
-    include(${CMAKE_SOURCE_DIR}/cmake/opencv.cmake)
+if(MMDEPLOY_BUILD_SDK)
+  # include OpenCV for SDK modules since many of them depends on it
+  include(${CMAKE_SOURCE_DIR}/cmake/opencv.cmake)
 
-    add_subdirectory(core)
-    add_subdirectory(execution)
-    add_subdirectory(utils)
-    add_subdirectory(archive)
-    add_subdirectory(device)
-    add_subdirectory(graph)
-    add_subdirectory(model)
-    add_subdirectory(operation)
-    add_subdirectory(preprocess)
-    add_subdirectory(net)
-    add_subdirectory(codebase)
-    add_subdirectory(apis)
-endif ()
+  add_subdirectory(core)
+  add_subdirectory(execution)
+  add_subdirectory(utils)
+  add_subdirectory(archive)
+  add_subdirectory(device)
+  add_subdirectory(graph)
+  add_subdirectory(model)
+  add_subdirectory(operation)
+  add_subdirectory(preprocess)
+  add_subdirectory(net)
+  add_subdirectory(codebase)
+  add_subdirectory(apis)
+endif()
diff --git a/csrc/mmdeploy/apis/CMakeLists.txt b/csrc/mmdeploy/apis/CMakeLists.txt
index 1ab877be90..e137bce311 100644
--- a/csrc/mmdeploy/apis/CMakeLists.txt
+++ b/csrc/mmdeploy/apis/CMakeLists.txt
@@ -4,8 +4,8 @@ add_subdirectory(c)
 add_subdirectory(cxx)
 add_subdirectory(java)
 
-# add python subdir conditionally since it's designed to work as
-# a standalone project also
-if (MMDEPLOY_BUILD_SDK_PYTHON_API)
-    add_subdirectory(python)
-endif ()
+# add python subdir conditionally since it's designed to work as a standalone
+# project also
+if(MMDEPLOY_BUILD_SDK_PYTHON_API)
+  add_subdirectory(python)
+endif()
diff --git a/csrc/mmdeploy/apis/c/CMakeLists.txt b/csrc/mmdeploy/apis/c/CMakeLists.txt
index f08fa8cf86..4c1755b168 100644
--- a/csrc/mmdeploy/apis/c/CMakeLists.txt
+++ b/csrc/mmdeploy/apis/c/CMakeLists.txt
@@ -6,81 +6,76 @@ include(${CMAKE_SOURCE_DIR}/cmake/MMDeploy.cmake)
 set(CAPI_OBJS)
 
 macro(add_object name)
-    add_library(${name} OBJECT ${ARGN})
-    set_target_properties(${name} PROPERTIES POSITION_INDEPENDENT_CODE 1)
-    target_compile_definitions(${name} PRIVATE -DMMDEPLOY_API_EXPORTS=1)
-    if (NOT MSVC)
-        target_compile_options(${name} PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-fvisibility=hidden>)
-    endif ()
-    target_link_libraries(${name} PRIVATE mmdeploy::core)
-    target_include_directories(${name} PUBLIC
-            $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
-            $<INSTALL_INTERFACE:include>)
-    set(CAPI_OBJS ${CAPI_OBJS} ${name})
-    mmdeploy_export(${name})
+  add_library(${name} OBJECT ${ARGN})
+  set_target_properties(${name} PROPERTIES POSITION_INDEPENDENT_CODE 1)
+  target_compile_definitions(${name} PRIVATE -DMMDEPLOY_API_EXPORTS=1)
+  if(NOT MSVC)
+    target_compile_options(
+      ${name} PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-fvisibility=hidden>)
+  endif()
+  target_link_libraries(${name} PRIVATE mmdeploy::core)
+  target_include_directories(
+    ${name} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+                   $<INSTALL_INTERFACE:include>)
+  set(CAPI_OBJS ${CAPI_OBJS} ${name})
+  mmdeploy_export(${name})
 endmacro()
 
-set(COMMON_LIST
-        common
-        model
-        executor
-        pipeline)
+set(COMMON_LIST common model executor pipeline)
 
 set(TASK_LIST ${MMDEPLOY_TASKS})
 
-foreach (TASK ${COMMON_LIST})
-    set(TARGET_NAME mmdeploy_${TASK})
-    set(OBJECT_NAME mmdeploy_${TASK}_obj)
-    add_object(${OBJECT_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/mmdeploy/${TASK}.cpp)
-    mmdeploy_add_library(${TARGET_NAME})
-    target_link_libraries(${TARGET_NAME} PRIVATE ${OBJECT_NAME})
-    target_include_directories(${TARGET_NAME} PUBLIC
-            $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
-            $<INSTALL_INTERFACE:include>)
-    install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/mmdeploy/${TASK}.h
-            DESTINATION include/mmdeploy)
-endforeach ()
+foreach(TASK ${COMMON_LIST})
+  set(TARGET_NAME mmdeploy_${TASK})
+  set(OBJECT_NAME mmdeploy_${TASK}_obj)
+  add_object(${OBJECT_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/mmdeploy/${TASK}.cpp)
+  mmdeploy_add_library(${TARGET_NAME})
+  target_link_libraries(${TARGET_NAME} PRIVATE ${OBJECT_NAME})
+  target_include_directories(
+    ${TARGET_NAME} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+                          $<INSTALL_INTERFACE:include>)
+  install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/mmdeploy/${TASK}.h
+          DESTINATION include/mmdeploy)
+endforeach()
 
-target_link_libraries(mmdeploy_executor PUBLIC
-        mmdeploy_execution mmdeploy_common)
-target_link_libraries(mmdeploy_pipeline PUBLIC
-        mmdeploy_executor mmdeploy_model mmdeploy_common)
+target_link_libraries(mmdeploy_executor PUBLIC mmdeploy_execution
+                                               mmdeploy_common)
+target_link_libraries(mmdeploy_pipeline PUBLIC mmdeploy_executor mmdeploy_model
+                                               mmdeploy_common)
 
-foreach (TASK ${TASK_LIST})
-    set(TARGET_NAME mmdeploy_${TASK})
-    set(OBJECT_NAME mmdeploy_${TASK}_obj)
-    add_object(${OBJECT_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/mmdeploy/${TASK}.cpp)
-    mmdeploy_add_library(${TARGET_NAME})
-    target_link_libraries(${TARGET_NAME} PRIVATE ${OBJECT_NAME}
-            mmdeploy_pipeline)
-    target_include_directories(${TARGET_NAME} PUBLIC
-            $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
-            $<INSTALL_INTERFACE:include>)
-    install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/mmdeploy/${TASK}.h
-            DESTINATION include/mmdeploy)
-endforeach ()
+foreach(TASK ${TASK_LIST})
+  set(TARGET_NAME mmdeploy_${TASK})
+  set(OBJECT_NAME mmdeploy_${TASK}_obj)
+  add_object(${OBJECT_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/mmdeploy/${TASK}.cpp)
+  mmdeploy_add_library(${TARGET_NAME})
+  target_link_libraries(${TARGET_NAME} PRIVATE ${OBJECT_NAME} mmdeploy_pipeline)
+  target_include_directories(
+    ${TARGET_NAME} PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+                          $<INSTALL_INTERFACE:include>)
+  install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/mmdeploy/${TASK}.h
+          DESTINATION include/mmdeploy)
+endforeach()
 
-install(DIRECTORY ${CMAKE_SOURCE_DIR}/demo/csrc/ DESTINATION example/cpp
-        FILES_MATCHING
-        PATTERN "*.cpp"
-        PATTERN "CMakeLists.txt"
-        )
+install(
+  DIRECTORY ${CMAKE_SOURCE_DIR}/demo/csrc/
+  DESTINATION example/cpp
+  FILES_MATCHING
+  PATTERN "*.cpp"
+  PATTERN "CMakeLists.txt")
 
-if (MMDEPLOY_BUILD_SDK_CSHARP_API OR MMDEPLOY_BUILD_SDK_MONOLITHIC)
-    add_library(mmdeploy SHARED)
-    mmdeploy_load_static(mmdeploy MMDeployStaticModules)
-    mmdeploy_load_dynamic(mmdeploy MMDeployDynamicModules)
-    target_link_libraries(mmdeploy PRIVATE ${CAPI_OBJS} mmdeploy_execution)
-    target_include_directories(mmdeploy PUBLIC
-            $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
-            $<INSTALL_INTERFACE:include>)
-    set(MMDEPLOY_VERSION ${MMDEPLOY_VERSION_MAJOR}
-            .${MMDEPLOY_VERSION_MINOR}
-            .${MMDEPLOY_VERSION_PATCH})
-    string(REPLACE ";" "" MMDEPLOY_VERSION ${MMDEPLOY_VERSION})
-    set_target_properties(mmdeploy PROPERTIES
-            VERSION ${MMDEPLOY_VERSION}
-            SOVERSION ${MMDEPLOY_VERSION_MAJOR})
-    mmdeploy_add_rpath(mmdeploy)
-    mmdeploy_export_impl(mmdeploy)
-endif ()
+if(MMDEPLOY_BUILD_SDK_CSHARP_API OR MMDEPLOY_BUILD_SDK_MONOLITHIC)
+  add_library(mmdeploy SHARED)
+  mmdeploy_load_static(mmdeploy MMDeployStaticModules)
+  mmdeploy_load_dynamic(mmdeploy MMDeployDynamicModules)
+  target_link_libraries(mmdeploy PRIVATE ${CAPI_OBJS} mmdeploy_execution)
+  target_include_directories(
+    mmdeploy PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+                    $<INSTALL_INTERFACE:include>)
+  set(MMDEPLOY_VERSION ${MMDEPLOY_VERSION_MAJOR} .${MMDEPLOY_VERSION_MINOR}
+                       .${MMDEPLOY_VERSION_PATCH})
+  string(REPLACE ";" "" MMDEPLOY_VERSION ${MMDEPLOY_VERSION})
+  set_target_properties(mmdeploy PROPERTIES VERSION ${MMDEPLOY_VERSION}
+                                            SOVERSION ${MMDEPLOY_VERSION_MAJOR})
+  mmdeploy_add_rpath(mmdeploy)
+  mmdeploy_export_impl(mmdeploy)
+endif()
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/classifier.cpp b/csrc/mmdeploy/apis/c/mmdeploy/classifier.cpp
index 3eec4ef90b..9faf47f349 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/classifier.cpp
+++ b/csrc/mmdeploy/apis/c/mmdeploy/classifier.cpp
@@ -16,118 +16,132 @@
 using namespace mmdeploy;
 using namespace std;
 
-int mmdeploy_classifier_create(mmdeploy_model_t model, const char* device_name, int device_id,
-                               mmdeploy_classifier_t* classifier) {
-  mmdeploy_context_t context{};
-  auto ec = mmdeploy_context_create_by_device(device_name, device_id, &context);
-  if (ec != MMDEPLOY_SUCCESS) {
+int mmdeploy_classifier_create(mmdeploy_model_t model, const char* device_name, int device_id, mmdeploy_classifier_t* classifier)
+{
+    mmdeploy_context_t context{};
+    auto               ec = mmdeploy_context_create_by_device(device_name, device_id, &context);
+    if (ec != MMDEPLOY_SUCCESS)
+    {
+        return ec;
+    }
+    ec = mmdeploy_classifier_create_v2(model, context, classifier);
+    mmdeploy_context_destroy(context);
     return ec;
-  }
-  ec = mmdeploy_classifier_create_v2(model, context, classifier);
-  mmdeploy_context_destroy(context);
-  return ec;
 }
 
-int mmdeploy_classifier_create_by_path(const char* model_path, const char* device_name,
-                                       int device_id, mmdeploy_classifier_t* classifier) {
-  mmdeploy_model_t model{};
+int mmdeploy_classifier_create_by_path(const char* model_path, const char* device_name, int device_id, mmdeploy_classifier_t* classifier)
+{
+    mmdeploy_model_t model{};
 
-  if (auto ec = mmdeploy_model_create_by_path(model_path, &model)) {
+    if (auto ec = mmdeploy_model_create_by_path(model_path, &model))
+    {
+        return ec;
+    }
+    auto ec = mmdeploy_classifier_create(model, device_name, device_id, classifier);
+    mmdeploy_model_destroy(model);
     return ec;
-  }
-  auto ec = mmdeploy_classifier_create(model, device_name, device_id, classifier);
-  mmdeploy_model_destroy(model);
-  return ec;
 }
 
-int mmdeploy_classifier_create_v2(mmdeploy_model_t model, mmdeploy_context_t context,
-                                  mmdeploy_classifier_t* classifier) {
-  return mmdeploy_pipeline_create_from_model(model, context, (mmdeploy_pipeline_t*)classifier);
+int mmdeploy_classifier_create_v2(mmdeploy_model_t model, mmdeploy_context_t context, mmdeploy_classifier_t* classifier)
+{
+    return mmdeploy_pipeline_create_from_model(model, context, (mmdeploy_pipeline_t*)classifier);
 }
 
-int mmdeploy_classifier_create_input(const mmdeploy_mat_t* mats, int mat_count,
-                                     mmdeploy_value_t* value) {
-  return mmdeploy_common_create_input(mats, mat_count, value);
+int mmdeploy_classifier_create_input(const mmdeploy_mat_t* mats, int mat_count, mmdeploy_value_t* value)
+{
+    return mmdeploy_common_create_input(mats, mat_count, value);
 }
 
-int mmdeploy_classifier_apply(mmdeploy_classifier_t classifier, const mmdeploy_mat_t* mats,
-                              int mat_count, mmdeploy_classification_t** results,
-                              int** result_count) {
-  wrapped<mmdeploy_value_t> input;
-  if (auto ec = mmdeploy_classifier_create_input(mats, mat_count, input.ptr())) {
-    return ec;
-  }
-  wrapped<mmdeploy_value_t> output;
-  if (auto ec = mmdeploy_classifier_apply_v2(classifier, input, output.ptr())) {
-    return ec;
-  }
-  if (auto ec = mmdeploy_classifier_get_result(output, results, result_count)) {
-    return ec;
-  }
-  return MMDEPLOY_SUCCESS;
+int mmdeploy_classifier_apply(mmdeploy_classifier_t classifier, const mmdeploy_mat_t* mats, int mat_count, mmdeploy_classification_t** results, int** result_count)
+{
+    wrapped<mmdeploy_value_t> input;
+    if (auto ec = mmdeploy_classifier_create_input(mats, mat_count, input.ptr()))
+    {
+        return ec;
+    }
+    wrapped<mmdeploy_value_t> output;
+    if (auto ec = mmdeploy_classifier_apply_v2(classifier, input, output.ptr()))
+    {
+        return ec;
+    }
+    if (auto ec = mmdeploy_classifier_get_result(output, results, result_count))
+    {
+        return ec;
+    }
+    return MMDEPLOY_SUCCESS;
 }
 
-int mmdeploy_classifier_apply_v2(mmdeploy_classifier_t classifier, mmdeploy_value_t input,
-                                 mmdeploy_value_t* output) {
-  return mmdeploy_pipeline_apply((mmdeploy_pipeline_t)classifier, input, output);
+int mmdeploy_classifier_apply_v2(mmdeploy_classifier_t classifier, mmdeploy_value_t input, mmdeploy_value_t* output)
+{
+    return mmdeploy_pipeline_apply((mmdeploy_pipeline_t)classifier, input, output);
 }
 
-int mmdeploy_classifier_apply_async(mmdeploy_classifier_t classifier, mmdeploy_sender_t input,
-                                    mmdeploy_sender_t* output) {
-  return mmdeploy_pipeline_apply_async((mmdeploy_pipeline_t)classifier, input, output);
+int mmdeploy_classifier_apply_async(mmdeploy_classifier_t classifier, mmdeploy_sender_t input, mmdeploy_sender_t* output)
+{
+    return mmdeploy_pipeline_apply_async((mmdeploy_pipeline_t)classifier, input, output);
 }
 
-int mmdeploy_classifier_get_result(mmdeploy_value_t output, mmdeploy_classification_t** results,
-                                   int** result_count) {
-  if (!output || !results || !result_count) {
-    return MMDEPLOY_E_INVALID_ARG;
-  }
-  try {
-    Value& value = Cast(output)->front();
-
-    auto classify_outputs = from_value<vector<mmcls::Labels>>(value);
-
-    vector<int> _result_count;
-    _result_count.reserve(classify_outputs.size());
-
-    for (const auto& cls_output : classify_outputs) {
-      _result_count.push_back((int)cls_output.size());
+int mmdeploy_classifier_get_result(mmdeploy_value_t output, mmdeploy_classification_t** results, int** result_count)
+{
+    if (!output || !results || !result_count)
+    {
+        return MMDEPLOY_E_INVALID_ARG;
     }
-
-    auto total = std::accumulate(begin(_result_count), end(_result_count), 0);
-
-    std::unique_ptr<int[]> result_count_data(new int[_result_count.size()]{});
-    std::copy(_result_count.begin(), _result_count.end(), result_count_data.get());
-
-    std::unique_ptr<mmdeploy_classification_t[]> result_data(
-        new mmdeploy_classification_t[total]{});
-    auto result_ptr = result_data.get();
-    for (const auto& cls_output : classify_outputs) {
-      for (const auto& label : cls_output) {
-        result_ptr->label_id = label.label_id;
-        result_ptr->score = label.score;
-        ++result_ptr;
-      }
+    try
+    {
+        Value&      value = Cast(output)->front();
+
+        auto        classify_outputs = from_value<vector<mmcls::Labels>>(value);
+
+        vector<int> _result_count;
+        _result_count.reserve(classify_outputs.size());
+
+        for (const auto& cls_output : classify_outputs)
+        {
+            _result_count.push_back((int)cls_output.size());
+        }
+
+        auto                   total = std::accumulate(begin(_result_count), end(_result_count), 0);
+
+        std::unique_ptr<int[]> result_count_data(new int[_result_count.size()]{});
+        std::copy(_result_count.begin(), _result_count.end(), result_count_data.get());
+
+        std::unique_ptr<mmdeploy_classification_t[]> result_data(
+            new mmdeploy_classification_t[total]{});
+        auto result_ptr = result_data.get();
+        for (const auto& cls_output : classify_outputs)
+        {
+            for (const auto& label : cls_output)
+            {
+                result_ptr->label_id = label.label_id;
+                result_ptr->score    = label.score;
+                ++result_ptr;
+            }
+        }
+
+        *result_count = result_count_data.release();
+        *results      = result_data.release();
+
+        return MMDEPLOY_SUCCESS;
     }
-
-    *result_count = result_count_data.release();
-    *results = result_data.release();
-
-    return MMDEPLOY_SUCCESS;
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("unhandled exception: {}", e.what());
-  } catch (...) {
-    MMDEPLOY_ERROR("unknown exception caught");
-  }
-  return MMDEPLOY_E_FAIL;
+    catch (const std::exception& e)
+    {
+        MMDEPLOY_ERROR("unhandled exception: {}", e.what());
+    }
+    catch (...)
+    {
+        MMDEPLOY_ERROR("unknown exception caught");
+    }
+    return MMDEPLOY_E_FAIL;
 }
 
-void mmdeploy_classifier_release_result(mmdeploy_classification_t* results, const int* result_count,
-                                        int count) {
-  delete[] results;
-  delete[] result_count;
+void mmdeploy_classifier_release_result(mmdeploy_classification_t* results, const int* result_count, int count)
+{
+    delete[] results;
+    delete[] result_count;
 }
 
-void mmdeploy_classifier_destroy(mmdeploy_classifier_t classifier) {
-  mmdeploy_pipeline_destroy((mmdeploy_pipeline_t)classifier);
+void mmdeploy_classifier_destroy(mmdeploy_classifier_t classifier)
+{
+    mmdeploy_pipeline_destroy((mmdeploy_pipeline_t)classifier);
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/classifier.h b/csrc/mmdeploy/apis/c/mmdeploy/classifier.h
index 54e9d0215b..1681cf7fae 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/classifier.h
+++ b/csrc/mmdeploy/apis/c/mmdeploy/classifier.h
@@ -13,124 +13,125 @@
 #include "mmdeploy/model.h"
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
-typedef struct mmdeploy_classification_t {
-  int label_id;
-  float score;
-} mmdeploy_classification_t;
-
-typedef struct mmdeploy_classifier* mmdeploy_classifier_t;
-
-/**
- * @brief Create classifier's handle
- * @param[in] model an instance of mmclassification sdk model created by
- * \ref mmdeploy_model_create_by_path or \ref mmdeploy_model_create in \ref model.h
- * @param[in] device_name name of device, such as "cpu", "cuda", etc.
- * @param[in] device_id id of device.
- * @param[out] classifier instance of a classifier, which must be destroyed
- * by \ref mmdeploy_classifier_destroy
- * @return status of creating classifier's handle
- */
-MMDEPLOY_API int mmdeploy_classifier_create(mmdeploy_model_t model, const char* device_name,
-                                            int device_id, mmdeploy_classifier_t* classifier);
-
-/**
- * @brief Create classifier's handle
- * @param[in] model_path path of mmclassification sdk model exported by mmdeploy model converter
- * @param[in] device_name name of device, such as "cpu", "cuda", etc.
- * @param[in] device_id id of device.
- * @param[out] classifier instance of a classifier, which must be destroyed
- * by \ref mmdeploy_classifier_destroy
- * @return status of creating classifier's handle
- */
-MMDEPLOY_API int mmdeploy_classifier_create_by_path(const char* model_path, const char* device_name,
-                                                    int device_id,
-                                                    mmdeploy_classifier_t* classifier);
-
-/**
- * @brief Use classifier created by  \ref mmdeploy_classifier_create_by_path to get label
- * information of each image in a batch
- * @param[in] classifier classifier's handle created by  \ref mmdeploy_classifier_create_by_path
- * @param[in] mats a batch of images
- * @param[in] mat_count number of images in the batch
- * @param[out] results a linear buffer to save classification results of each
- * image, which must be freed by \ref mmdeploy_classifier_release_result
- * @param[out] result_count a linear buffer with length being \p mat_count to save the number of
- * classification results of each image. It must be released by \ref
- * mmdeploy_classifier_release_result
- * @return status of inference
- */
-MMDEPLOY_API int mmdeploy_classifier_apply(mmdeploy_classifier_t classifier,
-                                           const mmdeploy_mat_t* mats, int mat_count,
-                                           mmdeploy_classification_t** results, int** result_count);
-
-/**
- * @brief Release the inference result buffer created \ref mmdeploy_classifier_apply
- * @param[in] results classification results buffer
- * @param[in] result_count \p results size buffer
- * @param[in] count length of \p result_count
- */
-MMDEPLOY_API void mmdeploy_classifier_release_result(mmdeploy_classification_t* results,
-                                                     const int* result_count, int count);
-
-/**
- * @brief Destroy classifier's handle
- * @param[in] classifier classifier's handle created by \ref mmdeploy_classifier_create_by_path
- */
-MMDEPLOY_API void mmdeploy_classifier_destroy(mmdeploy_classifier_t classifier);
-
-/******************************************************************************
- * Experimental asynchronous APIs */
-
-/**
- * @brief Same as \ref mmdeploy_classifier_create, but allows to control execution context of tasks
- * via context
- */
-MMDEPLOY_API int mmdeploy_classifier_create_v2(mmdeploy_model_t model, mmdeploy_context_t context,
-                                               mmdeploy_classifier_t* classifier);
-
-/**
- * @brief Pack classifier inputs into mmdeploy_value_t
- * @param[in] mats a batch of images
- * @param[in] mat_count number of images in the batch
- * @param[out] value the packed value
- * @return status of the operation
- */
-MMDEPLOY_API int mmdeploy_classifier_create_input(const mmdeploy_mat_t* mats, int mat_count,
-                                                  mmdeploy_value_t* value);
-
-/**
- * @brief Same as \ref mmdeploy_classifier_apply, but input and output are packed in \ref
- * mmdeploy_value_t.
- */
-MMDEPLOY_API int mmdeploy_classifier_apply_v2(mmdeploy_classifier_t classifier,
-                                              mmdeploy_value_t input, mmdeploy_value_t* output);
-
-/**
- * @brief Apply classifier asynchronously
- * @param[in] classifier handle of the classifier
- * @param[in] input input sender that will be consumed by the operation
- * @param[out] output output sender
- * @return status of the operation
- */
-MMDEPLOY_API int mmdeploy_classifier_apply_async(mmdeploy_classifier_t classifier,
-                                                 mmdeploy_sender_t input,
-                                                 mmdeploy_sender_t* output);
-
-/**
- *
- * @param[in] output output obtained by applying a classifier
- * @param[out] results a linear buffer containing classification results of each image, released by
- * \ref mmdeploy_classifier_release_result
- * @param[out] result_count a linear buffer containing the number of results for each input image,
- * released by \ref mmdeploy_classifier_release_result
- * @return status of the operation
- */
-MMDEPLOY_API int mmdeploy_classifier_get_result(mmdeploy_value_t output,
-                                                mmdeploy_classification_t** results,
-                                                int** result_count);
+    typedef struct mmdeploy_classification_t
+    {
+        int   label_id;
+        float score;
+    } mmdeploy_classification_t;
+
+    typedef struct mmdeploy_classifier* mmdeploy_classifier_t;
+
+    /**
+     * @brief Create classifier's handle
+     * @param[in] model an instance of mmclassification sdk model created by
+     * \ref mmdeploy_model_create_by_path or \ref mmdeploy_model_create in \ref model.h
+     * @param[in] device_name name of device, such as "cpu", "cuda", etc.
+     * @param[in] device_id id of device.
+     * @param[out] classifier instance of a classifier, which must be destroyed
+     * by \ref mmdeploy_classifier_destroy
+     * @return status of creating classifier's handle
+     */
+    MMDEPLOY_API int                    mmdeploy_classifier_create(mmdeploy_model_t model, const char* device_name, int device_id, mmdeploy_classifier_t* classifier);
+
+    /**
+     * @brief Create classifier's handle
+     * @param[in] model_path path of mmclassification sdk model exported by mmdeploy model converter
+     * @param[in] device_name name of device, such as "cpu", "cuda", etc.
+     * @param[in] device_id id of device.
+     * @param[out] classifier instance of a classifier, which must be destroyed
+     * by \ref mmdeploy_classifier_destroy
+     * @return status of creating classifier's handle
+     */
+    MMDEPLOY_API int                    mmdeploy_classifier_create_by_path(const char* model_path, const char* device_name, int device_id, mmdeploy_classifier_t* classifier);
+
+    /**
+     * @brief Use classifier created by  \ref mmdeploy_classifier_create_by_path to get label
+     * information of each image in a batch
+     * @param[in] classifier classifier's handle created by  \ref mmdeploy_classifier_create_by_path
+     * @param[in] mats a batch of images
+     * @param[in] mat_count number of images in the batch
+     * @param[out] results a linear buffer to save classification results of each
+     * image, which must be freed by \ref mmdeploy_classifier_release_result
+     * @param[out] result_count a linear buffer with length being \p mat_count to save the number of
+     * classification results of each image. It must be released by \ref
+     * mmdeploy_classifier_release_result
+     * @return status of inference
+     */
+    MMDEPLOY_API int                    mmdeploy_classifier_apply(mmdeploy_classifier_t       classifier,
+                                                                  const mmdeploy_mat_t*       mats,
+                                                                  int                         mat_count,
+                                                                  mmdeploy_classification_t** results,
+                                                                  int**                       result_count);
+
+    /**
+     * @brief Release the inference result buffer created \ref mmdeploy_classifier_apply
+     * @param[in] results classification results buffer
+     * @param[in] result_count \p results size buffer
+     * @param[in] count length of \p result_count
+     */
+    MMDEPLOY_API void                   mmdeploy_classifier_release_result(mmdeploy_classification_t* results,
+                                                                           const int*                 result_count,
+                                                                           int                        count);
+
+    /**
+     * @brief Destroy classifier's handle
+     * @param[in] classifier classifier's handle created by \ref mmdeploy_classifier_create_by_path
+     */
+    MMDEPLOY_API void                   mmdeploy_classifier_destroy(mmdeploy_classifier_t classifier);
+
+    /******************************************************************************
+     * Experimental asynchronous APIs */
+
+    /**
+     * @brief Same as \ref mmdeploy_classifier_create, but allows to control execution context of tasks
+     * via context
+     */
+    MMDEPLOY_API int                    mmdeploy_classifier_create_v2(mmdeploy_model_t model, mmdeploy_context_t context, mmdeploy_classifier_t* classifier);
+
+    /**
+     * @brief Pack classifier inputs into mmdeploy_value_t
+     * @param[in] mats a batch of images
+     * @param[in] mat_count number of images in the batch
+     * @param[out] value the packed value
+     * @return status of the operation
+     */
+    MMDEPLOY_API int                    mmdeploy_classifier_create_input(const mmdeploy_mat_t* mats, int mat_count, mmdeploy_value_t* value);
+
+    /**
+     * @brief Same as \ref mmdeploy_classifier_apply, but input and output are packed in \ref
+     * mmdeploy_value_t.
+     */
+    MMDEPLOY_API int                    mmdeploy_classifier_apply_v2(mmdeploy_classifier_t classifier,
+                                                                     mmdeploy_value_t      input,
+                                                                     mmdeploy_value_t*     output);
+
+    /**
+     * @brief Apply classifier asynchronously
+     * @param[in] classifier handle of the classifier
+     * @param[in] input input sender that will be consumed by the operation
+     * @param[out] output output sender
+     * @return status of the operation
+     */
+    MMDEPLOY_API int                    mmdeploy_classifier_apply_async(mmdeploy_classifier_t classifier,
+                                                                        mmdeploy_sender_t     input,
+                                                                        mmdeploy_sender_t*    output);
+
+    /**
+     *
+     * @param[in] output output obtained by applying a classifier
+     * @param[out] results a linear buffer containing classification results of each image, released by
+     * \ref mmdeploy_classifier_release_result
+     * @param[out] result_count a linear buffer containing the number of results for each input image,
+     * released by \ref mmdeploy_classifier_release_result
+     * @return status of the operation
+     */
+    MMDEPLOY_API int                    mmdeploy_classifier_get_result(mmdeploy_value_t            output,
+                                                                       mmdeploy_classification_t** results,
+                                                                       int**                       result_count);
 
 #ifdef __cplusplus
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/common.cpp b/csrc/mmdeploy/apis/c/mmdeploy/common.cpp
index e00cc3f1cf..81e43ffce3 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/common.cpp
+++ b/csrc/mmdeploy/apis/c/mmdeploy/common.cpp
@@ -5,111 +5,144 @@
 #include "mmdeploy/core/profiler.h"
 #include "mmdeploy/executor_internal.h"
 
-mmdeploy_value_t mmdeploy_value_copy(mmdeploy_value_t value) {
-  if (!value) {
-    return nullptr;
-  }
-  return Guard([&] { return Take(Value(*Cast(value))); });
+mmdeploy_value_t mmdeploy_value_copy(mmdeploy_value_t value)
+{
+    if (!value)
+    {
+        return nullptr;
+    }
+    return Guard([&]
+                 { return Take(Value(*Cast(value))); });
 }
 
-void mmdeploy_value_destroy(mmdeploy_value_t value) { delete Cast(value); }
+void mmdeploy_value_destroy(mmdeploy_value_t value)
+{
+    delete Cast(value);
+}
 
-int mmdeploy_context_create(mmdeploy_context_t* context) {
-  *context = (mmdeploy_context_t) new Value;
-  return 0;
+int mmdeploy_context_create(mmdeploy_context_t* context)
+{
+    *context = (mmdeploy_context_t) new Value;
+    return 0;
 }
 
-int mmdeploy_context_create_by_device(const char* device_name, int device_id,
-                                      mmdeploy_context_t* context) {
-  mmdeploy_device_t device{};
-  int ec = MMDEPLOY_SUCCESS;
-  mmdeploy_context_t _context{};
-  ec = mmdeploy_context_create(&_context);
-  if (ec != MMDEPLOY_SUCCESS) {
-    return ec;
-  }
-  ec = mmdeploy_device_create(device_name, device_id, &device);
-  if (ec != MMDEPLOY_SUCCESS) {
+int mmdeploy_context_create_by_device(const char* device_name, int device_id, mmdeploy_context_t* context)
+{
+    mmdeploy_device_t  device{};
+    int                ec = MMDEPLOY_SUCCESS;
+    mmdeploy_context_t _context{};
+    ec = mmdeploy_context_create(&_context);
+    if (ec != MMDEPLOY_SUCCESS)
+    {
+        return ec;
+    }
+    ec = mmdeploy_device_create(device_name, device_id, &device);
+    if (ec != MMDEPLOY_SUCCESS)
+    {
+        return ec;
+    }
+    ec = mmdeploy_context_add(_context, MMDEPLOY_TYPE_DEVICE, nullptr, device);
+    mmdeploy_device_destroy(device);
+    if (ec == MMDEPLOY_SUCCESS)
+    {
+        *context = _context;
+    }
     return ec;
-  }
-  ec = mmdeploy_context_add(_context, MMDEPLOY_TYPE_DEVICE, nullptr, device);
-  mmdeploy_device_destroy(device);
-  if (ec == MMDEPLOY_SUCCESS) {
-    *context = _context;
-  }
-  return ec;
 }
 
-void mmdeploy_context_destroy(mmdeploy_context_t context) { delete Cast(context); }
+void mmdeploy_context_destroy(mmdeploy_context_t context)
+{
+    delete Cast(context);
+}
 
-int mmdeploy_common_create_input(const mmdeploy_mat_t* mats, int mat_count,
-                                 mmdeploy_value_t* value) {
-  if (mat_count && mats == nullptr) {
-    return MMDEPLOY_E_INVALID_ARG;
-  }
-  try {
-    auto input = std::make_unique<Value>(Value{Value::kArray});
-    for (int i = 0; i < mat_count; ++i) {
-      input->front().push_back({{"ori_img", Cast(mats[i])}});
+int mmdeploy_common_create_input(const mmdeploy_mat_t* mats, int mat_count, mmdeploy_value_t* value)
+{
+    if (mat_count && mats == nullptr)
+    {
+        return MMDEPLOY_E_INVALID_ARG;
     }
-    *value = Cast(input.release());
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("unhandled exception: {}", e.what());
-  } catch (...) {
-    MMDEPLOY_ERROR("unknown exception caught");
-  }
-  return MMDEPLOY_SUCCESS;
-}
 
-int mmdeploy_device_create(const char* device_name, int device_id, mmdeploy_device_t* device) {
-  Device tmp(device_name, device_id);
-  if (tmp.platform_id() == -1) {
-    MMDEPLOY_ERROR("Device \"{}\" not found", device_name);
-    return MMDEPLOY_E_INVALID_ARG;
-  }
-  *device = (mmdeploy_device_t) new Device(tmp);
-  return MMDEPLOY_SUCCESS;
+    try
+    {
+        auto input = std::make_unique<Value>(Value{Value::kArray});
+        for (int i = 0; i < mat_count; ++i)
+        {
+            input->front().push_back({{"ori_img", Cast(mats[i])}});
+        }
+        *value = Cast(input.release());
+    }
+    catch (const std::exception& e)
+    {
+        MMDEPLOY_ERROR("unhandled exception: {}", e.what());
+    }
+    catch (...)
+    {
+        MMDEPLOY_ERROR("unknown exception caught");
+    }
+
+    return MMDEPLOY_SUCCESS;
 }
 
-void mmdeploy_device_destroy(mmdeploy_device_t device) { delete (Device*)device; }
+int mmdeploy_device_create(const char* device_name, int device_id, mmdeploy_device_t* device)
+{
+    Device tmp(device_name, device_id);
+    if (tmp.platform_id() == -1)
+    {
+        MMDEPLOY_ERROR("Device \"{}\" not found", device_name);
+        return MMDEPLOY_E_INVALID_ARG;
+    }
+    *device = (mmdeploy_device_t) new Device(tmp);
+    return MMDEPLOY_SUCCESS;
+}
 
-int mmdeploy_profiler_create(const char* path, mmdeploy_profiler_t* profiler) {
-  *profiler = (mmdeploy_profiler_t) new profiler::Profiler(path);
-  return MMDEPLOY_SUCCESS;
+void mmdeploy_device_destroy(mmdeploy_device_t device)
+{
+    delete (Device*)device;
 }
 
-void mmdeploy_profiler_destroy(mmdeploy_profiler_t profiler) {
-  if (profiler) {
-    auto p = (profiler::Profiler*)profiler;
-    p->Release();
-    delete p;
-  }
+int mmdeploy_profiler_create(const char* path, mmdeploy_profiler_t* profiler)
+{
+    *profiler = (mmdeploy_profiler_t) new profiler::Profiler(path);
+    return MMDEPLOY_SUCCESS;
 }
 
-int mmdeploy_context_add(mmdeploy_context_t context, mmdeploy_context_type_t type, const char* name,
-                         const void* object) {
-  auto& ctx = *Cast(context);
-  switch (type) {
-    case MMDEPLOY_TYPE_DEVICE: {
-      const auto& device = *(Device*)object;
-      ctx["device"] = device;
-      ctx["stream"] = Stream(device);
-      break;
+void mmdeploy_profiler_destroy(mmdeploy_profiler_t profiler)
+{
+    if (profiler)
+    {
+        auto p = (profiler::Profiler*)profiler;
+        p->Release();
+        delete p;
     }
-    case MMDEPLOY_TYPE_SCHEDULER:
-      ctx["scheduler"][name] = *Cast((const mmdeploy_scheduler_t)object);
-      break;
-    case MMDEPLOY_TYPE_MODEL:
-      ctx["model"][name] = *Cast((const mmdeploy_model_t)object);
-      break;
-    case MMDEPLOY_TYPE_PROFILER: {
-      const auto& profiler = *(profiler::Profiler*)object;
-      profiler::Scope* root(profiler.scope());
-      ctx["scope"] = root;
-      break;
+}
+
+int mmdeploy_context_add(mmdeploy_context_t context, mmdeploy_context_type_t type, const char* name, const void* object)
+{
+    auto& ctx = *Cast(context);
+    switch (type)
+    {
+        case MMDEPLOY_TYPE_DEVICE:
+        {
+            const auto& device = *(Device*)object;
+            ctx["device"]      = device;
+            ctx["stream"]      = Stream(device);
+            break;
+        }
+        case MMDEPLOY_TYPE_SCHEDULER:
+            ctx["scheduler"][name] = *Cast((const mmdeploy_scheduler_t)object);
+            break;
+        case MMDEPLOY_TYPE_MODEL:
+            ctx["model"][name] = *Cast((const mmdeploy_model_t)object);
+            break;
+        case MMDEPLOY_TYPE_PROFILER:
+        {
+            const auto&      profiler = *(profiler::Profiler*)object;
+            profiler::Scope* root(profiler.scope());
+            ctx["scope"] = root;
+            break;
+        }
+        default:
+            return MMDEPLOY_E_NOT_SUPPORTED;
     }
-    default:
-      return MMDEPLOY_E_NOT_SUPPORTED;
-  }
-  return 0;
+    return 0;
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/common.h b/csrc/mmdeploy/apis/c/mmdeploy/common.h
index c665134cbf..26b92973ca 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/common.h
+++ b/csrc/mmdeploy/apis/c/mmdeploy/common.h
@@ -6,19 +6,19 @@
 #include <stdint.h>  // NOLINT
 
 #ifndef MMDEPLOY_EXPORT
-#ifdef _MSC_VER
-#define MMDEPLOY_EXPORT __declspec(dllexport)
-#else
-#define MMDEPLOY_EXPORT __attribute__((visibility("default")))
-#endif
+    #ifdef _MSC_VER
+        #define MMDEPLOY_EXPORT __declspec(dllexport)
+    #else
+        #define MMDEPLOY_EXPORT __attribute__((visibility("default")))
+    #endif
 #endif
 
 #ifndef MMDEPLOY_API
-#ifdef MMDEPLOY_API_EXPORTS
-#define MMDEPLOY_API MMDEPLOY_EXPORT
-#else
-#define MMDEPLOY_API
-#endif
+    #ifdef MMDEPLOY_API_EXPORTS
+        #define MMDEPLOY_API MMDEPLOY_EXPORT
+    #else
+        #define MMDEPLOY_API
+    #endif
 #endif
 
 // clang-format off
@@ -54,136 +54,137 @@ typedef enum mmdeploy_status_t {
 
 // clang-format on
 
-typedef struct mmdeploy_device* mmdeploy_device_t;
+typedef struct mmdeploy_device*   mmdeploy_device_t;
 
 typedef struct mmdeploy_profiler* mmdeploy_profiler_t;
 
-typedef struct mmdeploy_mat_t {
-  uint8_t* data;
-  int height;
-  int width;
-  int channel;
-  mmdeploy_pixel_format_t format;
-  mmdeploy_data_type_t type;
-  mmdeploy_device_t device;
+typedef struct mmdeploy_mat_t
+{
+    uint8_t*                data;
+    int                     height;
+    int                     width;
+    int                     channel;
+    mmdeploy_pixel_format_t format;
+    mmdeploy_data_type_t    type;
+    mmdeploy_device_t       device;
 } mmdeploy_mat_t;
 
-typedef struct mmdeploy_rect_t {
-  float left;
-  float top;
-  float right;
-  float bottom;
+typedef struct mmdeploy_rect_t
+{
+    float left;
+    float top;
+    float right;
+    float bottom;
 } mmdeploy_rect_t;
 
-typedef struct mmdeploy_point_t {
-  float x;
-  float y;
+typedef struct mmdeploy_point_t
+{
+    float x;
+    float y;
 } mmdeploy_point_t;
 
-typedef struct mmdeploy_value* mmdeploy_value_t;
+typedef struct mmdeploy_value*   mmdeploy_value_t;
 
 typedef struct mmdeploy_context* mmdeploy_context_t;
 
-typedef enum mmdeploy_context_type_t {
-  MMDEPLOY_TYPE_DEVICE = 0,
-  MMDEPLOY_TYPE_STREAM = 1,
-  MMDEPLOY_TYPE_MODEL = 2,
-  MMDEPLOY_TYPE_SCHEDULER = 3,
-  MMDEPLOY_TYPE_MAT = 4,
-  MMDEPLOY_TYPE_PROFILER = 5,
+typedef enum mmdeploy_context_type_t
+{
+    MMDEPLOY_TYPE_DEVICE    = 0,
+    MMDEPLOY_TYPE_STREAM    = 1,
+    MMDEPLOY_TYPE_MODEL     = 2,
+    MMDEPLOY_TYPE_SCHEDULER = 3,
+    MMDEPLOY_TYPE_MAT       = 4,
+    MMDEPLOY_TYPE_PROFILER  = 5,
 } mmdeploy_context_type_t;
 
 #if __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
-/**
- * Copy value
- * @param value
- * @return
- */
-MMDEPLOY_API mmdeploy_value_t mmdeploy_value_copy(mmdeploy_value_t value);
-
-/**
- * Destroy value
- * @param value
- */
-MMDEPLOY_API void mmdeploy_value_destroy(mmdeploy_value_t value);
-
-/**
- * Create device handle
- * @param device_name
- * @param device_id
- * @param device
- * @return
- */
-MMDEPLOY_API int mmdeploy_device_create(const char* device_name, int device_id,
-                                        mmdeploy_device_t* device);
-
-/**
- * Destroy device handle
- * @param device
- */
-MMDEPLOY_API void mmdeploy_device_destroy(mmdeploy_device_t device);
-
-/**
- * Create profiler
- * @param path path to save the profile data
- * @param profiler handle for profiler, should be added to context and deleted by
- * mmdeploy_profiler_destroy
- * @return status of create
- */
-MMDEPLOY_API int mmdeploy_profiler_create(const char* path, mmdeploy_profiler_t* profiler);
-
-/**
- * Destroy profiler handle
- * @param profiler handle for profiler, profile data will be written to disk after this call
- */
-MMDEPLOY_API void mmdeploy_profiler_destroy(mmdeploy_profiler_t profiler);
-
-/**
- * Create context
- * @param context
- * @return
- */
-MMDEPLOY_API int mmdeploy_context_create(mmdeploy_context_t* context);
-
-/**
- * Create context
- * @param device_name
- * @param device_id
- * @param context
- * @return
- */
-MMDEPLOY_API int mmdeploy_context_create_by_device(const char* device_name, int device_id,
-                                                   mmdeploy_context_t* context);
-
-/**
- * Destroy context
- * @param context
- */
-MMDEPLOY_API void mmdeploy_context_destroy(mmdeploy_context_t context);
-
-/**
- * Add context object
- * @param context
- * @param type
- * @param name
- * @param object
- * @return
- */
-MMDEPLOY_API int mmdeploy_context_add(mmdeploy_context_t context, mmdeploy_context_type_t type,
-                                      const char* name, const void* object);
-
-/**
- * Create input value from array of mats
- * @param mats
- * @param mat_count
- * @param value
- * @return
- */
-MMDEPLOY_API int mmdeploy_common_create_input(const mmdeploy_mat_t* mats, int mat_count,
-                                              mmdeploy_value_t* value);
+    /**
+     * Copy value
+     * @param value
+     * @return
+     */
+    MMDEPLOY_API mmdeploy_value_t mmdeploy_value_copy(mmdeploy_value_t value);
+
+    /**
+     * Destroy value
+     * @param value
+     */
+    MMDEPLOY_API void             mmdeploy_value_destroy(mmdeploy_value_t value);
+
+    /**
+     * Create device handle
+     * @param device_name
+     * @param device_id
+     * @param device
+     * @return
+     */
+    MMDEPLOY_API int              mmdeploy_device_create(const char* device_name, int device_id, mmdeploy_device_t* device);
+
+    /**
+     * Destroy device handle
+     * @param device
+     */
+    MMDEPLOY_API void             mmdeploy_device_destroy(mmdeploy_device_t device);
+
+    /**
+     * Create profiler
+     * @param path path to save the profile data
+     * @param profiler handle for profiler, should be added to context and deleted by
+     * mmdeploy_profiler_destroy
+     * @return status of create
+     */
+    MMDEPLOY_API int              mmdeploy_profiler_create(const char* path, mmdeploy_profiler_t* profiler);
+
+    /**
+     * Destroy profiler handle
+     * @param profiler handle for profiler, profile data will be written to disk after this call
+     */
+    MMDEPLOY_API void             mmdeploy_profiler_destroy(mmdeploy_profiler_t profiler);
+
+    /**
+     * Create context
+     * @param context
+     * @return
+     */
+    MMDEPLOY_API int              mmdeploy_context_create(mmdeploy_context_t* context);
+
+    /**
+     * Create context
+     * @param device_name
+     * @param device_id
+     * @param context
+     * @return
+     */
+    MMDEPLOY_API int              mmdeploy_context_create_by_device(const char* device_name, int device_id, mmdeploy_context_t* context);
+
+    /**
+     * Destroy context
+     * @param context
+     */
+    MMDEPLOY_API void             mmdeploy_context_destroy(mmdeploy_context_t context);
+
+    /**
+     * Add context object
+     * @param context
+     * @param type
+     * @param name
+     * @param object
+     * @return
+     */
+    MMDEPLOY_API int              mmdeploy_context_add(mmdeploy_context_t context, mmdeploy_context_type_t type, const char* name, const void* object);
+
+    /**
+     * Create input value from array of mats
+     * @param mats
+     * @param mat_count
+     * @param value
+     * @return
+     */
+    MMDEPLOY_API int              mmdeploy_common_create_input(const mmdeploy_mat_t* mats, int mat_count, mmdeploy_value_t* value);
 
 #if __cplusplus
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/common_internal.h b/csrc/mmdeploy/apis/c/mmdeploy/common_internal.h
index a1ddecb54d..24a776d8be 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/common_internal.h
+++ b/csrc/mmdeploy/apis/c/mmdeploy/common_internal.h
@@ -12,93 +12,160 @@
 
 using namespace mmdeploy;
 
-namespace {
-
-inline mmdeploy_value_t Cast(Value* s) { return reinterpret_cast<mmdeploy_value_t>(s); }
-
-inline Value* Cast(mmdeploy_value_t s) { return reinterpret_cast<Value*>(s); }
-
-inline Value Take(mmdeploy_value_t v) {
-  auto value = std::move(*Cast(v));
-  mmdeploy_value_destroy(v);
-  return value;
-}
-
-inline Value* Cast(mmdeploy_context_t c) { return reinterpret_cast<Value*>(c); }
-
-inline mmdeploy_value_t Take(Value v) {
-  return Cast(new Value(std::move(v)));  // NOLINT
-}
-
-inline mmdeploy_pipeline_t Cast(AsyncHandle* pipeline) {
-  return reinterpret_cast<mmdeploy_pipeline_t>(pipeline);
-}
-
-inline AsyncHandle* Cast(mmdeploy_pipeline_t pipeline) {
-  return reinterpret_cast<AsyncHandle*>(pipeline);
-}
-
-inline mmdeploy_model_t Cast(Model* model) { return reinterpret_cast<mmdeploy_model_t>(model); }
-
-inline Model* Cast(mmdeploy_model_t model) { return reinterpret_cast<Model*>(model); }
-
-inline Mat Cast(const mmdeploy_mat_t& mat) {
-  return Mat{mat.height,         mat.width, PixelFormat(mat.format),
-             DataType(mat.type), mat.data,  mat.device ? *(const Device*)mat.device : Device{0}};
-}
-
-template <typename F>
-std::invoke_result_t<F> Guard(F f) {
-  try {
-    return f();
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("unhandled exception: {}", e.what());
-  } catch (...) {
-    MMDEPLOY_ERROR("unknown exception caught");
-  }
-  return nullptr;
-}
-
-template <typename T, typename SFINAE = void>
-class wrapped {};
-
-template <typename T>
-class wrapped<T, std::void_t<decltype(Cast(T{}))>> {
- public:
-  wrapped() noexcept : v_(nullptr) {}
-  explicit wrapped(T v) noexcept : v_(v) {}
-
-  void reset() {
-    if (v_) {
-      delete Cast(v_);
-      v_ = nullptr;
+namespace
+{
+
+    inline mmdeploy_value_t Cast(Value* s)
+    {
+        return reinterpret_cast<mmdeploy_value_t>(s);
+    }
+
+    inline Value* Cast(mmdeploy_value_t s)
+    {
+        return reinterpret_cast<Value*>(s);
+    }
+
+    inline Value Take(mmdeploy_value_t v)
+    {
+        auto value = std::move(*Cast(v));
+        mmdeploy_value_destroy(v);
+        return value;
+    }
+
+    inline Value* Cast(mmdeploy_context_t c)
+    {
+        return reinterpret_cast<Value*>(c);
     }
-  }
 
-  ~wrapped() { reset(); }
+    inline mmdeploy_value_t Take(Value v)
+    {
+        return Cast(new Value(std::move(v)));  // NOLINT
+    }
 
-  wrapped(const wrapped&) = delete;
-  wrapped& operator=(const wrapped&) = delete;
+    inline mmdeploy_pipeline_t Cast(AsyncHandle* pipeline)
+    {
+        return reinterpret_cast<mmdeploy_pipeline_t>(pipeline);
+    }
 
-  wrapped(wrapped&& other) noexcept : v_(other.release()) {}
-  wrapped& operator=(wrapped&& other) noexcept {
-    reset();
-    v_ = other.release();
-    return *this;
-  }
+    inline AsyncHandle* Cast(mmdeploy_pipeline_t pipeline)
+    {
+        return reinterpret_cast<AsyncHandle*>(pipeline);
+    }
 
-  T release() noexcept { return std::exchange(v_, nullptr); }
+    inline mmdeploy_model_t Cast(Model* model)
+    {
+        return reinterpret_cast<mmdeploy_model_t>(model);
+    }
 
-  auto operator*() { return Cast(v_); }
-  auto operator-> () { return Cast(v_); }
+    inline Model* Cast(mmdeploy_model_t model)
+    {
+        return reinterpret_cast<Model*>(model);
+    }
 
-  T* ptr() noexcept { return &v_; }
+    inline Mat Cast(const mmdeploy_mat_t& mat)
+    {
+        return Mat{mat.height,
+                   mat.width,
+                   PixelFormat(mat.format),
+                   DataType(mat.type),
+                   mat.data,
+                   mat.device ? *(const Device*)mat.device : Device{0}};
+    }
 
-  operator T() const noexcept { return v_; }  // NOLINT
+    template<typename F>
+    std::invoke_result_t<F> Guard(F f)
+    {
+        try
+        {
+            return f();
+        }
+        catch (const std::exception& e)
+        {
+            MMDEPLOY_ERROR("unhandled exception: {}", e.what());
+        }
+        catch (...)
+        {
+            MMDEPLOY_ERROR("unknown exception caught");
+        }
+
+        return nullptr;
+    }
 
- private:
-  T v_;
-};
+    template<typename T, typename SFINAE = void>
+    class wrapped
+    {
+    };
+
+    template<typename T>
+    class wrapped<T, std::void_t<decltype(Cast(T{}))>>
+    {
+      public:
+        wrapped() noexcept
+            : v_(nullptr)
+        {
+        }
+        explicit wrapped(T v) noexcept
+            : v_(v)
+        {
+        }
+
+        void reset()
+        {
+            if (v_)
+            {
+                delete Cast(v_);
+                v_ = nullptr;
+            }
+        }
+
+        ~wrapped()
+        {
+            reset();
+        }
+
+        wrapped(const wrapped&)            = delete;
+        wrapped& operator=(const wrapped&) = delete;
+
+        wrapped(wrapped&& other) noexcept
+            : v_(other.release())
+        {
+        }
+
+        wrapped& operator=(wrapped&& other) noexcept
+        {
+            reset();
+            v_ = other.release();
+            return *this;
+        }
+
+        T release() noexcept
+        {
+            return std::exchange(v_, nullptr);
+        }
+
+        auto operator*()
+        {
+            return Cast(v_);
+        }
+
+        auto operator->()
+        {
+            return Cast(v_);
+        }
+
+        T* ptr() noexcept
+        {
+            return &v_;
+        }
+
+        operator T() const noexcept
+        {
+            return v_;
+        }  // NOLINT
+
+      private:
+        T v_;
+    };
 
 }  // namespace
 
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/detector.cpp b/csrc/mmdeploy/apis/c/mmdeploy/detector.cpp
index aadf92fb62..6ad627be50 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/detector.cpp
+++ b/csrc/mmdeploy/apis/c/mmdeploy/detector.cpp
@@ -24,126 +24,143 @@ using ResultType = mmdeploy::Structure<mmdeploy_detection_t,
                                        std::deque<mmdeploy_instance_mask_t>,       //
                                        std::vector<mmdeploy::framework::Buffer>>;  //
 
-int mmdeploy_detector_create(mmdeploy_model_t model, const char* device_name, int device_id,
-                             mmdeploy_detector_t* detector) {
-  mmdeploy_context_t context{};
-  auto ec = mmdeploy_context_create_by_device(device_name, device_id, &context);
-  if (ec != MMDEPLOY_SUCCESS) {
+int mmdeploy_detector_create(mmdeploy_model_t model, const char* device_name, int device_id, mmdeploy_detector_t* detector)
+{
+    mmdeploy_context_t context{};
+    auto               ec = mmdeploy_context_create_by_device(device_name, device_id, &context);
+    if (ec != MMDEPLOY_SUCCESS)
+    {
+        return ec;
+    }
+    ec = mmdeploy_detector_create_v2(model, context, detector);
+    mmdeploy_context_destroy(context);
     return ec;
-  }
-  ec = mmdeploy_detector_create_v2(model, context, detector);
-  mmdeploy_context_destroy(context);
-  return ec;
 }
 
-int mmdeploy_detector_create_v2(mmdeploy_model_t model, mmdeploy_context_t context,
-                                mmdeploy_detector_t* detector) {
-  return mmdeploy_pipeline_create_from_model(model, context, (mmdeploy_pipeline_t*)detector);
+int mmdeploy_detector_create_v2(mmdeploy_model_t model, mmdeploy_context_t context, mmdeploy_detector_t* detector)
+{
+    return mmdeploy_pipeline_create_from_model(model, context, (mmdeploy_pipeline_t*)detector);
 }
 
-int mmdeploy_detector_create_by_path(const char* model_path, const char* device_name, int device_id,
-                                     mmdeploy_detector_t* detector) {
-  mmdeploy_model_t model{};
+int mmdeploy_detector_create_by_path(const char* model_path, const char* device_name, int device_id, mmdeploy_detector_t* detector)
+{
+    mmdeploy_model_t model{};
 
-  if (auto ec = mmdeploy_model_create_by_path(model_path, &model)) {
+    if (auto ec = mmdeploy_model_create_by_path(model_path, &model))
+    {
+        return ec;
+    }
+    auto ec = mmdeploy_detector_create(model, device_name, device_id, detector);
+    mmdeploy_model_destroy(model);
     return ec;
-  }
-  auto ec = mmdeploy_detector_create(model, device_name, device_id, detector);
-  mmdeploy_model_destroy(model);
-  return ec;
 }
 
-int mmdeploy_detector_create_input(const mmdeploy_mat_t* mats, int mat_count,
-                                   mmdeploy_value_t* input) {
-  return mmdeploy_common_create_input(mats, mat_count, input);
+int mmdeploy_detector_create_input(const mmdeploy_mat_t* mats, int mat_count, mmdeploy_value_t* input)
+{
+    return mmdeploy_common_create_input(mats, mat_count, input);
 }
 
-int mmdeploy_detector_apply(mmdeploy_detector_t detector, const mmdeploy_mat_t* mats, int mat_count,
-                            mmdeploy_detection_t** results, int** result_count) {
-  wrapped<mmdeploy_value_t> input;
-  if (auto ec = mmdeploy_detector_create_input(mats, mat_count, input.ptr())) {
-    return ec;
-  }
-  wrapped<mmdeploy_value_t> output;
-  if (auto ec = mmdeploy_detector_apply_v2(detector, input, output.ptr())) {
-    return ec;
-  }
-  if (auto ec = mmdeploy_detector_get_result(output, results, result_count)) {
-    return ec;
-  }
-  return MMDEPLOY_SUCCESS;
+int mmdeploy_detector_apply(mmdeploy_detector_t detector, const mmdeploy_mat_t* mats, int mat_count, mmdeploy_detection_t** results, int** result_count)
+{
+    wrapped<mmdeploy_value_t> input;
+    if (auto ec = mmdeploy_detector_create_input(mats, mat_count, input.ptr()))
+    {
+        return ec;
+    }
+    wrapped<mmdeploy_value_t> output;
+    if (auto ec = mmdeploy_detector_apply_v2(detector, input, output.ptr()))
+    {
+        return ec;
+    }
+    if (auto ec = mmdeploy_detector_get_result(output, results, result_count))
+    {
+        return ec;
+    }
+    return MMDEPLOY_SUCCESS;
 }
 
-int mmdeploy_detector_apply_v2(mmdeploy_detector_t detector, mmdeploy_value_t input,
-                               mmdeploy_value_t* output) {
-  return mmdeploy_pipeline_apply((mmdeploy_pipeline_t)detector, input, output);
+int mmdeploy_detector_apply_v2(mmdeploy_detector_t detector, mmdeploy_value_t input, mmdeploy_value_t* output)
+{
+    return mmdeploy_pipeline_apply((mmdeploy_pipeline_t)detector, input, output);
 }
 
-int mmdeploy_detector_apply_async(mmdeploy_detector_t detector, mmdeploy_sender_t input,
-                                  mmdeploy_sender_t* output) {
-  return mmdeploy_pipeline_apply_async((mmdeploy_pipeline_t)detector, input, output);
+int mmdeploy_detector_apply_async(mmdeploy_detector_t detector, mmdeploy_sender_t input, mmdeploy_sender_t* output)
+{
+    return mmdeploy_pipeline_apply_async((mmdeploy_pipeline_t)detector, input, output);
 }
 
-int mmdeploy_detector_get_result(mmdeploy_value_t output, mmdeploy_detection_t** results,
-                                 int** result_count) {
-  if (!output || !results || !result_count) {
-    return MMDEPLOY_E_INVALID_ARG;
-  }
-  try {
-    Value& value = Cast(output)->front();
-    auto detector_outputs = from_value<vector<mmdet::Detections>>(value);
-
-    vector<int> _result_count(detector_outputs.size());
-    size_t total = 0;
-    for (size_t i = 0; i < detector_outputs.size(); ++i) {
-      _result_count[i] = static_cast<int>(detector_outputs[i].size());
-      total += detector_outputs[i].size();
+int mmdeploy_detector_get_result(mmdeploy_value_t output, mmdeploy_detection_t** results, int** result_count)
+{
+    if (!output || !results || !result_count)
+    {
+        return MMDEPLOY_E_INVALID_ARG;
     }
 
-    ResultType r({total, 1, 1, 1});
-    auto [result_data, result_count_vec, masks, buffers] = r.pointers();
-
-    auto result_ptr = result_data;
-
-    for (const auto& det_output : detector_outputs) {
-      for (const auto& detection : det_output) {
-        result_ptr->label_id = detection.label_id;
-        result_ptr->score = detection.score;
-        const auto& bbox = detection.bbox;
-        result_ptr->bbox = {bbox[0], bbox[1], bbox[2], bbox[3]};
-        auto mask_byte_size = detection.mask.byte_size();
-        if (mask_byte_size) {
-          auto& mask = detection.mask;
-          result_ptr->mask = &masks->emplace_back();
-          buffers->push_back(mask.buffer());
-          result_ptr->mask->data = mask.data<char>();
-          result_ptr->mask->width = mask.width();
-          result_ptr->mask->height = mask.height();
+    try
+    {
+        Value&      value            = Cast(output)->front();
+        auto        detector_outputs = from_value<vector<mmdet::Detections>>(value);
+
+        vector<int> _result_count(detector_outputs.size());
+        size_t      total = 0;
+        for (size_t i = 0; i < detector_outputs.size(); ++i)
+        {
+            _result_count[i] = static_cast<int>(detector_outputs[i].size());
+            total += detector_outputs[i].size();
         }
-        ++result_ptr;
-      }
-    }
 
-    *result_count_vec = std::move(_result_count);
-    *result_count = result_count_vec->data();
-    *results = result_data;
-    r.release();
+        ResultType r({total, 1, 1, 1});
+        auto [result_data, result_count_vec, masks, buffers] = r.pointers();
+
+        auto result_ptr = result_data;
+
+        for (const auto& det_output : detector_outputs)
+        {
+            for (const auto& detection : det_output)
+            {
+                result_ptr->label_id = detection.label_id;
+                result_ptr->score    = detection.score;
+                const auto& bbox     = detection.bbox;
+                result_ptr->bbox     = {bbox[0], bbox[1], bbox[2], bbox[3]};
+                auto mask_byte_size  = detection.mask.byte_size();
+                if (mask_byte_size)
+                {
+                    auto& mask       = detection.mask;
+                    result_ptr->mask = &masks->emplace_back();
+                    buffers->push_back(mask.buffer());
+                    result_ptr->mask->data   = mask.data<char>();
+                    result_ptr->mask->width  = mask.width();
+                    result_ptr->mask->height = mask.height();
+                }
+                ++result_ptr;
+            }
+        }
 
-    return MMDEPLOY_SUCCESS;
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("unhandled exception: {}", e.what());
-  } catch (...) {
-    MMDEPLOY_ERROR("unknown exception caught");
-  }
-  return MMDEPLOY_E_FAIL;
+        *result_count_vec = std::move(_result_count);
+        *result_count     = result_count_vec->data();
+        *results          = result_data;
+        r.release();
+
+        return MMDEPLOY_SUCCESS;
+    }
+    catch (const std::exception& e)
+    {
+        MMDEPLOY_ERROR("unhandled exception: {}", e.what());
+    }
+    catch (...)
+    {
+        MMDEPLOY_ERROR("unknown exception caught");
+    }
+    return MMDEPLOY_E_FAIL;
 }
 
-void mmdeploy_detector_release_result(mmdeploy_detection_t* results, const int* result_count,
-                                      int count) {
-  auto num_dets = std::accumulate(result_count, result_count + count, 0);
-  ResultType deleter({static_cast<size_t>(num_dets), 1, 1, 1}, results);
+void mmdeploy_detector_release_result(mmdeploy_detection_t* results, const int* result_count, int count)
+{
+    auto       num_dets = std::accumulate(result_count, result_count + count, 0);
+    ResultType deleter({static_cast<size_t>(num_dets), 1, 1, 1}, results);
 }
 
-void mmdeploy_detector_destroy(mmdeploy_detector_t detector) {
-  mmdeploy_pipeline_destroy((mmdeploy_pipeline_t)detector);
+void mmdeploy_detector_destroy(mmdeploy_detector_t detector)
+{
+    mmdeploy_pipeline_destroy((mmdeploy_pipeline_t)detector);
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/detector.h b/csrc/mmdeploy/apis/c/mmdeploy/detector.h
index 5c5ba2f356..713214ca4f 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/detector.h
+++ b/csrc/mmdeploy/apis/c/mmdeploy/detector.h
@@ -13,124 +13,123 @@
 #include "mmdeploy/model.h"
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
-typedef struct mmdeploy_instance_mask_t {
-  char* data;
-  int height;
-  int width;
-} mmdeploy_instance_mask_t;
-
-typedef struct mmdeploy_detection_t {
-  int label_id;
-  float score;
-  mmdeploy_rect_t bbox;
-  mmdeploy_instance_mask_t* mask;
-} mmdeploy_detection_t;
-
-typedef struct mmdeploy_detector* mmdeploy_detector_t;
-
-/**
- * @brief Create detector's handle
- * @param[in] model an instance of mmdetection sdk model created by
- * \ref mmdeploy_model_create_by_path or \ref mmdeploy_model_create in \ref model.h
- * @param[in] device_name name of device, such as "cpu", "cuda", etc.
- * @param[in] device_id id of device.
- * @param[out] detector instance of a detector
- * @return status of creating detector's handle
- */
-MMDEPLOY_API int mmdeploy_detector_create(mmdeploy_model_t model, const char* device_name,
-                                          int device_id, mmdeploy_detector_t* detector);
-
-/**
- * @brief Create detector's handle
- * @param[in] model_path path of mmdetection sdk model exported by mmdeploy model converter
- * @param[in] device_name name of device, such as "cpu", "cuda", etc.
- * @param[in] device_id id of device.
- * @param[out] detector instance of a detector
- * @return status of creating detector's handle
- */
-MMDEPLOY_API int mmdeploy_detector_create_by_path(const char* model_path, const char* device_name,
-                                                  int device_id, mmdeploy_detector_t* detector);
-
-/**
- * @brief Apply detector to batch images and get their inference results
- * @param[in] detector detector's handle created by \ref mmdeploy_detector_create_by_path
- * @param[in] mats a batch of images
- * @param[in] mat_count number of images in the batch
- * @param[out] results a linear buffer to save detection results of each image. It must be released
- * by \ref mmdeploy_detector_release_result
- * @param[out] result_count a linear buffer with length being \p mat_count to save the number of
- * detection results of each image. And it must be released by \ref
- * mmdeploy_detector_release_result
- * @return status of inference
- */
-MMDEPLOY_API int mmdeploy_detector_apply(mmdeploy_detector_t detector, const mmdeploy_mat_t* mats,
-                                         int mat_count, mmdeploy_detection_t** results,
-                                         int** result_count);
-
-/** @brief Release the inference result buffer created by \ref mmdeploy_detector_apply
- * @param[in] results detection results buffer
- * @param[in] result_count  \p results size buffer
- * @param[in] count length of \p result_count
- */
-MMDEPLOY_API void mmdeploy_detector_release_result(mmdeploy_detection_t* results,
-                                                   const int* result_count, int count);
-
-/**
- * @brief Destroy detector's handle
- * @param[in] detector detector's handle created by \ref mmdeploy_detector_create_by_path
- */
-MMDEPLOY_API void mmdeploy_detector_destroy(mmdeploy_detector_t detector);
-
-/******************************************************************************
- * Experimental asynchronous APIs */
-
-/**
- * @brief Same as \ref mmdeploy_detector_create, but allows to control execution context of tasks
- * via context
- */
-MMDEPLOY_API int mmdeploy_detector_create_v2(mmdeploy_model_t model, mmdeploy_context_t context,
-                                             mmdeploy_detector_t* detector);
-
-/**
- * @brief Pack detector inputs into mmdeploy_value_t
- * @param[in] mats a batch of images
- * @param[in] mat_count number of images in the batch
- * @return the created value
- */
-MMDEPLOY_API int mmdeploy_detector_create_input(const mmdeploy_mat_t* mats, int mat_count,
-                                                mmdeploy_value_t* input);
-
-/**
- * @brief Same as \ref mmdeploy_detector_apply, but input and output are packed in \ref
- * mmdeploy_value_t.
- */
-MMDEPLOY_API int mmdeploy_detector_apply_v2(mmdeploy_detector_t detector, mmdeploy_value_t input,
-                                            mmdeploy_value_t* output);
-
-/**
- * @brief Apply detector asynchronously
- * @param[in] detector handle to the detector
- * @param[in] input input sender
- * @return output sender
- */
-MMDEPLOY_API int mmdeploy_detector_apply_async(mmdeploy_detector_t detector,
-                                               mmdeploy_sender_t input, mmdeploy_sender_t* output);
-
-/**
- * @brief Unpack detector output from a mmdeploy_value_t
- * @param[in] output output obtained by applying a detector
- * @param[out] results a linear buffer to save detection results of each image. It must be released
- * by \ref mmdeploy_detector_release_result
- * @param[out] result_count a linear buffer with length number of input images to save the number of
- * detection results of each image. Must be released by \ref
- * mmdeploy_detector_release_result
- * @return status of the operation
- */
-MMDEPLOY_API int mmdeploy_detector_get_result(mmdeploy_value_t output,
-                                              mmdeploy_detection_t** results, int** result_count);
+    typedef struct mmdeploy_instance_mask_t
+    {
+        char* data;
+        int   height;
+        int   width;
+    } mmdeploy_instance_mask_t;
+
+    typedef struct mmdeploy_detection_t
+    {
+        int                       label_id;
+        float                     score;
+        mmdeploy_rect_t           bbox;
+        mmdeploy_instance_mask_t* mask;
+    } mmdeploy_detection_t;
+
+    typedef struct mmdeploy_detector* mmdeploy_detector_t;
+
+    /**
+     * @brief Create detector's handle
+     * @param[in] model an instance of mmdetection sdk model created by
+     * \ref mmdeploy_model_create_by_path or \ref mmdeploy_model_create in \ref model.h
+     * @param[in] device_name name of device, such as "cpu", "cuda", etc.
+     * @param[in] device_id id of device.
+     * @param[out] detector instance of a detector
+     * @return status of creating detector's handle
+     */
+    MMDEPLOY_API int                  mmdeploy_detector_create(mmdeploy_model_t model, const char* device_name, int device_id, mmdeploy_detector_t* detector);
+
+    /**
+     * @brief Create detector's handle
+     * @param[in] model_path path of mmdetection sdk model exported by mmdeploy model converter
+     * @param[in] device_name name of device, such as "cpu", "cuda", etc.
+     * @param[in] device_id id of device.
+     * @param[out] detector instance of a detector
+     * @return status of creating detector's handle
+     */
+    MMDEPLOY_API int                  mmdeploy_detector_create_by_path(const char* model_path, const char* device_name, int device_id, mmdeploy_detector_t* detector);
+
+    /**
+     * @brief Apply detector to batch images and get their inference results
+     * @param[in] detector detector's handle created by \ref mmdeploy_detector_create_by_path
+     * @param[in] mats a batch of images
+     * @param[in] mat_count number of images in the batch
+     * @param[out] results a linear buffer to save detection results of each image. It must be released
+     * by \ref mmdeploy_detector_release_result
+     * @param[out] result_count a linear buffer with length being \p mat_count to save the number of
+     * detection results of each image. And it must be released by \ref
+     * mmdeploy_detector_release_result
+     * @return status of inference
+     */
+    MMDEPLOY_API int                  mmdeploy_detector_apply(mmdeploy_detector_t detector, const mmdeploy_mat_t* mats, int mat_count, mmdeploy_detection_t** results, int** result_count);
+
+    /** @brief Release the inference result buffer created by \ref mmdeploy_detector_apply
+     * @param[in] results detection results buffer
+     * @param[in] result_count  \p results size buffer
+     * @param[in] count length of \p result_count
+     */
+    MMDEPLOY_API void                 mmdeploy_detector_release_result(mmdeploy_detection_t* results,
+                                                                       const int*            result_count,
+                                                                       int                   count);
+
+    /**
+     * @brief Destroy detector's handle
+     * @param[in] detector detector's handle created by \ref mmdeploy_detector_create_by_path
+     */
+    MMDEPLOY_API void                 mmdeploy_detector_destroy(mmdeploy_detector_t detector);
+
+    /******************************************************************************
+     * Experimental asynchronous APIs */
+
+    /**
+     * @brief Same as \ref mmdeploy_detector_create, but allows to control execution context of tasks
+     * via context
+     */
+    MMDEPLOY_API int                  mmdeploy_detector_create_v2(mmdeploy_model_t model, mmdeploy_context_t context, mmdeploy_detector_t* detector);
+
+    /**
+     * @brief Pack detector inputs into mmdeploy_value_t
+     * @param[in] mats a batch of images
+     * @param[in] mat_count number of images in the batch
+     * @return the created value
+     */
+    MMDEPLOY_API int                  mmdeploy_detector_create_input(const mmdeploy_mat_t* mats, int mat_count, mmdeploy_value_t* input);
+
+    /**
+     * @brief Same as \ref mmdeploy_detector_apply, but input and output are packed in \ref
+     * mmdeploy_value_t.
+     */
+    MMDEPLOY_API int                  mmdeploy_detector_apply_v2(mmdeploy_detector_t detector, mmdeploy_value_t input, mmdeploy_value_t* output);
+
+    /**
+     * @brief Apply detector asynchronously
+     * @param[in] detector handle to the detector
+     * @param[in] input input sender
+     * @return output sender
+     */
+    MMDEPLOY_API int                  mmdeploy_detector_apply_async(mmdeploy_detector_t detector,
+                                                                    mmdeploy_sender_t   input,
+                                                                    mmdeploy_sender_t*  output);
+
+    /**
+     * @brief Unpack detector output from a mmdeploy_value_t
+     * @param[in] output output obtained by applying a detector
+     * @param[out] results a linear buffer to save detection results of each image. It must be released
+     * by \ref mmdeploy_detector_release_result
+     * @param[out] result_count a linear buffer with length number of input images to save the number of
+     * detection results of each image. Must be released by \ref
+     * mmdeploy_detector_release_result
+     * @return status of the operation
+     */
+    MMDEPLOY_API int                  mmdeploy_detector_get_result(mmdeploy_value_t       output,
+                                                                   mmdeploy_detection_t** results,
+                                                                   int**                  result_count);
 
 #ifdef __cplusplus
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/executor.cpp b/csrc/mmdeploy/apis/c/mmdeploy/executor.cpp
index 2fdfb9091f..0de722b58c 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/executor.cpp
+++ b/csrc/mmdeploy/apis/c/mmdeploy/executor.cpp
@@ -9,199 +9,283 @@
 
 using namespace mmdeploy;
 
-namespace {
+namespace
+{
 
-mmdeploy_scheduler_t CreateScheduler(const char* type, const Value& config = Value()) {
-  try {
-    auto creator = gRegistry<SchedulerType>().Get(type);
-    if (!creator) {
-      MMDEPLOY_ERROR("Creator for {} not found. Available schedulers: {}", type,
-                     gRegistry<SchedulerType>().List());
-      return nullptr;
+    mmdeploy_scheduler_t CreateScheduler(const char* type, const Value& config = Value())
+    {
+        try
+        {
+            auto creator = gRegistry<SchedulerType>().Get(type);
+            if (!creator)
+            {
+                MMDEPLOY_ERROR("Creator for {} not found. Available schedulers: {}",
+                               type,
+                               gRegistry<SchedulerType>().List());
+                return nullptr;
+            }
+            return Cast(new SchedulerType(creator->Create(config)));
+        }
+        catch (const std::exception& e)
+        {
+            MMDEPLOY_ERROR("failed to create Scheduler: {} ({}), config: {}", type, e.what(), config);
+            return nullptr;
+        }
     }
-    return Cast(new SchedulerType(creator->Create(config)));
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("failed to create Scheduler: {} ({}), config: {}", type, e.what(), config);
-    return nullptr;
-  }
-}
 
 }  // namespace
 
-mmdeploy_sender_t mmdeploy_sender_copy(mmdeploy_sender_t input) {
-  if (!input) {
-    return nullptr;
-  }
-  return Take(SenderType(*Cast(input)));
+mmdeploy_sender_t mmdeploy_sender_copy(mmdeploy_sender_t input)
+{
+    if (!input)
+    {
+        return nullptr;
+    }
+    return Take(SenderType(*Cast(input)));
 }
 
-int mmdeploy_sender_destroy(mmdeploy_sender_t sender) {
-  delete Cast(sender);
-  return 0;
+int mmdeploy_sender_destroy(mmdeploy_sender_t sender)
+{
+    delete Cast(sender);
+    return 0;
 }
 
-mmdeploy_scheduler_t mmdeploy_executor_inline() { return CreateScheduler("Inline"); }
+mmdeploy_scheduler_t mmdeploy_executor_inline()
+{
+    return CreateScheduler("Inline");
+}
 
-mmdeploy_scheduler_t mmdeploy_executor_system_pool() {
-  // create a thread pool context and hold its shared handle
-  static auto scheduler = *Cast(CreateScheduler("ThreadPool"));
-  // return a copy of the handle to the thread pool
-  return Cast(new SchedulerType(scheduler));
+mmdeploy_scheduler_t mmdeploy_executor_system_pool()
+{
+    // create a thread pool context and hold its shared handle
+    static auto scheduler = *Cast(CreateScheduler("ThreadPool"));
+    // return a copy of the handle to the thread pool
+    return Cast(new SchedulerType(scheduler));
 }
 
-mmdeploy_scheduler_t mmdeploy_executor_create_thread_pool(int num_threads) {
-  return CreateScheduler("ThreadPool", {{"num_threads", num_threads}});
+mmdeploy_scheduler_t mmdeploy_executor_create_thread_pool(int num_threads)
+{
+    return CreateScheduler("ThreadPool", {{"num_threads", num_threads}});
 }
 
-mmdeploy_scheduler_t mmdeploy_executor_create_thread() { return CreateScheduler("SingleThread"); }
+mmdeploy_scheduler_t mmdeploy_executor_create_thread()
+{
+    return CreateScheduler("SingleThread");
+}
 
 mmdeploy_scheduler_t mmdeploy_executor_dynamic_batch(mmdeploy_scheduler_t scheduler,
-                                                     int max_batch_size, int timeout) {
-  if (!scheduler) {
-    return nullptr;
-  }
-  return CreateScheduler(
-      "DynamicBatch",
-      {{"scheduler", *Cast(scheduler)}, {"max_batch_size", max_batch_size}, {"timeout", timeout}});
+                                                     int                  max_batch_size,
+                                                     int                  timeout)
+{
+    if (!scheduler)
+    {
+        return nullptr;
+    }
+    return CreateScheduler("DynamicBatch",
+                           {{"scheduler", *Cast(scheduler)},
+                            {"max_batch_size", max_batch_size},
+                            {"timeout", timeout}});
 }
 
-int mmdeploy_scheduler_destroy(mmdeploy_scheduler_t scheduler) {
-  delete Cast(scheduler);
-  return 0;
+int mmdeploy_scheduler_destroy(mmdeploy_scheduler_t scheduler)
+{
+    delete Cast(scheduler);
+    return 0;
 }
 
-mmdeploy_sender_t mmdeploy_executor_just(mmdeploy_value_t value) {
-  if (value) {
-    return Guard([&] { return Take(Just(*Cast(value))); });
-  } else {
-    return Take(Just(Value()));
-  }
+mmdeploy_sender_t mmdeploy_executor_just(mmdeploy_value_t value)
+{
+    if (value)
+    {
+        return Guard([&]
+                     { return Take(Just(*Cast(value))); });
+    }
+    else
+    {
+        return Take(Just(Value()));
+    }
 }
 
-mmdeploy_sender_t mmdeploy_executor_schedule(mmdeploy_scheduler_t scheduler) {
-  if (!scheduler) {
-    return nullptr;
-  }
-  return Guard([&] { return Take(Then(Schedule(*Cast(scheduler)), [] { return Value(); })); });
+mmdeploy_sender_t mmdeploy_executor_schedule(mmdeploy_scheduler_t scheduler)
+{
+    if (!scheduler)
+    {
+        return nullptr;
+    }
+    return Guard([&]
+                 { return Take(Then(Schedule(*Cast(scheduler)),
+                                    []
+                                    {
+                                        return Value();
+                                    })); });
 }
 
 mmdeploy_sender_t mmdeploy_executor_transfer_just(mmdeploy_scheduler_t scheduler,
-                                                  mmdeploy_value_t value) {
-  if (!scheduler || !value) {
-    return nullptr;
-  }
-  return Guard([&] { return Take(TransferJust(*Cast(scheduler), *Cast(value))); });
-}
-
-mmdeploy_sender_t mmdeploy_executor_transfer(mmdeploy_sender_t input,
-                                             mmdeploy_scheduler_t scheduler) {
-  if (!input || !scheduler) {
-    return nullptr;
-  }
-  return Guard([&] { return Take(Transfer(Take(input), *Cast(scheduler))); });
-}
-
-mmdeploy_sender_t mmdeploy_executor_on(mmdeploy_scheduler_t scheduler, mmdeploy_sender_t input) {
-  if (!scheduler || !input) {
-    return nullptr;
-  }
-  return Guard([&] { return Take(On(*Cast(scheduler), Take(input))); });
-}
-
-mmdeploy_sender_t mmdeploy_executor_then(mmdeploy_sender_t input, mmdeploy_then_fn_t fn,
-                                         void* context) {
-  if (!input || !fn) {
-    return nullptr;
-  }
-  return Guard([&] {
-    return Take(Then(Take(input), [fn, context](Value args) {
-      auto out = Cast(fn(Take(std::move(args)), context));
-      Value ret(std::move(*out));
-      delete out;
-      return ret;
-    }));
-  });
-}
-
-mmdeploy_sender_t mmdeploy_executor_let_value(mmdeploy_sender_t input, mmdeploy_let_value_fn_t fn,
-                                              void* context) {
-  if (!input || !fn) {
-    return nullptr;
-  }
-  return Guard([&] {
-    return Take(LetValue(Take(input), [fn, context](Value& args) {
-      auto out = Cast(fn(Cast(&args), context));
-      SenderType ret(std::move(*out));
-      delete out;
-      return ret;
-    }));
-  });
-}
-
-mmdeploy_sender_t mmdeploy_executor_split(mmdeploy_sender_t input) {
-  if (!input) {
-    return nullptr;
-  }
-  return Guard([&] { return Take(Split(Take(input))); });
-}
-
-mmdeploy_sender_t mmdeploy_executor_when_all(mmdeploy_sender_t inputs[], int32_t n) {
-  if (!inputs) {
-    return nullptr;
-  }
-  return Guard([&] {
-    std::vector<SenderType> senders;
-    senders.reserve(n);
-    for (int i = 0; i < n; ++i) {
-      senders.emplace_back(Take(inputs[i]));
-    }
-    return Take(
-        Then(WhenAll(std::move(senders)), [](Value::Array&& v) { return Value(std::move(v)); }));
-  });
-}
-
-mmdeploy_sender_t mmdeploy_executor_ensure_started(mmdeploy_sender_t input) {
-  if (!input) {
-    return nullptr;
-  }
-  return Guard([&] { return Take(EnsureStarted(Take(input))); });
-}
-
-int mmdeploy_executor_start_detached(mmdeploy_sender_t input) {
-  if (!input) {
-    return MMDEPLOY_E_INVALID_ARG;
-  }
-  try {
-    StartDetached(Take(input));
-    return 0;
-  } catch (...) {
-  }
-  return MMDEPLOY_E_FAIL;
+                                                  mmdeploy_value_t     value)
+{
+    if (!scheduler || !value)
+    {
+        return nullptr;
+    }
+    return Guard([&]
+                 { return Take(TransferJust(*Cast(scheduler), *Cast(value))); });
+}
+
+mmdeploy_sender_t mmdeploy_executor_transfer(mmdeploy_sender_t    input,
+                                             mmdeploy_scheduler_t scheduler)
+{
+    if (!input || !scheduler)
+    {
+        return nullptr;
+    }
+    return Guard([&]
+                 { return Take(Transfer(Take(input), *Cast(scheduler))); });
+}
+
+mmdeploy_sender_t mmdeploy_executor_on(mmdeploy_scheduler_t scheduler, mmdeploy_sender_t input)
+{
+    if (!scheduler || !input)
+    {
+        return nullptr;
+    }
+    return Guard([&]
+                 { return Take(On(*Cast(scheduler), Take(input))); });
+}
+
+mmdeploy_sender_t mmdeploy_executor_then(mmdeploy_sender_t input, mmdeploy_then_fn_t fn, void* context)
+{
+    if (!input || !fn)
+    {
+        return nullptr;
+    }
+    return Guard([&]
+                 { return Take(Then(Take(input),
+                                    [fn, context](Value args)
+                                    {
+                                        auto  out = Cast(fn(Take(std::move(args)), context));
+                                        Value ret(std::move(*out));
+                                        delete out;
+                                        return ret;
+                                    })); });
+}
+
+mmdeploy_sender_t mmdeploy_executor_let_value(mmdeploy_sender_t input, mmdeploy_let_value_fn_t fn, void* context)
+{
+    if (!input || !fn)
+    {
+        return nullptr;
+    }
+    return Guard([&]
+                 { return Take(LetValue(Take(input),
+                                        [fn, context](Value& args)
+                                        {
+                                            auto       out = Cast(fn(Cast(&args), context));
+                                            SenderType ret(std::move(*out));
+                                            delete out;
+                                            return ret;
+                                        })); });
 }
 
-mmdeploy_value_t mmdeploy_executor_sync_wait(mmdeploy_sender_t input) {
-  if (!input) {
-    return nullptr;
-  }
-  return Guard([&] { return Take(std::get<Value>(SyncWait(Take(input)))); });
+mmdeploy_sender_t mmdeploy_executor_split(mmdeploy_sender_t input)
+{
+    if (!input)
+    {
+        return nullptr;
+    }
+    return Guard([&]
+                 { return Take(Split(Take(input))); });
+}
+
+mmdeploy_sender_t mmdeploy_executor_when_all(mmdeploy_sender_t inputs[], int32_t n)
+{
+    if (!inputs)
+    {
+        return nullptr;
+    }
+    return Guard([&]
+                 {
+                    std::vector<SenderType> senders;
+                    senders.reserve(n);
+                    for (int i = 0; i < n; ++i)
+                    {
+                        senders.emplace_back(Take(inputs[i]));
+                    }
+                    return Take(Then(WhenAll(std::move(senders)), 
+                                [](Value::Array&& v)
+                                {
+                                    return Value(std::move(v)); 
+                                })); });
+}
+
+mmdeploy_sender_t mmdeploy_executor_ensure_started(mmdeploy_sender_t input)
+{
+    if (!input)
+    {
+        return nullptr;
+    }
+    return Guard([&]
+                 { return Take(EnsureStarted(Take(input))); });
 }
 
-int mmdeploy_executor_sync_wait_v2(mmdeploy_sender_t sender, mmdeploy_value_t* value) {
-  if (!sender) {
-    return MMDEPLOY_E_INVALID_ARG;
-  }
-  auto result = mmdeploy_executor_sync_wait(sender);
-  if (!result) {
+int mmdeploy_executor_start_detached(mmdeploy_sender_t input)
+{
+    if (!input)
+    {
+        return MMDEPLOY_E_INVALID_ARG;
+    }
+
+    try
+    {
+        StartDetached(Take(input));
+        return 0;
+    }
+    catch (...)
+    {
+    }
+
     return MMDEPLOY_E_FAIL;
-  }
-  if (value) {
-    *value = result;
-  } else {
-    mmdeploy_value_destroy(result);
-  }
-  return MMDEPLOY_SUCCESS;
 }
 
-void mmdeploy_executor_execute(mmdeploy_scheduler_t scheduler, void (*fn)(void*), void* context) {
-  Execute(*Cast(scheduler), [fn, context] { fn(context); });
+mmdeploy_value_t mmdeploy_executor_sync_wait(mmdeploy_sender_t input)
+{
+    if (!input)
+    {
+        return nullptr;
+    }
+    return Guard([&]
+                 { return Take(std::get<Value>(SyncWait(Take(input)))); });
+}
+
+int mmdeploy_executor_sync_wait_v2(mmdeploy_sender_t sender, mmdeploy_value_t* value)
+{
+    if (!sender)
+    {
+        return MMDEPLOY_E_INVALID_ARG;
+    }
+
+    auto result = mmdeploy_executor_sync_wait(sender);
+    if (!result)
+    {
+        return MMDEPLOY_E_FAIL;
+    }
+
+    if (value)
+    {
+        *value = result;
+    }
+    else
+    {
+        mmdeploy_value_destroy(result);
+    }
+
+    return MMDEPLOY_SUCCESS;
+}
+
+void mmdeploy_executor_execute(mmdeploy_scheduler_t scheduler, void (*fn)(void*), void* context)
+{
+    Execute(*Cast(scheduler),
+            [fn, context]
+            {
+                fn(context);
+            });
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/executor.h b/csrc/mmdeploy/apis/c/mmdeploy/executor.h
index a2c8ffa387..4b044a6b51 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/executor.h
+++ b/csrc/mmdeploy/apis/c/mmdeploy/executor.h
@@ -6,133 +6,135 @@
 #include "mmdeploy/common.h"
 
 #if __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
-/******************************************************************************
- * Experimental asynchronous APIs */
+    /******************************************************************************
+     * Experimental asynchronous APIs */
 
-typedef mmdeploy_value_t (*mmdeploy_then_fn_t)(mmdeploy_value_t, void*);
+    typedef mmdeploy_value_t (*mmdeploy_then_fn_t)(mmdeploy_value_t, void*);
 
-typedef mmdeploy_value_t (*mmdeploy_then_fn_v2_t)(mmdeploy_value_t*, void*);
-
-typedef int (*mmdeploy_then_fn_v3_t)(mmdeploy_value_t* input, mmdeploy_value_t* output, void*);
+    typedef mmdeploy_value_t (*mmdeploy_then_fn_v2_t)(mmdeploy_value_t*, void*);
+
+    typedef int (*mmdeploy_then_fn_v3_t)(mmdeploy_value_t* input, mmdeploy_value_t* output, void*);
+
+    struct mmdeploy_sender;
+    struct mmdeploy_scheduler;
+
+    typedef struct mmdeploy_sender*    mmdeploy_sender_t;
+    typedef struct mmdeploy_scheduler* mmdeploy_scheduler_t;
 
-struct mmdeploy_sender;
-struct mmdeploy_scheduler;
+    typedef mmdeploy_sender_t (*mmdeploy_let_value_fn_t)(mmdeploy_value_t, void*);
 
-typedef struct mmdeploy_sender* mmdeploy_sender_t;
-typedef struct mmdeploy_scheduler* mmdeploy_scheduler_t;
+    ///////////////////////////////////////////////////////////////////////////////
+    // Scheduler
+    ///////////////////////////////////////////////////////////////////////////////
+    MMDEPLOY_API mmdeploy_scheduler_t mmdeploy_executor_inline();
 
-typedef mmdeploy_sender_t (*mmdeploy_let_value_fn_t)(mmdeploy_value_t, void*);
+    MMDEPLOY_API mmdeploy_scheduler_t mmdeploy_executor_system_pool();
 
-///////////////////////////////////////////////////////////////////////////////
-// Scheduler
-///////////////////////////////////////////////////////////////////////////////
-MMDEPLOY_API mmdeploy_scheduler_t mmdeploy_executor_inline();
+    /**
+     * Create a thread pool with the given number of worker threads
+     * @param[in] num_threads
+     * @return the handle to the created thread pool
+     */
+    MMDEPLOY_API mmdeploy_scheduler_t mmdeploy_executor_create_thread_pool(int num_threads);
 
-MMDEPLOY_API mmdeploy_scheduler_t mmdeploy_executor_system_pool();
+    MMDEPLOY_API mmdeploy_scheduler_t mmdeploy_executor_create_thread();
 
-/**
- * Create a thread pool with the given number of worker threads
- * @param[in] num_threads
- * @return the handle to the created thread pool
- */
-MMDEPLOY_API mmdeploy_scheduler_t mmdeploy_executor_create_thread_pool(int num_threads);
+    MMDEPLOY_API mmdeploy_scheduler_t mmdeploy_executor_dynamic_batch(mmdeploy_scheduler_t scheduler,
+                                                                      int                  max_batch_size,
+                                                                      int                  timeout);
 
-MMDEPLOY_API mmdeploy_scheduler_t mmdeploy_executor_create_thread();
+    MMDEPLOY_API int                  mmdeploy_scheduler_destroy(mmdeploy_scheduler_t scheduler);
 
-MMDEPLOY_API mmdeploy_scheduler_t mmdeploy_executor_dynamic_batch(mmdeploy_scheduler_t scheduler,
-                                                                  int max_batch_size, int timeout);
+    ///////////////////////////////////////////////////////////////////////////////
+    // Utilities
+    ///////////////////////////////////////////////////////////////////////////////
 
-MMDEPLOY_API int mmdeploy_scheduler_destroy(mmdeploy_scheduler_t scheduler);
+    /**
+     * @brief Create a copy of a copyable sender. Only senders created by \ref mmdeploy_executor_split
+     * is copyable for now.
+     * @param[in] input copyable sender,
+     * @return the sender created, or nullptr if the sender is not copyable
+     */
+    MMDEPLOY_API mmdeploy_sender_t    mmdeploy_sender_copy(mmdeploy_sender_t input);
 
-///////////////////////////////////////////////////////////////////////////////
-// Utilities
-///////////////////////////////////////////////////////////////////////////////
+    /**
+     * @brief Destroy a sender, notice that all sender adapters will consume input senders, only unused
+     * senders should be destroyed using this function.
+     * @param[in] input
+     */
+    MMDEPLOY_API int                  mmdeploy_sender_destroy(mmdeploy_sender_t sender);
 
-/**
- * @brief Create a copy of a copyable sender. Only senders created by \ref mmdeploy_executor_split
- * is copyable for now.
- * @param[in] input copyable sender,
- * @return the sender created, or nullptr if the sender is not copyable
- */
-MMDEPLOY_API mmdeploy_sender_t mmdeploy_sender_copy(mmdeploy_sender_t input);
+    ///////////////////////////////////////////////////////////////////////////////
+    // Sender factories
+    ///////////////////////////////////////////////////////////////////////////////
 
-/**
- * @brief Destroy a sender, notice that all sender adapters will consume input senders, only unused
- * senders should be destroyed using this function.
- * @param[in] input
- */
-MMDEPLOY_API int mmdeploy_sender_destroy(mmdeploy_sender_t sender);
+    /**
+     * @brief Create a sender that sends the provided value
+     * @param[in] value
+     * @return created sender
+     */
+    MMDEPLOY_API mmdeploy_sender_t    mmdeploy_executor_just(mmdeploy_value_t value);
 
-///////////////////////////////////////////////////////////////////////////////
-// Sender factories
-///////////////////////////////////////////////////////////////////////////////
+    /**
+     * @brief
+     * @param[in] scheduler
+     * @return the sender created
+     */
+    MMDEPLOY_API mmdeploy_sender_t    mmdeploy_executor_schedule(mmdeploy_scheduler_t scheduler);
 
-/**
- * @brief Create a sender that sends the provided value
- * @param[in] value
- * @return created sender
- */
-MMDEPLOY_API mmdeploy_sender_t mmdeploy_executor_just(mmdeploy_value_t value);
+    MMDEPLOY_API mmdeploy_sender_t    mmdeploy_executor_transfer_just(mmdeploy_scheduler_t scheduler,
+                                                                      mmdeploy_value_t     value);
 
-/**
- * @brief
- * @param[in] scheduler
- * @return the sender created
- */
-MMDEPLOY_API mmdeploy_sender_t mmdeploy_executor_schedule(mmdeploy_scheduler_t scheduler);
+    ///////////////////////////////////////////////////////////////////////////////
+    // Sender adapters
+    ///////////////////////////////////////////////////////////////////////////////
 
-MMDEPLOY_API mmdeploy_sender_t mmdeploy_executor_transfer_just(mmdeploy_scheduler_t scheduler,
-                                                               mmdeploy_value_t value);
+    /**
+     * Transfer the execution to the execution agent of the provided scheduler
+     * @param[in] input
+     * @param[in] scheduler
+     * @return the sender created
+     */
+    MMDEPLOY_API mmdeploy_sender_t    mmdeploy_executor_transfer(mmdeploy_sender_t    input,
+                                                                 mmdeploy_scheduler_t scheduler);
 
-///////////////////////////////////////////////////////////////////////////////
-// Sender adapters
-///////////////////////////////////////////////////////////////////////////////
+    MMDEPLOY_API mmdeploy_sender_t    mmdeploy_executor_on(mmdeploy_scheduler_t scheduler,
+                                                           mmdeploy_sender_t    input);
 
-/**
- * Transfer the execution to the execution agent of the provided scheduler
- * @param[in] input
- * @param[in] scheduler
- * @return the sender created
- */
-MMDEPLOY_API mmdeploy_sender_t mmdeploy_executor_transfer(mmdeploy_sender_t input,
-                                                          mmdeploy_scheduler_t scheduler);
+    MMDEPLOY_API mmdeploy_sender_t    mmdeploy_executor_then(mmdeploy_sender_t  input,
+                                                             mmdeploy_then_fn_t fn,
+                                                             void*              context);
 
-MMDEPLOY_API mmdeploy_sender_t mmdeploy_executor_on(mmdeploy_scheduler_t scheduler,
-                                                    mmdeploy_sender_t input);
+    MMDEPLOY_API mmdeploy_sender_t    mmdeploy_executor_let_value(mmdeploy_sender_t       input,
+                                                                  mmdeploy_let_value_fn_t fn,
+                                                                  void*                   context);
 
-MMDEPLOY_API mmdeploy_sender_t mmdeploy_executor_then(mmdeploy_sender_t input,
-                                                      mmdeploy_then_fn_t fn, void* context);
-
-MMDEPLOY_API mmdeploy_sender_t mmdeploy_executor_let_value(mmdeploy_sender_t input,
-                                                           mmdeploy_let_value_fn_t fn,
-                                                           void* context);
-
-/**
- * Convert the input sender into a sender that is copyable via \ref mmdeploy_sender_copy. Notice
- * that this function doesn't make the sender multi-shot, it just return a sender that is copyable.
- * @param[in] input
- * @return the sender that is copyable
- */
-MMDEPLOY_API mmdeploy_sender_t mmdeploy_executor_split(mmdeploy_sender_t input);
-
-MMDEPLOY_API mmdeploy_sender_t mmdeploy_executor_when_all(mmdeploy_sender_t inputs[], int32_t n);
-
-MMDEPLOY_API mmdeploy_sender_t mmdeploy_executor_ensure_started(mmdeploy_sender_t input);
-
-///////////////////////////////////////////////////////////////////////////////
-// Sender consumers
-///////////////////////////////////////////////////////////////////////////////
-MMDEPLOY_API int mmdeploy_executor_start_detached(mmdeploy_sender_t input);
-
-MMDEPLOY_API mmdeploy_value_t mmdeploy_executor_sync_wait(mmdeploy_sender_t input);
-
-MMDEPLOY_API int mmdeploy_executor_sync_wait_v2(mmdeploy_sender_t input, mmdeploy_value_t* output);
-
-MMDEPLOY_API void mmdeploy_executor_execute(mmdeploy_scheduler_t scheduler, void (*fn)(void*),
-                                            void* context);
+    /**
+     * Convert the input sender into a sender that is copyable via \ref mmdeploy_sender_copy. Notice
+     * that this function doesn't make the sender multi-shot, it just return a sender that is copyable.
+     * @param[in] input
+     * @return the sender that is copyable
+     */
+    MMDEPLOY_API mmdeploy_sender_t    mmdeploy_executor_split(mmdeploy_sender_t input);
+
+    MMDEPLOY_API mmdeploy_sender_t    mmdeploy_executor_when_all(mmdeploy_sender_t inputs[], int32_t n);
+
+    MMDEPLOY_API mmdeploy_sender_t    mmdeploy_executor_ensure_started(mmdeploy_sender_t input);
+
+    ///////////////////////////////////////////////////////////////////////////////
+    // Sender consumers
+    ///////////////////////////////////////////////////////////////////////////////
+    MMDEPLOY_API int                  mmdeploy_executor_start_detached(mmdeploy_sender_t input);
+
+    MMDEPLOY_API mmdeploy_value_t     mmdeploy_executor_sync_wait(mmdeploy_sender_t input);
+
+    MMDEPLOY_API int                  mmdeploy_executor_sync_wait_v2(mmdeploy_sender_t input, mmdeploy_value_t* output);
+
+    MMDEPLOY_API void                 mmdeploy_executor_execute(mmdeploy_scheduler_t scheduler, void (*fn)(void*), void* context);
 
 #if __cplusplus
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/executor_internal.h b/csrc/mmdeploy/apis/c/mmdeploy/executor_internal.h
index 95f39fe009..0ae8c2a529 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/executor_internal.h
+++ b/csrc/mmdeploy/apis/c/mmdeploy/executor_internal.h
@@ -8,33 +8,49 @@
 
 using namespace mmdeploy;
 
-using SenderType = TypeErasedSender<Value>;
+using SenderType    = TypeErasedSender<Value>;
 using SchedulerType = TypeErasedScheduler<Value>;
 
-namespace {
-
-inline SchedulerType* Cast(mmdeploy_scheduler_t s) { return reinterpret_cast<SchedulerType*>(s); }
-
-inline mmdeploy_scheduler_t Cast(SchedulerType* s) {
-  return reinterpret_cast<mmdeploy_scheduler_t>(s);
-}
-
-inline SenderType* Cast(mmdeploy_sender_t s) { return reinterpret_cast<SenderType*>(s); }
-
-inline mmdeploy_sender_t Cast(SenderType* s) { return reinterpret_cast<mmdeploy_sender_t>(s); }
-
-inline SenderType Take(mmdeploy_sender_t s) {
-  auto sender = std::move(*Cast(s));
-  mmdeploy_sender_destroy(s);
-  return sender;
-}
-
-inline mmdeploy_sender_t Take(SenderType s) { return Cast(new SenderType(std::move(s))); }
-
-template <typename T, std::enable_if_t<_is_sender<T>, int> = 0>
-inline mmdeploy_sender_t Take(T& s) {
-  return Take(SenderType(std::move(s)));
-}
+namespace
+{
+
+    inline SchedulerType* Cast(mmdeploy_scheduler_t s)
+    {
+        return reinterpret_cast<SchedulerType*>(s);
+    }
+
+    inline mmdeploy_scheduler_t Cast(SchedulerType* s)
+    {
+        return reinterpret_cast<mmdeploy_scheduler_t>(s);
+    }
+
+    inline SenderType* Cast(mmdeploy_sender_t s)
+    {
+        return reinterpret_cast<SenderType*>(s);
+    }
+
+    inline mmdeploy_sender_t Cast(SenderType* s)
+    {
+        return reinterpret_cast<mmdeploy_sender_t>(s);
+    }
+
+    inline SenderType Take(mmdeploy_sender_t s)
+    {
+        auto sender = std::move(*Cast(s));
+        mmdeploy_sender_destroy(s);
+        return sender;
+    }
+
+    inline mmdeploy_sender_t Take(SenderType s)
+    {
+        return Cast(new SenderType(std::move(s)));
+    }
+
+    template<typename T, std::enable_if_t<_is_sender<T>, int> = 0>
+    inline mmdeploy_sender_t Take(T& s)
+    {
+        return Take(SenderType(std::move(s)));
+    }
 
 }  // namespace
 
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/handle.h b/csrc/mmdeploy/apis/c/mmdeploy/handle.h
index 006ddaae3d..d2ccde1ef5 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/handle.h
+++ b/csrc/mmdeploy/apis/c/mmdeploy/handle.h
@@ -11,42 +11,53 @@
 #include "mmdeploy/graph/common.h"
 #include "mmdeploy/graph/static_router.h"
 
-namespace mmdeploy {
-
-using namespace framework;
-
-namespace {
-
-class AsyncHandle {
- public:
-  AsyncHandle(const char* device_name, int device_id, Value config)
-      : AsyncHandle(SetContext(std::move(config), device_name, device_id)) {}
-
-  explicit AsyncHandle(const Value& config) {
-    if (auto builder = graph::Builder::CreateFromConfig(config).value()) {
-      node_ = builder->Build().value();
-    } else {
-      MMDEPLOY_ERROR("failed to find creator for node");
-      throw_exception(eEntryNotFound);
-    }
-  }
-
-  graph::Sender<Value> Process(graph::Sender<Value> input) {
-    return node_->Process(std::move(input));
-  }
-
- private:
-  static Value SetContext(Value config, const char* device_name, int device_id) {
-    Device device(device_name, device_id);
-    Stream stream(device);
-    config["context"].update({{"device", device}, {"stream", stream}});
-    return config;
-  }
-
-  std::unique_ptr<graph::Node> node_;
-};
-
-}  // namespace
+namespace mmdeploy
+{
+
+    using namespace framework;
+
+    namespace
+    {
+
+        class AsyncHandle
+        {
+          public:
+            AsyncHandle(const char* device_name, int device_id, Value config)
+                : AsyncHandle(SetContext(std::move(config), device_name, device_id))
+            {
+            }
+
+            explicit AsyncHandle(const Value& config)
+            {
+                if (auto builder = graph::Builder::CreateFromConfig(config).value())
+                {
+                    node_ = builder->Build().value();
+                }
+                else
+                {
+                    MMDEPLOY_ERROR("failed to find creator for node");
+                    throw_exception(eEntryNotFound);
+                }
+            }
+
+            graph::Sender<Value> Process(graph::Sender<Value> input)
+            {
+                return node_->Process(std::move(input));
+            }
+
+          private:
+            static Value SetContext(Value config, const char* device_name, int device_id)
+            {
+                Device device(device_name, device_id);
+                Stream stream(device);
+                config["context"].update({{"device", device}, {"stream", stream}});
+                return config;
+            }
+
+            std::unique_ptr<graph::Node> node_;
+        };
+
+    }  // namespace
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/model.cpp b/csrc/mmdeploy/apis/c/mmdeploy/model.cpp
index 6d202bce81..08af517522 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/model.cpp
+++ b/csrc/mmdeploy/apis/c/mmdeploy/model.cpp
@@ -12,30 +12,45 @@
 
 using namespace mmdeploy;
 
-int mmdeploy_model_create_by_path(const char* path, mmdeploy_model_t* model) {
-  try {
-    auto ptr = std::make_unique<Model>(path);
-    *model = reinterpret_cast<mmdeploy_model_t>(ptr.release());
-    return MMDEPLOY_SUCCESS;
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("failed to create model: {}", e.what());
-  } catch (...) {
-    MMDEPLOY_ERROR("unknown exception caught");
-  }
-  return MMDEPLOY_E_FAIL;
+int mmdeploy_model_create_by_path(const char* path, mmdeploy_model_t* model)
+{
+    try
+    {
+        auto ptr = std::make_unique<Model>(path);
+        *model   = reinterpret_cast<mmdeploy_model_t>(ptr.release());
+        return MMDEPLOY_SUCCESS;
+    }
+    catch (const std::exception& e)
+    {
+        MMDEPLOY_ERROR("failed to create model: {}", e.what());
+    }
+    catch (...)
+    {
+        MMDEPLOY_ERROR("unknown exception caught");
+    }
+    return MMDEPLOY_E_FAIL;
 }
 
-int mmdeploy_model_create(const void* buffer, int size, mmdeploy_model_t* model) {
-  try {
-    auto ptr = std::make_unique<Model>(buffer, size);
-    *model = reinterpret_cast<mmdeploy_model_t>(ptr.release());
-    return MMDEPLOY_SUCCESS;
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("failed to create model: {}", e.what());
-  } catch (...) {
-    MMDEPLOY_ERROR("unknown exception caught");
-  }
-  return MMDEPLOY_E_FAIL;
+int mmdeploy_model_create(const void* buffer, int size, mmdeploy_model_t* model)
+{
+    try
+    {
+        auto ptr = std::make_unique<Model>(buffer, size);
+        *model   = reinterpret_cast<mmdeploy_model_t>(ptr.release());
+        return MMDEPLOY_SUCCESS;
+    }
+    catch (const std::exception& e)
+    {
+        MMDEPLOY_ERROR("failed to create model: {}", e.what());
+    }
+    catch (...)
+    {
+        MMDEPLOY_ERROR("unknown exception caught");
+    }
+    return MMDEPLOY_E_FAIL;
 }
 
-void mmdeploy_model_destroy(mmdeploy_model_t model) { delete reinterpret_cast<Model*>(model); }
+void mmdeploy_model_destroy(mmdeploy_model_t model)
+{
+    delete reinterpret_cast<Model*>(model);
+}
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/model.h b/csrc/mmdeploy/apis/c/mmdeploy/model.h
index 394d2902c2..ddea967f1a 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/model.h
+++ b/csrc/mmdeploy/apis/c/mmdeploy/model.h
@@ -11,34 +11,35 @@
 #include "mmdeploy/common.h"
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
-typedef struct mmdeploy_model* mmdeploy_model_t;
-
-/**
- * @brief Create SDK Model instance from given model path
- * @param[in] path model path
- * @param[out] model sdk model instance that must be destroyed by \ref mmdeploy_model_destroy
- * @return status code of the operation
- */
-MMDEPLOY_API int mmdeploy_model_create_by_path(const char* path, mmdeploy_model_t* model);
-
-/**
- * @brief Create SDK Model instance from memory
- * @param[in] buffer a linear buffer contains the model information
- * @param[in] size size of \p buffer in bytes
- * @param[out] model sdk model instance that must be destroyed by \ref mmdeploy_model_destroy
- * @return status code of the operation
- */
-MMDEPLOY_API int mmdeploy_model_create(const void* buffer, int size, mmdeploy_model_t* model);
-
-/**
- * @brief Destroy model instance
- * @param[in] model sdk model instance created by \ref mmdeploy_model_create_by_path or \ref
- * mmdeploy_model_create
- */
-MMDEPLOY_API void mmdeploy_model_destroy(mmdeploy_model_t model);
+    typedef struct mmdeploy_model* mmdeploy_model_t;
+
+    /**
+     * @brief Create SDK Model instance from given model path
+     * @param[in] path model path
+     * @param[out] model sdk model instance that must be destroyed by \ref mmdeploy_model_destroy
+     * @return status code of the operation
+     */
+    MMDEPLOY_API int               mmdeploy_model_create_by_path(const char* path, mmdeploy_model_t* model);
+
+    /**
+     * @brief Create SDK Model instance from memory
+     * @param[in] buffer a linear buffer contains the model information
+     * @param[in] size size of \p buffer in bytes
+     * @param[out] model sdk model instance that must be destroyed by \ref mmdeploy_model_destroy
+     * @return status code of the operation
+     */
+    MMDEPLOY_API int               mmdeploy_model_create(const void* buffer, int size, mmdeploy_model_t* model);
+
+    /**
+     * @brief Destroy model instance
+     * @param[in] model sdk model instance created by \ref mmdeploy_model_create_by_path or \ref
+     * mmdeploy_model_create
+     */
+    MMDEPLOY_API void              mmdeploy_model_destroy(mmdeploy_model_t model);
 
 #ifdef __cplusplus
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/pipeline.cpp b/csrc/mmdeploy/apis/c/mmdeploy/pipeline.cpp
index a9a02807ee..9e0fcf011e 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/pipeline.cpp
+++ b/csrc/mmdeploy/apis/c/mmdeploy/pipeline.cpp
@@ -6,73 +6,95 @@
 #include "mmdeploy/executor_internal.h"
 #include "mmdeploy/handle.h"
 
-int mmdeploy_pipeline_create_v3(mmdeploy_value_t config, mmdeploy_context_t context,
-                                mmdeploy_pipeline_t* pipeline) {
-  try {
-    auto _config = *Cast(config);
-    if (context) {
-      if (!_config.contains("context")) {
-        _config["context"] = Value::Object();
-      }
-      update(_config["context"].object(), Cast(context)->object(), 2);
+int mmdeploy_pipeline_create_v3(mmdeploy_value_t config, mmdeploy_context_t context, mmdeploy_pipeline_t* pipeline)
+{
+    try
+    {
+        auto _config = *Cast(config);
+        if (context)
+        {
+            if (!_config.contains("context"))
+            {
+                _config["context"] = Value::Object();
+            }
+            update(_config["context"].object(), Cast(context)->object(), 2);
+        }
+        auto _handle = std::make_unique<AsyncHandle>(std::move(_config));
+        *pipeline    = Cast(_handle.release());
+        return MMDEPLOY_SUCCESS;
     }
-    auto _handle = std::make_unique<AsyncHandle>(std::move(_config));
-    *pipeline = Cast(_handle.release());
-    return MMDEPLOY_SUCCESS;
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("exception caught: {}", e.what());
-  } catch (...) {
-    MMDEPLOY_ERROR("unknown exception caught");
-  }
-  return MMDEPLOY_E_FAIL;
+    catch (const std::exception& e)
+    {
+        MMDEPLOY_ERROR("exception caught: {}", e.what());
+    }
+    catch (...)
+    {
+        MMDEPLOY_ERROR("unknown exception caught");
+    }
+    return MMDEPLOY_E_FAIL;
 }
 
-int mmdeploy_pipeline_create_from_model(mmdeploy_model_t model, mmdeploy_context_t context,
-                                        mmdeploy_pipeline_t* pipeline) {
-  auto config = Cast(model)->ReadConfig("pipeline.json");
-  auto _context = *Cast(context);
-  _context["model"] = *Cast(model);
-  return mmdeploy_pipeline_create_v3(Cast(&config.value()), (mmdeploy_context_t)&_context,
-                                     pipeline);
+int mmdeploy_pipeline_create_from_model(mmdeploy_model_t model, mmdeploy_context_t context, mmdeploy_pipeline_t* pipeline)
+{
+    auto config       = Cast(model)->ReadConfig("pipeline.json");
+    auto _context     = *Cast(context);
+    _context["model"] = *Cast(model);
+    return mmdeploy_pipeline_create_v3(Cast(&config.value()), (mmdeploy_context_t)&_context, pipeline);
 }
 
-int mmdeploy_pipeline_apply_async(mmdeploy_pipeline_t pipeline, mmdeploy_sender_t input,
-                                  mmdeploy_sender_t* output) {
-  if (!pipeline || !input || !output) {
-    return MMDEPLOY_E_INVALID_ARG;
-  }
-  try {
-    auto h = Cast(pipeline);
-    *output = Take(h->Process(Take(input)));
-    return MMDEPLOY_SUCCESS;
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("exception caught: {}", e.what());
-  } catch (...) {
-    MMDEPLOY_ERROR("unknown exception caught");
-  }
-  return MMDEPLOY_E_FAIL;
+int mmdeploy_pipeline_apply_async(mmdeploy_pipeline_t pipeline, mmdeploy_sender_t input, mmdeploy_sender_t* output)
+{
+    if (!pipeline || !input || !output)
+    {
+        return MMDEPLOY_E_INVALID_ARG;
+    }
+
+    try
+    {
+        auto h  = Cast(pipeline);
+        *output = Take(h->Process(Take(input)));
+        return MMDEPLOY_SUCCESS;
+    }
+    catch (const std::exception& e)
+    {
+        MMDEPLOY_ERROR("exception caught: {}", e.what());
+    }
+    catch (...)
+    {
+        MMDEPLOY_ERROR("unknown exception caught");
+    }
+
+    return MMDEPLOY_E_FAIL;
 }
 
-void mmdeploy_pipeline_destroy(mmdeploy_pipeline_t pipeline) {
-  if (pipeline != nullptr) {
-    delete Cast(pipeline);
-  }
+void mmdeploy_pipeline_destroy(mmdeploy_pipeline_t pipeline)
+{
+    if (pipeline != nullptr)
+    {
+        delete Cast(pipeline);
+    }
 }
 
-int mmdeploy_pipeline_apply(mmdeploy_pipeline_t pipeline, mmdeploy_value_t input,
-                            mmdeploy_value_t* output) {
-  auto input_sender = mmdeploy_executor_just(input);
-  if (!input_sender) {
-    return MMDEPLOY_E_FAIL;
-  }
-  mmdeploy_sender_t output_sender{};
-  if (auto ec = mmdeploy_pipeline_apply_async(pipeline, input_sender, &output_sender)) {
-    return ec;
-  }
-  auto _output = mmdeploy_executor_sync_wait(output_sender);
-  if (!_output) {
-    return MMDEPLOY_E_FAIL;
-  }
-  *output = _output;
-  return MMDEPLOY_SUCCESS;
+int mmdeploy_pipeline_apply(mmdeploy_pipeline_t pipeline, mmdeploy_value_t input, mmdeploy_value_t* output)
+{
+    auto input_sender = mmdeploy_executor_just(input);
+    if (!input_sender)
+    {
+        return MMDEPLOY_E_FAIL;
+    }
+
+    mmdeploy_sender_t output_sender{};
+    if (auto ec = mmdeploy_pipeline_apply_async(pipeline, input_sender, &output_sender))
+    {
+        return ec;
+    }
+
+    auto _output = mmdeploy_executor_sync_wait(output_sender);
+    if (!_output)
+    {
+        return MMDEPLOY_E_FAIL;
+    }
+
+    *output = _output;
+    return MMDEPLOY_SUCCESS;
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/pipeline.h b/csrc/mmdeploy/apis/c/mmdeploy/pipeline.h
index 55ccf1e67c..faf523863f 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/pipeline.h
+++ b/csrc/mmdeploy/apis/c/mmdeploy/pipeline.h
@@ -8,59 +8,59 @@
 #include "mmdeploy/model.h"
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
-/******************************************************************************
- * Experimental pipeline APIs */
+    /******************************************************************************
+     * Experimental pipeline APIs */
 
-typedef struct mmdeploy_pipeline* mmdeploy_pipeline_t;
+    typedef struct mmdeploy_pipeline* mmdeploy_pipeline_t;
 
-/**
- * Create pipeline
- * @param config
- * @param context
- * @param pipeline
- * @return
- */
-MMDEPLOY_API int mmdeploy_pipeline_create_v3(mmdeploy_value_t config, mmdeploy_context_t context,
-                                             mmdeploy_pipeline_t* pipeline);
-/**
- * Create pipeline from internal pipeline config of the model
- * @param model
- * @param context
- * @param pipeline
- * @return
- */
-MMDEPLOY_API int mmdeploy_pipeline_create_from_model(mmdeploy_model_t model,
-                                                     mmdeploy_context_t context,
-                                                     mmdeploy_pipeline_t* pipeline);
+    /**
+     * Create pipeline
+     * @param config
+     * @param context
+     * @param pipeline
+     * @return
+     */
+    MMDEPLOY_API int                  mmdeploy_pipeline_create_v3(mmdeploy_value_t config, mmdeploy_context_t context, mmdeploy_pipeline_t* pipeline);
+    /**
+     * Create pipeline from internal pipeline config of the model
+     * @param model
+     * @param context
+     * @param pipeline
+     * @return
+     */
+    MMDEPLOY_API int                  mmdeploy_pipeline_create_from_model(mmdeploy_model_t     model,
+                                                                          mmdeploy_context_t   context,
+                                                                          mmdeploy_pipeline_t* pipeline);
 
-/**
- * @brief Apply pipeline
- * @param[in] pipeline handle of the pipeline
- * @param[in] input input value
- * @param[out] output output value
- * @return status of the operation
- */
-MMDEPLOY_API int mmdeploy_pipeline_apply(mmdeploy_pipeline_t pipeline, mmdeploy_value_t input,
-                                         mmdeploy_value_t* output);
+    /**
+     * @brief Apply pipeline
+     * @param[in] pipeline handle of the pipeline
+     * @param[in] input input value
+     * @param[out] output output value
+     * @return status of the operation
+     */
+    MMDEPLOY_API int                  mmdeploy_pipeline_apply(mmdeploy_pipeline_t pipeline, mmdeploy_value_t input, mmdeploy_value_t* output);
 
-/**
- * Apply pipeline asynchronously
- * @param pipeline handle of the pipeline
- * @param input input sender that will be consumed by the operation
- * @param output output sender
- * @return status of the operation
- */
-MMDEPLOY_API int mmdeploy_pipeline_apply_async(mmdeploy_pipeline_t pipeline,
-                                               mmdeploy_sender_t input, mmdeploy_sender_t* output);
+    /**
+     * Apply pipeline asynchronously
+     * @param pipeline handle of the pipeline
+     * @param input input sender that will be consumed by the operation
+     * @param output output sender
+     * @return status of the operation
+     */
+    MMDEPLOY_API int                  mmdeploy_pipeline_apply_async(mmdeploy_pipeline_t pipeline,
+                                                                    mmdeploy_sender_t   input,
+                                                                    mmdeploy_sender_t*  output);
 
-/**
- * @brief destroy pipeline
- * @param[in] pipeline
- */
-MMDEPLOY_API void mmdeploy_pipeline_destroy(mmdeploy_pipeline_t pipeline);
+    /**
+     * @brief destroy pipeline
+     * @param[in] pipeline
+     */
+    MMDEPLOY_API void                 mmdeploy_pipeline_destroy(mmdeploy_pipeline_t pipeline);
 
 #ifdef __cplusplus
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/pose_detector.cpp b/csrc/mmdeploy/apis/c/mmdeploy/pose_detector.cpp
index 46f9921e62..ee0cc0c564 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/pose_detector.cpp
+++ b/csrc/mmdeploy/apis/c/mmdeploy/pose_detector.cpp
@@ -16,164 +16,197 @@
 using namespace std;
 using namespace mmdeploy;
 
-int mmdeploy_pose_detector_create(mmdeploy_model_t model, const char* device_name, int device_id,
-                                  mmdeploy_pose_detector_t* detector) {
-  mmdeploy_context_t context{};
-  auto ec = mmdeploy_context_create_by_device(device_name, device_id, &context);
-  if (ec != MMDEPLOY_SUCCESS) {
+int mmdeploy_pose_detector_create(mmdeploy_model_t model, const char* device_name, int device_id, mmdeploy_pose_detector_t* detector)
+{
+    mmdeploy_context_t context{};
+    auto               ec = mmdeploy_context_create_by_device(device_name, device_id, &context);
+    if (ec != MMDEPLOY_SUCCESS)
+    {
+        return ec;
+    }
+    ec = mmdeploy_pose_detector_create_v2(model, context, detector);
+    mmdeploy_context_destroy(context);
     return ec;
-  }
-  ec = mmdeploy_pose_detector_create_v2(model, context, detector);
-  mmdeploy_context_destroy(context);
-  return ec;
 }
 
-int mmdeploy_pose_detector_create_by_path(const char* model_path, const char* device_name,
-                                          int device_id, mmdeploy_pose_detector_t* detector) {
-  mmdeploy_model_t model{};
-  if (auto ec = mmdeploy_model_create_by_path(model_path, &model)) {
+int mmdeploy_pose_detector_create_by_path(const char* model_path, const char* device_name, int device_id, mmdeploy_pose_detector_t* detector)
+{
+    mmdeploy_model_t model{};
+    if (auto ec = mmdeploy_model_create_by_path(model_path, &model))
+    {
+        return ec;
+    }
+    auto ec = mmdeploy_pose_detector_create(model, device_name, device_id, detector);
+    mmdeploy_model_destroy(model);
     return ec;
-  }
-  auto ec = mmdeploy_pose_detector_create(model, device_name, device_id, detector);
-  mmdeploy_model_destroy(model);
-  return ec;
 }
 
-int mmdeploy_pose_detector_apply(mmdeploy_pose_detector_t detector, const mmdeploy_mat_t* mats,
-                                 int mat_count, mmdeploy_pose_detection_t** results) {
-  return mmdeploy_pose_detector_apply_bbox(detector, mats, mat_count, nullptr, nullptr, results);
+int mmdeploy_pose_detector_apply(mmdeploy_pose_detector_t detector, const mmdeploy_mat_t* mats, int mat_count, mmdeploy_pose_detection_t** results)
+{
+    return mmdeploy_pose_detector_apply_bbox(detector, mats, mat_count, nullptr, nullptr, results);
 }
 
-int mmdeploy_pose_detector_apply_bbox(mmdeploy_pose_detector_t detector, const mmdeploy_mat_t* mats,
-                                      int mat_count, const mmdeploy_rect_t* bboxes,
-                                      const int* bbox_count, mmdeploy_pose_detection_t** results) {
-  wrapped<mmdeploy_value_t> input;
-  if (auto ec =
-          mmdeploy_pose_detector_create_input(mats, mat_count, bboxes, bbox_count, input.ptr())) {
-    return ec;
-  }
-  wrapped<mmdeploy_value_t> output;
-  if (auto ec = mmdeploy_pose_detector_apply_v2(detector, input, output.ptr())) {
-    return ec;
-  }
-  if (auto ec = mmdeploy_pose_detector_get_result(output, results)) {
-    return ec;
-  }
-  return MMDEPLOY_SUCCESS;
+int mmdeploy_pose_detector_apply_bbox(mmdeploy_pose_detector_t detector, const mmdeploy_mat_t* mats, int mat_count, const mmdeploy_rect_t* bboxes, const int* bbox_count, mmdeploy_pose_detection_t** results)
+{
+    wrapped<mmdeploy_value_t> input;
+    if (auto ec =
+            mmdeploy_pose_detector_create_input(mats, mat_count, bboxes, bbox_count, input.ptr()))
+    {
+        return ec;
+    }
+    wrapped<mmdeploy_value_t> output;
+    if (auto ec = mmdeploy_pose_detector_apply_v2(detector, input, output.ptr()))
+    {
+        return ec;
+    }
+    if (auto ec = mmdeploy_pose_detector_get_result(output, results))
+    {
+        return ec;
+    }
+    return MMDEPLOY_SUCCESS;
 }
 
-void mmdeploy_pose_detector_release_result(mmdeploy_pose_detection_t* results, int count) {
-  if (results == nullptr) {
-    return;
-  }
-  for (int i = 0; i < count; ++i) {
-    delete[] results[i].point;
-    delete[] results[i].score;
-  }
-  delete[] results;
+void mmdeploy_pose_detector_release_result(mmdeploy_pose_detection_t* results, int count)
+{
+    if (results == nullptr)
+    {
+        return;
+    }
+    for (int i = 0; i < count; ++i)
+    {
+        delete[] results[i].point;
+        delete[] results[i].score;
+    }
+    delete[] results;
 }
 
-void mmdeploy_pose_detector_destroy(mmdeploy_pose_detector_t detector) {
-  mmdeploy_pipeline_destroy((mmdeploy_pipeline_t)detector);
+void mmdeploy_pose_detector_destroy(mmdeploy_pose_detector_t detector)
+{
+    mmdeploy_pipeline_destroy((mmdeploy_pipeline_t)detector);
 }
 
-int mmdeploy_pose_detector_create_v2(mmdeploy_model_t model, mmdeploy_context_t context,
-                                     mmdeploy_pose_detector_t* detector) {
-  return mmdeploy_pipeline_create_from_model(model, context, (mmdeploy_pipeline_t*)detector);
+int mmdeploy_pose_detector_create_v2(mmdeploy_model_t model, mmdeploy_context_t context, mmdeploy_pose_detector_t* detector)
+{
+    return mmdeploy_pipeline_create_from_model(model, context, (mmdeploy_pipeline_t*)detector);
 }
 
-int mmdeploy_pose_detector_create_input(const mmdeploy_mat_t* mats, int mat_count,
-                                        const mmdeploy_rect_t* bboxes, const int* bbox_count,
-                                        mmdeploy_value_t* value) {
-  if (mat_count && mats == nullptr) {
-    return MMDEPLOY_E_INVALID_ARG;
-  }
-  try {
-    Value::Array input_images;
-
-    auto add_bbox = [&](const Mat& img, const mmdeploy_rect_t* bbox) {
-      Value::Array b;
-      if (bbox) {
-        float width = bbox->right - bbox->left + 1;
-        float height = bbox->bottom - bbox->top + 1;
-        b = {bbox->left, bbox->top, width, height, 1.0};
-      } else {
-        b = {0, 0, img.width(), img.height(), 1.0};
-      }
-      input_images.push_back({{"ori_img", img}, {"bbox", std::move(b)}});
-    };
-
-    for (int i = 0; i < mat_count; ++i) {
-      auto _mat = Cast(mats[i]);
-      if (bboxes && bbox_count) {
-        for (int j = 0; j < bbox_count[i]; ++j) {
-          add_bbox(_mat, bboxes++);
-        }
-      } else {  // inference whole image
-        add_bbox(_mat, nullptr);
-      }
+int mmdeploy_pose_detector_create_input(const mmdeploy_mat_t* mats, int mat_count, const mmdeploy_rect_t* bboxes, const int* bbox_count, mmdeploy_value_t* value)
+{
+    if (mat_count && mats == nullptr)
+    {
+        return MMDEPLOY_E_INVALID_ARG;
     }
+    try
+    {
+        Value::Array input_images;
+
+        auto         add_bbox = [&](const Mat& img, const mmdeploy_rect_t* bbox)
+        {
+            Value::Array b;
+            if (bbox)
+            {
+                float width  = bbox->right - bbox->left + 1;
+                float height = bbox->bottom - bbox->top + 1;
+                b            = {bbox->left, bbox->top, width, height, 1.0};
+            }
+            else
+            {
+                b = {0, 0, img.width(), img.height(), 1.0};
+            }
+            input_images.push_back({{"ori_img", img}, {"bbox", std::move(b)}});
+        };
+
+        for (int i = 0; i < mat_count; ++i)
+        {
+            auto _mat = Cast(mats[i]);
+            if (bboxes && bbox_count)
+            {
+                for (int j = 0; j < bbox_count[i]; ++j)
+                {
+                    add_bbox(_mat, bboxes++);
+                }
+            }
+            else
+            {  // inference whole image
+                add_bbox(_mat, nullptr);
+            }
+        }
 
-    *value = Take(Value{std::move(input_images)});
-    return MMDEPLOY_SUCCESS;
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("unhandled exception: {}", e.what());
-  } catch (...) {
-    MMDEPLOY_ERROR("unknown exception caught");
-  }
-  return MMDEPLOY_E_FAIL;
+        *value = Take(Value{std::move(input_images)});
+        return MMDEPLOY_SUCCESS;
+    }
+    catch (const std::exception& e)
+    {
+        MMDEPLOY_ERROR("unhandled exception: {}", e.what());
+    }
+    catch (...)
+    {
+        MMDEPLOY_ERROR("unknown exception caught");
+    }
+    return MMDEPLOY_E_FAIL;
 }
 
-int mmdeploy_pose_detector_apply_v2(mmdeploy_pose_detector_t detector, mmdeploy_value_t input,
-                                    mmdeploy_value_t* output) {
-  return mmdeploy_pipeline_apply((mmdeploy_pipeline_t)detector, input, output);
+int mmdeploy_pose_detector_apply_v2(mmdeploy_pose_detector_t detector, mmdeploy_value_t input, mmdeploy_value_t* output)
+{
+    return mmdeploy_pipeline_apply((mmdeploy_pipeline_t)detector, input, output);
 }
 
-int mmdeploy_pose_detector_apply_async(mmdeploy_pose_detector_t detector, mmdeploy_sender_t input,
-                                       mmdeploy_sender_t* output) {
-  return mmdeploy_pipeline_apply_async((mmdeploy_pipeline_t)detector, input, output);
+int mmdeploy_pose_detector_apply_async(mmdeploy_pose_detector_t detector, mmdeploy_sender_t input, mmdeploy_sender_t* output)
+{
+    return mmdeploy_pipeline_apply_async((mmdeploy_pipeline_t)detector, input, output);
 }
 
-int mmdeploy_pose_detector_get_result(mmdeploy_value_t output,
-                                      mmdeploy_pose_detection_t** results) {
-  if (!output || !results) {
-    return MMDEPLOY_E_INVALID_ARG;
-  }
-  try {
-    std::vector<mmpose::PoseDetectorOutput> detections;
-    from_value(Cast(output)->front(), detections);
-
-    size_t count = detections.size();
-
-    auto deleter = [&](mmdeploy_pose_detection_t* p) {
-      mmdeploy_pose_detector_release_result(p, static_cast<int>(count));
-    };
-
-    std::unique_ptr<mmdeploy_pose_detection_t[], decltype(deleter)> _results(
-        new mmdeploy_pose_detection_t[count]{}, deleter);
-
-    size_t result_idx = 0;
-    for (const auto& bbox_result : detections) {
-      auto& res = _results[result_idx++];
-      auto size = bbox_result.key_points.size();
-
-      res.point = new mmdeploy_point_t[size];
-      res.score = new float[size];
-      res.length = static_cast<int>(size);
-
-      for (int k = 0; k < size; k++) {
-        res.point[k].x = bbox_result.key_points[k].bbox[0];
-        res.point[k].y = bbox_result.key_points[k].bbox[1];
-        res.score[k] = bbox_result.key_points[k].score;
-      }
+int mmdeploy_pose_detector_get_result(mmdeploy_value_t            output,
+                                      mmdeploy_pose_detection_t** results)
+{
+    if (!output || !results)
+    {
+        return MMDEPLOY_E_INVALID_ARG;
     }
+    try
+    {
+        std::vector<mmpose::PoseDetectorOutput> detections;
+        from_value(Cast(output)->front(), detections);
+
+        size_t count = detections.size();
+
+        auto   deleter = [&](mmdeploy_pose_detection_t* p)
+        {
+            mmdeploy_pose_detector_release_result(p, static_cast<int>(count));
+        };
+
+        std::unique_ptr<mmdeploy_pose_detection_t[], decltype(deleter)> _results(
+            new mmdeploy_pose_detection_t[count]{},
+            deleter);
+
+        size_t result_idx = 0;
+        for (const auto& bbox_result : detections)
+        {
+            auto& res  = _results[result_idx++];
+            auto  size = bbox_result.key_points.size();
+
+            res.point  = new mmdeploy_point_t[size];
+            res.score  = new float[size];
+            res.length = static_cast<int>(size);
+
+            for (int k = 0; k < size; k++)
+            {
+                res.point[k].x = bbox_result.key_points[k].bbox[0];
+                res.point[k].y = bbox_result.key_points[k].bbox[1];
+                res.score[k]   = bbox_result.key_points[k].score;
+            }
+        }
 
-    *results = _results.release();
-    return MMDEPLOY_SUCCESS;
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("unhandled exception: {}", e.what());
-  } catch (...) {
-    MMDEPLOY_ERROR("unknown exception caught");
-  }
-  return MMDEPLOY_E_FAIL;
+        *results = _results.release();
+        return MMDEPLOY_SUCCESS;
+    }
+    catch (const std::exception& e)
+    {
+        MMDEPLOY_ERROR("unhandled exception: {}", e.what());
+    }
+    catch (...)
+    {
+        MMDEPLOY_ERROR("unknown exception caught");
+    }
+    return MMDEPLOY_E_FAIL;
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/pose_detector.h b/csrc/mmdeploy/apis/c/mmdeploy/pose_detector.h
index ff0987cee4..6fceb99f72 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/pose_detector.h
+++ b/csrc/mmdeploy/apis/c/mmdeploy/pose_detector.h
@@ -13,111 +13,113 @@
 #include "mmdeploy/model.h"
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
-typedef struct mmdeploy_pose_detection_t {
-  mmdeploy_point_t* point;  ///< keypoint
-  float* score;             ///< keypoint score
-  int length;               ///< number of keypoint
-} mmdeploy_pose_detection_t;
-
-typedef struct mmdeploy_pose_detector* mmdeploy_pose_detector_t;
-
-/**
- * @brief Create a pose detector instance
- * @param[in] model an instance of mmpose model created by
- * \ref mmdeploy_model_create_by_path or \ref mmdeploy_model_create in \ref model.h
- * @param[in] device_name name of device, such as "cpu", "cuda", etc.
- * @param[in] device_id id of device.
- * @param[out] detector handle of the created pose detector, which must be destroyed
- * by \ref mmdeploy_pose_detector_destroy
- * @return status code of the operation
- */
-MMDEPLOY_API int mmdeploy_pose_detector_create(mmdeploy_model_t model, const char* device_name,
-                                               int device_id, mmdeploy_pose_detector_t* detector);
-
-/**
- * @brief Create a pose detector instance
- * @param[in] model_path path to pose detection model
- * @param[in] device_name name of device, such as "cpu", "cuda", etc.
- * @param[in] device_id id of device.
- * @param[out] detector handle of the created pose detector, which must be destroyed
- * by \ref mmdeploy_pose_detector_destroy
- * @return status code of the operation
- */
-MMDEPLOY_API int mmdeploy_pose_detector_create_by_path(const char* model_path,
-                                                       const char* device_name, int device_id,
-                                                       mmdeploy_pose_detector_t* detector);
-
-/**
- * @brief Apply pose detector to a batch of images with full image roi
- * @param[in] detector pose detector's handle created by \ref
- * mmdeploy_pose_detector_create_by_path
- * @param[in] images a batch of images
- * @param[in] count number of images in the batch
- * @param[out] results a linear buffer contains the pose result, must be release
- * by \ref mmdeploy_pose_detector_release_result
- * @return status code of the operation
- */
-MMDEPLOY_API int mmdeploy_pose_detector_apply(mmdeploy_pose_detector_t detector,
-                                              const mmdeploy_mat_t* mats, int mat_count,
-                                              mmdeploy_pose_detection_t** results);
-
-/**
- * @brief Apply pose detector to a batch of images supplied with bboxes(roi)
- * @param[in] detector pose detector's handle created by \ref
- * mmdeploy_pose_detector_create_by_path
- * @param[in] images a batch of images
- * @param[in] image_count number of images in the batch
- * @param[in] bboxes bounding boxes(roi) detected by mmdet
- * @param[in] bbox_count number of bboxes of each \p images, must be same length as \p images
- * @param[out] results a linear buffer contains the pose result, which has the same length as \p
- * bboxes, must be release by \ref mmdeploy_pose_detector_release_result
- * @return status code of the operation
- */
-MMDEPLOY_API int mmdeploy_pose_detector_apply_bbox(mmdeploy_pose_detector_t detector,
-                                                   const mmdeploy_mat_t* mats, int mat_count,
-                                                   const mmdeploy_rect_t* bboxes,
-                                                   const int* bbox_count,
-                                                   mmdeploy_pose_detection_t** results);
-
-/** @brief Release result buffer returned by \ref mmdeploy_pose_detector_apply or \ref
- * mmdeploy_pose_detector_apply_bbox
- * @param[in] results result buffer by pose detector
- * @param[in] count length of \p result
- */
-MMDEPLOY_API void mmdeploy_pose_detector_release_result(mmdeploy_pose_detection_t* results,
-                                                        int count);
-
-/**
- * @brief destroy pose_detector
- * @param[in] detector handle of pose_detector created by \ref
- * mmdeploy_pose_detector_create_by_path or \ref mmdeploy_pose_detector_create
- */
-MMDEPLOY_API void mmdeploy_pose_detector_destroy(mmdeploy_pose_detector_t detector);
-
-/******************************************************************************
- * Experimental asynchronous APIs */
-
-MMDEPLOY_API int mmdeploy_pose_detector_create_v2(mmdeploy_model_t model,
-                                                  mmdeploy_context_t context,
-                                                  mmdeploy_pose_detector_t* detector);
-
-MMDEPLOY_API int mmdeploy_pose_detector_create_input(const mmdeploy_mat_t* mats, int mat_count,
-                                                     const mmdeploy_rect_t* bboxes,
-                                                     const int* bbox_count,
-                                                     mmdeploy_value_t* value);
-
-MMDEPLOY_API int mmdeploy_pose_detector_apply_v2(mmdeploy_pose_detector_t detector,
-                                                 mmdeploy_value_t input, mmdeploy_value_t* output);
-
-MMDEPLOY_API int mmdeploy_pose_detector_apply_async(mmdeploy_pose_detector_t detector,
-                                                    mmdeploy_sender_t input,
-                                                    mmdeploy_sender_t* output);
-
-MMDEPLOY_API int mmdeploy_pose_detector_get_result(mmdeploy_value_t output,
-                                                   mmdeploy_pose_detection_t** results);
+    typedef struct mmdeploy_pose_detection_t
+    {
+        mmdeploy_point_t* point;   ///< keypoint
+        float*            score;   ///< keypoint score
+        int               length;  ///< number of keypoint
+    } mmdeploy_pose_detection_t;
+
+    typedef struct mmdeploy_pose_detector* mmdeploy_pose_detector_t;
+
+    /**
+     * @brief Create a pose detector instance
+     * @param[in] model an instance of mmpose model created by
+     * \ref mmdeploy_model_create_by_path or \ref mmdeploy_model_create in \ref model.h
+     * @param[in] device_name name of device, such as "cpu", "cuda", etc.
+     * @param[in] device_id id of device.
+     * @param[out] detector handle of the created pose detector, which must be destroyed
+     * by \ref mmdeploy_pose_detector_destroy
+     * @return status code of the operation
+     */
+    MMDEPLOY_API int                       mmdeploy_pose_detector_create(mmdeploy_model_t model, const char* device_name, int device_id, mmdeploy_pose_detector_t* detector);
+
+    /**
+     * @brief Create a pose detector instance
+     * @param[in] model_path path to pose detection model
+     * @param[in] device_name name of device, such as "cpu", "cuda", etc.
+     * @param[in] device_id id of device.
+     * @param[out] detector handle of the created pose detector, which must be destroyed
+     * by \ref mmdeploy_pose_detector_destroy
+     * @return status code of the operation
+     */
+    MMDEPLOY_API int                       mmdeploy_pose_detector_create_by_path(const char*               model_path,
+                                                                                 const char*               device_name,
+                                                                                 int                       device_id,
+                                                                                 mmdeploy_pose_detector_t* detector);
+
+    /**
+     * @brief Apply pose detector to a batch of images with full image roi
+     * @param[in] detector pose detector's handle created by \ref
+     * mmdeploy_pose_detector_create_by_path
+     * @param[in] images a batch of images
+     * @param[in] count number of images in the batch
+     * @param[out] results a linear buffer contains the pose result, must be release
+     * by \ref mmdeploy_pose_detector_release_result
+     * @return status code of the operation
+     */
+    MMDEPLOY_API int                       mmdeploy_pose_detector_apply(mmdeploy_pose_detector_t    detector,
+                                                                        const mmdeploy_mat_t*       mats,
+                                                                        int                         mat_count,
+                                                                        mmdeploy_pose_detection_t** results);
+
+    /**
+     * @brief Apply pose detector to a batch of images supplied with bboxes(roi)
+     * @param[in] detector pose detector's handle created by \ref
+     * mmdeploy_pose_detector_create_by_path
+     * @param[in] images a batch of images
+     * @param[in] image_count number of images in the batch
+     * @param[in] bboxes bounding boxes(roi) detected by mmdet
+     * @param[in] bbox_count number of bboxes of each \p images, must be same length as \p images
+     * @param[out] results a linear buffer contains the pose result, which has the same length as \p
+     * bboxes, must be release by \ref mmdeploy_pose_detector_release_result
+     * @return status code of the operation
+     */
+    MMDEPLOY_API int                       mmdeploy_pose_detector_apply_bbox(mmdeploy_pose_detector_t    detector,
+                                                                             const mmdeploy_mat_t*       mats,
+                                                                             int                         mat_count,
+                                                                             const mmdeploy_rect_t*      bboxes,
+                                                                             const int*                  bbox_count,
+                                                                             mmdeploy_pose_detection_t** results);
+
+    /** @brief Release result buffer returned by \ref mmdeploy_pose_detector_apply or \ref
+     * mmdeploy_pose_detector_apply_bbox
+     * @param[in] results result buffer by pose detector
+     * @param[in] count length of \p result
+     */
+    MMDEPLOY_API void                      mmdeploy_pose_detector_release_result(mmdeploy_pose_detection_t* results,
+                                                                                 int                        count);
+
+    /**
+     * @brief destroy pose_detector
+     * @param[in] detector handle of pose_detector created by \ref
+     * mmdeploy_pose_detector_create_by_path or \ref mmdeploy_pose_detector_create
+     */
+    MMDEPLOY_API void                      mmdeploy_pose_detector_destroy(mmdeploy_pose_detector_t detector);
+
+    /******************************************************************************
+     * Experimental asynchronous APIs */
+
+    MMDEPLOY_API int                       mmdeploy_pose_detector_create_v2(mmdeploy_model_t          model,
+                                                                            mmdeploy_context_t        context,
+                                                                            mmdeploy_pose_detector_t* detector);
+
+    MMDEPLOY_API int                       mmdeploy_pose_detector_create_input(const mmdeploy_mat_t* mats, int mat_count, const mmdeploy_rect_t* bboxes, const int* bbox_count, mmdeploy_value_t* value);
+
+    MMDEPLOY_API int                       mmdeploy_pose_detector_apply_v2(mmdeploy_pose_detector_t detector,
+                                                                           mmdeploy_value_t         input,
+                                                                           mmdeploy_value_t*        output);
+
+    MMDEPLOY_API int                       mmdeploy_pose_detector_apply_async(mmdeploy_pose_detector_t detector,
+                                                                              mmdeploy_sender_t        input,
+                                                                              mmdeploy_sender_t*       output);
+
+    MMDEPLOY_API int                       mmdeploy_pose_detector_get_result(mmdeploy_value_t            output,
+                                                                             mmdeploy_pose_detection_t** results);
 
 #ifdef __cplusplus
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/pose_tracker.cpp b/csrc/mmdeploy/apis/c/mmdeploy/pose_tracker.cpp
index 113b520c39..d2587b1949 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/pose_tracker.cpp
+++ b/csrc/mmdeploy/apis/c/mmdeploy/pose_tracker.cpp
@@ -9,18 +9,21 @@
 #include "mmdeploy/core/mpl/structure.h"
 #include "mmdeploy/pipeline.h"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-using namespace framework;
+    using namespace framework;
 
 }  // namespace mmdeploy
 
 using namespace mmdeploy;
 
-namespace {
+namespace
+{
 
-Value config_template() {
-  static const auto json = R"(
+    Value config_template()
+    {
+        static const auto json   = R"(
 {
   "type": "Pipeline",
   "input": ["img", "force_det", "state"],
@@ -77,149 +80,184 @@ Value config_template() {
   ]
 }
 )"_json;
-  static const auto config = from_json<Value>(json);
-  return config;
-}
+        static const auto config = from_json<Value>(json);
+        return config;
+    }
 
 }  // namespace
 
-int mmdeploy_pose_tracker_default_params(mmdeploy_pose_tracker_param_t* params) {
-  mmpose::_pose_tracker::SetDefaultParams(*params);
-  return 0;
+int mmdeploy_pose_tracker_default_params(mmdeploy_pose_tracker_param_t* params)
+{
+    mmpose::_pose_tracker::SetDefaultParams(*params);
+    return 0;
 }
 
-int mmdeploy_pose_tracker_create(mmdeploy_model_t det_model, mmdeploy_model_t pose_model,
-                                 mmdeploy_context_t context, mmdeploy_pose_tracker_t* pipeline) {
-  mmdeploy_context_add(context, MMDEPLOY_TYPE_MODEL, "detection", det_model);
-  mmdeploy_context_add(context, MMDEPLOY_TYPE_MODEL, "pose", pose_model);
-  auto config = config_template();
-  return mmdeploy_pipeline_create_v3(Cast(&config), context, (mmdeploy_pipeline_t*)pipeline);
+int mmdeploy_pose_tracker_create(mmdeploy_model_t det_model, mmdeploy_model_t pose_model, mmdeploy_context_t context, mmdeploy_pose_tracker_t* pipeline)
+{
+    mmdeploy_context_add(context, MMDEPLOY_TYPE_MODEL, "detection", det_model);
+    mmdeploy_context_add(context, MMDEPLOY_TYPE_MODEL, "pose", pose_model);
+    auto config = config_template();
+    return mmdeploy_pipeline_create_v3(Cast(&config), context, (mmdeploy_pipeline_t*)pipeline);
 }
 
-void mmdeploy_pose_tracker_destroy(mmdeploy_pose_tracker_t pipeline) {
-  mmdeploy_pipeline_destroy((mmdeploy_pipeline_t)pipeline);
+void mmdeploy_pose_tracker_destroy(mmdeploy_pose_tracker_t pipeline)
+{
+    mmdeploy_pipeline_destroy((mmdeploy_pipeline_t)pipeline);
 }
 
-int mmdeploy_pose_tracker_create_state(mmdeploy_pose_tracker_t pipeline,
+int mmdeploy_pose_tracker_create_state(mmdeploy_pose_tracker_t              pipeline,
                                        const mmdeploy_pose_tracker_param_t* params,
-                                       mmdeploy_pose_tracker_state_t* state) {
-  try {
-    auto create_fn = gRegistry<Module>().Create("pose_tracker::Create", Value()).value();
-    *state = reinterpret_cast<mmdeploy_pose_tracker_state_t>(new Value(
-        create_fn->Process({const_cast<mmdeploy_pose_tracker_param_t*>(params)}).value()[0]));
-    return MMDEPLOY_SUCCESS;
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("unhandled exception: {}", e.what());
-  } catch (...) {
-    MMDEPLOY_ERROR("unknown exception caught");
-  }
-  return MMDEPLOY_E_FAIL;
+                                       mmdeploy_pose_tracker_state_t*       state)
+{
+    try
+    {
+        auto create_fn = gRegistry<Module>().Create("pose_tracker::Create", Value()).value();
+        *state         = reinterpret_cast<mmdeploy_pose_tracker_state_t>(new Value(
+            create_fn->Process({const_cast<mmdeploy_pose_tracker_param_t*>(params)}).value()[0]));
+        return MMDEPLOY_SUCCESS;
+    }
+    catch (const std::exception& e)
+    {
+        MMDEPLOY_ERROR("unhandled exception: {}", e.what());
+    }
+    catch (...)
+    {
+        MMDEPLOY_ERROR("unknown exception caught");
+    }
+    return MMDEPLOY_E_FAIL;
 }
 
-void mmdeploy_pose_tracker_destroy_state(mmdeploy_pose_tracker_state_t state) {
-  delete reinterpret_cast<Value*>(state);
+void mmdeploy_pose_tracker_destroy_state(mmdeploy_pose_tracker_state_t state)
+{
+    delete reinterpret_cast<Value*>(state);
 }
 
 int mmdeploy_pose_tracker_create_input(mmdeploy_pose_tracker_state_t* states,
-                                       const mmdeploy_mat_t* frames, const int32_t* use_detect,
-                                       int batch_size, mmdeploy_value_t* value) {
-  try {
-    Value::Array images;
-    Value::Array use_dets;
-    Value::Array trackers;
-    for (int i = 0; i < batch_size; ++i) {
-      images.push_back({{"ori_img", Cast(frames[i])}});
-      use_dets.emplace_back(use_detect ? use_detect[i] : -1);
-      trackers.push_back(*reinterpret_cast<Value*>(states[i]));
+                                       const mmdeploy_mat_t*          frames,
+                                       const int32_t*                 use_detect,
+                                       int                            batch_size,
+                                       mmdeploy_value_t*              value)
+{
+    try
+    {
+        Value::Array images;
+        Value::Array use_dets;
+        Value::Array trackers;
+        for (int i = 0; i < batch_size; ++i)
+        {
+            images.push_back({{"ori_img", Cast(frames[i])}});
+            use_dets.emplace_back(use_detect ? use_detect[i] : -1);
+            trackers.push_back(*reinterpret_cast<Value*>(states[i]));
+        }
+        *value = Take(Value{std::move(images), std::move(use_dets), std::move(trackers)});
+        return MMDEPLOY_SUCCESS;
     }
-    *value = Take(Value{std::move(images), std::move(use_dets), std::move(trackers)});
-    return MMDEPLOY_SUCCESS;
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("unhandled exception: {}", e.what());
-  } catch (...) {
-    MMDEPLOY_ERROR("unknown exception caught");
-  }
-  return MMDEPLOY_E_FAIL;
+    catch (const std::exception& e)
+    {
+        MMDEPLOY_ERROR("unhandled exception: {}", e.what());
+    }
+    catch (...)
+    {
+        MMDEPLOY_ERROR("unknown exception caught");
+    }
+    return MMDEPLOY_E_FAIL;
 }
 
-using ResultType = mmdeploy::Structure<mmdeploy_pose_tracker_target_t, std::vector<int32_t>,
-                                       std::vector<mmpose::_pose_tracker::TrackerResult>>;
+using ResultType = mmdeploy::Structure<mmdeploy_pose_tracker_target_t, std::vector<int32_t>, std::vector<mmpose::_pose_tracker::TrackerResult>>;
 
-int mmdeploy_pose_tracker_get_result(mmdeploy_value_t output,
+int mmdeploy_pose_tracker_get_result(mmdeploy_value_t                 output,
                                      mmdeploy_pose_tracker_target_t** results,
-                                     int32_t** result_count) {
-  if (!output || !results) {
-    return MMDEPLOY_E_INVALID_ARG;
-  }
-  try {
-    // convert result from Values
-    std::vector<mmpose::_pose_tracker::TrackerResult> res;
-    from_value(Cast(output)->front(), res);
-
-    size_t total = 0;
-    for (const auto& r : res) {
-      total += r.bboxes.size();
+                                     int32_t**                        result_count)
+{
+    if (!output || !results)
+    {
+        return MMDEPLOY_E_INVALID_ARG;
     }
+    try
+    {
+        // convert result from Values
+        std::vector<mmpose::_pose_tracker::TrackerResult> res;
+        from_value(Cast(output)->front(), res);
 
-    // preserve space for the output structure
-    ResultType result_type({total, 1, 1});
-    auto [result_data, result_cnt, result_holder] = result_type.pointers();
+        size_t total = 0;
+        for (const auto& r : res)
+        {
+            total += r.bboxes.size();
+        }
 
-    auto result_ptr = result_data;
+        // preserve space for the output structure
+        ResultType result_type({total, 1, 1});
+        auto [result_data, result_cnt, result_holder] = result_type.pointers();
 
-    result_holder->swap(res);
+        auto result_ptr = result_data;
 
-    // build output structure
-    for (auto& r : *result_holder) {
-      for (int j = 0; j < r.bboxes.size(); ++j) {
-        auto& p = *result_ptr++;
-        p.keypoint_count = static_cast<int32_t>(r.keypoints[j].size());
-        p.keypoints = r.keypoints[j].data();
-        p.scores = r.scores[j].data();
-        p.bbox = r.bboxes[j];
-        p.target_id = r.track_ids[j];
-      }
-      result_cnt->push_back(r.bboxes.size());
-      // debug info
-      //  p.reserved0 = new std::vector(r.pose_input_bboxes);
-      //  p.reserved1 = new std::vector(r.pose_output_bboxes);
-    }
+        result_holder->swap(res);
 
-    *results = result_data;
-    *result_count = result_cnt->data();
-    result_type.release();
+        // build output structure
+        for (auto& r : *result_holder)
+        {
+            for (int j = 0; j < r.bboxes.size(); ++j)
+            {
+                auto& p          = *result_ptr++;
+                p.keypoint_count = static_cast<int32_t>(r.keypoints[j].size());
+                p.keypoints      = r.keypoints[j].data();
+                p.scores         = r.scores[j].data();
+                p.bbox           = r.bboxes[j];
+                p.target_id      = r.track_ids[j];
+            }
+            result_cnt->push_back(r.bboxes.size());
+            // debug info
+            //  p.reserved0 = new std::vector(r.pose_input_bboxes);
+            //  p.reserved1 = new std::vector(r.pose_output_bboxes);
+        }
 
-    return MMDEPLOY_SUCCESS;
+        *results      = result_data;
+        *result_count = result_cnt->data();
+        result_type.release();
 
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("unhandled exception: {}", e.what());
-  } catch (...) {
-    MMDEPLOY_ERROR("unknown exception caught");
-  }
-  return MMDEPLOY_E_FAIL;
+        return MMDEPLOY_SUCCESS;
+    }
+    catch (const std::exception& e)
+    {
+        MMDEPLOY_ERROR("unhandled exception: {}", e.what());
+    }
+    catch (...)
+    {
+        MMDEPLOY_ERROR("unknown exception caught");
+    }
+    return MMDEPLOY_E_FAIL;
 }
 
-int mmdeploy_pose_tracker_apply(mmdeploy_pose_tracker_t pipeline,
-                                mmdeploy_pose_tracker_state_t* states, const mmdeploy_mat_t* frames,
-                                const int32_t* use_detect, int32_t count,
-                                mmdeploy_pose_tracker_target_t** results, int32_t** result_count) {
-  wrapped<mmdeploy_value_t> input;
-  if (auto ec =
-          mmdeploy_pose_tracker_create_input(states, frames, use_detect, count, input.ptr())) {
-    return ec;
-  }
-  wrapped<mmdeploy_value_t> output;
-  if (auto ec = mmdeploy_pipeline_apply((mmdeploy_pipeline_t)pipeline, input, output.ptr())) {
-    return ec;
-  }
-  if (auto ec = mmdeploy_pose_tracker_get_result(output, results, result_count)) {
-    return ec;
-  }
-  return MMDEPLOY_SUCCESS;
+int mmdeploy_pose_tracker_apply(mmdeploy_pose_tracker_t          pipeline,
+                                mmdeploy_pose_tracker_state_t*   states,
+                                const mmdeploy_mat_t*            frames,
+                                const int32_t*                   use_detect,
+                                int32_t                          count,
+                                mmdeploy_pose_tracker_target_t** results,
+                                int32_t**                        result_count)
+{
+    wrapped<mmdeploy_value_t> input;
+    if (auto ec =
+            mmdeploy_pose_tracker_create_input(states, frames, use_detect, count, input.ptr()))
+    {
+        return ec;
+    }
+    wrapped<mmdeploy_value_t> output;
+    if (auto ec = mmdeploy_pipeline_apply((mmdeploy_pipeline_t)pipeline, input, output.ptr()))
+    {
+        return ec;
+    }
+    if (auto ec = mmdeploy_pose_tracker_get_result(output, results, result_count))
+    {
+        return ec;
+    }
+    return MMDEPLOY_SUCCESS;
 }
 
 void mmdeploy_pose_tracker_release_result(mmdeploy_pose_tracker_target_t* results,
-                                          const int32_t* result_count, int count) {
-  auto total = std::accumulate(result_count, result_count + count, 0);
-  ResultType deleter({static_cast<size_t>(total), 1, 1}, results);
+                                          const int32_t*                  result_count,
+                                          int                             count)
+{
+    auto       total = std::accumulate(result_count, result_count + count, 0);
+    ResultType deleter({static_cast<size_t>(total), 1, 1}, results);
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/pose_tracker.h b/csrc/mmdeploy/apis/c/mmdeploy/pose_tracker.h
index 4b27fbab8a..c8191b40fa 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/pose_tracker.h
+++ b/csrc/mmdeploy/apis/c/mmdeploy/pose_tracker.h
@@ -14,142 +14,147 @@
 #include "mmdeploy/pose_detector.h"
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
-typedef struct mmdeploy_pose_tracker* mmdeploy_pose_tracker_t;
-typedef struct mmdeploy_pose_tracker_state* mmdeploy_pose_tracker_state_t;
-
-typedef struct mmdeploy_pose_tracker_param_t {
-  // detection interval, default = 1
-  int32_t det_interval;
-  // detection label use for pose estimation, default = 0
-  int32_t det_label;
-  // detection score threshold, default = 0.5
-  float det_thr;
-  // detection minimum bbox size (compute as sqrt(area)), default = -1
-  float det_min_bbox_size;
-  // nms iou threshold for merging detected bboxes and bboxes from tracked targets, default = 0.7
-  float det_nms_thr;
-
-  // max number of bboxes used for pose estimation per frame, default = -1
-  int32_t pose_max_num_bboxes;
-  // threshold for visible key-points, default = 0.5
-  float pose_kpt_thr;
-  // min number of key-points for valid poses (-1 indicates ceil(n_kpts/2)), default = -1
-  int32_t pose_min_keypoints;
-  // scale for expanding key-points to bbox, default = 1.25
-  float pose_bbox_scale;
-  // min pose bbox size, tracks with bbox size smaller than the threshold will be dropped,
-  // default = -1
-  float pose_min_bbox_size;
-  // nms oks/iou threshold for suppressing overlapped poses, useful when multiple pose estimations
-  // collapse to the same target, default = 0.5
-  float pose_nms_thr;
-  // keypoint sigmas for computing OKS, will use IOU if not set, default = nullptr
-  float* keypoint_sigmas;
-  // size of keypoint sigma array, must be consistent with the number of key-points, default = 0
-  int32_t keypoint_sigmas_size;
-
-  // iou threshold for associating missing tracks, default = 0.4
-  float track_iou_thr;
-  // max number of missing frames before a missing tracks is removed, default = 10
-  int32_t track_max_missing;
-  // track history size, default = 1
-  int32_t track_history_size;
-
-  // weight of position for setting covariance matrices of kalman filters, default = 0.05
-  float std_weight_position;
-  // weight of velocity for setting covariance matrices of kalman filters, default = 0.00625
-  float std_weight_velocity;
-
-  // params for the one-euro filter for smoothing the outputs - (beta, fc_min, fc_derivative)
-  // default = (0.007, 1, 1)
-  float smooth_params[3];
-} mmdeploy_pose_tracker_param_t;
-
-typedef struct mmdeploy_pose_tracker_target_t {
-  mmdeploy_point_t* keypoints;  // key-points of the target
-  int32_t keypoint_count;       // size of `keypoints` array
-  float* scores;                // scores of each key-point
-  mmdeploy_rect_t bbox;         // estimated bbox from key-points
-  uint32_t target_id;           // target id from internal tracker
-} mmdeploy_pose_tracker_target_t;
-
-/**
- * @brief Fill params with default parameters
- * @param[in,out] params
- * @return status of the operation
- */
-MMDEPLOY_API int mmdeploy_pose_tracker_default_params(mmdeploy_pose_tracker_param_t* params);
-
-/**
- * @brief Create pose tracker pipeline
- * @param[in] det_model detection model object, created by \ref mmdeploy_model_create
- * @param[in] pose_model pose model object
- * @param[in] context context object describing execution environment (device, profiler, etc...),
- * created by \ref mmdeploy_context_create
- * @param[out] pipeline handle of the created pipeline
- * @return status of the operation
- */
-MMDEPLOY_API int mmdeploy_pose_tracker_create(mmdeploy_model_t det_model,
-                                              mmdeploy_model_t pose_model,
-                                              mmdeploy_context_t context,
-                                              mmdeploy_pose_tracker_t* pipeline);
-
-/**
- * @brief Destroy pose tracker pipeline
- * @param[in] pipeline
- */
-MMDEPLOY_API void mmdeploy_pose_tracker_destroy(mmdeploy_pose_tracker_t pipeline);
-
-/**
- * @brief Create a tracker state handle corresponds to a video stream
- * @param[in] pipeline handle of a pose tracker pipeline
- * @param[in] params params for creating the tracker state
- * @param[out] state handle of the created tracker state
- * @return status of the operation
- */
-MMDEPLOY_API int mmdeploy_pose_tracker_create_state(mmdeploy_pose_tracker_t pipeline,
-                                                    const mmdeploy_pose_tracker_param_t* params,
-                                                    mmdeploy_pose_tracker_state_t* state);
-
-/**
- * @brief Destroy tracker state
- * @param[in] state handle of the tracker state
- */
-MMDEPLOY_API void mmdeploy_pose_tracker_destroy_state(mmdeploy_pose_tracker_state_t state);
-
-/**
- * @brief Apply pose tracker pipeline, notice that this function supports batch operation by feeding
- * arrays of size \p count to \p states, \p frames and \p use_detect
- * @param[in] pipeline handle of a pose tracker pipeline
- * @param[in] states tracker states handles, array of size \p count
- * @param[in] frames input frames of size \p count
- * @param[in] use_detect control the use of detector, array of size \p count
- *   -1: use params.det_interval, 0: don't use detector, 1: force use detector
- * @param[in] count batch size
- * @param[out] results a linear buffer contains the tracked targets of input frames. Should be
- * released by \ref mmdeploy_pose_tracker_release_result
- * @param[out] result_count a linear buffer of size \p count contains the number of tracked
- * targets of the frames. Should be released by \ref mmdeploy_pose_tracker_release_result
- * @return status of the operation
- */
-MMDEPLOY_API int mmdeploy_pose_tracker_apply(mmdeploy_pose_tracker_t pipeline,
-                                             mmdeploy_pose_tracker_state_t* states,
-                                             const mmdeploy_mat_t* frames,
-                                             const int32_t* use_detect, int32_t count,
-                                             mmdeploy_pose_tracker_target_t** results,
-                                             int32_t** result_count);
-
-/**
- * @brief Release result objects
- * @param[in] results
- * @param[in] result_count
- * @param[in] count
- */
-MMDEPLOY_API void mmdeploy_pose_tracker_release_result(mmdeploy_pose_tracker_target_t* results,
-                                                       const int32_t* result_count, int count);
+    typedef struct mmdeploy_pose_tracker*       mmdeploy_pose_tracker_t;
+    typedef struct mmdeploy_pose_tracker_state* mmdeploy_pose_tracker_state_t;
+
+    typedef struct mmdeploy_pose_tracker_param_t
+    {
+        // detection interval, default = 1
+        int32_t det_interval;
+        // detection label use for pose estimation, default = 0
+        int32_t det_label;
+        // detection score threshold, default = 0.5
+        float   det_thr;
+        // detection minimum bbox size (compute as sqrt(area)), default = -1
+        float   det_min_bbox_size;
+        // nms iou threshold for merging detected bboxes and bboxes from tracked targets, default = 0.7
+        float   det_nms_thr;
+
+        // max number of bboxes used for pose estimation per frame, default = -1
+        int32_t pose_max_num_bboxes;
+        // threshold for visible key-points, default = 0.5
+        float   pose_kpt_thr;
+        // min number of key-points for valid poses (-1 indicates ceil(n_kpts/2)), default = -1
+        int32_t pose_min_keypoints;
+        // scale for expanding key-points to bbox, default = 1.25
+        float   pose_bbox_scale;
+        // min pose bbox size, tracks with bbox size smaller than the threshold will be dropped,
+        // default = -1
+        float   pose_min_bbox_size;
+        // nms oks/iou threshold for suppressing overlapped poses, useful when multiple pose estimations
+        // collapse to the same target, default = 0.5
+        float   pose_nms_thr;
+        // keypoint sigmas for computing OKS, will use IOU if not set, default = nullptr
+        float*  keypoint_sigmas;
+        // size of keypoint sigma array, must be consistent with the number of key-points, default = 0
+        int32_t keypoint_sigmas_size;
+
+        // iou threshold for associating missing tracks, default = 0.4
+        float   track_iou_thr;
+        // max number of missing frames before a missing tracks is removed, default = 10
+        int32_t track_max_missing;
+        // track history size, default = 1
+        int32_t track_history_size;
+
+        // weight of position for setting covariance matrices of kalman filters, default = 0.05
+        float   std_weight_position;
+        // weight of velocity for setting covariance matrices of kalman filters, default = 0.00625
+        float   std_weight_velocity;
+
+        // params for the one-euro filter for smoothing the outputs - (beta, fc_min, fc_derivative)
+        // default = (0.007, 1, 1)
+        float   smooth_params[3];
+    } mmdeploy_pose_tracker_param_t;
+
+    typedef struct mmdeploy_pose_tracker_target_t
+    {
+        mmdeploy_point_t* keypoints;       // key-points of the target
+        int32_t           keypoint_count;  // size of `keypoints` array
+        float*            scores;          // scores of each key-point
+        mmdeploy_rect_t   bbox;            // estimated bbox from key-points
+        uint32_t          target_id;       // target id from internal tracker
+    } mmdeploy_pose_tracker_target_t;
+
+    /**
+     * @brief Fill params with default parameters
+     * @param[in,out] params
+     * @return status of the operation
+     */
+    MMDEPLOY_API int  mmdeploy_pose_tracker_default_params(mmdeploy_pose_tracker_param_t* params);
+
+    /**
+     * @brief Create pose tracker pipeline
+     * @param[in] det_model detection model object, created by \ref mmdeploy_model_create
+     * @param[in] pose_model pose model object
+     * @param[in] context context object describing execution environment (device, profiler, etc...),
+     * created by \ref mmdeploy_context_create
+     * @param[out] pipeline handle of the created pipeline
+     * @return status of the operation
+     */
+    MMDEPLOY_API int  mmdeploy_pose_tracker_create(mmdeploy_model_t         det_model,
+                                                   mmdeploy_model_t         pose_model,
+                                                   mmdeploy_context_t       context,
+                                                   mmdeploy_pose_tracker_t* pipeline);
+
+    /**
+     * @brief Destroy pose tracker pipeline
+     * @param[in] pipeline
+     */
+    MMDEPLOY_API void mmdeploy_pose_tracker_destroy(mmdeploy_pose_tracker_t pipeline);
+
+    /**
+     * @brief Create a tracker state handle corresponds to a video stream
+     * @param[in] pipeline handle of a pose tracker pipeline
+     * @param[in] params params for creating the tracker state
+     * @param[out] state handle of the created tracker state
+     * @return status of the operation
+     */
+    MMDEPLOY_API int  mmdeploy_pose_tracker_create_state(mmdeploy_pose_tracker_t              pipeline,
+                                                         const mmdeploy_pose_tracker_param_t* params,
+                                                         mmdeploy_pose_tracker_state_t*       state);
+
+    /**
+     * @brief Destroy tracker state
+     * @param[in] state handle of the tracker state
+     */
+    MMDEPLOY_API void mmdeploy_pose_tracker_destroy_state(mmdeploy_pose_tracker_state_t state);
+
+    /**
+     * @brief Apply pose tracker pipeline, notice that this function supports batch operation by feeding
+     * arrays of size \p count to \p states, \p frames and \p use_detect
+     * @param[in] pipeline handle of a pose tracker pipeline
+     * @param[in] states tracker states handles, array of size \p count
+     * @param[in] frames input frames of size \p count
+     * @param[in] use_detect control the use of detector, array of size \p count
+     *   -1: use params.det_interval, 0: don't use detector, 1: force use detector
+     * @param[in] count batch size
+     * @param[out] results a linear buffer contains the tracked targets of input frames. Should be
+     * released by \ref mmdeploy_pose_tracker_release_result
+     * @param[out] result_count a linear buffer of size \p count contains the number of tracked
+     * targets of the frames. Should be released by \ref mmdeploy_pose_tracker_release_result
+     * @return status of the operation
+     */
+    MMDEPLOY_API int  mmdeploy_pose_tracker_apply(mmdeploy_pose_tracker_t          pipeline,
+                                                  mmdeploy_pose_tracker_state_t*   states,
+                                                  const mmdeploy_mat_t*            frames,
+                                                  const int32_t*                   use_detect,
+                                                  int32_t                          count,
+                                                  mmdeploy_pose_tracker_target_t** results,
+                                                  int32_t**                        result_count);
+
+    /**
+     * @brief Release result objects
+     * @param[in] results
+     * @param[in] result_count
+     * @param[in] count
+     */
+    MMDEPLOY_API void mmdeploy_pose_tracker_release_result(mmdeploy_pose_tracker_target_t* results,
+                                                           const int32_t*                  result_count,
+                                                           int                             count);
 
 #ifdef __cplusplus
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/restorer.cpp b/csrc/mmdeploy/apis/c/mmdeploy/restorer.cpp
index 9ca2ca65f7..49f8487d12 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/restorer.cpp
+++ b/csrc/mmdeploy/apis/c/mmdeploy/restorer.cpp
@@ -16,106 +16,121 @@ using namespace mmdeploy;
 
 using ResultType = mmdeploy::Structure<mmdeploy_mat_t, mmdeploy::framework::Buffer>;
 
-int mmdeploy_restorer_create(mmdeploy_model_t model, const char* device_name, int device_id,
-                             mmdeploy_restorer_t* restorer) {
-  mmdeploy_context_t context{};
-  auto ec = mmdeploy_context_create_by_device(device_name, device_id, &context);
-  if (ec != MMDEPLOY_SUCCESS) {
+int mmdeploy_restorer_create(mmdeploy_model_t model, const char* device_name, int device_id, mmdeploy_restorer_t* restorer)
+{
+    mmdeploy_context_t context{};
+    auto               ec = mmdeploy_context_create_by_device(device_name, device_id, &context);
+    if (ec != MMDEPLOY_SUCCESS)
+    {
+        return ec;
+    }
+    ec = mmdeploy_restorer_create_v2(model, context, restorer);
+    mmdeploy_context_destroy(context);
     return ec;
-  }
-  ec = mmdeploy_restorer_create_v2(model, context, restorer);
-  mmdeploy_context_destroy(context);
-  return ec;
 }
 
-int mmdeploy_restorer_create_by_path(const char* model_path, const char* device_name, int device_id,
-                                     mmdeploy_restorer_t* restorer) {
-  mmdeploy_model_t model{};
-  if (auto ec = mmdeploy_model_create_by_path(model_path, &model)) {
+int mmdeploy_restorer_create_by_path(const char* model_path, const char* device_name, int device_id, mmdeploy_restorer_t* restorer)
+{
+    mmdeploy_model_t model{};
+    if (auto ec = mmdeploy_model_create_by_path(model_path, &model))
+    {
+        return ec;
+    }
+    auto ec = mmdeploy_restorer_create(model, device_name, device_id, restorer);
+    mmdeploy_model_destroy(model);
     return ec;
-  }
-  auto ec = mmdeploy_restorer_create(model, device_name, device_id, restorer);
-  mmdeploy_model_destroy(model);
-  return ec;
 }
 
-int mmdeploy_restorer_apply(mmdeploy_restorer_t restorer, const mmdeploy_mat_t* images, int count,
-                            mmdeploy_mat_t** results) {
-  wrapped<mmdeploy_value_t> input;
-  if (auto ec = mmdeploy_restorer_create_input(images, count, input.ptr())) {
-    return ec;
-  }
-  wrapped<mmdeploy_value_t> output;
-  if (auto ec = mmdeploy_restorer_apply_v2(restorer, input, output.ptr())) {
-    return ec;
-  }
-  if (auto ec = mmdeploy_restorer_get_result(output, results)) {
-    return ec;
-  }
-  return MMDEPLOY_SUCCESS;
+int mmdeploy_restorer_apply(mmdeploy_restorer_t restorer, const mmdeploy_mat_t* images, int count, mmdeploy_mat_t** results)
+{
+    wrapped<mmdeploy_value_t> input;
+    if (auto ec = mmdeploy_restorer_create_input(images, count, input.ptr()))
+    {
+        return ec;
+    }
+    wrapped<mmdeploy_value_t> output;
+    if (auto ec = mmdeploy_restorer_apply_v2(restorer, input, output.ptr()))
+    {
+        return ec;
+    }
+    if (auto ec = mmdeploy_restorer_get_result(output, results))
+    {
+        return ec;
+    }
+    return MMDEPLOY_SUCCESS;
 }
 
-void mmdeploy_restorer_release_result(mmdeploy_mat_t* results, int count) {
-  ResultType deleter{static_cast<size_t>(count), results};
+void mmdeploy_restorer_release_result(mmdeploy_mat_t* results, int count)
+{
+    ResultType deleter{static_cast<size_t>(count), results};
 }
 
-void mmdeploy_restorer_destroy(mmdeploy_restorer_t restorer) {
-  mmdeploy_pipeline_destroy((mmdeploy_pipeline_t)restorer);
+void mmdeploy_restorer_destroy(mmdeploy_restorer_t restorer)
+{
+    mmdeploy_pipeline_destroy((mmdeploy_pipeline_t)restorer);
 }
 
-int mmdeploy_restorer_create_v2(mmdeploy_model_t model, mmdeploy_context_t context,
-                                mmdeploy_restorer_t* restorer) {
-  return mmdeploy_pipeline_create_from_model(model, context, (mmdeploy_pipeline_t*)restorer);
+int mmdeploy_restorer_create_v2(mmdeploy_model_t model, mmdeploy_context_t context, mmdeploy_restorer_t* restorer)
+{
+    return mmdeploy_pipeline_create_from_model(model, context, (mmdeploy_pipeline_t*)restorer);
 }
 
-int mmdeploy_restorer_create_input(const mmdeploy_mat_t* mats, int mat_count,
-                                   mmdeploy_value_t* value) {
-  return mmdeploy_common_create_input(mats, mat_count, value);
+int mmdeploy_restorer_create_input(const mmdeploy_mat_t* mats, int mat_count, mmdeploy_value_t* value)
+{
+    return mmdeploy_common_create_input(mats, mat_count, value);
 }
 
-int mmdeploy_restorer_apply_v2(mmdeploy_restorer_t restorer, mmdeploy_value_t input,
-                               mmdeploy_value_t* output) {
-  return mmdeploy_pipeline_apply((mmdeploy_pipeline_t)restorer, input, output);
+int mmdeploy_restorer_apply_v2(mmdeploy_restorer_t restorer, mmdeploy_value_t input, mmdeploy_value_t* output)
+{
+    return mmdeploy_pipeline_apply((mmdeploy_pipeline_t)restorer, input, output);
 }
 
-int mmdeploy_restorer_apply_async(mmdeploy_restorer_t restorer, mmdeploy_sender_t input,
-                                  mmdeploy_sender_t* output) {
-  return mmdeploy_pipeline_apply_async((mmdeploy_pipeline_t)restorer, input, output);
+int mmdeploy_restorer_apply_async(mmdeploy_restorer_t restorer, mmdeploy_sender_t input, mmdeploy_sender_t* output)
+{
+    return mmdeploy_pipeline_apply_async((mmdeploy_pipeline_t)restorer, input, output);
 }
 
-int mmdeploy_restorer_get_result(mmdeploy_value_t output, mmdeploy_mat_t** results) {
-  if (!output || !results) {
-    return MMDEPLOY_E_INVALID_ARG;
-  }
-  try {
-    const Value& value = Cast(output)->front();
-
-    auto restorer_output = from_value<std::vector<mmedit::RestorerOutput>>(value);
-    auto count = restorer_output.size();
-
-    ResultType r(count);
-    auto [_results, buffers] = r.pointers();
-
-    for (int i = 0; i < count; ++i) {
-      auto upscale = restorer_output[i];
-      auto& res = _results[i];
-      res.data = upscale.data<uint8_t>();
-      buffers[i] = upscale.buffer();
-      res.format = (mmdeploy_pixel_format_t)upscale.pixel_format();
-      res.height = upscale.height();
-      res.width = upscale.width();
-      res.channel = upscale.channel();
-      res.type = (mmdeploy_data_type_t)upscale.type();
+int mmdeploy_restorer_get_result(mmdeploy_value_t output, mmdeploy_mat_t** results)
+{
+    if (!output || !results)
+    {
+        return MMDEPLOY_E_INVALID_ARG;
     }
-
-    *results = _results;
-    r.release();
-
-    return MMDEPLOY_SUCCESS;
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("unhandled exception: {}", e.what());
-  } catch (...) {
-    MMDEPLOY_ERROR("unknown exception caught");
-  }
-  return MMDEPLOY_E_FAIL;
+    try
+    {
+        const Value& value = Cast(output)->front();
+
+        auto         restorer_output = from_value<std::vector<mmedit::RestorerOutput>>(value);
+        auto         count           = restorer_output.size();
+
+        ResultType   r(count);
+        auto [_results, buffers] = r.pointers();
+
+        for (int i = 0; i < count; ++i)
+        {
+            auto  upscale = restorer_output[i];
+            auto& res     = _results[i];
+            res.data      = upscale.data<uint8_t>();
+            buffers[i]    = upscale.buffer();
+            res.format    = (mmdeploy_pixel_format_t)upscale.pixel_format();
+            res.height    = upscale.height();
+            res.width     = upscale.width();
+            res.channel   = upscale.channel();
+            res.type      = (mmdeploy_data_type_t)upscale.type();
+        }
+
+        *results = _results;
+        r.release();
+
+        return MMDEPLOY_SUCCESS;
+    }
+    catch (const std::exception& e)
+    {
+        MMDEPLOY_ERROR("unhandled exception: {}", e.what());
+    }
+    catch (...)
+    {
+        MMDEPLOY_ERROR("unknown exception caught");
+    }
+    return MMDEPLOY_E_FAIL;
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/restorer.h b/csrc/mmdeploy/apis/c/mmdeploy/restorer.h
index 9ab529850f..5c8533102f 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/restorer.h
+++ b/csrc/mmdeploy/apis/c/mmdeploy/restorer.h
@@ -13,76 +13,72 @@
 #include "mmdeploy/model.h"
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
-typedef struct mmdeploy_restorer* mmdeploy_restorer_t;
-
-/**
- * @brief Create a restorer instance
- * @param[in] model an instance of image restoration model created by
- * \ref mmdeploy_model_create_by_path or \ref mmdeploy_model_create in \ref model.h
- * @param[in] device_name name of device, such as "cpu", "cuda", etc.
- * @param[in] device_id id of device.
- * @param[out] restorer handle of the created restorer, which must be destroyed
- * by \ref mmdeploy_restorer_destroy
- * @return status code of the operation
- */
-MMDEPLOY_API int mmdeploy_restorer_create(mmdeploy_model_t model, const char* device_name,
-                                          int device_id, mmdeploy_restorer_t* restorer);
-
-/**
- * @brief Create a restorer instance
- * @param[in] model_path path to image restoration model
- * @param[in] device_name name of device, such as "cpu", "cuda", etc.
- * @param[in] device_id id of device.
- * @param[out] restorer handle of the created restorer, which must be destroyed
- * by \ref mmdeploy_restorer_destroy
- * @return status code of the operation
- */
-MMDEPLOY_API int mmdeploy_restorer_create_by_path(const char* model_path, const char* device_name,
-                                                  int device_id, mmdeploy_restorer_t* restorer);
-
-/**
- * @brief Apply restorer to a batch of images
- * @param[in] restorer restorer's handle created by \ref mmdeploy_restorer_create_by_path
- * @param[in] images a batch of images
- * @param[in] count number of images in the batch
- * @param[out] results a linear buffer contains the restored images, must be release
- * by \ref mmdeploy_restorer_release_result
- * @return status code of the operation
- */
-MMDEPLOY_API int mmdeploy_restorer_apply(mmdeploy_restorer_t restorer, const mmdeploy_mat_t* images,
-                                         int count, mmdeploy_mat_t** results);
-
-/** @brief Release result buffer returned by \ref mmdeploy_restorer_apply
- * @param[in] results result buffer by restorer
- * @param[in] count length of \p result
- */
-MMDEPLOY_API void mmdeploy_restorer_release_result(mmdeploy_mat_t* results, int count);
-
-/**
- * @brief destroy restorer
- * @param[in] restorer handle of restorer created by \ref mmdeploy_restorer_create_by_path
- */
-MMDEPLOY_API void mmdeploy_restorer_destroy(mmdeploy_restorer_t restorer);
-
-/******************************************************************************
- * Experimental asynchronous APIs */
-
-MMDEPLOY_API int mmdeploy_restorer_create_v2(mmdeploy_model_t model, mmdeploy_context_t context,
-                                             mmdeploy_restorer_t* restorer);
-
-MMDEPLOY_API int mmdeploy_restorer_create_input(const mmdeploy_mat_t* mats, int mat_count,
-                                                mmdeploy_value_t* value);
-
-MMDEPLOY_API int mmdeploy_restorer_apply_v2(mmdeploy_restorer_t restorer, mmdeploy_value_t input,
-                                            mmdeploy_value_t* output);
-
-MMDEPLOY_API int mmdeploy_restorer_apply_async(mmdeploy_restorer_t restorer,
-                                               mmdeploy_sender_t input, mmdeploy_sender_t* output);
-
-MMDEPLOY_API int mmdeploy_restorer_get_result(mmdeploy_value_t output, mmdeploy_mat_t** results);
+    typedef struct mmdeploy_restorer* mmdeploy_restorer_t;
+
+    /**
+     * @brief Create a restorer instance
+     * @param[in] model an instance of image restoration model created by
+     * \ref mmdeploy_model_create_by_path or \ref mmdeploy_model_create in \ref model.h
+     * @param[in] device_name name of device, such as "cpu", "cuda", etc.
+     * @param[in] device_id id of device.
+     * @param[out] restorer handle of the created restorer, which must be destroyed
+     * by \ref mmdeploy_restorer_destroy
+     * @return status code of the operation
+     */
+    MMDEPLOY_API int                  mmdeploy_restorer_create(mmdeploy_model_t model, const char* device_name, int device_id, mmdeploy_restorer_t* restorer);
+
+    /**
+     * @brief Create a restorer instance
+     * @param[in] model_path path to image restoration model
+     * @param[in] device_name name of device, such as "cpu", "cuda", etc.
+     * @param[in] device_id id of device.
+     * @param[out] restorer handle of the created restorer, which must be destroyed
+     * by \ref mmdeploy_restorer_destroy
+     * @return status code of the operation
+     */
+    MMDEPLOY_API int                  mmdeploy_restorer_create_by_path(const char* model_path, const char* device_name, int device_id, mmdeploy_restorer_t* restorer);
+
+    /**
+     * @brief Apply restorer to a batch of images
+     * @param[in] restorer restorer's handle created by \ref mmdeploy_restorer_create_by_path
+     * @param[in] images a batch of images
+     * @param[in] count number of images in the batch
+     * @param[out] results a linear buffer contains the restored images, must be release
+     * by \ref mmdeploy_restorer_release_result
+     * @return status code of the operation
+     */
+    MMDEPLOY_API int                  mmdeploy_restorer_apply(mmdeploy_restorer_t restorer, const mmdeploy_mat_t* images, int count, mmdeploy_mat_t** results);
+
+    /** @brief Release result buffer returned by \ref mmdeploy_restorer_apply
+     * @param[in] results result buffer by restorer
+     * @param[in] count length of \p result
+     */
+    MMDEPLOY_API void                 mmdeploy_restorer_release_result(mmdeploy_mat_t* results, int count);
+
+    /**
+     * @brief destroy restorer
+     * @param[in] restorer handle of restorer created by \ref mmdeploy_restorer_create_by_path
+     */
+    MMDEPLOY_API void                 mmdeploy_restorer_destroy(mmdeploy_restorer_t restorer);
+
+    /******************************************************************************
+     * Experimental asynchronous APIs */
+
+    MMDEPLOY_API int                  mmdeploy_restorer_create_v2(mmdeploy_model_t model, mmdeploy_context_t context, mmdeploy_restorer_t* restorer);
+
+    MMDEPLOY_API int                  mmdeploy_restorer_create_input(const mmdeploy_mat_t* mats, int mat_count, mmdeploy_value_t* value);
+
+    MMDEPLOY_API int                  mmdeploy_restorer_apply_v2(mmdeploy_restorer_t restorer, mmdeploy_value_t input, mmdeploy_value_t* output);
+
+    MMDEPLOY_API int                  mmdeploy_restorer_apply_async(mmdeploy_restorer_t restorer,
+                                                                    mmdeploy_sender_t   input,
+                                                                    mmdeploy_sender_t*  output);
+
+    MMDEPLOY_API int                  mmdeploy_restorer_get_result(mmdeploy_value_t output, mmdeploy_mat_t** results);
 
 #ifdef __cplusplus
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/rotated_detector.cpp b/csrc/mmdeploy/apis/c/mmdeploy/rotated_detector.cpp
index d2172c54b8..04d537a376 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/rotated_detector.cpp
+++ b/csrc/mmdeploy/apis/c/mmdeploy/rotated_detector.cpp
@@ -15,124 +15,146 @@
 using namespace std;
 using namespace mmdeploy;
 
-int mmdeploy_rotated_detector_create(mmdeploy_model_t model, const char* device_name, int device_id,
-                                     mmdeploy_rotated_detector_t* detector) {
-  mmdeploy_context_t context{};
-  auto ec = mmdeploy_context_create_by_device(device_name, device_id, &context);
-  if (ec != MMDEPLOY_SUCCESS) {
+int mmdeploy_rotated_detector_create(mmdeploy_model_t model, const char* device_name, int device_id, mmdeploy_rotated_detector_t* detector)
+{
+    mmdeploy_context_t context{};
+    auto               ec = mmdeploy_context_create_by_device(device_name, device_id, &context);
+    if (ec != MMDEPLOY_SUCCESS)
+    {
+        return ec;
+    }
+    ec = mmdeploy_rotated_detector_create_v2(model, context, detector);
+    mmdeploy_context_destroy(context);
     return ec;
-  }
-  ec = mmdeploy_rotated_detector_create_v2(model, context, detector);
-  mmdeploy_context_destroy(context);
-  return ec;
 }
 
-int mmdeploy_rotated_detector_create_by_path(const char* model_path, const char* device_name,
-                                             int device_id, mmdeploy_rotated_detector_t* detector) {
-  mmdeploy_model_t model{};
+int mmdeploy_rotated_detector_create_by_path(const char* model_path, const char* device_name, int device_id, mmdeploy_rotated_detector_t* detector)
+{
+    mmdeploy_model_t model{};
 
-  if (auto ec = mmdeploy_model_create_by_path(model_path, &model)) {
+    if (auto ec = mmdeploy_model_create_by_path(model_path, &model))
+    {
+        return ec;
+    }
+    auto ec = mmdeploy_rotated_detector_create(model, device_name, device_id, detector);
+    mmdeploy_model_destroy(model);
     return ec;
-  }
-  auto ec = mmdeploy_rotated_detector_create(model, device_name, device_id, detector);
-  mmdeploy_model_destroy(model);
-  return ec;
 }
 
-int mmdeploy_rotated_detector_apply(mmdeploy_rotated_detector_t detector,
-                                    const mmdeploy_mat_t* mats, int mat_count,
-                                    mmdeploy_rotated_detection_t** results, int** result_count) {
-  wrapped<mmdeploy_value_t> input;
-  if (auto ec = mmdeploy_rotated_detector_create_input(mats, mat_count, input.ptr())) {
-    return ec;
-  }
-  wrapped<mmdeploy_value_t> output;
-  if (auto ec = mmdeploy_rotated_detector_apply_v2(detector, input, output.ptr())) {
-    return ec;
-  }
-  if (auto ec = mmdeploy_rotated_detector_get_result(output, results, result_count)) {
-    return ec;
-  }
-  return MMDEPLOY_SUCCESS;
+int mmdeploy_rotated_detector_apply(mmdeploy_rotated_detector_t    detector,
+                                    const mmdeploy_mat_t*          mats,
+                                    int                            mat_count,
+                                    mmdeploy_rotated_detection_t** results,
+                                    int**                          result_count)
+{
+    wrapped<mmdeploy_value_t> input;
+    if (auto ec = mmdeploy_rotated_detector_create_input(mats, mat_count, input.ptr()))
+    {
+        return ec;
+    }
+    wrapped<mmdeploy_value_t> output;
+    if (auto ec = mmdeploy_rotated_detector_apply_v2(detector, input, output.ptr()))
+    {
+        return ec;
+    }
+    if (auto ec = mmdeploy_rotated_detector_get_result(output, results, result_count))
+    {
+        return ec;
+    }
+    return MMDEPLOY_SUCCESS;
 }
 
 void mmdeploy_rotated_detector_release_result(mmdeploy_rotated_detection_t* results,
-                                              const int* result_count) {
-  delete[] results;
-  delete[] result_count;
+                                              const int*                    result_count)
+{
+    delete[] results;
+    delete[] result_count;
 }
 
-void mmdeploy_rotated_detector_destroy(mmdeploy_rotated_detector_t detector) {
-  mmdeploy_pipeline_destroy((mmdeploy_pipeline_t)detector);
+void mmdeploy_rotated_detector_destroy(mmdeploy_rotated_detector_t detector)
+{
+    mmdeploy_pipeline_destroy((mmdeploy_pipeline_t)detector);
 }
 
-int mmdeploy_rotated_detector_create_v2(mmdeploy_model_t model, mmdeploy_context_t context,
-                                        mmdeploy_rotated_detector_t* detector) {
-  return mmdeploy_pipeline_create_from_model(model, context, (mmdeploy_pipeline_t*)detector);
+int mmdeploy_rotated_detector_create_v2(mmdeploy_model_t model, mmdeploy_context_t context, mmdeploy_rotated_detector_t* detector)
+{
+    return mmdeploy_pipeline_create_from_model(model, context, (mmdeploy_pipeline_t*)detector);
 }
 
-int mmdeploy_rotated_detector_create_input(const mmdeploy_mat_t* mats, int mat_count,
-                                           mmdeploy_value_t* input) {
-  return mmdeploy_common_create_input(mats, mat_count, input);
+int mmdeploy_rotated_detector_create_input(const mmdeploy_mat_t* mats, int mat_count, mmdeploy_value_t* input)
+{
+    return mmdeploy_common_create_input(mats, mat_count, input);
 }
 
-int mmdeploy_rotated_detector_apply_v2(mmdeploy_rotated_detector_t detector, mmdeploy_value_t input,
-                                       mmdeploy_value_t* output) {
-  return mmdeploy_pipeline_apply((mmdeploy_pipeline_t)detector, input, output);
+int mmdeploy_rotated_detector_apply_v2(mmdeploy_rotated_detector_t detector, mmdeploy_value_t input, mmdeploy_value_t* output)
+{
+    return mmdeploy_pipeline_apply((mmdeploy_pipeline_t)detector, input, output);
 }
 
 int mmdeploy_rotated_detector_apply_async(mmdeploy_rotated_detector_t detector,
-                                          mmdeploy_sender_t input, mmdeploy_sender_t* output) {
-  return mmdeploy_pipeline_apply_async((mmdeploy_pipeline_t)detector, input, output);
+                                          mmdeploy_sender_t           input,
+                                          mmdeploy_sender_t*          output)
+{
+    return mmdeploy_pipeline_apply_async((mmdeploy_pipeline_t)detector, input, output);
 }
 
-int mmdeploy_rotated_detector_get_result(mmdeploy_value_t output,
+int mmdeploy_rotated_detector_get_result(mmdeploy_value_t               output,
                                          mmdeploy_rotated_detection_t** results,
-                                         int** result_count) {
-  if (!output || !results || !result_count) {
-    return MMDEPLOY_E_INVALID_ARG;
-  }
-
-  try {
-    Value& value = Cast(output)->front();
-    auto detector_outputs = from_value<vector<mmrotate::RotatedDetectorOutput>>(value);
-
-    vector<int> _result_count;
-    _result_count.reserve(detector_outputs.size());
-    for (const auto& det_output : detector_outputs) {
-      _result_count.push_back((int)det_output.detections.size());
+                                         int**                          result_count)
+{
+    if (!output || !results || !result_count)
+    {
+        return MMDEPLOY_E_INVALID_ARG;
     }
 
-    auto total = std::accumulate(_result_count.begin(), _result_count.end(), 0);
+    try
+    {
+        Value&      value            = Cast(output)->front();
+        auto        detector_outputs = from_value<vector<mmrotate::RotatedDetectorOutput>>(value);
 
-    std::unique_ptr<int[]> result_count_data(new int[_result_count.size()]{});
-    std::copy(_result_count.begin(), _result_count.end(), result_count_data.get());
-
-    std::unique_ptr<mmdeploy_rotated_detection_t[]> result_data(
-        new mmdeploy_rotated_detection_t[total]{});
-    auto result_ptr = result_data.get();
-
-    for (const auto& det_output : detector_outputs) {
-      for (const auto& detection : det_output.detections) {
-        result_ptr->label_id = detection.label_id;
-        result_ptr->score = detection.score;
-        const auto& rbbox = detection.rbbox;
-        for (int i = 0; i < 5; i++) {
-          result_ptr->rbbox[i] = rbbox[i];
+        vector<int> _result_count;
+        _result_count.reserve(detector_outputs.size());
+        for (const auto& det_output : detector_outputs)
+        {
+            _result_count.push_back((int)det_output.detections.size());
         }
-        ++result_ptr;
-      }
-    }
 
-    *result_count = result_count_data.release();
-    *results = result_data.release();
+        auto                   total = std::accumulate(_result_count.begin(), _result_count.end(), 0);
+
+        std::unique_ptr<int[]> result_count_data(new int[_result_count.size()]{});
+        std::copy(_result_count.begin(), _result_count.end(), result_count_data.get());
+
+        std::unique_ptr<mmdeploy_rotated_detection_t[]> result_data(
+            new mmdeploy_rotated_detection_t[total]{});
+        auto result_ptr = result_data.get();
+
+        for (const auto& det_output : detector_outputs)
+        {
+            for (const auto& detection : det_output.detections)
+            {
+                result_ptr->label_id = detection.label_id;
+                result_ptr->score    = detection.score;
+                const auto& rbbox    = detection.rbbox;
+                for (int i = 0; i < 5; i++)
+                {
+                    result_ptr->rbbox[i] = rbbox[i];
+                }
+                ++result_ptr;
+            }
+        }
 
-    return MMDEPLOY_SUCCESS;
+        *result_count = result_count_data.release();
+        *results      = result_data.release();
 
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("unhandled exception: {}", e.what());
-  } catch (...) {
-    MMDEPLOY_ERROR("unknown exception caught");
-  }
-  return MMDEPLOY_E_FAIL;
+        return MMDEPLOY_SUCCESS;
+    }
+    catch (const std::exception& e)
+    {
+        MMDEPLOY_ERROR("unhandled exception: {}", e.what());
+    }
+    catch (...)
+    {
+        MMDEPLOY_ERROR("unknown exception caught");
+    }
+    return MMDEPLOY_E_FAIL;
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/rotated_detector.h b/csrc/mmdeploy/apis/c/mmdeploy/rotated_detector.h
index 35125a74ff..1d745debae 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/rotated_detector.h
+++ b/csrc/mmdeploy/apis/c/mmdeploy/rotated_detector.h
@@ -13,125 +13,126 @@
 #include "mmdeploy/model.h"
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
-typedef struct mmdeploy_rotated_detection_t {
-  int label_id;
-  float score;
-  float rbbox[5];  // cx, cy, w, h, angle
-} mmdeploy_rotated_detection_t;
-
-typedef struct mmdeploy_rotated_detector* mmdeploy_rotated_detector_t;
-
-/**
- * @brief Create rotated detector's handle
- * @param[in] model an instance of mmrotate sdk model created by
- * \ref mmdeploy_model_create_by_path or \ref mmdeploy_model_create in \ref model.h
- * @param[in] device_name name of device, such as "cpu", "cuda", etc.
- * @param[in] device_id id of device.
- * @param[out] detector instance of a rotated detector
- * @return status of creating rotated detector's handle
- */
-MMDEPLOY_API int mmdeploy_rotated_detector_create(mmdeploy_model_t model, const char* device_name,
-                                                  int device_id,
-                                                  mmdeploy_rotated_detector_t* detector);
-
-/**
- * @brief Create rotated detector's handle
- * @param[in] model_path path of mmrotate sdk model exported by mmdeploy model converter
- * @param[in] device_name name of device, such as "cpu", "cuda", etc.
- * @param[in] device_id id of device.
- * @param[out] detector instance of a rotated detector
- * @return status of creating rotated detector's handle
- */
-MMDEPLOY_API int mmdeploy_rotated_detector_create_by_path(const char* model_path,
-                                                          const char* device_name, int device_id,
-                                                          mmdeploy_rotated_detector_t* detector);
-
-/**
- * @brief Apply rotated detector to batch images and get their inference results
- * @param[in] detector rotated detector's handle created by \ref
- * mmdeploy_rotated_detector_create_by_path
- * @param[in] mats a batch of images
- * @param[in] mat_count number of images in the batch
- * @param[out] results a linear buffer to save detection results of each image. It must be released
- * by \ref mmdeploy_rotated_detector_release_result
- * @param[out] result_count a linear buffer with length being \p mat_count to save the number of
- * detection results of each image. And it must be released by \ref
- * mmdeploy_rotated_detector_release_result
- * @return status of inference
- */
-MMDEPLOY_API int mmdeploy_rotated_detector_apply(mmdeploy_rotated_detector_t detector,
-                                                 const mmdeploy_mat_t* mats, int mat_count,
-                                                 mmdeploy_rotated_detection_t** results,
-                                                 int** result_count);
-
-/** @brief Release the inference result buffer created by \ref mmdeploy_rotated_detector_apply
- * @param[in] results rotated detection results buffer
- * @param[in] result_count  \p results size buffer
- */
-MMDEPLOY_API void mmdeploy_rotated_detector_release_result(mmdeploy_rotated_detection_t* results,
-                                                           const int* result_count);
-
-/**
- * @brief Destroy rotated detector's handle
- * @param[in] detector rotated detector's handle created by \ref
- * mmdeploy_rotated_detector_create_by_path or by \ref mmdeploy_rotated_detector_create
- */
-MMDEPLOY_API void mmdeploy_rotated_detector_destroy(mmdeploy_rotated_detector_t detector);
-
-/******************************************************************************
- * Experimental asynchronous APIs */
-
-/**
- * @brief Same as \ref mmdeploy_detector_create, but allows to control execution context of tasks
- * via context
- */
-MMDEPLOY_API int mmdeploy_rotated_detector_create_v2(mmdeploy_model_t model,
-                                                     mmdeploy_context_t context,
-                                                     mmdeploy_rotated_detector_t* detector);
-
-/**
- * @brief Pack rotated detector inputs into mmdeploy_value_t
- * @param[in] mats a batch of images
- * @param[in] mat_count number of images in the batch
- * @return the created value
- */
-MMDEPLOY_API int mmdeploy_rotated_detector_create_input(const mmdeploy_mat_t* mats, int mat_count,
-                                                        mmdeploy_value_t* input);
-
-/**
- * @brief Same as \ref mmdeploy_rotated_detector_apply, but input and output are packed in \ref
- * mmdeploy_value_t.
- */
-MMDEPLOY_API int mmdeploy_rotated_detector_apply_v2(mmdeploy_rotated_detector_t detector,
-                                                    mmdeploy_value_t input,
-                                                    mmdeploy_value_t* output);
-
-/**
- * @brief Apply rotated detector asynchronously
- * @param[in] detector handle to the detector
- * @param[in] input input sender
- * @return output sender
- */
-MMDEPLOY_API int mmdeploy_rotated_detector_apply_async(mmdeploy_rotated_detector_t detector,
-                                                       mmdeploy_sender_t input,
-                                                       mmdeploy_sender_t* output);
-
-/**
- * @brief Unpack rotated detector output from a mmdeploy_value_t
- * @param[in] output output obtained by applying a detector
- * @param[out] results a linear buffer to save detection results of each image. It must be released
- * by \ref mmdeploy_detector_release_result
- * @param[out] result_count a linear buffer with length number of input images to save the number of
- * detection results of each image. Must be released by \ref
- * mmdeploy_detector_release_result
- * @return status of the operation
- */
-MMDEPLOY_API int mmdeploy_rotated_detector_get_result(mmdeploy_value_t output,
-                                                      mmdeploy_rotated_detection_t** results,
-                                                      int** result_count);
+    typedef struct mmdeploy_rotated_detection_t
+    {
+        int   label_id;
+        float score;
+        float rbbox[5];  // cx, cy, w, h, angle
+    } mmdeploy_rotated_detection_t;
+
+    typedef struct mmdeploy_rotated_detector* mmdeploy_rotated_detector_t;
+
+    /**
+     * @brief Create rotated detector's handle
+     * @param[in] model an instance of mmrotate sdk model created by
+     * \ref mmdeploy_model_create_by_path or \ref mmdeploy_model_create in \ref model.h
+     * @param[in] device_name name of device, such as "cpu", "cuda", etc.
+     * @param[in] device_id id of device.
+     * @param[out] detector instance of a rotated detector
+     * @return status of creating rotated detector's handle
+     */
+    MMDEPLOY_API int                          mmdeploy_rotated_detector_create(mmdeploy_model_t model, const char* device_name, int device_id, mmdeploy_rotated_detector_t* detector);
+
+    /**
+     * @brief Create rotated detector's handle
+     * @param[in] model_path path of mmrotate sdk model exported by mmdeploy model converter
+     * @param[in] device_name name of device, such as "cpu", "cuda", etc.
+     * @param[in] device_id id of device.
+     * @param[out] detector instance of a rotated detector
+     * @return status of creating rotated detector's handle
+     */
+    MMDEPLOY_API int                          mmdeploy_rotated_detector_create_by_path(const char*                  model_path,
+                                                                                       const char*                  device_name,
+                                                                                       int                          device_id,
+                                                                                       mmdeploy_rotated_detector_t* detector);
+
+    /**
+     * @brief Apply rotated detector to batch images and get their inference results
+     * @param[in] detector rotated detector's handle created by \ref
+     * mmdeploy_rotated_detector_create_by_path
+     * @param[in] mats a batch of images
+     * @param[in] mat_count number of images in the batch
+     * @param[out] results a linear buffer to save detection results of each image. It must be released
+     * by \ref mmdeploy_rotated_detector_release_result
+     * @param[out] result_count a linear buffer with length being \p mat_count to save the number of
+     * detection results of each image. And it must be released by \ref
+     * mmdeploy_rotated_detector_release_result
+     * @return status of inference
+     */
+    MMDEPLOY_API int                          mmdeploy_rotated_detector_apply(mmdeploy_rotated_detector_t    detector,
+                                                                              const mmdeploy_mat_t*          mats,
+                                                                              int                            mat_count,
+                                                                              mmdeploy_rotated_detection_t** results,
+                                                                              int**                          result_count);
+
+    /** @brief Release the inference result buffer created by \ref mmdeploy_rotated_detector_apply
+     * @param[in] results rotated detection results buffer
+     * @param[in] result_count  \p results size buffer
+     */
+    MMDEPLOY_API void                         mmdeploy_rotated_detector_release_result(mmdeploy_rotated_detection_t* results,
+                                                                                       const int*                    result_count);
+
+    /**
+     * @brief Destroy rotated detector's handle
+     * @param[in] detector rotated detector's handle created by \ref
+     * mmdeploy_rotated_detector_create_by_path or by \ref mmdeploy_rotated_detector_create
+     */
+    MMDEPLOY_API void                         mmdeploy_rotated_detector_destroy(mmdeploy_rotated_detector_t detector);
+
+    /******************************************************************************
+     * Experimental asynchronous APIs */
+
+    /**
+     * @brief Same as \ref mmdeploy_detector_create, but allows to control execution context of tasks
+     * via context
+     */
+    MMDEPLOY_API int                          mmdeploy_rotated_detector_create_v2(mmdeploy_model_t             model,
+                                                                                  mmdeploy_context_t           context,
+                                                                                  mmdeploy_rotated_detector_t* detector);
+
+    /**
+     * @brief Pack rotated detector inputs into mmdeploy_value_t
+     * @param[in] mats a batch of images
+     * @param[in] mat_count number of images in the batch
+     * @return the created value
+     */
+    MMDEPLOY_API int                          mmdeploy_rotated_detector_create_input(const mmdeploy_mat_t* mats, int mat_count, mmdeploy_value_t* input);
+
+    /**
+     * @brief Same as \ref mmdeploy_rotated_detector_apply, but input and output are packed in \ref
+     * mmdeploy_value_t.
+     */
+    MMDEPLOY_API int                          mmdeploy_rotated_detector_apply_v2(mmdeploy_rotated_detector_t detector,
+                                                                                 mmdeploy_value_t            input,
+                                                                                 mmdeploy_value_t*           output);
+
+    /**
+     * @brief Apply rotated detector asynchronously
+     * @param[in] detector handle to the detector
+     * @param[in] input input sender
+     * @return output sender
+     */
+    MMDEPLOY_API int                          mmdeploy_rotated_detector_apply_async(mmdeploy_rotated_detector_t detector,
+                                                                                    mmdeploy_sender_t           input,
+                                                                                    mmdeploy_sender_t*          output);
+
+    /**
+     * @brief Unpack rotated detector output from a mmdeploy_value_t
+     * @param[in] output output obtained by applying a detector
+     * @param[out] results a linear buffer to save detection results of each image. It must be released
+     * by \ref mmdeploy_detector_release_result
+     * @param[out] result_count a linear buffer with length number of input images to save the number of
+     * detection results of each image. Must be released by \ref
+     * mmdeploy_detector_release_result
+     * @return status of the operation
+     */
+    MMDEPLOY_API int                          mmdeploy_rotated_detector_get_result(mmdeploy_value_t               output,
+                                                                                   mmdeploy_rotated_detection_t** results,
+                                                                                   int**                          result_count);
 
 #ifdef __cplusplus
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/segmentor.cpp b/csrc/mmdeploy/apis/c/mmdeploy/segmentor.cpp
index c982df39e5..9ec8ae366c 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/segmentor.cpp
+++ b/csrc/mmdeploy/apis/c/mmdeploy/segmentor.cpp
@@ -18,111 +18,128 @@ using namespace mmdeploy;
 
 using ResultType = mmdeploy::Structure<mmdeploy_segmentation_t, mmdeploy::framework::Buffer>;
 
-int mmdeploy_segmentor_create(mmdeploy_model_t model, const char* device_name, int device_id,
-                              mmdeploy_segmentor_t* segmentor) {
-  mmdeploy_context_t context{};
-  auto ec = mmdeploy_context_create_by_device(device_name, device_id, &context);
-  if (ec != MMDEPLOY_SUCCESS) {
+int mmdeploy_segmentor_create(mmdeploy_model_t model, const char* device_name, int device_id, mmdeploy_segmentor_t* segmentor)
+{
+    mmdeploy_context_t context{};
+    auto               ec = mmdeploy_context_create_by_device(device_name, device_id, &context);
+    if (ec != MMDEPLOY_SUCCESS)
+    {
+        return ec;
+    }
+    ec = mmdeploy_segmentor_create_v2(model, context, segmentor);
+    mmdeploy_context_destroy(context);
     return ec;
-  }
-  ec = mmdeploy_segmentor_create_v2(model, context, segmentor);
-  mmdeploy_context_destroy(context);
-  return ec;
 }
 
-int mmdeploy_segmentor_create_by_path(const char* model_path, const char* device_name,
-                                      int device_id, mmdeploy_segmentor_t* segmentor) {
-  mmdeploy_model_t model{};
-  if (auto ec = mmdeploy_model_create_by_path(model_path, &model)) {
+int mmdeploy_segmentor_create_by_path(const char* model_path, const char* device_name, int device_id, mmdeploy_segmentor_t* segmentor)
+{
+    mmdeploy_model_t model{};
+    if (auto ec = mmdeploy_model_create_by_path(model_path, &model))
+    {
+        return ec;
+    }
+    auto ec = mmdeploy_segmentor_create(model, device_name, device_id, segmentor);
+    mmdeploy_model_destroy(model);
     return ec;
-  }
-  auto ec = mmdeploy_segmentor_create(model, device_name, device_id, segmentor);
-  mmdeploy_model_destroy(model);
-  return ec;
 }
 
-int mmdeploy_segmentor_apply(mmdeploy_segmentor_t segmentor, const mmdeploy_mat_t* mats,
-                             int mat_count, mmdeploy_segmentation_t** results) {
-  wrapped<mmdeploy_value_t> input;
-  if (auto ec = mmdeploy_segmentor_create_input(mats, mat_count, input.ptr())) {
-    return ec;
-  }
-  wrapped<mmdeploy_value_t> output;
-  if (auto ec = mmdeploy_segmentor_apply_v2(segmentor, input, output.ptr())) {
-    return ec;
-  }
-  if (auto ec = mmdeploy_segmentor_get_result(output, results)) {
-    return ec;
-  }
-  return MMDEPLOY_SUCCESS;
+int mmdeploy_segmentor_apply(mmdeploy_segmentor_t segmentor, const mmdeploy_mat_t* mats, int mat_count, mmdeploy_segmentation_t** results)
+{
+    wrapped<mmdeploy_value_t> input;
+    if (auto ec = mmdeploy_segmentor_create_input(mats, mat_count, input.ptr()))
+    {
+        return ec;
+    }
+    wrapped<mmdeploy_value_t> output;
+    if (auto ec = mmdeploy_segmentor_apply_v2(segmentor, input, output.ptr()))
+    {
+        return ec;
+    }
+    if (auto ec = mmdeploy_segmentor_get_result(output, results))
+    {
+        return ec;
+    }
+    return MMDEPLOY_SUCCESS;
 }
 
-void mmdeploy_segmentor_release_result(mmdeploy_segmentation_t* results, int count) {
-  ResultType deleter(static_cast<size_t>(count), results);
+void mmdeploy_segmentor_release_result(mmdeploy_segmentation_t* results, int count)
+{
+    ResultType deleter(static_cast<size_t>(count), results);
 }
 
-void mmdeploy_segmentor_destroy(mmdeploy_segmentor_t segmentor) {
-  mmdeploy_pipeline_destroy((mmdeploy_pipeline_t)segmentor);
+void mmdeploy_segmentor_destroy(mmdeploy_segmentor_t segmentor)
+{
+    mmdeploy_pipeline_destroy((mmdeploy_pipeline_t)segmentor);
 }
 
-int mmdeploy_segmentor_create_v2(mmdeploy_model_t model, mmdeploy_context_t context,
-                                 mmdeploy_segmentor_t* segmentor) {
-  return mmdeploy_pipeline_create_from_model(model, context, (mmdeploy_pipeline_t*)segmentor);
+int mmdeploy_segmentor_create_v2(mmdeploy_model_t model, mmdeploy_context_t context, mmdeploy_segmentor_t* segmentor)
+{
+    return mmdeploy_pipeline_create_from_model(model, context, (mmdeploy_pipeline_t*)segmentor);
 }
 
-int mmdeploy_segmentor_create_input(const mmdeploy_mat_t* mats, int mat_count,
-                                    mmdeploy_value_t* value) {
-  return mmdeploy_common_create_input(mats, mat_count, value);
+int mmdeploy_segmentor_create_input(const mmdeploy_mat_t* mats, int mat_count, mmdeploy_value_t* value)
+{
+    return mmdeploy_common_create_input(mats, mat_count, value);
 }
 
-int mmdeploy_segmentor_apply_v2(mmdeploy_segmentor_t segmentor, mmdeploy_value_t input,
-                                mmdeploy_value_t* output) {
-  return mmdeploy_pipeline_apply((mmdeploy_pipeline_t)segmentor, input, output);
+int mmdeploy_segmentor_apply_v2(mmdeploy_segmentor_t segmentor, mmdeploy_value_t input, mmdeploy_value_t* output)
+{
+    return mmdeploy_pipeline_apply((mmdeploy_pipeline_t)segmentor, input, output);
 }
 
-int mmdeploy_segmentor_apply_async(mmdeploy_segmentor_t segmentor, mmdeploy_sender_t input,
-                                   mmdeploy_sender_t* output) {
-  return mmdeploy_pipeline_apply_async((mmdeploy_pipeline_t)segmentor, input, output);
+int mmdeploy_segmentor_apply_async(mmdeploy_segmentor_t segmentor, mmdeploy_sender_t input, mmdeploy_sender_t* output)
+{
+    return mmdeploy_pipeline_apply_async((mmdeploy_pipeline_t)segmentor, input, output);
 }
 
-int mmdeploy_segmentor_get_result(mmdeploy_value_t output, mmdeploy_segmentation_t** results) {
-  try {
-    const auto& value = Cast(output)->front();
-    size_t image_count = value.size();
-
-    ResultType r(image_count);
-    auto [results_data, buffers] = r.pointers();
-
-    auto results_ptr = results_data;
-
-    for (auto i = 0; i < image_count; ++i, ++results_ptr) {
-      auto& output_item = value[i];
-      MMDEPLOY_DEBUG("the {}-th item in output: {}", i, output_item);
-      auto segmentor_output = from_value<mmseg::SegmentorOutput>(output_item);
-      results_ptr->height = segmentor_output.height;
-      results_ptr->width = segmentor_output.width;
-      results_ptr->classes = segmentor_output.classes;
-      auto& mask = segmentor_output.mask;
-      auto& score = segmentor_output.score;
-      results_ptr->mask = nullptr;
-      results_ptr->score = nullptr;
-      if (mask.shape().size()) {
-        results_ptr->mask = mask.data<int>();
-        buffers[i] = mask.buffer();
-      } else {
-        results_ptr->score = score.data<float>();
-        buffers[i] = score.buffer();
-      }
+int mmdeploy_segmentor_get_result(mmdeploy_value_t output, mmdeploy_segmentation_t** results)
+{
+    try
+    {
+        const auto& value       = Cast(output)->front();
+        size_t      image_count = value.size();
+
+        ResultType  r(image_count);
+        auto [results_data, buffers] = r.pointers();
+
+        auto results_ptr = results_data;
+
+        for (auto i = 0; i < image_count; ++i, ++results_ptr)
+        {
+            auto& output_item = value[i];
+            MMDEPLOY_DEBUG("the {}-th item in output: {}", i, output_item);
+            auto segmentor_output = from_value<mmseg::SegmentorOutput>(output_item);
+            results_ptr->height   = segmentor_output.height;
+            results_ptr->width    = segmentor_output.width;
+            results_ptr->classes  = segmentor_output.classes;
+            auto& mask            = segmentor_output.mask;
+            auto& score           = segmentor_output.score;
+            results_ptr->mask     = nullptr;
+            results_ptr->score    = nullptr;
+            if (mask.shape().size())
+            {
+                results_ptr->mask = mask.data<int>();
+                buffers[i]        = mask.buffer();
+            }
+            else
+            {
+                results_ptr->score = score.data<float>();
+                buffers[i]         = score.buffer();
+            }
+        }
+
+        *results = results_data;
+        r.release();
+
+        return MMDEPLOY_SUCCESS;
     }
-
-    *results = results_data;
-    r.release();
-
-    return MMDEPLOY_SUCCESS;
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("exception caught: {}", e.what());
-  } catch (...) {
-    MMDEPLOY_ERROR("unknown exception caught");
-  }
-  return MMDEPLOY_E_FAIL;
+    catch (const std::exception& e)
+    {
+        MMDEPLOY_ERROR("exception caught: {}", e.what());
+    }
+    catch (...)
+    {
+        MMDEPLOY_ERROR("unknown exception caught");
+    }
+    return MMDEPLOY_E_FAIL;
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/segmentor.h b/csrc/mmdeploy/apis/c/mmdeploy/segmentor.h
index 65bcfd03f3..8d885a275b 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/segmentor.h
+++ b/csrc/mmdeploy/apis/c/mmdeploy/segmentor.h
@@ -13,91 +13,90 @@
 #include "mmdeploy/model.h"
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
-typedef struct mmdeploy_segmentation_t {
-  int height;    ///< height of \p mask that equals to the input image's height
-  int width;     ///< width of \p mask that equals to the input image's width
-  int classes;   ///< the number of labels in \p mask
-  int* mask;     ///< segmentation mask of the input image, in which mask[i * width + j] indicates
-                 ///< the label id of pixel at (i, j), this field might be null
-  float* score;  ///< segmentation score map of the input image in CHW format, in which
-                 ///< score[height * width * k + i * width + j] indicates the score
-                 ///< of class k at pixel (i, j), this field might be null
-} mmdeploy_segmentation_t;
-
-typedef struct mmdeploy_segmentor* mmdeploy_segmentor_t;
-
-/**
- * @brief Create segmentor's handle
- * @param[in] model an instance of mmsegmentation sdk model created by
- * \ref mmdeploy_model_create_by_path or \ref mmdeploy_model_create in \ref model.h
- * @param[in] device_name name of device, such as "cpu", "cuda", etc.
- * @param[in] device_id id of device.
- * @param[out] segmentor instance of a segmentor, which must be destroyed
- * by \ref mmdeploy_segmentor_destroy
- * @return status of creating segmentor's handle
- */
-MMDEPLOY_API int mmdeploy_segmentor_create(mmdeploy_model_t model, const char* device_name,
-                                           int device_id, mmdeploy_segmentor_t* segmentor);
-
-/**
- * @brief Create segmentor's handle
- * @param[in] model_path path of mmsegmentation sdk model exported by mmdeploy model converter
- * @param[in] device_name name of device, such as "cpu", "cuda", etc.
- * @param[in] device_id id of device.
- * @param[out] segmentor instance of a segmentor, which must be destroyed
- * by \ref mmdeploy_segmentor_destroy
- * @return status of creating segmentor's handle
- */
-MMDEPLOY_API int mmdeploy_segmentor_create_by_path(const char* model_path, const char* device_name,
-                                                   int device_id, mmdeploy_segmentor_t* segmentor);
-
-/**
- * @brief Apply segmentor to batch images and get their inference results
- * @param[in] segmentor segmentor's handle created by \ref mmdeploy_segmentor_create_by_path or \ref
- * mmdeploy_segmentor_create
- * @param[in] mats a batch of images
- * @param[in] mat_count number of images in the batch
- * @param[out] results a linear buffer of length \p mat_count to save segmentation result of each
- * image. It must be released by \ref mmdeploy_segmentor_release_result
- * @return status of inference
- */
-MMDEPLOY_API int mmdeploy_segmentor_apply(mmdeploy_segmentor_t segmentor,
-                                          const mmdeploy_mat_t* mats, int mat_count,
-                                          mmdeploy_segmentation_t** results);
-
-/**
- * @brief Release result buffer returned by \ref mmdeploy_segmentor_apply
- * @param[in] results result buffer
- * @param[in] count length of \p results
- */
-MMDEPLOY_API void mmdeploy_segmentor_release_result(mmdeploy_segmentation_t* results, int count);
-
-/**
- * @brief Destroy segmentor's handle
- * @param[in] segmentor segmentor's handle created by \ref mmdeploy_segmentor_create_by_path
- */
-MMDEPLOY_API void mmdeploy_segmentor_destroy(mmdeploy_segmentor_t segmentor);
-
-/******************************************************************************
- * Experimental asynchronous APIs */
-
-MMDEPLOY_API int mmdeploy_segmentor_create_v2(mmdeploy_model_t model, mmdeploy_context_t context,
-                                              mmdeploy_segmentor_t* segmentor);
-
-MMDEPLOY_API int mmdeploy_segmentor_create_input(const mmdeploy_mat_t* mats, int mat_count,
-                                                 mmdeploy_value_t* value);
-
-MMDEPLOY_API int mmdeploy_segmentor_apply_v2(mmdeploy_segmentor_t segmentor, mmdeploy_value_t input,
-                                             mmdeploy_value_t* output);
-
-MMDEPLOY_API int mmdeploy_segmentor_apply_async(mmdeploy_segmentor_t segmentor,
-                                                mmdeploy_sender_t input, mmdeploy_sender_t* output);
-
-MMDEPLOY_API int mmdeploy_segmentor_get_result(mmdeploy_value_t output,
-                                               mmdeploy_segmentation_t** results);
+    typedef struct mmdeploy_segmentation_t
+    {
+        int    height;   ///< height of \p mask that equals to the input image's height
+        int    width;    ///< width of \p mask that equals to the input image's width
+        int    classes;  ///< the number of labels in \p mask
+        int*   mask;     ///< segmentation mask of the input image, in which mask[i * width + j] indicates
+                         ///< the label id of pixel at (i, j), this field might be null
+        float* score;    ///< segmentation score map of the input image in CHW format, in which
+                         ///< score[height * width * k + i * width + j] indicates the score
+                         ///< of class k at pixel (i, j), this field might be null
+    } mmdeploy_segmentation_t;
+
+    typedef struct mmdeploy_segmentor* mmdeploy_segmentor_t;
+
+    /**
+     * @brief Create segmentor's handle
+     * @param[in] model an instance of mmsegmentation sdk model created by
+     * \ref mmdeploy_model_create_by_path or \ref mmdeploy_model_create in \ref model.h
+     * @param[in] device_name name of device, such as "cpu", "cuda", etc.
+     * @param[in] device_id id of device.
+     * @param[out] segmentor instance of a segmentor, which must be destroyed
+     * by \ref mmdeploy_segmentor_destroy
+     * @return status of creating segmentor's handle
+     */
+    MMDEPLOY_API int                   mmdeploy_segmentor_create(mmdeploy_model_t model, const char* device_name, int device_id, mmdeploy_segmentor_t* segmentor);
+
+    /**
+     * @brief Create segmentor's handle
+     * @param[in] model_path path of mmsegmentation sdk model exported by mmdeploy model converter
+     * @param[in] device_name name of device, such as "cpu", "cuda", etc.
+     * @param[in] device_id id of device.
+     * @param[out] segmentor instance of a segmentor, which must be destroyed
+     * by \ref mmdeploy_segmentor_destroy
+     * @return status of creating segmentor's handle
+     */
+    MMDEPLOY_API int                   mmdeploy_segmentor_create_by_path(const char* model_path, const char* device_name, int device_id, mmdeploy_segmentor_t* segmentor);
+
+    /**
+     * @brief Apply segmentor to batch images and get their inference results
+     * @param[in] segmentor segmentor's handle created by \ref mmdeploy_segmentor_create_by_path or \ref
+     * mmdeploy_segmentor_create
+     * @param[in] mats a batch of images
+     * @param[in] mat_count number of images in the batch
+     * @param[out] results a linear buffer of length \p mat_count to save segmentation result of each
+     * image. It must be released by \ref mmdeploy_segmentor_release_result
+     * @return status of inference
+     */
+    MMDEPLOY_API int                   mmdeploy_segmentor_apply(mmdeploy_segmentor_t      segmentor,
+                                                                const mmdeploy_mat_t*     mats,
+                                                                int                       mat_count,
+                                                                mmdeploy_segmentation_t** results);
+
+    /**
+     * @brief Release result buffer returned by \ref mmdeploy_segmentor_apply
+     * @param[in] results result buffer
+     * @param[in] count length of \p results
+     */
+    MMDEPLOY_API void                  mmdeploy_segmentor_release_result(mmdeploy_segmentation_t* results, int count);
+
+    /**
+     * @brief Destroy segmentor's handle
+     * @param[in] segmentor segmentor's handle created by \ref mmdeploy_segmentor_create_by_path
+     */
+    MMDEPLOY_API void                  mmdeploy_segmentor_destroy(mmdeploy_segmentor_t segmentor);
+
+    /******************************************************************************
+     * Experimental asynchronous APIs */
+
+    MMDEPLOY_API int                   mmdeploy_segmentor_create_v2(mmdeploy_model_t model, mmdeploy_context_t context, mmdeploy_segmentor_t* segmentor);
+
+    MMDEPLOY_API int                   mmdeploy_segmentor_create_input(const mmdeploy_mat_t* mats, int mat_count, mmdeploy_value_t* value);
+
+    MMDEPLOY_API int                   mmdeploy_segmentor_apply_v2(mmdeploy_segmentor_t segmentor, mmdeploy_value_t input, mmdeploy_value_t* output);
+
+    MMDEPLOY_API int                   mmdeploy_segmentor_apply_async(mmdeploy_segmentor_t segmentor,
+                                                                      mmdeploy_sender_t    input,
+                                                                      mmdeploy_sender_t*   output);
+
+    MMDEPLOY_API int                   mmdeploy_segmentor_get_result(mmdeploy_value_t          output,
+                                                                     mmdeploy_segmentation_t** results);
 
 #ifdef __cplusplus
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/text_detector.cpp b/csrc/mmdeploy/apis/c/mmdeploy/text_detector.cpp
index 576af07762..44b124187f 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/text_detector.cpp
+++ b/csrc/mmdeploy/apis/c/mmdeploy/text_detector.cpp
@@ -16,158 +16,186 @@
 using namespace std;
 using namespace mmdeploy;
 
-int mmdeploy_text_detector_create(mmdeploy_model_t model, const char* device_name, int device_id,
-                                  mmdeploy_text_detector_t* detector) {
-  mmdeploy_context_t context{};
-  auto ec = mmdeploy_context_create_by_device(device_name, device_id, &context);
-  if (ec != MMDEPLOY_SUCCESS) {
+int mmdeploy_text_detector_create(mmdeploy_model_t model, const char* device_name, int device_id, mmdeploy_text_detector_t* detector)
+{
+    mmdeploy_context_t context{};
+    auto               ec = mmdeploy_context_create_by_device(device_name, device_id, &context);
+    if (ec != MMDEPLOY_SUCCESS)
+    {
+        return ec;
+    }
+    ec = mmdeploy_text_detector_create_v2(model, context, detector);
+    mmdeploy_context_destroy(context);
     return ec;
-  }
-  ec = mmdeploy_text_detector_create_v2(model, context, detector);
-  mmdeploy_context_destroy(context);
-  return ec;
 }
 
-int mmdeploy_text_detector_create_v2(mmdeploy_model_t model, mmdeploy_context_t context,
-                                     mmdeploy_text_detector_t* detector) {
-  return mmdeploy_pipeline_create_from_model(model, context, (mmdeploy_pipeline_t*)detector);
+int mmdeploy_text_detector_create_v2(mmdeploy_model_t model, mmdeploy_context_t context, mmdeploy_text_detector_t* detector)
+{
+    return mmdeploy_pipeline_create_from_model(model, context, (mmdeploy_pipeline_t*)detector);
 }
 
-int mmdeploy_text_detector_create_by_path(const char* model_path, const char* device_name,
-                                          int device_id, mmdeploy_text_detector_t* detector) {
-  mmdeploy_model_t model{};
-  if (auto ec = mmdeploy_model_create_by_path(model_path, &model)) {
+int mmdeploy_text_detector_create_by_path(const char* model_path, const char* device_name, int device_id, mmdeploy_text_detector_t* detector)
+{
+    mmdeploy_model_t model{};
+    if (auto ec = mmdeploy_model_create_by_path(model_path, &model))
+    {
+        return ec;
+    }
+    auto ec = mmdeploy_text_detector_create(model, device_name, device_id, detector);
+    mmdeploy_model_destroy(model);
     return ec;
-  }
-  auto ec = mmdeploy_text_detector_create(model, device_name, device_id, detector);
-  mmdeploy_model_destroy(model);
-  return ec;
 }
 
-int mmdeploy_text_detector_create_input(const mmdeploy_mat_t* mats, int mat_count,
-                                        mmdeploy_value_t* input) {
-  return mmdeploy_common_create_input(mats, mat_count, input);
+int mmdeploy_text_detector_create_input(const mmdeploy_mat_t* mats, int mat_count, mmdeploy_value_t* input)
+{
+    return mmdeploy_common_create_input(mats, mat_count, input);
 }
 
-int mmdeploy_text_detector_apply(mmdeploy_text_detector_t detector, const mmdeploy_mat_t* mats,
-                                 int mat_count, mmdeploy_text_detection_t** results,
-                                 int** result_count) {
-  wrapped<mmdeploy_value_t> input;
-  if (auto ec = mmdeploy_text_detector_create_input(mats, mat_count, input.ptr())) {
-    return ec;
-  }
-  wrapped<mmdeploy_value_t> output;
-  if (auto ec = mmdeploy_text_detector_apply_v2(detector, input, output.ptr())) {
-    return ec;
-  }
-  if (auto ec = mmdeploy_text_detector_get_result(output, results, result_count)) {
-    return ec;
-  }
-  return MMDEPLOY_SUCCESS;
+int mmdeploy_text_detector_apply(mmdeploy_text_detector_t detector, const mmdeploy_mat_t* mats, int mat_count, mmdeploy_text_detection_t** results, int** result_count)
+{
+    wrapped<mmdeploy_value_t> input;
+    if (auto ec = mmdeploy_text_detector_create_input(mats, mat_count, input.ptr()))
+    {
+        return ec;
+    }
+    wrapped<mmdeploy_value_t> output;
+    if (auto ec = mmdeploy_text_detector_apply_v2(detector, input, output.ptr()))
+    {
+        return ec;
+    }
+    if (auto ec = mmdeploy_text_detector_get_result(output, results, result_count))
+    {
+        return ec;
+    }
+    return MMDEPLOY_SUCCESS;
 }
 
-int mmdeploy_text_detector_apply_v2(mmdeploy_text_detector_t detector, mmdeploy_value_t input,
-                                    mmdeploy_value_t* output) {
-  return mmdeploy_pipeline_apply((mmdeploy_pipeline_t)detector, input, output);
+int mmdeploy_text_detector_apply_v2(mmdeploy_text_detector_t detector, mmdeploy_value_t input, mmdeploy_value_t* output)
+{
+    return mmdeploy_pipeline_apply((mmdeploy_pipeline_t)detector, input, output);
 }
 
-int mmdeploy_text_detector_apply_async(mmdeploy_text_detector_t detector, mmdeploy_sender_t input,
-                                       mmdeploy_sender_t* output) {
-  return mmdeploy_pipeline_apply_async((mmdeploy_pipeline_t)detector, input, output);
+int mmdeploy_text_detector_apply_async(mmdeploy_text_detector_t detector, mmdeploy_sender_t input, mmdeploy_sender_t* output)
+{
+    return mmdeploy_pipeline_apply_async((mmdeploy_pipeline_t)detector, input, output);
 }
 
-int mmdeploy_text_detector_get_result(mmdeploy_value_t output, mmdeploy_text_detection_t** results,
-                                      int** result_count) {
-  if (!output || !results || !result_count) {
-    return MMDEPLOY_E_INVALID_ARG;
-  }
-  try {
-    Value& value = reinterpret_cast<Value*>(output)->front();
-    auto detector_outputs = from_value<std::vector<mmocr::TextDetections>>(value);
-
-    vector<int> _result_count;
-    _result_count.reserve(detector_outputs.size());
-    for (const auto& det_output : detector_outputs) {
-      _result_count.push_back((int)det_output.size());
+int mmdeploy_text_detector_get_result(mmdeploy_value_t output, mmdeploy_text_detection_t** results, int** result_count)
+{
+    if (!output || !results || !result_count)
+    {
+        return MMDEPLOY_E_INVALID_ARG;
     }
-
-    auto total = std::accumulate(_result_count.begin(), _result_count.end(), 0);
-
-    std::unique_ptr<int[]> result_count_data(new int[_result_count.size()]{});
-    std::copy(_result_count.begin(), _result_count.end(), result_count_data.get());
-
-    std::unique_ptr<mmdeploy_text_detection_t[]> result_data(
-        new mmdeploy_text_detection_t[total]{});
-    auto result_ptr = result_data.get();
-
-    for (const auto& det_output : detector_outputs) {
-      for (auto i = 0; i < det_output.size(); ++i, ++result_ptr) {
-        result_ptr->score = det_output[i].score;
-        auto& bbox = det_output[i].bbox;
-        for (auto j = 0; j < bbox.size(); j += 2) {
-          result_ptr->bbox[j / 2].x = bbox[j];
-          result_ptr->bbox[j / 2].y = bbox[j + 1];
+    try
+    {
+        Value&      value            = reinterpret_cast<Value*>(output)->front();
+        auto        detector_outputs = from_value<std::vector<mmocr::TextDetections>>(value);
+
+        vector<int> _result_count;
+        _result_count.reserve(detector_outputs.size());
+        for (const auto& det_output : detector_outputs)
+        {
+            _result_count.push_back((int)det_output.size());
         }
-      }
-    }
 
-    *result_count = result_count_data.release();
-    *results = result_data.release();
+        auto                   total = std::accumulate(_result_count.begin(), _result_count.end(), 0);
+
+        std::unique_ptr<int[]> result_count_data(new int[_result_count.size()]{});
+        std::copy(_result_count.begin(), _result_count.end(), result_count_data.get());
+
+        std::unique_ptr<mmdeploy_text_detection_t[]> result_data(
+            new mmdeploy_text_detection_t[total]{});
+        auto result_ptr = result_data.get();
+
+        for (const auto& det_output : detector_outputs)
+        {
+            for (auto i = 0; i < det_output.size(); ++i, ++result_ptr)
+            {
+                result_ptr->score = det_output[i].score;
+                auto& bbox        = det_output[i].bbox;
+                for (auto j = 0; j < bbox.size(); j += 2)
+                {
+                    result_ptr->bbox[j / 2].x = bbox[j];
+                    result_ptr->bbox[j / 2].y = bbox[j + 1];
+                }
+            }
+        }
 
-    return MMDEPLOY_SUCCESS;
+        *result_count = result_count_data.release();
+        *results      = result_data.release();
 
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("unhandled exception: {}", e.what());
-  } catch (...) {
-    MMDEPLOY_ERROR("unknown exception caught");
-  }
-  return 0;
+        return MMDEPLOY_SUCCESS;
+    }
+    catch (const std::exception& e)
+    {
+        MMDEPLOY_ERROR("unhandled exception: {}", e.what());
+    }
+    catch (...)
+    {
+        MMDEPLOY_ERROR("unknown exception caught");
+    }
+    return 0;
 }
 
 void mmdeploy_text_detector_release_result(mmdeploy_text_detection_t* results,
-                                           const int* result_count, int count) {
-  delete[] results;
-  delete[] result_count;
+                                           const int*                 result_count,
+                                           int                        count)
+{
+    delete[] results;
+    delete[] result_count;
 }
 
-void mmdeploy_text_detector_destroy(mmdeploy_text_detector_t detector) {
-  mmdeploy_pipeline_destroy((mmdeploy_pipeline_t)detector);
+void mmdeploy_text_detector_destroy(mmdeploy_text_detector_t detector)
+{
+    mmdeploy_pipeline_destroy((mmdeploy_pipeline_t)detector);
 }
 
-int mmdeploy_text_detector_apply_async_v2(mmdeploy_text_detector_t detector,
-                                          const mmdeploy_mat_t* imgs, int img_count,
-                                          mmdeploy_text_detector_continue_t cont, void* context,
-                                          mmdeploy_sender_t* output) {
-  mmdeploy_sender_t result_sender{};
-  if (auto ec = mmdeploy_text_detector_apply_async_v3(detector, imgs, img_count, &result_sender)) {
-    return ec;
-  }
-  if (auto ec = mmdeploy_text_detector_continue_async(result_sender, cont, context, output)) {
-    return ec;
-  }
-  return MMDEPLOY_SUCCESS;
+int mmdeploy_text_detector_apply_async_v2(mmdeploy_text_detector_t          detector,
+                                          const mmdeploy_mat_t*             imgs,
+                                          int                               img_count,
+                                          mmdeploy_text_detector_continue_t cont,
+                                          void*                             context,
+                                          mmdeploy_sender_t*                output)
+{
+    mmdeploy_sender_t result_sender{};
+    if (auto ec = mmdeploy_text_detector_apply_async_v3(detector, imgs, img_count, &result_sender))
+    {
+        return ec;
+    }
+    if (auto ec = mmdeploy_text_detector_continue_async(result_sender, cont, context, output))
+    {
+        return ec;
+    }
+    return MMDEPLOY_SUCCESS;
 }
 
 int mmdeploy_text_detector_apply_async_v3(mmdeploy_text_detector_t detector,
-                                          const mmdeploy_mat_t* imgs, int img_count,
-                                          mmdeploy_sender_t* output) {
-  wrapped<mmdeploy_value_t> input_val;
-  if (auto ec = mmdeploy_text_detector_create_input(imgs, img_count, input_val.ptr())) {
-    return ec;
-  }
-  mmdeploy_sender_t input_sndr = mmdeploy_executor_just(input_val);
-  if (auto ec = mmdeploy_text_detector_apply_async(detector, input_sndr, output)) {
-    return ec;
-  }
-  return MMDEPLOY_SUCCESS;
+                                          const mmdeploy_mat_t*    imgs,
+                                          int                      img_count,
+                                          mmdeploy_sender_t*       output)
+{
+    wrapped<mmdeploy_value_t> input_val;
+    if (auto ec = mmdeploy_text_detector_create_input(imgs, img_count, input_val.ptr()))
+    {
+        return ec;
+    }
+    mmdeploy_sender_t input_sndr = mmdeploy_executor_just(input_val);
+    if (auto ec = mmdeploy_text_detector_apply_async(detector, input_sndr, output))
+    {
+        return ec;
+    }
+    return MMDEPLOY_SUCCESS;
 }
 
-int mmdeploy_text_detector_continue_async(mmdeploy_sender_t input,
-                                          mmdeploy_text_detector_continue_t cont, void* context,
-                                          mmdeploy_sender_t* output) {
-  auto sender = Guard([&] {
-    return Take(
-        LetValue(Take(input), [fn = cont, context](Value& value) -> TypeErasedSender<Value> {
+int mmdeploy_text_detector_continue_async(mmdeploy_sender_t                 input,
+                                          mmdeploy_text_detector_continue_t cont,
+                                          void*                             context,
+                                          mmdeploy_sender_t*                output)
+{
+    auto sender = Guard([&]
+                        { return Take(
+                              LetValue(Take(input), [fn = cont, context](Value& value) -> TypeErasedSender<Value>
+                                       {
           mmdeploy_text_detection_t* results{};
           int* result_count{};
           if (auto ec = mmdeploy_text_detector_get_result(Cast(&value), &results, &result_count)) {
@@ -178,12 +206,11 @@ int mmdeploy_text_detector_continue_async(mmdeploy_sender_t input,
           if (auto ec = fn(results, result_count, context, &output); ec || !output) {
             return Just(Value());
           }
-          return Take(output);
-        }));
-  });
-  if (sender) {
-    *output = sender;
-    return MMDEPLOY_SUCCESS;
-  }
-  return MMDEPLOY_E_FAIL;
+          return Take(output); })); });
+    if (sender)
+    {
+        *output = sender;
+        return MMDEPLOY_SUCCESS;
+    }
+    return MMDEPLOY_E_FAIL;
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/text_detector.h b/csrc/mmdeploy/apis/c/mmdeploy/text_detector.h
index a3c38dc6f6..da363940d7 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/text_detector.h
+++ b/csrc/mmdeploy/apis/c/mmdeploy/text_detector.h
@@ -13,141 +13,147 @@
 #include "mmdeploy/model.h"
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
-typedef struct mmdeploy_text_detection_t {
-  mmdeploy_point_t bbox[4];  ///< a text bounding box of which the vertex are in clock-wise
-  float score;
-} mmdeploy_text_detection_t;
-
-typedef struct mmdeploy_text_detector* mmdeploy_text_detector_t;
-
-/**
- * @brief Create text-detector's handle
- * @param[in] model an instance of mmocr text detection model created by
- * \ref mmdeploy_model_create_by_path or \ref mmdeploy_model_create in \ref model.h
- * @param[in] device_name name of device, such as "cpu", "cuda", etc.
- * @param[in] device_id id of device.
- * @param[out] detector instance of a text-detector, which must be destroyed
- * by \ref mmdeploy_text_detector_destroy
- * @return status of creating text-detector's handle
- */
-MMDEPLOY_API int mmdeploy_text_detector_create(mmdeploy_model_t model, const char* device_name,
-                                               int device_id, mmdeploy_text_detector_t* detector);
-
-/**
- * @brief Create text-detector's handle
- * @param[in] model_path path to text detection model
- * @param[in] device_name name of device, such as "cpu", "cuda", etc.
- * @param[in] device_id id of device
- * @param[out] detector instance of a text-detector, which must be destroyed
- * by \ref mmdeploy_text_detector_destroy
- * @return status of creating text-detector's handle
- */
-MMDEPLOY_API int mmdeploy_text_detector_create_by_path(const char* model_path,
-                                                       const char* device_name, int device_id,
-                                                       mmdeploy_text_detector_t* detector);
-
-/**
- * @brief Apply text-detector to batch images and get their inference results
- * @param[in] detector text-detector's handle created by \ref mmdeploy_text_detector_create_by_path
- * @param[in] mats a batch of images
- * @param[in] mat_count number of images in the batch
- * @param[out] results a linear buffer to save text detection results of each
- * image. It must be released by calling \ref mmdeploy_text_detector_release_result
- * @param[out] result_count a linear buffer of length \p mat_count to save the number of detection
- * results of each image. It must be released by \ref mmdeploy_detector_release_result
- * @return status of inference
- */
-MMDEPLOY_API int mmdeploy_text_detector_apply(mmdeploy_text_detector_t detector,
-                                              const mmdeploy_mat_t* mats, int mat_count,
-                                              mmdeploy_text_detection_t** results,
-                                              int** result_count);
-
-/** @brief Release the inference result buffer returned by \ref mmdeploy_text_detector_apply
- * @param[in] results text detection result buffer
- * @param[in] result_count  \p results size buffer
- * @param[in] count the length of buffer \p result_count
- */
-MMDEPLOY_API void mmdeploy_text_detector_release_result(mmdeploy_text_detection_t* results,
-                                                        const int* result_count, int count);
-
-/**
- * @brief Destroy text-detector's handle
- * @param[in] detector text-detector's handle created by \ref mmdeploy_text_detector_create_by_path
- * or \ref mmdeploy_text_detector_create
- */
-MMDEPLOY_API void mmdeploy_text_detector_destroy(mmdeploy_text_detector_t detector);
-
-/******************************************************************************
- * Experimental asynchronous APIs */
-
-/**
- * @brief Same as \ref mmdeploy_text_detector_create, but allows to control execution context of
- * tasks via context
- */
-MMDEPLOY_API int mmdeploy_text_detector_create_v2(mmdeploy_model_t model,
-                                                  mmdeploy_context_t context,
-                                                  mmdeploy_text_detector_t* detector);
-
-/**
- * @brief Pack text-detector inputs into mmdeploy_value_t
- * @param[in] mats a batch of images
- * @param[in] mat_count number of images in the batch
- * @return the created value
- */
-MMDEPLOY_API int mmdeploy_text_detector_create_input(const mmdeploy_mat_t* mats, int mat_count,
-                                                     mmdeploy_value_t* input);
-
-/**
- * @brief Same as \ref mmdeploy_text_detector_apply, but input and output are packed in \ref
- * mmdeploy_value_t.
- */
-MMDEPLOY_API int mmdeploy_text_detector_apply_v2(mmdeploy_text_detector_t detector,
-                                                 mmdeploy_value_t input, mmdeploy_value_t* output);
-
-/**
- * @brief Apply text-detector asynchronously
- * @param[in] detector handle to the detector
- * @param[in] input input sender that will be consumed by the operation
- * @return output sender
- */
-MMDEPLOY_API int mmdeploy_text_detector_apply_async(mmdeploy_text_detector_t detector,
-                                                    mmdeploy_sender_t input,
-                                                    mmdeploy_sender_t* output);
-
-/**
- * @brief Unpack detector output from a mmdeploy_value_t
- * @param[in] output output sender returned by applying a detector
- * @param[out] results a linear buffer to save detection results of each image. It must be
- * released by \ref mmdeploy_text_detector_release_result
- * @param[out] result_count a linear buffer with length number of input images to save the
- * number of detection results of each image. Must be released by \ref
- * mmdeploy_text_detector_release_result
- * @return status of the operation
- */
-MMDEPLOY_API
-int mmdeploy_text_detector_get_result(mmdeploy_value_t output, mmdeploy_text_detection_t** results,
-                                      int** result_count);
-
-typedef int (*mmdeploy_text_detector_continue_t)(mmdeploy_text_detection_t* results,
-                                                 int* result_count, void* context,
-                                                 mmdeploy_sender_t* output);
-
-// MMDEPLOY_API int mmdeploy_text_detector_apply_async_v2(mm_handle_t handle, const mm_mat_t* imgs,
-//                                                        int img_count,
-//                                                        mmdeploy_text_detector_continuation_t
-//                                                        cont, void* context, mmdeploy_sender_t*
-//                                                        output);
-
-MMDEPLOY_API int mmdeploy_text_detector_apply_async_v3(mmdeploy_text_detector_t detector,
-                                                       const mmdeploy_mat_t* imgs, int img_count,
-                                                       mmdeploy_sender_t* output);
-
-MMDEPLOY_API int mmdeploy_text_detector_continue_async(mmdeploy_sender_t input,
-                                                       mmdeploy_text_detector_continue_t cont,
-                                                       void* context, mmdeploy_sender_t* output);
+    typedef struct mmdeploy_text_detection_t
+    {
+        mmdeploy_point_t bbox[4];  ///< a text bounding box of which the vertex are in clock-wise
+        float            score;
+    } mmdeploy_text_detection_t;
+
+    typedef struct mmdeploy_text_detector* mmdeploy_text_detector_t;
+
+    /**
+     * @brief Create text-detector's handle
+     * @param[in] model an instance of mmocr text detection model created by
+     * \ref mmdeploy_model_create_by_path or \ref mmdeploy_model_create in \ref model.h
+     * @param[in] device_name name of device, such as "cpu", "cuda", etc.
+     * @param[in] device_id id of device.
+     * @param[out] detector instance of a text-detector, which must be destroyed
+     * by \ref mmdeploy_text_detector_destroy
+     * @return status of creating text-detector's handle
+     */
+    MMDEPLOY_API int                       mmdeploy_text_detector_create(mmdeploy_model_t model, const char* device_name, int device_id, mmdeploy_text_detector_t* detector);
+
+    /**
+     * @brief Create text-detector's handle
+     * @param[in] model_path path to text detection model
+     * @param[in] device_name name of device, such as "cpu", "cuda", etc.
+     * @param[in] device_id id of device
+     * @param[out] detector instance of a text-detector, which must be destroyed
+     * by \ref mmdeploy_text_detector_destroy
+     * @return status of creating text-detector's handle
+     */
+    MMDEPLOY_API int                       mmdeploy_text_detector_create_by_path(const char*               model_path,
+                                                                                 const char*               device_name,
+                                                                                 int                       device_id,
+                                                                                 mmdeploy_text_detector_t* detector);
+
+    /**
+     * @brief Apply text-detector to batch images and get their inference results
+     * @param[in] detector text-detector's handle created by \ref mmdeploy_text_detector_create_by_path
+     * @param[in] mats a batch of images
+     * @param[in] mat_count number of images in the batch
+     * @param[out] results a linear buffer to save text detection results of each
+     * image. It must be released by calling \ref mmdeploy_text_detector_release_result
+     * @param[out] result_count a linear buffer of length \p mat_count to save the number of detection
+     * results of each image. It must be released by \ref mmdeploy_detector_release_result
+     * @return status of inference
+     */
+    MMDEPLOY_API int                       mmdeploy_text_detector_apply(mmdeploy_text_detector_t    detector,
+                                                                        const mmdeploy_mat_t*       mats,
+                                                                        int                         mat_count,
+                                                                        mmdeploy_text_detection_t** results,
+                                                                        int**                       result_count);
+
+    /** @brief Release the inference result buffer returned by \ref mmdeploy_text_detector_apply
+     * @param[in] results text detection result buffer
+     * @param[in] result_count  \p results size buffer
+     * @param[in] count the length of buffer \p result_count
+     */
+    MMDEPLOY_API void                      mmdeploy_text_detector_release_result(mmdeploy_text_detection_t* results,
+                                                                                 const int*                 result_count,
+                                                                                 int                        count);
+
+    /**
+     * @brief Destroy text-detector's handle
+     * @param[in] detector text-detector's handle created by \ref mmdeploy_text_detector_create_by_path
+     * or \ref mmdeploy_text_detector_create
+     */
+    MMDEPLOY_API void                      mmdeploy_text_detector_destroy(mmdeploy_text_detector_t detector);
+
+    /******************************************************************************
+     * Experimental asynchronous APIs */
+
+    /**
+     * @brief Same as \ref mmdeploy_text_detector_create, but allows to control execution context of
+     * tasks via context
+     */
+    MMDEPLOY_API int                       mmdeploy_text_detector_create_v2(mmdeploy_model_t          model,
+                                                                            mmdeploy_context_t        context,
+                                                                            mmdeploy_text_detector_t* detector);
+
+    /**
+     * @brief Pack text-detector inputs into mmdeploy_value_t
+     * @param[in] mats a batch of images
+     * @param[in] mat_count number of images in the batch
+     * @return the created value
+     */
+    MMDEPLOY_API int                       mmdeploy_text_detector_create_input(const mmdeploy_mat_t* mats, int mat_count, mmdeploy_value_t* input);
+
+    /**
+     * @brief Same as \ref mmdeploy_text_detector_apply, but input and output are packed in \ref
+     * mmdeploy_value_t.
+     */
+    MMDEPLOY_API int                       mmdeploy_text_detector_apply_v2(mmdeploy_text_detector_t detector,
+                                                                           mmdeploy_value_t         input,
+                                                                           mmdeploy_value_t*        output);
+
+    /**
+     * @brief Apply text-detector asynchronously
+     * @param[in] detector handle to the detector
+     * @param[in] input input sender that will be consumed by the operation
+     * @return output sender
+     */
+    MMDEPLOY_API int                       mmdeploy_text_detector_apply_async(mmdeploy_text_detector_t detector,
+                                                                              mmdeploy_sender_t        input,
+                                                                              mmdeploy_sender_t*       output);
+
+    /**
+     * @brief Unpack detector output from a mmdeploy_value_t
+     * @param[in] output output sender returned by applying a detector
+     * @param[out] results a linear buffer to save detection results of each image. It must be
+     * released by \ref mmdeploy_text_detector_release_result
+     * @param[out] result_count a linear buffer with length number of input images to save the
+     * number of detection results of each image. Must be released by \ref
+     * mmdeploy_text_detector_release_result
+     * @return status of the operation
+     */
+    MMDEPLOY_API
+    int mmdeploy_text_detector_get_result(mmdeploy_value_t output, mmdeploy_text_detection_t** results, int** result_count);
+
+    typedef int (*mmdeploy_text_detector_continue_t)(mmdeploy_text_detection_t* results,
+                                                     int*                       result_count,
+                                                     void*                      context,
+                                                     mmdeploy_sender_t*         output);
+
+    // MMDEPLOY_API int mmdeploy_text_detector_apply_async_v2(mm_handle_t handle, const mm_mat_t* imgs,
+    //                                                        int img_count,
+    //                                                        mmdeploy_text_detector_continuation_t
+    //                                                        cont, void* context, mmdeploy_sender_t*
+    //                                                        output);
+
+    MMDEPLOY_API int mmdeploy_text_detector_apply_async_v3(mmdeploy_text_detector_t detector,
+                                                           const mmdeploy_mat_t*    imgs,
+                                                           int                      img_count,
+                                                           mmdeploy_sender_t*       output);
+
+    MMDEPLOY_API int mmdeploy_text_detector_continue_async(mmdeploy_sender_t                 input,
+                                                           mmdeploy_text_detector_continue_t cont,
+                                                           void*                             context,
+                                                           mmdeploy_sender_t*                output);
 
 #ifdef __cplusplus
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/text_recognizer.cpp b/csrc/mmdeploy/apis/c/mmdeploy/text_recognizer.cpp
index 3c8cfbb5c6..4c94666add 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/text_recognizer.cpp
+++ b/csrc/mmdeploy/apis/c/mmdeploy/text_recognizer.cpp
@@ -19,10 +19,12 @@
 
 using namespace mmdeploy;
 
-namespace {
+namespace
+{
 
-Value config_template(const Model& model) {
-  // clang-format off
+    Value config_template(const Model& model)
+    {
+        // clang-format off
   return {
     {"type", "Pipeline"},
     {"input", {"imgs", "bboxes"}},
@@ -44,194 +46,238 @@ Value config_template(const Model& model) {
     },
     {"output", "texts"},
   };
-  // clang-format on
-}
+        // clang-format on
+    }
 
 }  // namespace
 
-int mmdeploy_text_recognizer_create(mmdeploy_model_t model, const char* device_name, int device_id,
-                                    mmdeploy_text_recognizer_t* recognizer) {
-  mmdeploy_context_t context{};
-  auto ec = mmdeploy_context_create_by_device(device_name, device_id, &context);
-  if (ec != MMDEPLOY_SUCCESS) {
+int mmdeploy_text_recognizer_create(mmdeploy_model_t model, const char* device_name, int device_id, mmdeploy_text_recognizer_t* recognizer)
+{
+    mmdeploy_context_t context{};
+    auto               ec = mmdeploy_context_create_by_device(device_name, device_id, &context);
+    if (ec != MMDEPLOY_SUCCESS)
+    {
+        return ec;
+    }
+    ec = mmdeploy_text_recognizer_create_v2(model, context, recognizer);
+    mmdeploy_context_destroy(context);
     return ec;
-  }
-  ec = mmdeploy_text_recognizer_create_v2(model, context, recognizer);
-  mmdeploy_context_destroy(context);
-  return ec;
 }
 
-int mmdeploy_text_recognizer_create_v2(mmdeploy_model_t model, mmdeploy_context_t context,
-                                       mmdeploy_text_recognizer_t* recognizer) {
-  auto config = config_template(*Cast(model));
-  return mmdeploy_pipeline_create_v3(Cast(&config), context, (mmdeploy_pipeline_t*)recognizer);
+int mmdeploy_text_recognizer_create_v2(mmdeploy_model_t model, mmdeploy_context_t context, mmdeploy_text_recognizer_t* recognizer)
+{
+    auto config = config_template(*Cast(model));
+    return mmdeploy_pipeline_create_v3(Cast(&config), context, (mmdeploy_pipeline_t*)recognizer);
 }
 
-int mmdeploy_text_recognizer_create_by_path(const char* model_path, const char* device_name,
-                                            int device_id, mmdeploy_text_recognizer_t* recognizer) {
-  mmdeploy_model_t model{};
-  if (auto ec = mmdeploy_model_create_by_path(model_path, &model)) {
+int mmdeploy_text_recognizer_create_by_path(const char* model_path, const char* device_name, int device_id, mmdeploy_text_recognizer_t* recognizer)
+{
+    mmdeploy_model_t model{};
+    if (auto ec = mmdeploy_model_create_by_path(model_path, &model))
+    {
+        return ec;
+    }
+    auto ec = mmdeploy_text_recognizer_create(model, device_name, device_id, recognizer);
+    mmdeploy_model_destroy(model);
     return ec;
-  }
-  auto ec = mmdeploy_text_recognizer_create(model, device_name, device_id, recognizer);
-  mmdeploy_model_destroy(model);
-  return ec;
 }
 
-int mmdeploy_text_recognizer_apply(mmdeploy_text_recognizer_t recognizer,
-                                   const mmdeploy_mat_t* images, int count,
-                                   mmdeploy_text_recognition_t** results) {
-  return mmdeploy_text_recognizer_apply_bbox(recognizer, images, count, nullptr, nullptr, results);
+int mmdeploy_text_recognizer_apply(mmdeploy_text_recognizer_t    recognizer,
+                                   const mmdeploy_mat_t*         images,
+                                   int                           count,
+                                   mmdeploy_text_recognition_t** results)
+{
+    return mmdeploy_text_recognizer_apply_bbox(recognizer, images, count, nullptr, nullptr, results);
 }
 
-int mmdeploy_text_recognizer_create_input(const mmdeploy_mat_t* images, int image_count,
-                                          const mmdeploy_text_detection_t* bboxes,
-                                          const int* bbox_count, mmdeploy_value_t* output) {
-  if (image_count && images == nullptr) {
-    return MMDEPLOY_E_INVALID_ARG;
-  }
-  try {
-    Value::Array input_images;
-    Value::Array input_bboxes;
-
-    auto add_bbox = [&](Mat img, const mmdeploy_text_detection_t* det) {
-      if (det) {
-        const auto& b = det->bbox;
-        Value::Array bbox{b[0].x, b[0].y, b[1].x, b[1].y, b[2].x, b[2].y, b[3].x, b[3].y};
-        input_bboxes.push_back({{"bbox", std::move(bbox)}});
-      } else {
-        input_bboxes.push_back(nullptr);
-      }
-      input_images.push_back({{"ori_img", img}});
-    };
-
-    for (int i = 0; i < image_count; ++i) {
-      auto _mat = Cast(images[i]);
-      if (bboxes && bbox_count) {
-        for (int j = 0; j < bbox_count[i]; ++j) {
-          add_bbox(_mat, bboxes++);
-        }
-      } else {  // inference with whole image
-        add_bbox(_mat, nullptr);
-      }
+int mmdeploy_text_recognizer_create_input(const mmdeploy_mat_t* images, int image_count, const mmdeploy_text_detection_t* bboxes, const int* bbox_count, mmdeploy_value_t* output)
+{
+    if (image_count && images == nullptr)
+    {
+        return MMDEPLOY_E_INVALID_ARG;
     }
+    try
+    {
+        Value::Array input_images;
+        Value::Array input_bboxes;
 
-    *output = Take(Value{std::move(input_images), std::move(input_bboxes)});
-    return MMDEPLOY_SUCCESS;
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("exception caught: {}", e.what());
-  } catch (...) {
-    MMDEPLOY_ERROR("unknown exception caught");
-  }
-  return MMDEPLOY_E_FAIL;
+        auto         add_bbox = [&](Mat img, const mmdeploy_text_detection_t* det)
+        {
+            if (det)
+            {
+                const auto&  b = det->bbox;
+                Value::Array bbox{b[0].x, b[0].y, b[1].x, b[1].y, b[2].x, b[2].y, b[3].x, b[3].y};
+                input_bboxes.push_back({{"bbox", std::move(bbox)}});
+            }
+            else
+            {
+                input_bboxes.push_back(nullptr);
+            }
+            input_images.push_back({{"ori_img", img}});
+        };
+
+        for (int i = 0; i < image_count; ++i)
+        {
+            auto _mat = Cast(images[i]);
+            if (bboxes && bbox_count)
+            {
+                for (int j = 0; j < bbox_count[i]; ++j)
+                {
+                    add_bbox(_mat, bboxes++);
+                }
+            }
+            else
+            {  // inference with whole image
+                add_bbox(_mat, nullptr);
+            }
+        }
+
+        *output = Take(Value{std::move(input_images), std::move(input_bboxes)});
+        return MMDEPLOY_SUCCESS;
+    }
+    catch (const std::exception& e)
+    {
+        MMDEPLOY_ERROR("exception caught: {}", e.what());
+    }
+    catch (...)
+    {
+        MMDEPLOY_ERROR("unknown exception caught");
+    }
+    return MMDEPLOY_E_FAIL;
 }
 
-int mmdeploy_text_recognizer_apply_bbox(mmdeploy_text_recognizer_t recognizer,
-                                        const mmdeploy_mat_t* images, int image_count,
+int mmdeploy_text_recognizer_apply_bbox(mmdeploy_text_recognizer_t       recognizer,
+                                        const mmdeploy_mat_t*            images,
+                                        int                              image_count,
                                         const mmdeploy_text_detection_t* bboxes,
-                                        const int* bbox_count,
-                                        mmdeploy_text_recognition_t** results) {
-  wrapped<mmdeploy_value_t> input;
-  if (auto ec = mmdeploy_text_recognizer_create_input(images, image_count, bboxes, bbox_count,
-                                                      input.ptr())) {
-    return ec;
-  }
-  wrapped<mmdeploy_value_t> output;
-  if (auto ec = mmdeploy_text_recognizer_apply_v2(recognizer, input, output.ptr())) {
-    return ec;
-  }
-  if (auto ec = mmdeploy_text_recognizer_get_result(output, results)) {
-    return ec;
-  }
-  return MMDEPLOY_SUCCESS;
+                                        const int*                       bbox_count,
+                                        mmdeploy_text_recognition_t**    results)
+{
+    wrapped<mmdeploy_value_t> input;
+    if (auto ec = mmdeploy_text_recognizer_create_input(images, image_count, bboxes, bbox_count, input.ptr()))
+    {
+        return ec;
+    }
+    wrapped<mmdeploy_value_t> output;
+    if (auto ec = mmdeploy_text_recognizer_apply_v2(recognizer, input, output.ptr()))
+    {
+        return ec;
+    }
+    if (auto ec = mmdeploy_text_recognizer_get_result(output, results))
+    {
+        return ec;
+    }
+    return MMDEPLOY_SUCCESS;
 }
 
-int mmdeploy_text_recognizer_apply_v2(mmdeploy_text_recognizer_t recognizer, mmdeploy_value_t input,
-                                      mmdeploy_value_t* output) {
-  return mmdeploy_pipeline_apply((mmdeploy_pipeline_t)recognizer, input, output);
+int mmdeploy_text_recognizer_apply_v2(mmdeploy_text_recognizer_t recognizer, mmdeploy_value_t input, mmdeploy_value_t* output)
+{
+    return mmdeploy_pipeline_apply((mmdeploy_pipeline_t)recognizer, input, output);
 }
 
 int mmdeploy_text_recognizer_apply_async(mmdeploy_text_recognizer_t recognizer,
-                                         mmdeploy_sender_t input, mmdeploy_sender_t* output) {
-  return mmdeploy_pipeline_apply_async((mmdeploy_pipeline_t)recognizer, input, output);
+                                         mmdeploy_sender_t          input,
+                                         mmdeploy_sender_t*         output)
+{
+    return mmdeploy_pipeline_apply_async((mmdeploy_pipeline_t)recognizer, input, output);
 }
 
-MMDEPLOY_API int mmdeploy_text_recognizer_get_result(mmdeploy_value_t output,
-                                                     mmdeploy_text_recognition_t** results) {
-  if (!output || !results) {
-    return MMDEPLOY_E_INVALID_ARG;
-  }
-  try {
-    std::vector<mmocr::TextRecognition> recognitions;
-    from_value(Cast(output)->front(), recognitions);
+MMDEPLOY_API int mmdeploy_text_recognizer_get_result(mmdeploy_value_t              output,
+                                                     mmdeploy_text_recognition_t** results)
+{
+    if (!output || !results)
+    {
+        return MMDEPLOY_E_INVALID_ARG;
+    }
+    try
+    {
+        std::vector<mmocr::TextRecognition> recognitions;
+        from_value(Cast(output)->front(), recognitions);
 
-    size_t count = recognitions.size();
+        size_t count = recognitions.size();
 
-    auto deleter = [&](mmdeploy_text_recognition_t* p) {
-      mmdeploy_text_recognizer_release_result(p, static_cast<int>(count));
-    };
+        auto   deleter = [&](mmdeploy_text_recognition_t* p)
+        {
+            mmdeploy_text_recognizer_release_result(p, static_cast<int>(count));
+        };
 
-    std::unique_ptr<mmdeploy_text_recognition_t[], decltype(deleter)> _results(
-        new mmdeploy_text_recognition_t[count]{}, deleter);
+        std::unique_ptr<mmdeploy_text_recognition_t[], decltype(deleter)> _results(
+            new mmdeploy_text_recognition_t[count]{},
+            deleter);
 
-    size_t result_idx = 0;
-    for (const auto& bbox_result : recognitions) {
-      auto& res = _results[result_idx++];
+        size_t result_idx = 0;
+        for (const auto& bbox_result : recognitions)
+        {
+            auto& res = _results[result_idx++];
 
-      auto& score = bbox_result.score;
-      res.length = static_cast<int>(score.size());
+            auto& score = bbox_result.score;
+            res.length  = static_cast<int>(score.size());
 
-      res.score = new float[score.size()];
-      std::copy_n(score.data(), score.size(), res.score);
+            res.score = new float[score.size()];
+            std::copy_n(score.data(), score.size(), res.score);
 
-      auto text = bbox_result.text;
-      res.text = new char[text.length() + 1];
-      std::copy_n(text.data(), text.length() + 1, res.text);
-    }
+            auto text = bbox_result.text;
+            res.text  = new char[text.length() + 1];
+            std::copy_n(text.data(), text.length() + 1, res.text);
+        }
 
-    *results = _results.release();
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("exception caught: {}", e.what());
-  } catch (...) {
-    MMDEPLOY_ERROR("unknown exception caught");
-  }
-  return MMDEPLOY_SUCCESS;
+        *results = _results.release();
+    }
+    catch (const std::exception& e)
+    {
+        MMDEPLOY_ERROR("exception caught: {}", e.what());
+    }
+    catch (...)
+    {
+        MMDEPLOY_ERROR("unknown exception caught");
+    }
+    return MMDEPLOY_SUCCESS;
 }
 
-void mmdeploy_text_recognizer_release_result(mmdeploy_text_recognition_t* results, int count) {
-  for (int i = 0; i < count; ++i) {
-    delete[] results[i].score;
-    delete[] results[i].text;
-  }
-  delete[] results;
+void mmdeploy_text_recognizer_release_result(mmdeploy_text_recognition_t* results, int count)
+{
+    for (int i = 0; i < count; ++i)
+    {
+        delete[] results[i].score;
+        delete[] results[i].text;
+    }
+    delete[] results;
 }
 
-void mmdeploy_text_recognizer_destroy(mmdeploy_text_recognizer_t recognizer) {
-  mmdeploy_pipeline_destroy((mmdeploy_pipeline_t)recognizer);
+void mmdeploy_text_recognizer_destroy(mmdeploy_text_recognizer_t recognizer)
+{
+    mmdeploy_pipeline_destroy((mmdeploy_pipeline_t)recognizer);
 }
 
-int mmdeploy_text_recognizer_apply_async_v3(mmdeploy_text_recognizer_t recognizer,
-                                            const mmdeploy_mat_t* imgs, int img_count,
+int mmdeploy_text_recognizer_apply_async_v3(mmdeploy_text_recognizer_t       recognizer,
+                                            const mmdeploy_mat_t*            imgs,
+                                            int                              img_count,
                                             const mmdeploy_text_detection_t* bboxes,
-                                            const int* bbox_count, mmdeploy_sender_t* output) {
-  wrapped<mmdeploy_value_t> input_val;
-  if (auto ec = mmdeploy_text_recognizer_create_input(imgs, img_count, bboxes, bbox_count,
-                                                      input_val.ptr())) {
-    return ec;
-  }
-  mmdeploy_sender_t input_sndr = mmdeploy_executor_just(input_val);
-  if (auto ec = mmdeploy_text_recognizer_apply_async(recognizer, input_sndr, output)) {
-    return ec;
-  }
-  return MMDEPLOY_SUCCESS;
+                                            const int*                       bbox_count,
+                                            mmdeploy_sender_t*               output)
+{
+    wrapped<mmdeploy_value_t> input_val;
+    if (auto ec = mmdeploy_text_recognizer_create_input(imgs, img_count, bboxes, bbox_count, input_val.ptr()))
+    {
+        return ec;
+    }
+    mmdeploy_sender_t input_sndr = mmdeploy_executor_just(input_val);
+    if (auto ec = mmdeploy_text_recognizer_apply_async(recognizer, input_sndr, output))
+    {
+        return ec;
+    }
+    return MMDEPLOY_SUCCESS;
 }
 
-int mmdeploy_text_recognizer_continue_async(mmdeploy_sender_t input,
-                                            mmdeploy_text_recognizer_continue_t cont, void* context,
-                                            mmdeploy_sender_t* output) {
-  auto sender = Guard([&] {
-    return Take(
-        LetValue(Take(input), [fn = cont, context](Value& value) -> TypeErasedSender<Value> {
+int mmdeploy_text_recognizer_continue_async(mmdeploy_sender_t                   input,
+                                            mmdeploy_text_recognizer_continue_t cont,
+                                            void*                               context,
+                                            mmdeploy_sender_t*                  output)
+{
+    auto sender = Guard([&]
+                        { return Take(
+                              LetValue(Take(input), [fn = cont, context](Value& value) -> TypeErasedSender<Value>
+                                       {
           mmdeploy_text_recognition_t* results{};
           if (auto ec = mmdeploy_text_recognizer_get_result(Cast(&value), &results)) {
             return Just(Value());
@@ -241,12 +287,11 @@ int mmdeploy_text_recognizer_continue_async(mmdeploy_sender_t input,
           if (auto ec = fn(results, context, &output); ec || !output) {
             return Just(Value());
           }
-          return Take(output);
-        }));
-  });
-  if (sender) {
-    *output = sender;
-    return MMDEPLOY_SUCCESS;
-  }
-  return MMDEPLOY_E_FAIL;
+          return Take(output); })); });
+    if (sender)
+    {
+        *output = sender;
+        return MMDEPLOY_SUCCESS;
+    }
+    return MMDEPLOY_E_FAIL;
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/text_recognizer.h b/csrc/mmdeploy/apis/c/mmdeploy/text_recognizer.h
index 6c18928242..f20c878028 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/text_recognizer.h
+++ b/csrc/mmdeploy/apis/c/mmdeploy/text_recognizer.h
@@ -13,149 +13,155 @@
 #include "mmdeploy/text_detector.h"
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
-typedef struct mmdeploy_text_recognition_t {
-  char* text;
-  float* score;
-  int length;
-} mmdeploy_text_recognition_t;
-
-typedef struct mmdeploy_text_recognizer* mmdeploy_text_recognizer_t;
-
-/**
- * @brief Create a text recognizer instance
- * @param[in] model an instance of mmocr text recognition model created by
- * \ref mmdeploy_model_create_by_path or \ref mmdeploy_model_create in \ref model.h
- * @param[in] device_name name of device, such as "cpu", "cuda", etc.
- * @param[in] device_id id of device.
- * @param[out] recognizer handle of the created text recognizer, which must be destroyed
- * by \ref mmdeploy_text_recognizer_destroy
- * @return status code of the operation
- */
-MMDEPLOY_API int mmdeploy_text_recognizer_create(mmdeploy_model_t model, const char* device_name,
-                                                 int device_id,
-                                                 mmdeploy_text_recognizer_t* recognizer);
-
-/**
- * @brief Create a text recognizer instance
- * @param[in] model_path path to text recognition model
- * @param[in] device_name name of device, such as "cpu", "cuda", etc.
- * @param[in] device_id id of device.
- * @param[out] recognizer handle of the created text recognizer, which must be destroyed
- * by \ref mmdeploy_text_recognizer_destroy
- * @return status code of the operation
- */
-MMDEPLOY_API int mmdeploy_text_recognizer_create_by_path(const char* model_path,
-                                                         const char* device_name, int device_id,
-                                                         mmdeploy_text_recognizer_t* recognizer);
-
-/**
- * @brief Apply text recognizer to a batch of text images
- * @param[in] recognizer text recognizer's handle created by \ref
- * mmdeploy_text_recognizer_create_by_path
- * @param[in] images a batch of text images
- * @param[in] count number of images in the batch
- * @param[out] results a linear buffer contains the recognized text, must be release
- * by \ref mmdeploy_text_recognizer_release_result
- * @return status code of the operation
- */
-MMDEPLOY_API int mmdeploy_text_recognizer_apply(mmdeploy_text_recognizer_t recognizer,
-                                                const mmdeploy_mat_t* images, int count,
-                                                mmdeploy_text_recognition_t** results);
-
-/**
- * @brief Apply text recognizer to a batch of images supplied with text bboxes
- * @param[in] recognizer text recognizer's handle created by \ref
- * mmdeploy_text_recognizer_create_by_path
- * @param[in] images a batch of text images
- * @param[in] image_count number of images in the batch
- * @param[in] bboxes bounding boxes detected by text detector
- * @param[in] bbox_count number of bboxes of each \p images, must be same length as \p images
- * @param[out] results a linear buffer contains the recognized text, which has the same length as \p
- * bboxes, must be release by \ref mmdeploy_text_recognizer_release_result
- * @return status code of the operation
- */
-MMDEPLOY_API int mmdeploy_text_recognizer_apply_bbox(mmdeploy_text_recognizer_t recognizer,
-                                                     const mmdeploy_mat_t* images, int image_count,
-                                                     const mmdeploy_text_detection_t* bboxes,
-                                                     const int* bbox_count,
-                                                     mmdeploy_text_recognition_t** results);
-
-/** @brief Release result buffer returned by \ref mmdeploy_text_recognizer_apply or \ref
- * mmdeploy_text_recognizer_apply_bbox
- * @param[in] results result buffer by text recognizer
- * @param[in] count length of \p result
- */
-MMDEPLOY_API void mmdeploy_text_recognizer_release_result(mmdeploy_text_recognition_t* results,
-                                                          int count);
-
-/**
- * @brief destroy text recognizer
- * @param[in] recognizer handle of text recognizer created by \ref
- * mmdeploy_text_recognizer_create_by_path or \ref mmdeploy_text_recognizer_create
- */
-MMDEPLOY_API void mmdeploy_text_recognizer_destroy(mmdeploy_text_recognizer_t recognizer);
-
-/******************************************************************************
- * Experimental asynchronous APIs */
-
-/**
- * @brief Same as \ref mmdeploy_text_recognizer_create, but allows to control execution context of
- * tasks via context
- */
-MMDEPLOY_API int mmdeploy_text_recognizer_create_v2(mmdeploy_model_t model,
-                                                    mmdeploy_context_t context,
-                                                    mmdeploy_text_recognizer_t* recognizer);
-
-/**
- * @brief Pack text-recognizer inputs into mmdeploy_value_t
- * @param[in] images a batch of images
- * @param[in] image_count number of images in the batch
- * @param[in] bboxes bounding boxes detected by text detector
- * @param[in] bbox_count number of bboxes of each \p images, must be same length as \p images
- * @return value created
- */
-MMDEPLOY_API int mmdeploy_text_recognizer_create_input(const mmdeploy_mat_t* images,
-                                                       int image_count,
-                                                       const mmdeploy_text_detection_t* bboxes,
-                                                       const int* bbox_count,
-                                                       mmdeploy_value_t* output);
-
-MMDEPLOY_API int mmdeploy_text_recognizer_apply_v2(mmdeploy_text_recognizer_t recognizer,
-                                                   mmdeploy_value_t input,
-                                                   mmdeploy_value_t* output);
-
-/**
- * @brief Same as \ref mmdeploy_text_recognizer_apply_bbox, but input and output are packed in \ref
- * mmdeploy_value_t.
- */
-MMDEPLOY_API int mmdeploy_text_recognizer_apply_async(mmdeploy_text_recognizer_t recognizer,
-                                                      mmdeploy_sender_t input,
-                                                      mmdeploy_sender_t* output);
-
-typedef int (*mmdeploy_text_recognizer_continue_t)(mmdeploy_text_recognition_t* results,
-                                                   void* context, mmdeploy_sender_t* output);
-
-MMDEPLOY_API int mmdeploy_text_recognizer_apply_async_v3(mmdeploy_text_recognizer_t recognizer,
-                                                         const mmdeploy_mat_t* imgs, int img_count,
-                                                         const mmdeploy_text_detection_t* bboxes,
-                                                         const int* bbox_count,
-                                                         mmdeploy_sender_t* output);
-
-MMDEPLOY_API int mmdeploy_text_recognizer_continue_async(mmdeploy_sender_t input,
-                                                         mmdeploy_text_recognizer_continue_t cont,
-                                                         void* context, mmdeploy_sender_t* output);
-
-/**
- * @brief Unpack text-recognizer output from a mmdeploy_value_t
- * @param[in] output
- * @param[out] results
- * @return status of the operation
- */
-MMDEPLOY_API int mmdeploy_text_recognizer_get_result(mmdeploy_value_t output,
-                                                     mmdeploy_text_recognition_t** results);
+    typedef struct mmdeploy_text_recognition_t
+    {
+        char*  text;
+        float* score;
+        int    length;
+    } mmdeploy_text_recognition_t;
+
+    typedef struct mmdeploy_text_recognizer* mmdeploy_text_recognizer_t;
+
+    /**
+     * @brief Create a text recognizer instance
+     * @param[in] model an instance of mmocr text recognition model created by
+     * \ref mmdeploy_model_create_by_path or \ref mmdeploy_model_create in \ref model.h
+     * @param[in] device_name name of device, such as "cpu", "cuda", etc.
+     * @param[in] device_id id of device.
+     * @param[out] recognizer handle of the created text recognizer, which must be destroyed
+     * by \ref mmdeploy_text_recognizer_destroy
+     * @return status code of the operation
+     */
+    MMDEPLOY_API int                         mmdeploy_text_recognizer_create(mmdeploy_model_t model, const char* device_name, int device_id, mmdeploy_text_recognizer_t* recognizer);
+
+    /**
+     * @brief Create a text recognizer instance
+     * @param[in] model_path path to text recognition model
+     * @param[in] device_name name of device, such as "cpu", "cuda", etc.
+     * @param[in] device_id id of device.
+     * @param[out] recognizer handle of the created text recognizer, which must be destroyed
+     * by \ref mmdeploy_text_recognizer_destroy
+     * @return status code of the operation
+     */
+    MMDEPLOY_API int                         mmdeploy_text_recognizer_create_by_path(const char*                 model_path,
+                                                                                     const char*                 device_name,
+                                                                                     int                         device_id,
+                                                                                     mmdeploy_text_recognizer_t* recognizer);
+
+    /**
+     * @brief Apply text recognizer to a batch of text images
+     * @param[in] recognizer text recognizer's handle created by \ref
+     * mmdeploy_text_recognizer_create_by_path
+     * @param[in] images a batch of text images
+     * @param[in] count number of images in the batch
+     * @param[out] results a linear buffer contains the recognized text, must be release
+     * by \ref mmdeploy_text_recognizer_release_result
+     * @return status code of the operation
+     */
+    MMDEPLOY_API int                         mmdeploy_text_recognizer_apply(mmdeploy_text_recognizer_t    recognizer,
+                                                                            const mmdeploy_mat_t*         images,
+                                                                            int                           count,
+                                                                            mmdeploy_text_recognition_t** results);
+
+    /**
+     * @brief Apply text recognizer to a batch of images supplied with text bboxes
+     * @param[in] recognizer text recognizer's handle created by \ref
+     * mmdeploy_text_recognizer_create_by_path
+     * @param[in] images a batch of text images
+     * @param[in] image_count number of images in the batch
+     * @param[in] bboxes bounding boxes detected by text detector
+     * @param[in] bbox_count number of bboxes of each \p images, must be same length as \p images
+     * @param[out] results a linear buffer contains the recognized text, which has the same length as \p
+     * bboxes, must be release by \ref mmdeploy_text_recognizer_release_result
+     * @return status code of the operation
+     */
+    MMDEPLOY_API int                         mmdeploy_text_recognizer_apply_bbox(mmdeploy_text_recognizer_t       recognizer,
+                                                                                 const mmdeploy_mat_t*            images,
+                                                                                 int                              image_count,
+                                                                                 const mmdeploy_text_detection_t* bboxes,
+                                                                                 const int*                       bbox_count,
+                                                                                 mmdeploy_text_recognition_t**    results);
+
+    /** @brief Release result buffer returned by \ref mmdeploy_text_recognizer_apply or \ref
+     * mmdeploy_text_recognizer_apply_bbox
+     * @param[in] results result buffer by text recognizer
+     * @param[in] count length of \p result
+     */
+    MMDEPLOY_API void                        mmdeploy_text_recognizer_release_result(mmdeploy_text_recognition_t* results,
+                                                                                     int                          count);
+
+    /**
+     * @brief destroy text recognizer
+     * @param[in] recognizer handle of text recognizer created by \ref
+     * mmdeploy_text_recognizer_create_by_path or \ref mmdeploy_text_recognizer_create
+     */
+    MMDEPLOY_API void                        mmdeploy_text_recognizer_destroy(mmdeploy_text_recognizer_t recognizer);
+
+    /******************************************************************************
+     * Experimental asynchronous APIs */
+
+    /**
+     * @brief Same as \ref mmdeploy_text_recognizer_create, but allows to control execution context of
+     * tasks via context
+     */
+    MMDEPLOY_API int                         mmdeploy_text_recognizer_create_v2(mmdeploy_model_t            model,
+                                                                                mmdeploy_context_t          context,
+                                                                                mmdeploy_text_recognizer_t* recognizer);
+
+    /**
+     * @brief Pack text-recognizer inputs into mmdeploy_value_t
+     * @param[in] images a batch of images
+     * @param[in] image_count number of images in the batch
+     * @param[in] bboxes bounding boxes detected by text detector
+     * @param[in] bbox_count number of bboxes of each \p images, must be same length as \p images
+     * @return value created
+     */
+    MMDEPLOY_API int                         mmdeploy_text_recognizer_create_input(const mmdeploy_mat_t*            images,
+                                                                                   int                              image_count,
+                                                                                   const mmdeploy_text_detection_t* bboxes,
+                                                                                   const int*                       bbox_count,
+                                                                                   mmdeploy_value_t*                output);
+
+    MMDEPLOY_API int                         mmdeploy_text_recognizer_apply_v2(mmdeploy_text_recognizer_t recognizer,
+                                                                               mmdeploy_value_t           input,
+                                                                               mmdeploy_value_t*          output);
+
+    /**
+     * @brief Same as \ref mmdeploy_text_recognizer_apply_bbox, but input and output are packed in \ref
+     * mmdeploy_value_t.
+     */
+    MMDEPLOY_API int                         mmdeploy_text_recognizer_apply_async(mmdeploy_text_recognizer_t recognizer,
+                                                                                  mmdeploy_sender_t          input,
+                                                                                  mmdeploy_sender_t*         output);
+
+    typedef int (*mmdeploy_text_recognizer_continue_t)(mmdeploy_text_recognition_t* results,
+                                                       void*                        context,
+                                                       mmdeploy_sender_t*           output);
+
+    MMDEPLOY_API int mmdeploy_text_recognizer_apply_async_v3(mmdeploy_text_recognizer_t       recognizer,
+                                                             const mmdeploy_mat_t*            imgs,
+                                                             int                              img_count,
+                                                             const mmdeploy_text_detection_t* bboxes,
+                                                             const int*                       bbox_count,
+                                                             mmdeploy_sender_t*               output);
+
+    MMDEPLOY_API int mmdeploy_text_recognizer_continue_async(mmdeploy_sender_t                   input,
+                                                             mmdeploy_text_recognizer_continue_t cont,
+                                                             void*                               context,
+                                                             mmdeploy_sender_t*                  output);
+
+    /**
+     * @brief Unpack text-recognizer output from a mmdeploy_value_t
+     * @param[in] output
+     * @param[out] results
+     * @return status of the operation
+     */
+    MMDEPLOY_API int mmdeploy_text_recognizer_get_result(mmdeploy_value_t              output,
+                                                         mmdeploy_text_recognition_t** results);
 
 #ifdef __cplusplus
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/video_recognizer.cpp b/csrc/mmdeploy/apis/c/mmdeploy/video_recognizer.cpp
index de71e57842..3f0ab3c305 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/video_recognizer.cpp
+++ b/csrc/mmdeploy/apis/c/mmdeploy/video_recognizer.cpp
@@ -20,146 +20,178 @@
 
 using namespace mmdeploy;
 
-int mmdeploy_video_recognizer_create(mmdeploy_model_t model, const char* device_name, int device_id,
-                                     mmdeploy_video_recognizer_t* recognizer) {
-  mmdeploy_context_t context{};
-  auto ec = mmdeploy_context_create_by_device(device_name, device_id, &context);
-  if (ec != MMDEPLOY_SUCCESS) {
+int mmdeploy_video_recognizer_create(mmdeploy_model_t model, const char* device_name, int device_id, mmdeploy_video_recognizer_t* recognizer)
+{
+    mmdeploy_context_t context{};
+    auto               ec = mmdeploy_context_create_by_device(device_name, device_id, &context);
+    if (ec != MMDEPLOY_SUCCESS)
+    {
+        return ec;
+    }
+    ec = mmdeploy_video_recognizer_create_v2(model, context, recognizer);
+    mmdeploy_context_destroy(context);
     return ec;
-  }
-  ec = mmdeploy_video_recognizer_create_v2(model, context, recognizer);
-  mmdeploy_context_destroy(context);
-  return ec;
 }
 
-int mmdeploy_video_recognizer_create_by_path(const char* model_path, const char* device_name,
-                                             int device_id,
-                                             mmdeploy_video_recognizer_t* recognizer) {
-  mmdeploy_model_t model{};
+int mmdeploy_video_recognizer_create_by_path(const char* model_path, const char* device_name, int device_id, mmdeploy_video_recognizer_t* recognizer)
+{
+    mmdeploy_model_t model{};
 
-  if (auto ec = mmdeploy_model_create_by_path(model_path, &model)) {
+    if (auto ec = mmdeploy_model_create_by_path(model_path, &model))
+    {
+        return ec;
+    }
+    auto ec = mmdeploy_video_recognizer_create(model, device_name, device_id, recognizer);
+    mmdeploy_model_destroy(model);
     return ec;
-  }
-  auto ec = mmdeploy_video_recognizer_create(model, device_name, device_id, recognizer);
-  mmdeploy_model_destroy(model);
-  return ec;
 }
-int mmdeploy_video_recognizer_apply(mmdeploy_video_recognizer_t recognizer,
-                                    const mmdeploy_mat_t* images,
-                                    const mmdeploy_video_sample_info_t* video_info, int video_count,
-                                    mmdeploy_video_recognition_t** results, int** result_count) {
-  wrapped<mmdeploy_value_t> input;
-  if (auto ec =
-          mmdeploy_video_recognizer_create_input(images, video_info, video_count, input.ptr())) {
-    return ec;
-  }
+int mmdeploy_video_recognizer_apply(mmdeploy_video_recognizer_t         recognizer,
+                                    const mmdeploy_mat_t*               images,
+                                    const mmdeploy_video_sample_info_t* video_info,
+                                    int                                 video_count,
+                                    mmdeploy_video_recognition_t**      results,
+                                    int**                               result_count)
+{
+    wrapped<mmdeploy_value_t> input;
+    if (auto ec =
+            mmdeploy_video_recognizer_create_input(images, video_info, video_count, input.ptr()))
+    {
+        return ec;
+    }
 
-  wrapped<mmdeploy_value_t> output;
-  if (auto ec = mmdeploy_video_recognizer_apply_v2(recognizer, input, output.ptr())) {
-    return ec;
-  }
+    wrapped<mmdeploy_value_t> output;
+    if (auto ec = mmdeploy_video_recognizer_apply_v2(recognizer, input, output.ptr()))
+    {
+        return ec;
+    }
 
-  if (auto ec = mmdeploy_video_recognizer_get_result(output, results, result_count)) {
-    return ec;
-  }
-  return MMDEPLOY_SUCCESS;
+    if (auto ec = mmdeploy_video_recognizer_get_result(output, results, result_count))
+    {
+        return ec;
+    }
+    return MMDEPLOY_SUCCESS;
 }
 
 void mmdeploy_video_recognizer_release_result(mmdeploy_video_recognition_t* results,
-                                              int* result_count, int video_count) {
-  delete[] results;
-  delete[] result_count;
+                                              int*                          result_count,
+                                              int                           video_count)
+{
+    delete[] results;
+    delete[] result_count;
 }
 
-void mmdeploy_video_recognizer_destroy(mmdeploy_video_recognizer_t recognizer) {
-  mmdeploy_pipeline_destroy((mmdeploy_pipeline_t)recognizer);
+void mmdeploy_video_recognizer_destroy(mmdeploy_video_recognizer_t recognizer)
+{
+    mmdeploy_pipeline_destroy((mmdeploy_pipeline_t)recognizer);
 }
 
-int mmdeploy_video_recognizer_create_v2(mmdeploy_model_t model, mmdeploy_context_t context,
-                                        mmdeploy_video_recognizer_t* recognizer) {
-  return mmdeploy_pipeline_create_from_model(model, context, (mmdeploy_pipeline_t*)recognizer);
+int mmdeploy_video_recognizer_create_v2(mmdeploy_model_t model, mmdeploy_context_t context, mmdeploy_video_recognizer_t* recognizer)
+{
+    return mmdeploy_pipeline_create_from_model(model, context, (mmdeploy_pipeline_t*)recognizer);
 }
 
-int mmdeploy_video_recognizer_create_input(const mmdeploy_mat_t* images,
+int mmdeploy_video_recognizer_create_input(const mmdeploy_mat_t*               images,
                                            const mmdeploy_video_sample_info_t* video_info,
-                                           int video_count, mmdeploy_value_t* value) {
-  if (video_count && (images == nullptr || video_info == nullptr)) {
-    return MMDEPLOY_E_INVALID_ARG;
-  }
-  try {
-    auto input = std::make_unique<Value>(Value{Value::kArray});
-    auto sample = std::make_unique<Value>(Value::kArray);
-    for (int i = 0; i < video_count; ++i) {
-      int clip_len = video_info[i].clip_len;
-      int num_clips = video_info[i].num_clips;
-      int n_mat = clip_len * num_clips;
-      for (int j = 0; j < n_mat; j++) {
-        mmdeploy::Mat _mat{images[j].height,
-                           images[j].width,
-                           PixelFormat(images[j].format),
-                           DataType(images[j].type),
-                           images[j].data,
-                           images[j].device ? *(const Device*)(images[j].device) : Device{0}};
-        sample->push_back({{"ori_img", _mat}, {"clip_len", clip_len}, {"num_clips", num_clips}});
-      }
-      input->front().push_back(std::move(*sample.release()));
+                                           int                                 video_count,
+                                           mmdeploy_value_t*                   value)
+{
+    if (video_count && (images == nullptr || video_info == nullptr))
+    {
+        return MMDEPLOY_E_INVALID_ARG;
+    }
+    try
+    {
+        auto input  = std::make_unique<Value>(Value{Value::kArray});
+        auto sample = std::make_unique<Value>(Value::kArray);
+        for (int i = 0; i < video_count; ++i)
+        {
+            int clip_len  = video_info[i].clip_len;
+            int num_clips = video_info[i].num_clips;
+            int n_mat     = clip_len * num_clips;
+            for (int j = 0; j < n_mat; j++)
+            {
+                mmdeploy::Mat _mat{images[j].height,
+                                   images[j].width,
+                                   PixelFormat(images[j].format),
+                                   DataType(images[j].type),
+                                   images[j].data,
+                                   images[j].device ? *(const Device*)(images[j].device) : Device{0}};
+                sample->push_back({{"ori_img", _mat}, {"clip_len", clip_len}, {"num_clips", num_clips}});
+            }
+            input->front().push_back(std::move(*sample.release()));
+        }
+        *value = Cast(input.release());
+    }
+    catch (const std::exception& e)
+    {
+        MMDEPLOY_ERROR("unhandled exception: {}", e.what());
+    }
+    catch (...)
+    {
+        MMDEPLOY_ERROR("unknown exception caught");
     }
-    *value = Cast(input.release());
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("unhandled exception: {}", e.what());
-  } catch (...) {
-    MMDEPLOY_ERROR("unknown exception caught");
-  }
-  return MMDEPLOY_SUCCESS;
+    return MMDEPLOY_SUCCESS;
 }
 
 int mmdeploy_video_recognizer_apply_v2(mmdeploy_video_recognizer_t recognizer,
-                                       mmdeploy_value_t input, mmdeploy_value_t* output) {
-  return mmdeploy_pipeline_apply((mmdeploy_pipeline_t)recognizer, input, output);
+                                       mmdeploy_value_t            input,
+                                       mmdeploy_value_t*           output)
+{
+    return mmdeploy_pipeline_apply((mmdeploy_pipeline_t)recognizer, input, output);
 }
 
-int mmdeploy_video_recognizer_get_result(mmdeploy_value_t output,
+int mmdeploy_video_recognizer_get_result(mmdeploy_value_t               output,
                                          mmdeploy_video_recognition_t** results,
-                                         int** result_count) {
-  if (!output || !results || !result_count) {
-    return MMDEPLOY_E_INVALID_ARG;
-  }
-  try {
-    Value& value = Cast(output)->front();
-
-    auto classify_outputs = from_value<std::vector<mmaction::Labels>>(value);
-
-    std::vector<int> _result_count;
-    _result_count.reserve(classify_outputs.size());
-
-    for (const auto& cls_output : classify_outputs) {
-      _result_count.push_back((int)cls_output.size());
+                                         int**                          result_count)
+{
+    if (!output || !results || !result_count)
+    {
+        return MMDEPLOY_E_INVALID_ARG;
     }
-
-    auto total = std::accumulate(begin(_result_count), end(_result_count), 0);
-
-    std::unique_ptr<int[]> result_count_data(new int[_result_count.size()]{});
-    std::copy(_result_count.begin(), _result_count.end(), result_count_data.get());
-
-    std::unique_ptr<mmdeploy_video_recognition_t[]> result_data(
-        new mmdeploy_video_recognition_t[total]{});
-    auto result_ptr = result_data.get();
-    for (const auto& cls_output : classify_outputs) {
-      for (const auto& label : cls_output) {
-        result_ptr->label_id = label.label_id;
-        result_ptr->score = label.score;
-        ++result_ptr;
-      }
+    try
+    {
+        Value&           value = Cast(output)->front();
+
+        auto             classify_outputs = from_value<std::vector<mmaction::Labels>>(value);
+
+        std::vector<int> _result_count;
+        _result_count.reserve(classify_outputs.size());
+
+        for (const auto& cls_output : classify_outputs)
+        {
+            _result_count.push_back((int)cls_output.size());
+        }
+
+        auto                   total = std::accumulate(begin(_result_count), end(_result_count), 0);
+
+        std::unique_ptr<int[]> result_count_data(new int[_result_count.size()]{});
+        std::copy(_result_count.begin(), _result_count.end(), result_count_data.get());
+
+        std::unique_ptr<mmdeploy_video_recognition_t[]> result_data(
+            new mmdeploy_video_recognition_t[total]{});
+        auto result_ptr = result_data.get();
+        for (const auto& cls_output : classify_outputs)
+        {
+            for (const auto& label : cls_output)
+            {
+                result_ptr->label_id = label.label_id;
+                result_ptr->score    = label.score;
+                ++result_ptr;
+            }
+        }
+
+        *result_count = result_count_data.release();
+        *results      = result_data.release();
+
+        return MMDEPLOY_SUCCESS;
     }
-
-    *result_count = result_count_data.release();
-    *results = result_data.release();
-
-    return MMDEPLOY_SUCCESS;
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("unhandled exception: {}", e.what());
-  } catch (...) {
-    MMDEPLOY_ERROR("unknown exception caught");
-  }
-  return MMDEPLOY_E_FAIL;
+    catch (const std::exception& e)
+    {
+        MMDEPLOY_ERROR("unhandled exception: {}", e.what());
+    }
+    catch (...)
+    {
+        MMDEPLOY_ERROR("unknown exception caught");
+    }
+    return MMDEPLOY_E_FAIL;
 }
diff --git a/csrc/mmdeploy/apis/c/mmdeploy/video_recognizer.h b/csrc/mmdeploy/apis/c/mmdeploy/video_recognizer.h
index e98b2bd07e..6893170e7d 100644
--- a/csrc/mmdeploy/apis/c/mmdeploy/video_recognizer.h
+++ b/csrc/mmdeploy/apis/c/mmdeploy/video_recognizer.h
@@ -13,124 +13,129 @@
 #include "mmdeploy/model.h"
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
-typedef struct mmdeploy_video_recognition_t {
-  int label_id;
-  float score;
-} mmdeploy_video_recognition_t;
-
-typedef struct mmdeploy_video_sample_info_t {
-  int clip_len;
-  int num_clips;
-} mmdeploy_video_sample_info_t;
-
-typedef struct mmdeploy_video_recognizer* mmdeploy_video_recognizer_t;
-
-/**
- * @brief Create video recognizer's handle
- * @param[in] model an instance of mmaction sdk model created by
- * \ref mmdeploy_model_create_by_path or \ref mmdeploy_model_create in \ref model.h
- * @param[in] device_name name of device, such as "cpu", "cuda", etc.
- * @param[in] device_id id of device.
- * @param[out] recognizer handle of the created video recognizer, which must be destroyed
- * by \ref mmdeploy_video_recognizer_destroy
- * @return status of creating video recognizer's handle
- */
-MMDEPLOY_API int mmdeploy_video_recognizer_create(mmdeploy_model_t model, const char* device_name,
-                                                  int device_id,
-                                                  mmdeploy_video_recognizer_t* recognizer);
-
-/**
- * @brief Create a video recognizer instance
- * @param[in] model_path path to video recognition model
- * @param[in] device_name name of device, such as "cpu", "cuda", etc.
- * @param[in] device_id id of device.
- * @param[out] recognizer handle of the created video recognizer, which must be destroyed
- * by \ref mmdeploy_video_recognizer_destroy
- * @return status code of the operation
- */
-MMDEPLOY_API int mmdeploy_video_recognizer_create_by_path(const char* model_path,
-                                                          const char* device_name, int device_id,
-                                                          mmdeploy_video_recognizer_t* recognizer);
-
-/**
- * @brief Apply video recognizer to a batch of videos
- * @param[in] recognizer video recognizer's handle created by \ref
- * mmdeploy_video_recognizer_create_by_path
- * @param[in] images a batch of videos
- * @param[in] video_info video information of each video
- * @param[in] video_count number of videos
- * @param[out] results a linear buffer contains the recognized video, must be release
- * by \ref mmdeploy_video_recognizer_release_result
- * @param[out] result_count a linear buffer with length being \p video_count to save the number of
- * recognition results of each video. It must be released by \ref
- * mmdeploy_video_recognizer_release_result
- * @return status code of the operation
- */
-MMDEPLOY_API int mmdeploy_video_recognizer_apply(mmdeploy_video_recognizer_t recognizer,
-                                                 const mmdeploy_mat_t* images,
-                                                 const mmdeploy_video_sample_info_t* video_info,
-                                                 int video_count,
-                                                 mmdeploy_video_recognition_t** results,
-                                                 int** result_count);
-
-/** @brief Release result buffer returned by \ref mmdeploy_video_recognizer_apply
- * @param[in] results result buffer by video recognizer
- * @param[in] result_count \p results size buffer
- * @param[in] video_count length of \p result_count
- */
-MMDEPLOY_API void mmdeploy_video_recognizer_release_result(mmdeploy_video_recognition_t* results,
-                                                           int* result_count, int video_count);
-
-/**
- * @brief destroy video recognizer
- * @param[in] recognizer handle of video recognizer created by \ref
- * mmdeploy_video_recognizer_create_by_path or \ref mmdeploy_video_recognizer_create
- */
-MMDEPLOY_API void mmdeploy_video_recognizer_destroy(mmdeploy_video_recognizer_t recognizer);
-
-/**
- * @brief Same as \ref mmdeploy_video_recognizer_create, but allows to control execution context of
- * tasks via context
- */
-MMDEPLOY_API int mmdeploy_video_recognizer_create_v2(mmdeploy_model_t model,
-                                                     mmdeploy_context_t context,
-                                                     mmdeploy_video_recognizer_t* recognizer);
-
-/**
- * @brief Pack video recognizer inputs into mmdeploy_value_t
- * @param[in] images a batch of videos
- * @param[in] video_info video information of each video
- * @param[in] video_count number of videos in the batch
- * @param[out] value created value
- * @return status code of the operation
- */
-MMDEPLOY_API int mmdeploy_video_recognizer_create_input(
-    const mmdeploy_mat_t* images, const mmdeploy_video_sample_info_t* video_info, int video_count,
-    mmdeploy_value_t* value);
-
-/**
- * @brief Apply video recognizer to a batch of videos
- * @param[in] input packed input
- * @param[out] output inference output
- * @return status code of the operation
- */
-MMDEPLOY_API int mmdeploy_video_recognizer_apply_v2(mmdeploy_video_recognizer_t recognizer,
-                                                    mmdeploy_value_t input,
-                                                    mmdeploy_value_t* output);
-
-/**
- * @brief Apply video recognizer to a batch of videos
- * @param[in] output inference output
- * @param[out] results structured output
- * @param[out] result_count number of each videos
- * @return status code of the operation
- */
-MMDEPLOY_API int mmdeploy_video_recognizer_get_result(mmdeploy_value_t output,
-                                                      mmdeploy_video_recognition_t** results,
-                                                      int** result_count);
+    typedef struct mmdeploy_video_recognition_t
+    {
+        int   label_id;
+        float score;
+    } mmdeploy_video_recognition_t;
+
+    typedef struct mmdeploy_video_sample_info_t
+    {
+        int clip_len;
+        int num_clips;
+    } mmdeploy_video_sample_info_t;
+
+    typedef struct mmdeploy_video_recognizer* mmdeploy_video_recognizer_t;
+
+    /**
+     * @brief Create video recognizer's handle
+     * @param[in] model an instance of mmaction sdk model created by
+     * \ref mmdeploy_model_create_by_path or \ref mmdeploy_model_create in \ref model.h
+     * @param[in] device_name name of device, such as "cpu", "cuda", etc.
+     * @param[in] device_id id of device.
+     * @param[out] recognizer handle of the created video recognizer, which must be destroyed
+     * by \ref mmdeploy_video_recognizer_destroy
+     * @return status of creating video recognizer's handle
+     */
+    MMDEPLOY_API int                          mmdeploy_video_recognizer_create(mmdeploy_model_t model, const char* device_name, int device_id, mmdeploy_video_recognizer_t* recognizer);
+
+    /**
+     * @brief Create a video recognizer instance
+     * @param[in] model_path path to video recognition model
+     * @param[in] device_name name of device, such as "cpu", "cuda", etc.
+     * @param[in] device_id id of device.
+     * @param[out] recognizer handle of the created video recognizer, which must be destroyed
+     * by \ref mmdeploy_video_recognizer_destroy
+     * @return status code of the operation
+     */
+    MMDEPLOY_API int                          mmdeploy_video_recognizer_create_by_path(const char*                  model_path,
+                                                                                       const char*                  device_name,
+                                                                                       int                          device_id,
+                                                                                       mmdeploy_video_recognizer_t* recognizer);
+
+    /**
+     * @brief Apply video recognizer to a batch of videos
+     * @param[in] recognizer video recognizer's handle created by \ref
+     * mmdeploy_video_recognizer_create_by_path
+     * @param[in] images a batch of videos
+     * @param[in] video_info video information of each video
+     * @param[in] video_count number of videos
+     * @param[out] results a linear buffer contains the recognized video, must be release
+     * by \ref mmdeploy_video_recognizer_release_result
+     * @param[out] result_count a linear buffer with length being \p video_count to save the number of
+     * recognition results of each video. It must be released by \ref
+     * mmdeploy_video_recognizer_release_result
+     * @return status code of the operation
+     */
+    MMDEPLOY_API int                          mmdeploy_video_recognizer_apply(mmdeploy_video_recognizer_t         recognizer,
+                                                                              const mmdeploy_mat_t*               images,
+                                                                              const mmdeploy_video_sample_info_t* video_info,
+                                                                              int                                 video_count,
+                                                                              mmdeploy_video_recognition_t**      results,
+                                                                              int**                               result_count);
+
+    /** @brief Release result buffer returned by \ref mmdeploy_video_recognizer_apply
+     * @param[in] results result buffer by video recognizer
+     * @param[in] result_count \p results size buffer
+     * @param[in] video_count length of \p result_count
+     */
+    MMDEPLOY_API void                         mmdeploy_video_recognizer_release_result(mmdeploy_video_recognition_t* results,
+                                                                                       int*                          result_count,
+                                                                                       int                           video_count);
+
+    /**
+     * @brief destroy video recognizer
+     * @param[in] recognizer handle of video recognizer created by \ref
+     * mmdeploy_video_recognizer_create_by_path or \ref mmdeploy_video_recognizer_create
+     */
+    MMDEPLOY_API void                         mmdeploy_video_recognizer_destroy(mmdeploy_video_recognizer_t recognizer);
+
+    /**
+     * @brief Same as \ref mmdeploy_video_recognizer_create, but allows to control execution context of
+     * tasks via context
+     */
+    MMDEPLOY_API int                          mmdeploy_video_recognizer_create_v2(mmdeploy_model_t             model,
+                                                                                  mmdeploy_context_t           context,
+                                                                                  mmdeploy_video_recognizer_t* recognizer);
+
+    /**
+     * @brief Pack video recognizer inputs into mmdeploy_value_t
+     * @param[in] images a batch of videos
+     * @param[in] video_info video information of each video
+     * @param[in] video_count number of videos in the batch
+     * @param[out] value created value
+     * @return status code of the operation
+     */
+    MMDEPLOY_API int                          mmdeploy_video_recognizer_create_input(
+                                 const mmdeploy_mat_t*               images,
+                                 const mmdeploy_video_sample_info_t* video_info,
+                                 int                                 video_count,
+                                 mmdeploy_value_t*                   value);
+
+    /**
+     * @brief Apply video recognizer to a batch of videos
+     * @param[in] input packed input
+     * @param[out] output inference output
+     * @return status code of the operation
+     */
+    MMDEPLOY_API int mmdeploy_video_recognizer_apply_v2(mmdeploy_video_recognizer_t recognizer,
+                                                        mmdeploy_value_t            input,
+                                                        mmdeploy_value_t*           output);
+
+    /**
+     * @brief Apply video recognizer to a batch of videos
+     * @param[in] output inference output
+     * @param[out] results structured output
+     * @param[out] result_count number of each videos
+     * @return status code of the operation
+     */
+    MMDEPLOY_API int mmdeploy_video_recognizer_get_result(mmdeploy_value_t               output,
+                                                          mmdeploy_video_recognition_t** results,
+                                                          int**                          result_count);
 
 #ifdef __cplusplus
 }
diff --git a/csrc/mmdeploy/apis/cxx/CMakeLists.txt b/csrc/mmdeploy/apis/cxx/CMakeLists.txt
index 0ee897ca4d..9073665516 100644
--- a/csrc/mmdeploy/apis/cxx/CMakeLists.txt
+++ b/csrc/mmdeploy/apis/cxx/CMakeLists.txt
@@ -4,41 +4,44 @@ cmake_minimum_required(VERSION 3.14)
 project(mmdeploy_cxx_api)
 
 add_library(${PROJECT_NAME} INTERFACE)
-target_include_directories(${PROJECT_NAME} INTERFACE
-        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
-        $<INSTALL_INTERFACE:include>)
+target_include_directories(
+  ${PROJECT_NAME} INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+                            $<INSTALL_INTERFACE:include>)
 target_compile_features(${PROJECT_NAME} INTERFACE cxx_std_17)
 set(_tasks ${MMDEPLOY_TASKS} pipeline)
-foreach (task ${_tasks})
-    target_link_libraries(mmdeploy_${task} INTERFACE ${PROJECT_NAME})
-    install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/mmdeploy/${task}.hpp
-            DESTINATION include/mmdeploy)
-endforeach ()
-if (TARGET mmdeploy)
-    target_include_directories(${PROJECT_NAME} INTERFACE
-            $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/csrc>
-            $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/third_party/outcome>
-            $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/third_party/json>
-            )
-    target_include_directories(${PROJECT_NAME} INTERFACE
-            $<INSTALL_INTERFACE:include>
-            $<INSTALL_INTERFACE:include/mmdeploy/third_party/outcome>
-            $<INSTALL_INTERFACE:include/mmdeploy/third_party/json>
-            )
-    if (NOT MMDEPLOY_SPDLOG_EXTERNAL)
-        target_include_directories(${PROJECT_NAME} INTERFACE
-                $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/third_party/spdlog/include>
-                $<INSTALL_INTERFACE:include/mmdeploy/third_party>)
-    endif ()
-    target_link_libraries(mmdeploy INTERFACE ${PROJECT_NAME})
-else ()
-    target_link_libraries(${PROJECT_NAME} INTERFACE mmdeploy::core)
-endif ()
+foreach(task ${_tasks})
+  target_link_libraries(mmdeploy_${task} INTERFACE ${PROJECT_NAME})
+  install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/mmdeploy/${task}.hpp
+          DESTINATION include/mmdeploy)
+endforeach()
+if(TARGET mmdeploy)
+  target_include_directories(
+    ${PROJECT_NAME}
+    INTERFACE $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/csrc>
+              $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/third_party/outcome>
+              $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/third_party/json>)
+  target_include_directories(
+    ${PROJECT_NAME}
+    INTERFACE $<INSTALL_INTERFACE:include>
+              $<INSTALL_INTERFACE:include/mmdeploy/third_party/outcome>
+              $<INSTALL_INTERFACE:include/mmdeploy/third_party/json>)
+  if(NOT MMDEPLOY_SPDLOG_EXTERNAL)
+    target_include_directories(
+      ${PROJECT_NAME}
+      INTERFACE
+        $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/third_party/spdlog/include>
+        $<INSTALL_INTERFACE:include/mmdeploy/third_party>)
+  endif()
+  target_link_libraries(mmdeploy INTERFACE ${PROJECT_NAME})
+else()
+  target_link_libraries(${PROJECT_NAME} INTERFACE mmdeploy::core)
+endif()
 mmdeploy_export_impl(${PROJECT_NAME})
 install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/mmdeploy/common.hpp
         DESTINATION include/mmdeploy)
-install(DIRECTORY ${CMAKE_SOURCE_DIR}/demo/csrc/ DESTINATION example/cpp
-        FILES_MATCHING
-        PATTERN "*.cxx"
-        PATTERN "*.h"
-        )
+install(
+  DIRECTORY ${CMAKE_SOURCE_DIR}/demo/csrc/
+  DESTINATION example/cpp
+  FILES_MATCHING
+  PATTERN "*.cxx"
+  PATTERN "*.h")
diff --git a/csrc/mmdeploy/apis/cxx/mmdeploy/classifier.hpp b/csrc/mmdeploy/apis/cxx/mmdeploy/classifier.hpp
index 1d9880fb7d..5ba395ad77 100644
--- a/csrc/mmdeploy/apis/cxx/mmdeploy/classifier.hpp
+++ b/csrc/mmdeploy/apis/cxx/mmdeploy/classifier.hpp
@@ -6,68 +6,87 @@
 #include "mmdeploy/classifier.h"
 #include "mmdeploy/common.hpp"
 
-namespace mmdeploy {
-
-namespace cxx {
-
-using Classification = mmdeploy_classification_t;
-
-class Classifier : public NonMovable {
- public:
-  Classifier(const Model& model, const Context& context) {
-    auto ec = mmdeploy_classifier_create_v2(model, context, &classifier_);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-  }
-
-  ~Classifier() {
-    if (classifier_) {
-      mmdeploy_classifier_destroy(classifier_);
-      classifier_ = {};
-    }
-  }
-
-  using Result = Result_<Classification>;
-
-  std::vector<Result> Apply(Span<const Mat> images) {
-    if (images.empty()) {
-      return {};
-    }
-
-    Classification* results{};
-    int* result_count{};
-    auto ec = mmdeploy_classifier_apply(classifier_, reinterpret(images.data()),
-                                        static_cast<int>(images.size()), &results, &result_count);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-
-    std::vector<Result> rets;
-    rets.reserve(images.size());
-
-    std::shared_ptr<Classification> data(results, [result_count, count = images.size()](auto p) {
-      mmdeploy_classifier_release_result(p, result_count, count);
-    });
-
-    size_t offset = 0;
-    for (size_t i = 0; i < images.size(); ++i) {
-      offset += rets.emplace_back(offset, result_count[i], data).size();
-    }
-
-    return rets;
-  }
-
-  Result Apply(const Mat& img) { return Apply(Span{img})[0]; }
-
- private:
-  mmdeploy_classifier_t classifier_{};
-};
-
-}  // namespace cxx
-
-using cxx::Classification;
-using cxx::Classifier;
+namespace mmdeploy
+{
+
+    namespace cxx
+    {
+
+        using Classification = mmdeploy_classification_t;
+
+        class Classifier : public NonMovable
+        {
+          public:
+            Classifier(const Model& model, const Context& context)
+            {
+                auto ec = mmdeploy_classifier_create_v2(model, context, &classifier_);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+            }
+
+            ~Classifier()
+            {
+                if (classifier_)
+                {
+                    mmdeploy_classifier_destroy(classifier_);
+                    classifier_ = {};
+                }
+            }
+
+            using Result = Result_<Classification>;
+
+            std::vector<Result> Apply(Span<const Mat> images)
+            {
+                if (images.empty())
+                {
+                    return {};
+                }
+
+                Classification* results{};
+                int*            result_count{};
+                auto            ec = mmdeploy_classifier_apply(classifier_,
+                                                    reinterpret(images.data()),
+                                                    static_cast<int>(images.size()),
+                                                    &results,
+                                                    &result_count);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+
+                std::vector<Result> rets;
+                rets.reserve(images.size());
+
+                std::shared_ptr<Classification> data(results,
+                                                     [result_count, count = images.size()](auto p)
+                                                     {
+                                                         mmdeploy_classifier_release_result(p, result_count, count);
+                                                     });
+
+                size_t                          offset = 0;
+                for (size_t i = 0; i < images.size(); ++i)
+                {
+                    offset += rets.emplace_back(offset, result_count[i], data).size();
+                }
+
+                return rets;
+            }
+
+            Result Apply(const Mat& img)
+            {
+                return Apply(Span{img})[0];
+            }
+
+          private:
+            mmdeploy_classifier_t classifier_{};
+        };
+
+    }  // namespace cxx
+
+    using cxx::Classification;
+    using cxx::Classifier;
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/apis/cxx/mmdeploy/common.hpp b/csrc/mmdeploy/apis/cxx/mmdeploy/common.hpp
index 610c3a8b9e..07b6b225b2 100644
--- a/csrc/mmdeploy/apis/cxx/mmdeploy/common.hpp
+++ b/csrc/mmdeploy/apis/cxx/mmdeploy/common.hpp
@@ -16,253 +16,432 @@
 #include "mmdeploy/model.h"
 
 #ifndef MMDEPLOY_CXX_USE_OPENCV
-#define MMDEPLOY_CXX_USE_OPENCV 1
+    #define MMDEPLOY_CXX_USE_OPENCV 1
 #endif
 
 #if MMDEPLOY_CXX_USE_OPENCV
-#include "opencv2/core/core.hpp"
+    #include "opencv2/core/core.hpp"
 #endif
 
-namespace mmdeploy {
-
-namespace cxx {
-
-using Rect = mmdeploy_rect_t;
-
-template <typename T>
-class UniqueHandle : public NonCopyable {
- public:
-  UniqueHandle() = default;
-  explicit UniqueHandle(T handle) : handle_(handle) {}
-
-  // derived class must destroy the object and reset `handle_`
-  ~UniqueHandle() { assert(handle_ == nullptr); }
-
-  UniqueHandle(UniqueHandle&& o) noexcept : handle_(std::exchange(o.handle_, nullptr)) {}
-  UniqueHandle& operator=(UniqueHandle&& o) noexcept {
-    if (this != &o) {
-      handle_ = std::exchange(o.handle_, nullptr);
-    }
-    return *this;
-  }
-
-  explicit operator T() const noexcept { return handle_; }
-  T operator->() const noexcept { return handle_; }
-
- protected:
-  T handle_{};
-};
-
-class Model {
- public:
-  explicit Model(const char* path) {
-    mmdeploy_model_t model{};
-    auto ec = mmdeploy_model_create_by_path(path, &model);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-    model_.reset(model, [](auto p) { mmdeploy_model_destroy(p); });
-  }
-
-  explicit Model(const std::string& path) : Model(path.c_str()) {}
-
-  Model(const void* buffer, size_t size) {
-    mmdeploy_model_t model{};
-    auto ec = mmdeploy_model_create(buffer, static_cast<int>(size), &model);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-    model_.reset(model, [](auto p) { mmdeploy_model_destroy(p); });
-  }
-
-  operator mmdeploy_model_t() const noexcept { return model_.get(); }
-
- private:
-  std::shared_ptr<mmdeploy_model> model_{};
-};
-
-class Device {
- public:
-  explicit Device(std::string name, int index = 0) : name_(std::move(name)), index_(index) {
-    mmdeploy_device_t device{};
-    auto ec = mmdeploy_device_create(name_.c_str(), index, &device);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-    device_.reset(device, [](auto p) { mmdeploy_device_destroy(p); });
-  }
-
-  const char* name() const noexcept { return name_.c_str(); }
-  int index() const noexcept { return index_; }
-
-  operator mmdeploy_device_t() const noexcept { return device_.get(); }
-
- private:
-  std::string name_;
-  int index_;
-  std::shared_ptr<mmdeploy_device> device_;
-};
-
-class Profiler {
- public:
-  explicit Profiler(std::string_view path) : path_(path) {
-    mmdeploy_profiler_t profiler{};
-    auto ec = mmdeploy_profiler_create(path_.c_str(), &profiler);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-    profiler_.reset(profiler, [](auto p) { mmdeploy_profiler_destroy(p); });
-  };
-
-  operator mmdeploy_profiler_t() const noexcept { return profiler_.get(); }
-
- private:
-  std::string path_;
-  std::shared_ptr<mmdeploy_profiler> profiler_;
-};
-
-class Mat {
- public:
-  Mat() : desc_{} {}
-
-  Mat(int height, int width, int channels, mmdeploy_pixel_format_t format,
-      mmdeploy_data_type_t type, uint8_t* data, mmdeploy_device_t device = nullptr)
-      : desc_{data, height, width, channels, format, type, device} {}
-
-  Mat(const mmdeploy_mat_t& desc) : desc_(desc) {}  // NOLINT
-
-  const mmdeploy_mat_t& desc() const noexcept { return desc_; }
+namespace mmdeploy
+{
+
+    namespace cxx
+    {
+
+        using Rect = mmdeploy_rect_t;
+
+        template<typename T>
+        class UniqueHandle : public NonCopyable
+        {
+          public:
+            UniqueHandle() = default;
+            explicit UniqueHandle(T handle)
+                : handle_(handle)
+            {
+            }
+
+            // derived class must destroy the object and reset `handle_`
+            ~UniqueHandle()
+            {
+                assert(handle_ == nullptr);
+            }
+
+            UniqueHandle(UniqueHandle&& o) noexcept
+                : handle_(std::exchange(o.handle_, nullptr))
+            {
+            }
+
+            UniqueHandle& operator=(UniqueHandle&& o) noexcept
+            {
+                if (this != &o)
+                {
+                    handle_ = std::exchange(o.handle_, nullptr);
+                }
+                return *this;
+            }
+
+            explicit operator T() const noexcept
+            {
+                return handle_;
+            }
+
+            T operator->() const noexcept
+            {
+                return handle_;
+            }
+
+          protected:
+            T handle_{};
+        };
+
+        class Model
+        {
+          public:
+            explicit Model(const char* path)
+            {
+                mmdeploy_model_t model{};
+                auto             ec = mmdeploy_model_create_by_path(path, &model);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+                model_.reset(model,
+                             [](auto p)
+                             {
+                                 mmdeploy_model_destroy(p);
+                             });
+            }
+
+            explicit Model(const std::string& path)
+                : Model(path.c_str())
+            {
+            }
+
+            Model(const void* buffer, size_t size)
+            {
+                mmdeploy_model_t model{};
+                auto             ec = mmdeploy_model_create(buffer,
+                                                static_cast<int>(size),
+                                                &model);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+
+                model_.reset(model,
+                             [](auto p)
+                             {
+                                 mmdeploy_model_destroy(p);
+                             });
+            }
+
+            operator mmdeploy_model_t() const noexcept
+            {
+                return model_.get();
+            }
+
+          private:
+            std::shared_ptr<mmdeploy_model> model_{};
+        };
+
+        class Device
+        {
+          public:
+            explicit Device(std::string name, int index = 0)
+                : name_(std::move(name))
+                , index_(index)
+            {
+                mmdeploy_device_t device{};
+                auto              ec = mmdeploy_device_create(name_.c_str(),
+                                                 index,
+                                                 &device);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+
+                device_.reset(device,
+                              [](auto p)
+                              {
+                                  mmdeploy_device_destroy(p);
+                              });
+            }
+
+            const char* name() const noexcept
+            {
+                return name_.c_str();
+            }
+
+            int index() const noexcept
+            {
+                return index_;
+            }
+
+            operator mmdeploy_device_t() const noexcept
+            {
+                return device_.get();
+            }
+
+          private:
+            std::string                      name_;
+            int                              index_;
+            std::shared_ptr<mmdeploy_device> device_;
+        };
+
+        class Profiler
+        {
+          public:
+            explicit Profiler(std::string_view path)
+                : path_(path)
+            {
+                mmdeploy_profiler_t profiler{};
+                auto                ec = mmdeploy_profiler_create(path_.c_str(), &profiler);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+
+                profiler_.reset(profiler,
+                                [](auto p)
+                                {
+                                    mmdeploy_profiler_destroy(p);
+                                });
+            };
+
+            operator mmdeploy_profiler_t() const noexcept
+            {
+                return profiler_.get();
+            }
+
+          private:
+            std::string                        path_;
+            std::shared_ptr<mmdeploy_profiler> profiler_;
+        };
+
+        class Mat
+        {
+          public:
+            Mat()
+                : desc_{}
+            {
+            }
+
+            Mat(int                     height,
+                int                     width,
+                int                     channels,
+                mmdeploy_pixel_format_t format,
+                mmdeploy_data_type_t    type,
+                uint8_t*                data,
+                mmdeploy_device_t       device = nullptr)
+                : desc_{data,
+                        height,
+                        width,
+                        channels,
+                        format,
+                        type,
+                        device}
+            {
+            }
+
+            Mat(const mmdeploy_mat_t& desc)
+                : desc_(desc)
+            {
+            }  // NOLINT
+
+            const mmdeploy_mat_t& desc() const noexcept
+            {
+                return desc_;
+            }
 
 #if MMDEPLOY_CXX_USE_OPENCV
-  Mat(const cv::Mat& mat, mmdeploy_pixel_format_t pixel_format)
-      : desc_{mat.data, mat.rows, mat.cols, mat.channels(), pixel_format, GetCvType(mat.depth())} {
-    if (pixel_format == MMDEPLOY_PIXEL_FORMAT_COUNT) {
-      throw_exception(eNotSupported);
-    }
-    if (desc_.type == MMDEPLOY_DATA_TYPE_COUNT) {
-      throw_exception(eNotSupported);
-    }
-  }
-  Mat(const cv::Mat& mat) : Mat(mat, GetCvFormat(mat.channels())) {}
-
-  static mmdeploy_data_type_t GetCvType(int depth) {
-    switch (depth) {
-      case CV_8U:
-        return MMDEPLOY_DATA_TYPE_UINT8;
-      case CV_32F:
-        return MMDEPLOY_DATA_TYPE_FLOAT;
-      default:
-        return MMDEPLOY_DATA_TYPE_COUNT;
-    }
-  }
-  static mmdeploy_pixel_format_t GetCvFormat(int channels) {
-    switch (channels) {
-      case 1:
-        return MMDEPLOY_PIXEL_FORMAT_GRAYSCALE;
-      case 3:
-        return MMDEPLOY_PIXEL_FORMAT_BGR;
-      case 4:
-        return MMDEPLOY_PIXEL_FORMAT_BGRA;
-      default:
-        return MMDEPLOY_PIXEL_FORMAT_COUNT;
-    }
-  }
+            Mat(const cv::Mat& mat, mmdeploy_pixel_format_t pixel_format)
+                : desc_{mat.data,
+                        mat.rows,
+                        mat.cols,
+                        mat.channels(),
+                        pixel_format,
+                        GetCvType(mat.depth())}
+            {
+                if (pixel_format == MMDEPLOY_PIXEL_FORMAT_COUNT)
+                {
+                    throw_exception(eNotSupported);
+                }
+
+                if (desc_.type == MMDEPLOY_DATA_TYPE_COUNT)
+                {
+                    throw_exception(eNotSupported);
+                }
+            }
+
+            Mat(const cv::Mat& mat)
+                : Mat(mat, GetCvFormat(mat.channels()))
+            {
+            }
+
+            static mmdeploy_data_type_t GetCvType(int depth)
+            {
+                switch (depth)
+                {
+                    case CV_8U:
+                        return MMDEPLOY_DATA_TYPE_UINT8;
+                    case CV_32F:
+                        return MMDEPLOY_DATA_TYPE_FLOAT;
+                    default:
+                        return MMDEPLOY_DATA_TYPE_COUNT;
+                }
+            }
+
+            static mmdeploy_pixel_format_t GetCvFormat(int channels)
+            {
+                switch (channels)
+                {
+                    case 1:
+                        return MMDEPLOY_PIXEL_FORMAT_GRAYSCALE;
+                    case 3:
+                        return MMDEPLOY_PIXEL_FORMAT_BGR;
+                    case 4:
+                        return MMDEPLOY_PIXEL_FORMAT_BGRA;
+                    default:
+                        return MMDEPLOY_PIXEL_FORMAT_COUNT;
+                }
+            }
 #endif
- private:
-  mmdeploy_mat_t desc_;
-};
-
-template <typename T>
-class Result_ {
- public:
-  using value_type = T;
-  using size_type = size_t;
-  using difference_type = ptrdiff_t;
-  using reference = T&;
-  using const_reference = const T&;
-  using pointer = T*;
-  using const_pointer = const T*;
-  using iterator = T*;
-  using const_iterator = T*;
-
-  Result_(size_t offset, size_t size, std::shared_ptr<T> data)
-      : offset_(offset), size_(size), data_(std::move(data)) {}
-
-  T& operator[](size_t index) const noexcept { return *(data_.get() + offset_ + index); }
-  size_t size() const noexcept { return size_; }
-  T* begin() const noexcept { return data_.get() + offset_; }
-  T* end() const noexcept { return begin() + size_; }
-
-  T* operator->() const noexcept { return data_.get(); }
-  T& operator*() const noexcept { return *data_; }
-
- private:
-  size_t offset_;
-  size_t size_;
-  std::shared_ptr<T> data_;
-};
-
-inline const mmdeploy_mat_t* reinterpret(const Mat* p) {
-  return reinterpret_cast<const mmdeploy_mat_t*>(p);
-}
-
-class Scheduler {
- public:
-  explicit Scheduler(mmdeploy_scheduler_t scheduler) {
-    scheduler_.reset(scheduler, [](auto p) { mmdeploy_scheduler_destroy(p); });
-  }
-
-  static Scheduler ThreadPool(int num_threads) {
-    return Scheduler(mmdeploy_executor_create_thread_pool(num_threads));
-  }
-  static Scheduler Thread() { return Scheduler(mmdeploy_executor_create_thread()); }
-
-  operator mmdeploy_scheduler_t() const noexcept { return scheduler_.get(); }
-
- private:
-  std::shared_ptr<mmdeploy_scheduler> scheduler_;
-};
-
-class Context {
- public:
-  Context() {
-    mmdeploy_context_t context{};
-    mmdeploy_context_create(&context);
-    context_.reset(context, [](auto p) { mmdeploy_context_destroy(p); });
-  }
-  /* implicit */ Context(const Device& device) : Context() { Add(device); }
-
-  void Add(const std::string& name, const Scheduler& scheduler) {
-    mmdeploy_context_add(*this, MMDEPLOY_TYPE_SCHEDULER, name.c_str(), scheduler);
-  }
-
-  void Add(const std::string& name, const Model& model) {
-    mmdeploy_context_add(*this, MMDEPLOY_TYPE_MODEL, name.c_str(), model);
-  }
-
-  void Add(const Device& device) {
-    mmdeploy_context_add(*this, MMDEPLOY_TYPE_DEVICE, nullptr, device);
-  }
-
-  void Add(const Profiler& profiler) {
-    mmdeploy_context_add(*this, MMDEPLOY_TYPE_PROFILER, nullptr, profiler);
-  }
-
-  operator mmdeploy_context_t() const noexcept { return context_.get(); }
-
- private:
-  std::shared_ptr<mmdeploy_context> context_;
-};
-
-}  // namespace cxx
-
-using cxx::Context;
-using cxx::Device;
-using cxx::Mat;
-using cxx::Model;
-using cxx::Profiler;
-using cxx::Rect;
-using cxx::Scheduler;
+          private:
+            mmdeploy_mat_t desc_;
+        };
+
+        template<typename T>
+        class Result_
+        {
+          public:
+            using value_type      = T;
+            using size_type       = size_t;
+            using difference_type = ptrdiff_t;
+            using reference       = T&;
+            using const_reference = const T&;
+            using pointer         = T*;
+            using const_pointer   = const T*;
+            using iterator        = T*;
+            using const_iterator  = T*;
+
+            Result_(size_t offset, size_t size, std::shared_ptr<T> data)
+                : offset_(offset)
+                , size_(size)
+                , data_(std::move(data))
+            {
+            }
+
+            T& operator[](size_t index) const noexcept
+            {
+                return *(data_.get() + offset_ + index);
+            }
+
+            size_t size() const noexcept
+            {
+                return size_;
+            }
+
+            T* begin() const noexcept
+            {
+                return data_.get() + offset_;
+            }
+
+            T* end() const noexcept
+            {
+                return begin() + size_;
+            }
+
+            T* operator->() const noexcept
+            {
+                return data_.get();
+            }
+
+            T& operator*() const noexcept
+            {
+                return *data_;
+            }
+
+          private:
+            size_t             offset_;
+            size_t             size_;
+            std::shared_ptr<T> data_;
+        };
+
+        inline const mmdeploy_mat_t* reinterpret(const Mat* p)
+        {
+            return reinterpret_cast<const mmdeploy_mat_t*>(p);
+        }
+
+        class Scheduler
+        {
+          public:
+            explicit Scheduler(mmdeploy_scheduler_t scheduler)
+            {
+                scheduler_.reset(scheduler,
+                                 [](auto p)
+                                 {
+                                     mmdeploy_scheduler_destroy(p);
+                                 });
+            }
+
+            static Scheduler ThreadPool(int num_threads)
+            {
+                return Scheduler(mmdeploy_executor_create_thread_pool(num_threads));
+            }
+
+            static Scheduler Thread()
+            {
+                return Scheduler(mmdeploy_executor_create_thread());
+            }
+
+            operator mmdeploy_scheduler_t() const noexcept
+            {
+                return scheduler_.get();
+            }
+
+          private:
+            std::shared_ptr<mmdeploy_scheduler> scheduler_;
+        };
+
+        class Context
+        {
+          public:
+            Context()
+            {
+                mmdeploy_context_t context{};
+                mmdeploy_context_create(&context);
+                context_.reset(context,
+                               [](auto p)
+                               {
+                                   mmdeploy_context_destroy(p);
+                               });
+            }
+
+            /* implicit */ Context(const Device& device)
+                : Context()
+            {
+                Add(device);
+            }
+
+            void Add(const std::string& name, const Scheduler& scheduler)
+            {
+                mmdeploy_context_add(*this, MMDEPLOY_TYPE_SCHEDULER, name.c_str(), scheduler);
+            }
+
+            void Add(const std::string& name, const Model& model)
+            {
+                mmdeploy_context_add(*this, MMDEPLOY_TYPE_MODEL, name.c_str(), model);
+            }
+
+            void Add(const Device& device)
+            {
+                mmdeploy_context_add(*this, MMDEPLOY_TYPE_DEVICE, nullptr, device);
+            }
+
+            void Add(const Profiler& profiler)
+            {
+                mmdeploy_context_add(*this, MMDEPLOY_TYPE_PROFILER, nullptr, profiler);
+            }
+
+            operator mmdeploy_context_t() const noexcept
+            {
+                return context_.get();
+            }
+
+          private:
+            std::shared_ptr<mmdeploy_context> context_;
+        };
+
+    }  // namespace cxx
+
+    using cxx::Context;
+    using cxx::Device;
+    using cxx::Mat;
+    using cxx::Model;
+    using cxx::Profiler;
+    using cxx::Rect;
+    using cxx::Scheduler;
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/apis/cxx/mmdeploy/detector.hpp b/csrc/mmdeploy/apis/cxx/mmdeploy/detector.hpp
index 847505bbe7..31874fa9f9 100644
--- a/csrc/mmdeploy/apis/cxx/mmdeploy/detector.hpp
+++ b/csrc/mmdeploy/apis/cxx/mmdeploy/detector.hpp
@@ -6,68 +6,87 @@
 #include "mmdeploy/common.hpp"
 #include "mmdeploy/detector.h"
 
-namespace mmdeploy {
-
-namespace cxx {
-
-using Detection = mmdeploy_detection_t;
-
-class Detector : public NonMovable {
- public:
-  Detector(const Model& model, const Context& context) {
-    auto ec = mmdeploy_detector_create_v2(model, context, &detector_);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-  }
-
-  ~Detector() {
-    if (detector_) {
-      mmdeploy_detector_destroy(detector_);
-      detector_ = {};
-    }
-  }
-
-  using Result = Result_<Detection>;
-
-  std::vector<Result> Apply(Span<const Mat> images) {
-    if (images.empty()) {
-      return {};
-    }
-
-    Detection* results{};
-    int* result_count{};
-    auto ec = mmdeploy_detector_apply(detector_, reinterpret(images.data()),
-                                      static_cast<int>(images.size()), &results, &result_count);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-
-    std::shared_ptr<Detection> data(results, [result_count, count = images.size()](auto p) {
-      mmdeploy_detector_release_result(p, result_count, count);
-    });
-
-    std::vector<Result> rets;
-    rets.reserve(images.size());
-
-    size_t offset = 0;
-    for (size_t i = 0; i < images.size(); ++i) {
-      offset += rets.emplace_back(offset, result_count[i], data).size();
-    }
-
-    return rets;
-  }
-
-  Result Apply(const Mat& image) { return Apply(Span{image})[0]; }
-
- private:
-  mmdeploy_detector_t detector_{};
-};
-
-}  // namespace cxx
-
-using cxx::Detection;
-using cxx::Detector;
+namespace mmdeploy
+{
+
+    namespace cxx
+    {
+
+        using Detection = mmdeploy_detection_t;
+
+        class Detector : public NonMovable
+        {
+          public:
+            Detector(const Model& model, const Context& context)
+            {
+                auto ec = mmdeploy_detector_create_v2(model, context, &detector_);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+            }
+
+            ~Detector()
+            {
+                if (detector_)
+                {
+                    mmdeploy_detector_destroy(detector_);
+                    detector_ = {};
+                }
+            }
+
+            using Result = Result_<Detection>;
+
+            std::vector<Result> Apply(Span<const Mat> images)
+            {
+                if (images.empty())
+                {
+                    return {};
+                }
+
+                Detection* results{};
+                int*       result_count{};
+                auto       ec = mmdeploy_detector_apply(detector_,
+                                                  reinterpret(images.data()),
+                                                  static_cast<int>(images.size()),
+                                                  &results,
+                                                  &result_count);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+
+                std::shared_ptr<Detection> data(results,
+                                                [result_count, count = images.size()](auto p)
+                                                {
+                                                    mmdeploy_detector_release_result(p, result_count, count);
+                                                });
+
+                std::vector<Result>        rets;
+                rets.reserve(images.size());
+
+                size_t offset = 0;
+                for (size_t i = 0; i < images.size(); ++i)
+                {
+                    offset += rets.emplace_back(offset, result_count[i], data).size();
+                }
+
+                return rets;
+            }
+
+            Result Apply(const Mat& image)
+            {
+                return Apply(Span{image})[0];
+            }
+
+          private:
+            mmdeploy_detector_t detector_{};
+        };
+
+    }  // namespace cxx
+
+    using cxx::Detection;
+    using cxx::Detector;
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/apis/cxx/mmdeploy/pipeline.hpp b/csrc/mmdeploy/apis/cxx/mmdeploy/pipeline.hpp
index e20ec6a224..9380236f8c 100644
--- a/csrc/mmdeploy/apis/cxx/mmdeploy/pipeline.hpp
+++ b/csrc/mmdeploy/apis/cxx/mmdeploy/pipeline.hpp
@@ -7,72 +7,91 @@
 #include "mmdeploy/core/value.h"
 #include "mmdeploy/pipeline.h"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-namespace cxx {
+    namespace cxx
+    {
 
-class Pipeline : public NonMovable {
- public:
-  Pipeline(const Value& config, const Context& context) {
-    mmdeploy_pipeline_t pipeline{};
-    auto ec = mmdeploy_pipeline_create_v3((mmdeploy_value_t)&config, context, &pipeline);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-    pipeline_ = pipeline;
-  }
+        class Pipeline : public NonMovable
+        {
+          public:
+            Pipeline(const Value& config, const Context& context)
+            {
+                mmdeploy_pipeline_t pipeline{};
+                auto                ec = mmdeploy_pipeline_create_v3((mmdeploy_value_t)&config,
+                                                      context,
+                                                      &pipeline);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+                pipeline_ = pipeline;
+            }
 
-  ~Pipeline() {
-    if (pipeline_) {
-      mmdeploy_pipeline_destroy(pipeline_);
-      pipeline_ = nullptr;
-    }
-  }
+            ~Pipeline()
+            {
+                if (pipeline_)
+                {
+                    mmdeploy_pipeline_destroy(pipeline_);
+                    pipeline_ = nullptr;
+                }
+            }
 
-  Value Apply(const Value& inputs) {
-    mmdeploy_value_t tmp{};
-    auto ec = mmdeploy_pipeline_apply(pipeline_, (mmdeploy_value_t)&inputs, &tmp);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-    Value output = std::move(*(Value*)tmp);
-    mmdeploy_value_destroy(tmp);
-    return output;
-  }
+            Value Apply(const Value& inputs)
+            {
+                mmdeploy_value_t tmp{};
+                auto             ec = mmdeploy_pipeline_apply(pipeline_,
+                                                  (mmdeploy_value_t)&inputs,
+                                                  &tmp);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+                Value output = std::move(*(Value*)tmp);
+                mmdeploy_value_destroy(tmp);
+                return output;
+            }
 
-  Value Apply(Span<const Mat> images) {
-    if (images.empty()) {
-      return {};
-    }
-    mmdeploy_value_t inputs{};
-    auto ec = mmdeploy_common_create_input(reinterpret(images.data()),
-                                           static_cast<int>(images.size()), &inputs);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-    auto outputs = Apply(*reinterpret_cast<Value*>(inputs));
-    mmdeploy_value_destroy(inputs);
+            Value Apply(Span<const Mat> images)
+            {
+                if (images.empty())
+                {
+                    return {};
+                }
+                mmdeploy_value_t inputs{};
+                auto             ec = mmdeploy_common_create_input(reinterpret(images.data()),
+                                                       static_cast<int>(images.size()),
+                                                       &inputs);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+                auto outputs = Apply(*reinterpret_cast<Value*>(inputs));
+                mmdeploy_value_destroy(inputs);
 
-    return outputs;
-  }
+                return outputs;
+            }
 
-  Value Apply(const Mat& image) {
-    auto outputs = Apply(Span{image});
-    Value::Array rets;
-    rets.reserve(outputs.size());
-    for (auto& output : outputs) {
-      rets.push_back(std::move(output[0]));
-    }
-    return rets;
-  }
+            Value Apply(const Mat& image)
+            {
+                auto         outputs = Apply(Span{image});
+                Value::Array rets;
+                rets.reserve(outputs.size());
+                for (auto& output : outputs)
+                {
+                    rets.push_back(std::move(output[0]));
+                }
+                return rets;
+            }
 
- private:
-  mmdeploy_pipeline_t pipeline_{};
-};
+          private:
+            mmdeploy_pipeline_t pipeline_{};
+        };
 
-}  // namespace cxx
+    }  // namespace cxx
 
-using cxx::Pipeline;
+    using cxx::Pipeline;
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/apis/cxx/mmdeploy/pose_detector.hpp b/csrc/mmdeploy/apis/cxx/mmdeploy/pose_detector.hpp
index 7432a417fc..34ef2d2221 100644
--- a/csrc/mmdeploy/apis/cxx/mmdeploy/pose_detector.hpp
+++ b/csrc/mmdeploy/apis/cxx/mmdeploy/pose_detector.hpp
@@ -6,79 +6,91 @@
 #include "mmdeploy/common.hpp"
 #include "mmdeploy/pose_detector.h"
 
-namespace mmdeploy {
-
-namespace cxx {
-
-using PoseDetection = mmdeploy_pose_detection_t;
-
-class PoseDetector : public NonMovable {
- public:
-  PoseDetector(const Model& model, const Context& context) {
-    auto ec = mmdeploy_pose_detector_create_v2(model, context, &detector_);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-  }
-
-  ~PoseDetector() {
-    if (detector_) {
-      mmdeploy_pose_detector_destroy(detector_);
-      detector_ = {};
-    }
-  }
-
-  using Result = Result_<PoseDetection>;
-
-  std::vector<Result> Apply(Span<const Mat> images, Span<const Rect> bboxes,
-                            Span<const int> bbox_count) {
-    if (images.empty()) {
-      return {};
-    }
-
-    const mmdeploy_rect_t* p_bboxes{};
-    const int* p_bbox_count{};
-
-    if (!bboxes.empty()) {
-      p_bboxes = bboxes.data();
-      p_bbox_count = bbox_count.data();
-    }
-
-    PoseDetection* results{};
-    auto ec = mmdeploy_pose_detector_apply_bbox(detector_, reinterpret(images.data()),
-                                                static_cast<int>(images.size()), p_bboxes,
-                                                p_bbox_count, &results);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-
-    std::shared_ptr<PoseDetection> data(results, [count = images.size()](auto p) {
-      mmdeploy_pose_detector_release_result(p, count);
-    });
-
-    std::vector<Result> rets;
-    rets.reserve(images.size());
-
-    size_t offset = 0;
-    for (size_t i = 0; i < images.size(); ++i) {
-      offset += rets.emplace_back(offset, bboxes.empty() ? 1 : bbox_count[i], data).size();
-    }
-
-    return rets;
-  }
-
-  Result Apply(const Mat& image, Span<const Rect> bboxes = {}) {
-    return Apply(Span{image}, bboxes, {static_cast<int>(bboxes.size())})[0];
-  }
-
- private:
-  mmdeploy_pose_detector_t detector_{};
-};
-
-}  // namespace cxx
-
-using cxx::PoseDetection;
-using cxx::PoseDetector;
+namespace mmdeploy
+{
+
+    namespace cxx
+    {
+
+        using PoseDetection = mmdeploy_pose_detection_t;
+
+        class PoseDetector : public NonMovable
+        {
+          public:
+            PoseDetector(const Model& model, const Context& context)
+            {
+                auto ec = mmdeploy_pose_detector_create_v2(model, context, &detector_);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+            }
+
+            ~PoseDetector()
+            {
+                if (detector_)
+                {
+                    mmdeploy_pose_detector_destroy(detector_);
+                    detector_ = {};
+                }
+            }
+
+            using Result = Result_<PoseDetection>;
+
+            std::vector<Result> Apply(Span<const Mat> images, Span<const Rect> bboxes, Span<const int> bbox_count)
+            {
+                if (images.empty())
+                {
+                    return {};
+                }
+
+                const mmdeploy_rect_t* p_bboxes{};
+                const int*             p_bbox_count{};
+
+                if (!bboxes.empty())
+                {
+                    p_bboxes     = bboxes.data();
+                    p_bbox_count = bbox_count.data();
+                }
+
+                PoseDetection* results{};
+                auto           ec = mmdeploy_pose_detector_apply_bbox(detector_, reinterpret(images.data()), static_cast<int>(images.size()), p_bboxes, p_bbox_count, &results);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+
+                std::shared_ptr<PoseDetection> data(results,
+                                                    [count = images.size()](auto p)
+                                                    {
+                                                        mmdeploy_pose_detector_release_result(p, count);
+                                                    });
+
+                std::vector<Result>            rets;
+                rets.reserve(images.size());
+
+                size_t offset = 0;
+                for (size_t i = 0; i < images.size(); ++i)
+                {
+                    offset += rets.emplace_back(offset, bboxes.empty() ? 1 : bbox_count[i], data).size();
+                }
+
+                return rets;
+            }
+
+            Result Apply(const Mat& image, Span<const Rect> bboxes = {})
+            {
+                return Apply(Span{image}, bboxes, {static_cast<int>(bboxes.size())})[0];
+            }
+
+          private:
+            mmdeploy_pose_detector_t detector_{};
+        };
+
+    }  // namespace cxx
+
+    using cxx::PoseDetection;
+    using cxx::PoseDetector;
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/apis/cxx/mmdeploy/pose_tracker.hpp b/csrc/mmdeploy/apis/cxx/mmdeploy/pose_tracker.hpp
index 077ec75700..e1e330ce05 100644
--- a/csrc/mmdeploy/apis/cxx/mmdeploy/pose_tracker.hpp
+++ b/csrc/mmdeploy/apis/cxx/mmdeploy/pose_tracker.hpp
@@ -6,145 +6,171 @@
 #include "mmdeploy/common.hpp"
 #include "mmdeploy/pose_tracker.h"
 
-namespace mmdeploy {
-
-namespace cxx {
-
-class PoseTracker : public UniqueHandle<mmdeploy_pose_tracker_t> {
- public:
-  using Result = Result_<mmdeploy_pose_tracker_target_t>;
-  class State;
-  class Params;
-
- public:
-  /**
-   * @brief Create pose tracker pipeline
-   * @param detect object detection model
-   * @param pose pose estimation model
-   * @param context execution context
-   */
-  PoseTracker(const Model& detect, const Model& pose, const Context& context) {
-    auto ec = mmdeploy_pose_tracker_create(detect, pose, context, &handle_);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-  }
-  ~PoseTracker() {
-    if (handle_) {
-      mmdeploy_pose_tracker_destroy(handle_);
-      handle_ = {};
-    }
-  }
-  PoseTracker(PoseTracker&&) noexcept = default;
-
-  /**
-   * @brief Create a tracker state corresponds to a video stream
-   * @param params params for creating the tracker state
-   * @return created tracker state
-   */
-  State CreateState(const Params& params);
-
-  /**
-   * @brief Apply pose tracker pipeline
-   * @param state tracker state
-   * @param frame input video frame
-   * @param detect control the use of detector
-   *   -1: use params.det_interval, 0: don't use detector, 1: force use detector
-   * @return
-   */
-  Result Apply(State& state, const Mat& frame, int detect = -1);
-
-  /**
-   * @brief batched version of Apply
-   * @param states
-   * @param frames
-   * @param detects
-   * @return
-   */
-  std::vector<Result> Apply(const Span<State>& states, const Span<const Mat>& frames,
-                            const Span<const int>& detects = {});
-
- public:
-  /**
-   * see \ref mmdeploy/pose_tracker.h for detail
-   */
-  class Params : public UniqueHandle<mmdeploy_pose_tracker_param_t*> {
-   public:
-    explicit Params() {
-      handle_ = new mmdeploy_pose_tracker_param_t{};
-      mmdeploy_pose_tracker_default_params(handle_);
-    }
-    ~Params() {
-      if (handle_) {
-        delete handle_;
-        handle_ = {};
-      }
-    }
-  };
-
-  class State : public UniqueHandle<mmdeploy_pose_tracker_state_t> {
-   public:
-    explicit State(mmdeploy_pose_tracker_t pipeline, const mmdeploy_pose_tracker_param_t* params) {
-      auto ec = mmdeploy_pose_tracker_create_state(pipeline, params, &handle_);
-      if (ec != MMDEPLOY_SUCCESS) {
-        throw_exception(static_cast<ErrorCode>(ec));
-      }
-    }
-    ~State() {
-      if (handle_) {
-        mmdeploy_pose_tracker_destroy_state(handle_);
-        handle_ = {};
-      }
-    }
-    State(State&&) noexcept = default;
-  };
-};
-
-inline PoseTracker::State PoseTracker::CreateState(const PoseTracker::Params& params) {
-  return State(handle_, static_cast<mmdeploy_pose_tracker_param_t*>(params));
-}
-
-inline std::vector<PoseTracker::Result> PoseTracker::Apply(const Span<State>& states,
-                                                           const Span<const Mat>& frames,
-                                                           const Span<const int32_t>& detects) {
-  if (frames.empty()) {
-    return {};
-  }
-  mmdeploy_pose_tracker_target_t* results{};
-  int32_t* result_count{};
-
-  auto ec = mmdeploy_pose_tracker_apply(
-      handle_, reinterpret_cast<mmdeploy_pose_tracker_state_t*>(states.data()),
-      reinterpret(frames.data()), detects.data(), static_cast<int32_t>(frames.size()), &results,
-      &result_count);
-  if (ec != MMDEPLOY_SUCCESS) {
-    throw_exception(static_cast<ErrorCode>(ec));
-  }
-
-  std::shared_ptr<mmdeploy_pose_tracker_target_t> data(
-      results, [result_count, count = frames.size()](auto p) {
-        mmdeploy_pose_tracker_release_result(p, result_count, count);
-      });
-
-  std::vector<Result> rets;
-  rets.reserve(frames.size());
-
-  size_t offset = 0;
-  for (size_t i = 0; i < frames.size(); ++i) {
-    offset += rets.emplace_back(offset, result_count[i], data).size();
-  }
-
-  return rets;
-}
-
-inline PoseTracker::Result PoseTracker::Apply(PoseTracker::State& state, const Mat& frame,
-                                              int32_t detect) {
-  return Apply(Span(&state, 1), Span{frame}, Span{detect})[0];
-}
-
-}  // namespace cxx
-
-using cxx::PoseTracker;
+namespace mmdeploy
+{
+
+    namespace cxx
+    {
+
+        class PoseTracker : public UniqueHandle<mmdeploy_pose_tracker_t>
+        {
+          public:
+            using Result = Result_<mmdeploy_pose_tracker_target_t>;
+            class State;
+            class Params;
+
+          public:
+            /**
+             * @brief Create pose tracker pipeline
+             * @param detect object detection model
+             * @param pose pose estimation model
+             * @param context execution context
+             */
+            PoseTracker(const Model& detect, const Model& pose, const Context& context)
+            {
+                auto ec = mmdeploy_pose_tracker_create(detect, pose, context, &handle_);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+            }
+            ~PoseTracker()
+            {
+                if (handle_)
+                {
+                    mmdeploy_pose_tracker_destroy(handle_);
+                    handle_ = {};
+                }
+            }
+            PoseTracker(PoseTracker&&) noexcept = default;
+
+            /**
+             * @brief Create a tracker state corresponds to a video stream
+             * @param params params for creating the tracker state
+             * @return created tracker state
+             */
+            State               CreateState(const Params& params);
+
+            /**
+             * @brief Apply pose tracker pipeline
+             * @param state tracker state
+             * @param frame input video frame
+             * @param detect control the use of detector
+             *   -1: use params.det_interval, 0: don't use detector, 1: force use detector
+             * @return
+             */
+            Result              Apply(State& state, const Mat& frame, int detect = -1);
+
+            /**
+             * @brief batched version of Apply
+             * @param states
+             * @param frames
+             * @param detects
+             * @return
+             */
+            std::vector<Result> Apply(const Span<State>& states, const Span<const Mat>& frames, const Span<const int>& detects = {});
+
+          public:
+            /**
+             * see \ref mmdeploy/pose_tracker.h for detail
+             */
+            class Params : public UniqueHandle<mmdeploy_pose_tracker_param_t*>
+            {
+              public:
+                explicit Params()
+                {
+                    handle_ = new mmdeploy_pose_tracker_param_t{};
+                    mmdeploy_pose_tracker_default_params(handle_);
+                }
+                ~Params()
+                {
+                    if (handle_)
+                    {
+                        delete handle_;
+                        handle_ = {};
+                    }
+                }
+            };
+
+            class State : public UniqueHandle<mmdeploy_pose_tracker_state_t>
+            {
+              public:
+                explicit State(mmdeploy_pose_tracker_t pipeline, const mmdeploy_pose_tracker_param_t* params)
+                {
+                    auto ec = mmdeploy_pose_tracker_create_state(pipeline, params, &handle_);
+                    if (ec != MMDEPLOY_SUCCESS)
+                    {
+                        throw_exception(static_cast<ErrorCode>(ec));
+                    }
+                }
+                ~State()
+                {
+                    if (handle_)
+                    {
+                        mmdeploy_pose_tracker_destroy_state(handle_);
+                        handle_ = {};
+                    }
+                }
+                State(State&&) noexcept = default;
+            };
+        };
+
+        inline PoseTracker::State PoseTracker::CreateState(const PoseTracker::Params& params)
+        {
+            return State(handle_, static_cast<mmdeploy_pose_tracker_param_t*>(params));
+        }
+
+        inline std::vector<PoseTracker::Result> PoseTracker::Apply(const Span<State>&         states,
+                                                                   const Span<const Mat>&     frames,
+                                                                   const Span<const int32_t>& detects)
+        {
+            if (frames.empty())
+            {
+                return {};
+            }
+            mmdeploy_pose_tracker_target_t* results{};
+            int32_t*                        result_count{};
+
+            auto                            ec = mmdeploy_pose_tracker_apply(
+                handle_,
+                reinterpret_cast<mmdeploy_pose_tracker_state_t*>(states.data()),
+                reinterpret(frames.data()),
+                detects.data(),
+                static_cast<int32_t>(frames.size()),
+                &results,
+                &result_count);
+            if (ec != MMDEPLOY_SUCCESS)
+            {
+                throw_exception(static_cast<ErrorCode>(ec));
+            }
+
+            std::shared_ptr<mmdeploy_pose_tracker_target_t> data(
+                results,
+                [result_count, count = frames.size()](auto p)
+                {
+                    mmdeploy_pose_tracker_release_result(p, result_count, count);
+                });
+
+            std::vector<Result> rets;
+            rets.reserve(frames.size());
+
+            size_t offset = 0;
+            for (size_t i = 0; i < frames.size(); ++i)
+            {
+                offset += rets.emplace_back(offset, result_count[i], data).size();
+            }
+
+            return rets;
+        }
+
+        inline PoseTracker::Result PoseTracker::Apply(PoseTracker::State& state, const Mat& frame, int32_t detect)
+        {
+            return Apply(Span(&state, 1), Span{frame}, Span{detect})[0];
+        }
+
+    }  // namespace cxx
+
+    using cxx::PoseTracker;
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/apis/cxx/mmdeploy/restorer.hpp b/csrc/mmdeploy/apis/cxx/mmdeploy/restorer.hpp
index 671c5c2d0c..dcf9ab75af 100644
--- a/csrc/mmdeploy/apis/cxx/mmdeploy/restorer.hpp
+++ b/csrc/mmdeploy/apis/cxx/mmdeploy/restorer.hpp
@@ -6,62 +6,77 @@
 #include "mmdeploy/common.hpp"
 #include "mmdeploy/restorer.h"
 
-namespace mmdeploy {
-
-namespace cxx {
-
-class Restorer : public NonMovable {
- public:
-  Restorer(const Model& model, const Context& context) {
-    auto ec = mmdeploy_restorer_create_v2(model, context, &restorer_);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-  }
-
-  ~Restorer() {
-    if (restorer_) {
-      mmdeploy_restorer_destroy(restorer_);
-      restorer_ = {};
-    }
-  }
-
-  using Result = Result_<mmdeploy_mat_t>;
-
-  std::vector<Result> Apply(Span<const Mat> images) {
-    if (images.empty()) {
-      return {};
-    }
-
-    mmdeploy_mat_t* results{};
-    auto ec = mmdeploy_restorer_apply(restorer_, reinterpret(images.data()),
-                                      static_cast<int>(images.size()), &results);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-
-    std::vector<Result> rets;
-    rets.reserve(images.size());
-
-    std::shared_ptr<mmdeploy_mat_t> data(
-        results, [count = images.size()](auto p) { mmdeploy_restorer_release_result(p, count); });
-
-    for (size_t i = 0; i < images.size(); ++i) {
-      rets.emplace_back(i, 1, data);
-    }
-
-    return rets;
-  }
-
-  Result Apply(const Mat& image) { return Apply(Span{image})[0]; }
-
- private:
-  mmdeploy_restorer_t restorer_{};
-};
-
-}  // namespace cxx
-
-using cxx::Restorer;
+namespace mmdeploy
+{
+
+    namespace cxx
+    {
+
+        class Restorer : public NonMovable
+        {
+          public:
+            Restorer(const Model& model, const Context& context)
+            {
+                auto ec = mmdeploy_restorer_create_v2(model, context, &restorer_);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+            }
+
+            ~Restorer()
+            {
+                if (restorer_)
+                {
+                    mmdeploy_restorer_destroy(restorer_);
+                    restorer_ = {};
+                }
+            }
+
+            using Result = Result_<mmdeploy_mat_t>;
+
+            std::vector<Result> Apply(Span<const Mat> images)
+            {
+                if (images.empty())
+                {
+                    return {};
+                }
+
+                mmdeploy_mat_t* results{};
+                auto            ec = mmdeploy_restorer_apply(restorer_, reinterpret(images.data()), static_cast<int>(images.size()), &results);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+
+                std::vector<Result> rets;
+                rets.reserve(images.size());
+
+                std::shared_ptr<mmdeploy_mat_t> data(
+                    results,
+                    [count = images.size()](auto p)
+                    { mmdeploy_restorer_release_result(p, count); });
+
+                for (size_t i = 0; i < images.size(); ++i)
+                {
+                    rets.emplace_back(i, 1, data);
+                }
+
+                return rets;
+            }
+
+            Result Apply(const Mat& image)
+            {
+                return Apply(Span{image})[0];
+            }
+
+          private:
+            mmdeploy_restorer_t restorer_{};
+        };
+
+    }  // namespace cxx
+
+    using cxx::Restorer;
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/apis/cxx/mmdeploy/rotated_detector.hpp b/csrc/mmdeploy/apis/cxx/mmdeploy/rotated_detector.hpp
index fa065b0f0c..5a224f6fa5 100644
--- a/csrc/mmdeploy/apis/cxx/mmdeploy/rotated_detector.hpp
+++ b/csrc/mmdeploy/apis/cxx/mmdeploy/rotated_detector.hpp
@@ -6,69 +6,81 @@
 #include "mmdeploy/common.hpp"
 #include "mmdeploy/rotated_detector.h"
 
-namespace mmdeploy {
-
-namespace cxx {
-
-using RotatedDetection = mmdeploy_rotated_detection_t;
-
-class RotatedDetector : public NonMovable {
- public:
-  RotatedDetector(const Model& model, const Context& context) {
-    auto ec = mmdeploy_rotated_detector_create_v2(model, context, &detector_);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-  }
-
-  ~RotatedDetector() {
-    if (detector_) {
-      mmdeploy_rotated_detector_destroy(detector_);
-      detector_ = {};
-    }
-  }
-
-  using Result = Result_<RotatedDetection>;
-
-  std::vector<Result> Apply(Span<const Mat> images) {
-    if (images.empty()) {
-      return {};
-    }
-
-    RotatedDetection* results{};
-    int* result_count{};
-    auto ec =
-        mmdeploy_rotated_detector_apply(detector_, reinterpret(images.data()),
-                                        static_cast<int>(images.size()), &results, &result_count);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-
-    std::shared_ptr<RotatedDetection> data(results, [result_count](auto p) {
-      mmdeploy_rotated_detector_release_result(p, result_count);
-    });
-
-    std::vector<Result> rets;
-    rets.reserve(images.size());
-
-    size_t offset = 0;
-    for (size_t i = 0; i < images.size(); ++i) {
-      offset += rets.emplace_back(offset, result_count[i], data).size();
-    }
-
-    return rets;
-  }
-
-  Result Apply(const Mat& image) { return Apply(Span{image})[0]; }
-
- private:
-  mmdeploy_rotated_detector_t detector_{};
-};
-
-}  // namespace cxx
-
-using cxx::RotatedDetection;
-using cxx::RotatedDetector;
+namespace mmdeploy
+{
+
+    namespace cxx
+    {
+
+        using RotatedDetection = mmdeploy_rotated_detection_t;
+
+        class RotatedDetector : public NonMovable
+        {
+          public:
+            RotatedDetector(const Model& model, const Context& context)
+            {
+                auto ec = mmdeploy_rotated_detector_create_v2(model, context, &detector_);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+            }
+
+            ~RotatedDetector()
+            {
+                if (detector_)
+                {
+                    mmdeploy_rotated_detector_destroy(detector_);
+                    detector_ = {};
+                }
+            }
+
+            using Result = Result_<RotatedDetection>;
+
+            std::vector<Result> Apply(Span<const Mat> images)
+            {
+                if (images.empty())
+                {
+                    return {};
+                }
+
+                RotatedDetection* results{};
+                int*              result_count{};
+                auto              ec =
+                    mmdeploy_rotated_detector_apply(detector_, reinterpret(images.data()), static_cast<int>(images.size()), &results, &result_count);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+
+                std::shared_ptr<RotatedDetection> data(results, [result_count](auto p)
+                                                       { mmdeploy_rotated_detector_release_result(p, result_count); });
+
+                std::vector<Result>               rets;
+                rets.reserve(images.size());
+
+                size_t offset = 0;
+                for (size_t i = 0; i < images.size(); ++i)
+                {
+                    offset += rets.emplace_back(offset, result_count[i], data).size();
+                }
+
+                return rets;
+            }
+
+            Result Apply(const Mat& image)
+            {
+                return Apply(Span{image})[0];
+            }
+
+          private:
+            mmdeploy_rotated_detector_t detector_{};
+        };
+
+    }  // namespace cxx
+
+    using cxx::RotatedDetection;
+    using cxx::RotatedDetector;
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/apis/cxx/mmdeploy/segmentor.hpp b/csrc/mmdeploy/apis/cxx/mmdeploy/segmentor.hpp
index fe53023d1c..7ad98a91bb 100644
--- a/csrc/mmdeploy/apis/cxx/mmdeploy/segmentor.hpp
+++ b/csrc/mmdeploy/apis/cxx/mmdeploy/segmentor.hpp
@@ -6,65 +6,80 @@
 #include "mmdeploy/common.hpp"
 #include "mmdeploy/segmentor.h"
 
-namespace mmdeploy {
-
-namespace cxx {
-
-using Segmentation = mmdeploy_segmentation_t;
-
-class Segmentor : public NonMovable {
- public:
-  Segmentor(const Model& model, const Context& context) {
-    auto ec = mmdeploy_segmentor_create_v2(model, context, &segmentor_);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-  }
-
-  ~Segmentor() {
-    if (segmentor_) {
-      mmdeploy_segmentor_destroy(segmentor_);
-      segmentor_ = {};
-    }
-  }
-
-  using Result = Result_<Segmentation>;
-
-  std::vector<Result> Apply(Span<const Mat> images) {
-    if (images.empty()) {
-      return {};
-    }
-
-    Segmentation* results{};
-    auto ec = mmdeploy_segmentor_apply(segmentor_, reinterpret(images.data()),
-                                       static_cast<int>(images.size()), &results);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-
-    std::vector<Result> rets;
-    rets.reserve(images.size());
-
-    std::shared_ptr<Segmentation> data(
-        results, [count = images.size()](auto p) { mmdeploy_segmentor_release_result(p, count); });
-
-    for (size_t i = 0; i < images.size(); ++i) {
-      rets.emplace_back(i, 1, data);
-    }
-
-    return rets;
-  }
-
-  Result Apply(const Mat& image) { return Apply(Span{image})[0]; }
-
- private:
-  mmdeploy_segmentor_t segmentor_{};
-};
-
-}  // namespace cxx
-
-using cxx::Segmentation;
-using cxx::Segmentor;
+namespace mmdeploy
+{
+
+    namespace cxx
+    {
+
+        using Segmentation = mmdeploy_segmentation_t;
+
+        class Segmentor : public NonMovable
+        {
+          public:
+            Segmentor(const Model& model, const Context& context)
+            {
+                auto ec = mmdeploy_segmentor_create_v2(model, context, &segmentor_);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+            }
+
+            ~Segmentor()
+            {
+                if (segmentor_)
+                {
+                    mmdeploy_segmentor_destroy(segmentor_);
+                    segmentor_ = {};
+                }
+            }
+
+            using Result = Result_<Segmentation>;
+
+            std::vector<Result> Apply(Span<const Mat> images)
+            {
+                if (images.empty())
+                {
+                    return {};
+                }
+
+                Segmentation* results{};
+                auto          ec = mmdeploy_segmentor_apply(segmentor_, reinterpret(images.data()), static_cast<int>(images.size()), &results);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+
+                std::vector<Result> rets;
+                rets.reserve(images.size());
+
+                std::shared_ptr<Segmentation> data(
+                    results,
+                    [count = images.size()](auto p)
+                    { mmdeploy_segmentor_release_result(p, count); });
+
+                for (size_t i = 0; i < images.size(); ++i)
+                {
+                    rets.emplace_back(i, 1, data);
+                }
+
+                return rets;
+            }
+
+            Result Apply(const Mat& image)
+            {
+                return Apply(Span{image})[0];
+            }
+
+          private:
+            mmdeploy_segmentor_t segmentor_{};
+        };
+
+    }  // namespace cxx
+
+    using cxx::Segmentation;
+    using cxx::Segmentor;
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/apis/cxx/mmdeploy/text_detector.hpp b/csrc/mmdeploy/apis/cxx/mmdeploy/text_detector.hpp
index d848715405..56f2f02f18 100644
--- a/csrc/mmdeploy/apis/cxx/mmdeploy/text_detector.hpp
+++ b/csrc/mmdeploy/apis/cxx/mmdeploy/text_detector.hpp
@@ -6,69 +6,81 @@
 #include "mmdeploy/common.hpp"
 #include "mmdeploy/text_detector.h"
 
-namespace mmdeploy {
-
-namespace cxx {
-
-using TextDetection = mmdeploy_text_detection_t;
-
-class TextDetector : public NonMovable {
- public:
-  TextDetector(const Model& model, const Context& context) {
-    auto ec = mmdeploy_text_detector_create_v2(model, context, &detector_);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-  }
-
-  ~TextDetector() {
-    if (detector_) {
-      mmdeploy_text_detector_destroy(detector_);
-      detector_ = {};
-    }
-  }
-
-  using Result = Result_<TextDetection>;
-
-  std::vector<Result> Apply(Span<const Mat> images) {
-    if (images.empty()) {
-      return {};
-    }
-
-    TextDetection* results{};
-    int* result_count{};
-    auto ec =
-        mmdeploy_text_detector_apply(detector_, reinterpret(images.data()),
-                                     static_cast<int>(images.size()), &results, &result_count);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-
-    std::shared_ptr<TextDetection> data(results, [result_count, count = images.size()](auto p) {
-      mmdeploy_text_detector_release_result(p, result_count, count);
-    });
-
-    std::vector<Result> rets;
-    rets.reserve(images.size());
-
-    size_t offset = 0;
-    for (size_t i = 0; i < images.size(); ++i) {
-      offset += rets.emplace_back(offset, result_count[i], data).size();
-    }
-
-    return rets;
-  }
-
-  Result Apply(const Mat& image) { return Apply(Span{image})[0]; }
-
- private:
-  mmdeploy_text_detector_t detector_{};
-};
-
-}  // namespace cxx
-
-using cxx::TextDetection;
-using cxx::TextDetector;
+namespace mmdeploy
+{
+
+    namespace cxx
+    {
+
+        using TextDetection = mmdeploy_text_detection_t;
+
+        class TextDetector : public NonMovable
+        {
+          public:
+            TextDetector(const Model& model, const Context& context)
+            {
+                auto ec = mmdeploy_text_detector_create_v2(model, context, &detector_);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+            }
+
+            ~TextDetector()
+            {
+                if (detector_)
+                {
+                    mmdeploy_text_detector_destroy(detector_);
+                    detector_ = {};
+                }
+            }
+
+            using Result = Result_<TextDetection>;
+
+            std::vector<Result> Apply(Span<const Mat> images)
+            {
+                if (images.empty())
+                {
+                    return {};
+                }
+
+                TextDetection* results{};
+                int*           result_count{};
+                auto           ec =
+                    mmdeploy_text_detector_apply(detector_, reinterpret(images.data()), static_cast<int>(images.size()), &results, &result_count);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+
+                std::shared_ptr<TextDetection> data(results, [result_count, count = images.size()](auto p)
+                                                    { mmdeploy_text_detector_release_result(p, result_count, count); });
+
+                std::vector<Result>            rets;
+                rets.reserve(images.size());
+
+                size_t offset = 0;
+                for (size_t i = 0; i < images.size(); ++i)
+                {
+                    offset += rets.emplace_back(offset, result_count[i], data).size();
+                }
+
+                return rets;
+            }
+
+            Result Apply(const Mat& image)
+            {
+                return Apply(Span{image})[0];
+            }
+
+          private:
+            mmdeploy_text_detector_t detector_{};
+        };
+
+    }  // namespace cxx
+
+    using cxx::TextDetection;
+    using cxx::TextDetector;
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/apis/cxx/mmdeploy/text_recognizer.hpp b/csrc/mmdeploy/apis/cxx/mmdeploy/text_recognizer.hpp
index eba8ea3902..31c741e2ee 100644
--- a/csrc/mmdeploy/apis/cxx/mmdeploy/text_recognizer.hpp
+++ b/csrc/mmdeploy/apis/cxx/mmdeploy/text_recognizer.hpp
@@ -9,82 +9,91 @@
 #include "mmdeploy/text_detector.hpp"
 #include "mmdeploy/text_recognizer.h"
 
-namespace mmdeploy {
-
-namespace cxx {
-
-using TextRecognition = mmdeploy_text_recognition_t;
-
-class TextRecognizer : public NonMovable {
- public:
-  TextRecognizer(const Model& model, const Context& context) {
-    auto ec = mmdeploy_text_recognizer_create_v2(model, context, &recognizer_);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-  }
-
-  ~TextRecognizer() {
-    if (recognizer_) {
-      mmdeploy_text_recognizer_destroy(recognizer_);
-      recognizer_ = {};
-    }
-  }
-
-  using Result = Result_<TextRecognition>;
-
-  std::vector<Result> Apply(Span<const Mat> images, Span<const TextDetection> bboxes,
-                            Span<const int> bbox_count) {
-    if (images.empty()) {
-      return {};
-    }
-
-    const TextDetection* p_bboxes{};
-    const int* p_bbox_count{};
-
-    auto n_total_bboxes = static_cast<int>(images.size());
-
-    if (!bboxes.empty()) {
-      p_bboxes = bboxes.data();
-      p_bbox_count = bbox_count.data();
-      n_total_bboxes = std::accumulate(bbox_count.begin(), bbox_count.end(), 0);
-    }
-
-    TextRecognition* results{};
-    auto ec = mmdeploy_text_recognizer_apply_bbox(recognizer_, reinterpret(images.data()),
-                                                  static_cast<int>(images.size()), p_bboxes,
-                                                  p_bbox_count, &results);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-
-    std::shared_ptr<TextRecognition> data(results, [count = n_total_bboxes](auto p) {
-      mmdeploy_text_recognizer_release_result(p, count);
-    });
-
-    std::vector<Result> rets;
-    rets.reserve(images.size());
-
-    size_t offset = 0;
-    for (size_t i = 0; i < images.size(); ++i) {
-      offset += rets.emplace_back(offset, bboxes.empty() ? 1 : bbox_count[i], data).size();
-    }
-
-    return rets;
-  }
-
-  Result Apply(const Mat& image, Span<const TextDetection> bboxes = {}) {
-    return Apply(Span{image}, bboxes, {static_cast<int>(bboxes.size())})[0];
-  }
-
- private:
-  mmdeploy_text_recognizer_t recognizer_{};
-};
-
-}  // namespace cxx
-
-using cxx::TextRecognition;
-using cxx::TextRecognizer;
+namespace mmdeploy
+{
+
+    namespace cxx
+    {
+
+        using TextRecognition = mmdeploy_text_recognition_t;
+
+        class TextRecognizer : public NonMovable
+        {
+          public:
+            TextRecognizer(const Model& model, const Context& context)
+            {
+                auto ec = mmdeploy_text_recognizer_create_v2(model, context, &recognizer_);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+            }
+
+            ~TextRecognizer()
+            {
+                if (recognizer_)
+                {
+                    mmdeploy_text_recognizer_destroy(recognizer_);
+                    recognizer_ = {};
+                }
+            }
+
+            using Result = Result_<TextRecognition>;
+
+            std::vector<Result> Apply(Span<const Mat> images, Span<const TextDetection> bboxes, Span<const int> bbox_count)
+            {
+                if (images.empty())
+                {
+                    return {};
+                }
+
+                const TextDetection* p_bboxes{};
+                const int*           p_bbox_count{};
+
+                auto                 n_total_bboxes = static_cast<int>(images.size());
+
+                if (!bboxes.empty())
+                {
+                    p_bboxes       = bboxes.data();
+                    p_bbox_count   = bbox_count.data();
+                    n_total_bboxes = std::accumulate(bbox_count.begin(), bbox_count.end(), 0);
+                }
+
+                TextRecognition* results{};
+                auto             ec = mmdeploy_text_recognizer_apply_bbox(recognizer_, reinterpret(images.data()), static_cast<int>(images.size()), p_bboxes, p_bbox_count, &results);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+
+                std::shared_ptr<TextRecognition> data(results, [count = n_total_bboxes](auto p)
+                                                      { mmdeploy_text_recognizer_release_result(p, count); });
+
+                std::vector<Result>              rets;
+                rets.reserve(images.size());
+
+                size_t offset = 0;
+                for (size_t i = 0; i < images.size(); ++i)
+                {
+                    offset += rets.emplace_back(offset, bboxes.empty() ? 1 : bbox_count[i], data).size();
+                }
+
+                return rets;
+            }
+
+            Result Apply(const Mat& image, Span<const TextDetection> bboxes = {})
+            {
+                return Apply(Span{image}, bboxes, {static_cast<int>(bboxes.size())})[0];
+            }
+
+          private:
+            mmdeploy_text_recognizer_t recognizer_{};
+        };
+
+    }  // namespace cxx
+
+    using cxx::TextRecognition;
+    using cxx::TextRecognizer;
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/apis/cxx/mmdeploy/video_recognizer.hpp b/csrc/mmdeploy/apis/cxx/mmdeploy/video_recognizer.hpp
index 583b28dd59..ed3569e242 100644
--- a/csrc/mmdeploy/apis/cxx/mmdeploy/video_recognizer.hpp
+++ b/csrc/mmdeploy/apis/cxx/mmdeploy/video_recognizer.hpp
@@ -6,85 +6,97 @@
 #include "mmdeploy/common.hpp"
 #include "mmdeploy/video_recognizer.h"
 
-namespace mmdeploy {
-
-namespace cxx {
-
-using VideoRecognition = mmdeploy_video_recognition_t;
-using VideoSampleInfo = mmdeploy_video_sample_info_t;
-
-class VideoRecognizer : public NonMovable {
- public:
-  VideoRecognizer(const Model& model, const Context& context) {
-    auto ec = mmdeploy_video_recognizer_create_v2(model, context, &recognizer_);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-  }
-
-  ~VideoRecognizer() {
-    if (recognizer_) {
-      mmdeploy_video_recognizer_destroy(recognizer_);
-      recognizer_ = {};
-    }
-  }
-
-  using Result = Result_<VideoRecognition>;
-
-  std::vector<Result> Apply(Span<const std::vector<Mat>> videos,
-                            Span<const VideoSampleInfo> infos) {
-    if (videos.empty()) {
-      return {};
-    }
-
-    int video_count = videos.size();
-
-    VideoRecognition* results{};
-    int* result_count{};
-    std::vector<Mat> images;
-    std::vector<VideoSampleInfo> video_info;
-    for (int i = 0; i < videos.size(); i++) {
-      for (auto& mat : videos[i]) {
-        images.push_back(mat);
-      }
-      video_info.push_back(infos[i]);
-    }
-
-    auto ec =
-        mmdeploy_video_recognizer_apply(recognizer_, reinterpret(images.data()), video_info.data(),
-                                        video_count, &results, &result_count);
-    if (ec != MMDEPLOY_SUCCESS) {
-      throw_exception(static_cast<ErrorCode>(ec));
-    }
-
-    std::vector<Result> rets;
-    rets.reserve(video_count);
-
-    std::shared_ptr<VideoRecognition> data(results, [result_count, count = video_count](auto p) {
-      mmdeploy_video_recognizer_release_result(p, result_count, count);
-    });
-
-    size_t offset = 0;
-    for (size_t i = 0; i < video_count; ++i) {
-      offset += rets.emplace_back(offset, result_count[i], data).size();
-    }
-
-    return rets;
-  }
-
-  Result Apply(const std::vector<Mat>& video, const VideoSampleInfo info) {
-    return Apply(Span{video}, Span{info})[0];
-  }
-
- private:
-  mmdeploy_video_recognizer_t recognizer_{};
-};
-
-}  // namespace cxx
-
-using cxx::VideoRecognition;
-using cxx::VideoRecognizer;
-using cxx::VideoSampleInfo;
+namespace mmdeploy
+{
+
+    namespace cxx
+    {
+
+        using VideoRecognition = mmdeploy_video_recognition_t;
+        using VideoSampleInfo  = mmdeploy_video_sample_info_t;
+
+        class VideoRecognizer : public NonMovable
+        {
+          public:
+            VideoRecognizer(const Model& model, const Context& context)
+            {
+                auto ec = mmdeploy_video_recognizer_create_v2(model, context, &recognizer_);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+            }
+
+            ~VideoRecognizer()
+            {
+                if (recognizer_)
+                {
+                    mmdeploy_video_recognizer_destroy(recognizer_);
+                    recognizer_ = {};
+                }
+            }
+
+            using Result = Result_<VideoRecognition>;
+
+            std::vector<Result> Apply(Span<const std::vector<Mat>> videos,
+                                      Span<const VideoSampleInfo>  infos)
+            {
+                if (videos.empty())
+                {
+                    return {};
+                }
+
+                int                          video_count = videos.size();
+
+                VideoRecognition*            results{};
+                int*                         result_count{};
+                std::vector<Mat>             images;
+                std::vector<VideoSampleInfo> video_info;
+                for (int i = 0; i < videos.size(); i++)
+                {
+                    for (auto& mat : videos[i])
+                    {
+                        images.push_back(mat);
+                    }
+                    video_info.push_back(infos[i]);
+                }
+
+                auto ec =
+                    mmdeploy_video_recognizer_apply(recognizer_, reinterpret(images.data()), video_info.data(), video_count, &results, &result_count);
+                if (ec != MMDEPLOY_SUCCESS)
+                {
+                    throw_exception(static_cast<ErrorCode>(ec));
+                }
+
+                std::vector<Result> rets;
+                rets.reserve(video_count);
+
+                std::shared_ptr<VideoRecognition> data(results, [result_count, count = video_count](auto p)
+                                                       { mmdeploy_video_recognizer_release_result(p, result_count, count); });
+
+                size_t                            offset = 0;
+                for (size_t i = 0; i < video_count; ++i)
+                {
+                    offset += rets.emplace_back(offset, result_count[i], data).size();
+                }
+
+                return rets;
+            }
+
+            Result Apply(const std::vector<Mat>& video, const VideoSampleInfo info)
+            {
+                return Apply(Span{video}, Span{info})[0];
+            }
+
+          private:
+            mmdeploy_video_recognizer_t recognizer_{};
+        };
+
+    }  // namespace cxx
+
+    using cxx::VideoRecognition;
+    using cxx::VideoRecognizer;
+    using cxx::VideoSampleInfo;
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/apis/java/CMakeLists.txt b/csrc/mmdeploy/apis/java/CMakeLists.txt
index 04313f1934..6ae7a8e0ad 100644
--- a/csrc/mmdeploy/apis/java/CMakeLists.txt
+++ b/csrc/mmdeploy/apis/java/CMakeLists.txt
@@ -1,6 +1,6 @@
-if (NOT MMDEPLOY_BUILD_SDK_JAVA_API)
-        return ()
-endif ()
+if(NOT MMDEPLOY_BUILD_SDK_JAVA_API)
+  return()
+endif()
 
 project(mmdeploy_java_package)
 
@@ -9,26 +9,27 @@ include(UseJava)
 
 add_subdirectory(native)
 
-add_jar(${PROJECT_NAME} SOURCES
-        mmdeploy/DataType.java
-        mmdeploy/Mat.java
-        mmdeploy/InstanceMask.java
-        mmdeploy/PixelFormat.java
-        mmdeploy/PointF.java
-        mmdeploy/Rect.java
-        mmdeploy/Classifier.java
-        mmdeploy/Detector.java
-        mmdeploy/Segmentor.java
-        mmdeploy/TextDetector.java
-        mmdeploy/TextRecognizer.java
-        mmdeploy/Restorer.java
-        mmdeploy/PoseDetector.java
-        mmdeploy/Context.java
-        mmdeploy/Device.java
-        mmdeploy/Model.java
-        mmdeploy/Profiler.java
-        mmdeploy/Scheduler.java
-        mmdeploy/PoseTracker.java
-        mmdeploy/RotatedDetector.java
-        OUTPUT_NAME mmdeploy
-        OUTPUT_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY})
+add_jar(
+  ${PROJECT_NAME}
+  SOURCES mmdeploy/DataType.java
+          mmdeploy/Mat.java
+          mmdeploy/InstanceMask.java
+          mmdeploy/PixelFormat.java
+          mmdeploy/PointF.java
+          mmdeploy/Rect.java
+          mmdeploy/Classifier.java
+          mmdeploy/Detector.java
+          mmdeploy/Segmentor.java
+          mmdeploy/TextDetector.java
+          mmdeploy/TextRecognizer.java
+          mmdeploy/Restorer.java
+          mmdeploy/PoseDetector.java
+          mmdeploy/Context.java
+          mmdeploy/Device.java
+          mmdeploy/Model.java
+          mmdeploy/Profiler.java
+          mmdeploy/Scheduler.java
+          mmdeploy/PoseTracker.java
+          mmdeploy/RotatedDetector.java
+  OUTPUT_NAME mmdeploy
+  OUTPUT_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY})
diff --git a/csrc/mmdeploy/apis/java/native/CMakeLists.txt b/csrc/mmdeploy/apis/java/native/CMakeLists.txt
index 6324cd21a1..b1868b8567 100644
--- a/csrc/mmdeploy/apis/java/native/CMakeLists.txt
+++ b/csrc/mmdeploy/apis/java/native/CMakeLists.txt
@@ -1,35 +1,35 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 project(mmdeploy_java)
 
-if (NOT ANDROID)
-        find_package(JNI REQUIRED)
-else ()
-        set(JNI_LIBRARIES)
+if(NOT ANDROID)
+  find_package(JNI REQUIRED)
+else()
+  set(JNI_LIBRARIES)
 endif()
 
-mmdeploy_add_library(${PROJECT_NAME} SHARED EXCLUDE
-        mmdeploy_Classifier.cpp
-        mmdeploy_Detector.cpp
-        mmdeploy_Segmentor.cpp
-        mmdeploy_Restorer.cpp
-        mmdeploy_PoseDetector.cpp
-        mmdeploy_TextDetector.cpp
-        mmdeploy_TextRecognizer.cpp
-        mmdeploy_PoseTracker.cpp
-        mmdeploy_Context.cpp
-        mmdeploy_Device.cpp
-        mmdeploy_Model.cpp
-        mmdeploy_Profiler.cpp
-        mmdeploy_Scheduler.cpp
-        mmdeploy_RotatedDetector.cpp)
+mmdeploy_add_library(
+  ${PROJECT_NAME}
+  SHARED
+  EXCLUDE
+  mmdeploy_Classifier.cpp
+  mmdeploy_Detector.cpp
+  mmdeploy_Segmentor.cpp
+  mmdeploy_Restorer.cpp
+  mmdeploy_PoseDetector.cpp
+  mmdeploy_TextDetector.cpp
+  mmdeploy_TextRecognizer.cpp
+  mmdeploy_PoseTracker.cpp
+  mmdeploy_Context.cpp
+  mmdeploy_Device.cpp
+  mmdeploy_Model.cpp
+  mmdeploy_Profiler.cpp
+  mmdeploy_Scheduler.cpp
+  mmdeploy_RotatedDetector.cpp)
 
-target_include_directories(${PROJECT_NAME} PRIVATE
-        ${JNI_INCLUDE_DIRS})
+target_include_directories(${PROJECT_NAME} PRIVATE ${JNI_INCLUDE_DIRS})
 
 mmdeploy_load_static(${PROJECT_NAME} MMDeployStaticModules)
 mmdeploy_load_dynamic(${PROJECT_NAME} MMDeployDynamicModules)
 
-target_link_libraries(${PROJECT_NAME} PRIVATE
-        ${JNI_LIBRARIES} MMDeployLibs)
-install(TARGETS ${PROJECT_NAME}
-            DESTINATION lib)
+target_link_libraries(${PROJECT_NAME} PRIVATE ${JNI_LIBRARIES} MMDeployLibs)
+install(TARGETS ${PROJECT_NAME} DESTINATION lib)
diff --git a/csrc/mmdeploy/apis/java/native/common.h b/csrc/mmdeploy/apis/java/native/common.h
index ba2601e5f1..045dc02a35 100644
--- a/csrc/mmdeploy/apis/java/native/common.h
+++ b/csrc/mmdeploy/apis/java/native/common.h
@@ -10,45 +10,48 @@
 #include "mmdeploy/core/logger.h"
 #include "mmdeploy/core/utils/formatter.h"
 
-template <typename F>
-static auto With(JNIEnv *env, jobjectArray imgs, F f) noexcept {
-  auto mat_clazz = env->FindClass("mmdeploy/Mat");
-  auto shape_field = env->GetFieldID(mat_clazz, "shape", "[I");
-  auto format_field = env->GetFieldID(mat_clazz, "format", "I");
-  auto type_field = env->GetFieldID(mat_clazz, "type", "I");
-  auto data_field = env->GetFieldID(mat_clazz, "data", "[B");
-  auto num = env->GetArrayLength(imgs);
-  std::vector<mmdeploy_mat_t> mats;
-  std::vector<jbyteArray> datum;
-
-  mats.reserve(num);
-  datum.reserve(num);
-
-  for (int i = 0; i < num; ++i) {
-    auto obj = env->GetObjectArrayElement(imgs, i);
-    auto shape_obj = env->GetObjectField(obj, shape_field);
-    auto shape = env->GetIntArrayElements((jintArray)shape_obj, nullptr);
-    auto format = env->GetIntField(obj, format_field);
-    auto type = env->GetIntField(obj, type_field);
-    auto &mat = mats.emplace_back();
-    mat.height = shape[0];
-    mat.width = shape[1];
-    mat.channel = shape[2];
-    env->ReleaseIntArrayElements((jintArray)shape_obj, shape, JNI_ABORT);
-    mat.format = (mmdeploy_pixel_format_t)format;
-    mat.type = (mmdeploy_data_type_t)type;
-    auto data_obj = env->GetObjectField(obj, data_field);
-    mat.data = (uint8_t *)env->GetByteArrayElements((jbyteArray)data_obj, nullptr);
-    datum.push_back((jbyteArray)data_obj);
-  }
-
-  auto ret = f(mats.data(), mats.size());  // ! f must not throw
-
-  for (int i = 0; i < num; ++i) {
-    env->ReleaseByteArrayElements(datum[i], (jbyte *)mats[i].data, JNI_ABORT);
-  }
-
-  return ret;
+template<typename F>
+static auto With(JNIEnv* env, jobjectArray imgs, F f) noexcept
+{
+    auto                        mat_clazz    = env->FindClass("mmdeploy/Mat");
+    auto                        shape_field  = env->GetFieldID(mat_clazz, "shape", "[I");
+    auto                        format_field = env->GetFieldID(mat_clazz, "format", "I");
+    auto                        type_field   = env->GetFieldID(mat_clazz, "type", "I");
+    auto                        data_field   = env->GetFieldID(mat_clazz, "data", "[B");
+    auto                        num          = env->GetArrayLength(imgs);
+    std::vector<mmdeploy_mat_t> mats;
+    std::vector<jbyteArray>     datum;
+
+    mats.reserve(num);
+    datum.reserve(num);
+
+    for (int i = 0; i < num; ++i)
+    {
+        auto  obj       = env->GetObjectArrayElement(imgs, i);
+        auto  shape_obj = env->GetObjectField(obj, shape_field);
+        auto  shape     = env->GetIntArrayElements((jintArray)shape_obj, nullptr);
+        auto  format    = env->GetIntField(obj, format_field);
+        auto  type      = env->GetIntField(obj, type_field);
+        auto& mat       = mats.emplace_back();
+        mat.height      = shape[0];
+        mat.width       = shape[1];
+        mat.channel     = shape[2];
+        env->ReleaseIntArrayElements((jintArray)shape_obj, shape, JNI_ABORT);
+        mat.format    = (mmdeploy_pixel_format_t)format;
+        mat.type      = (mmdeploy_data_type_t)type;
+        auto data_obj = env->GetObjectField(obj, data_field);
+        mat.data      = (uint8_t*)env->GetByteArrayElements((jbyteArray)data_obj, nullptr);
+        datum.push_back((jbyteArray)data_obj);
+    }
+
+    auto ret = f(mats.data(), mats.size());  // ! f must not throw
+
+    for (int i = 0; i < num; ++i)
+    {
+        env->ReleaseByteArrayElements(datum[i], (jbyte*)mats[i].data, JNI_ABORT);
+    }
+
+    return ret;
 }
 
 #endif  // MMDEPLOY_CSRC_APIS_JAVA_NATIVE_COMMON_H_
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_Classifier.cpp b/csrc/mmdeploy/apis/java/native/mmdeploy_Classifier.cpp
index 2a3309361e..6664a65289 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_Classifier.cpp
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_Classifier.cpp
@@ -6,30 +6,33 @@
 #include "mmdeploy/apis/java/native/common.h"
 #include "mmdeploy/core/logger.h"
 
-jlong Java_mmdeploy_Classifier_create(JNIEnv *env, jobject, jstring modelPath, jstring deviceName,
-                                      jint device_id) {
-  auto model_path = env->GetStringUTFChars(modelPath, nullptr);
-  auto device_name = env->GetStringUTFChars(deviceName, nullptr);
-  mmdeploy_classifier_t classifier{};
-  auto ec =
-      mmdeploy_classifier_create_by_path(model_path, device_name, (int)device_id, &classifier);
-  env->ReleaseStringUTFChars(modelPath, model_path);
-  env->ReleaseStringUTFChars(deviceName, device_name);
-  if (ec) {
-    MMDEPLOY_ERROR("failed to create classifier, code = {}", ec);
-    return -1;
-  }
-  return (jlong)classifier;
+jlong Java_mmdeploy_Classifier_create(JNIEnv* env, jobject, jstring modelPath, jstring deviceName, jint device_id)
+{
+    auto                  model_path  = env->GetStringUTFChars(modelPath, nullptr);
+    auto                  device_name = env->GetStringUTFChars(deviceName, nullptr);
+    mmdeploy_classifier_t classifier{};
+    auto                  ec =
+        mmdeploy_classifier_create_by_path(model_path, device_name, (int)device_id, &classifier);
+    env->ReleaseStringUTFChars(modelPath, model_path);
+    env->ReleaseStringUTFChars(deviceName, device_name);
+    if (ec)
+    {
+        MMDEPLOY_ERROR("failed to create classifier, code = {}", ec);
+        return -1;
+    }
+    return (jlong)classifier;
 }
 
-void Java_mmdeploy_Classifier_destroy(JNIEnv *, jobject, jlong handle) {
-  MMDEPLOY_DEBUG("Java_mmdeploy_Classifier_destroy");
-  mmdeploy_classifier_destroy((mmdeploy_classifier_t)handle);
+void Java_mmdeploy_Classifier_destroy(JNIEnv*, jobject, jlong handle)
+{
+    MMDEPLOY_DEBUG("Java_mmdeploy_Classifier_destroy");
+    mmdeploy_classifier_destroy((mmdeploy_classifier_t)handle);
 }
 
-jobjectArray Java_mmdeploy_Classifier_apply(JNIEnv *env, jobject thiz, jlong handle,
-                                            jobjectArray images, jintArray counts) {
-  return With(env, images, [&](const mmdeploy_mat_t imgs[], int size) -> jobjectArray {
+jobjectArray Java_mmdeploy_Classifier_apply(JNIEnv* env, jobject thiz, jlong handle, jobjectArray images, jintArray counts)
+{
+    return With(env, images, [&](const mmdeploy_mat_t imgs[], int size) -> jobjectArray
+                {
     mmdeploy_classification_t *results{};
     int *result_count{};
     auto ec = mmdeploy_classifier_apply((mmdeploy_classifier_t)handle, imgs, size, &results,
@@ -55,6 +58,5 @@ jobjectArray Java_mmdeploy_Classifier_apply(JNIEnv *env, jobject thiz, jlong han
     }
     env->ReleaseIntArrayElements(counts, counts_array, 0);
     mmdeploy_classifier_release_result(results, result_count, size);
-    return array;
-  });
+    return array; });
 }
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_Classifier.h b/csrc/mmdeploy/apis/java/native/mmdeploy_Classifier.h
index 16a06b5fba..84adf58aa3 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_Classifier.h
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_Classifier.h
@@ -3,33 +3,33 @@
 /* Header for class mmdeploy_Classifier */
 
 #ifndef _Included_mmdeploy_Classifier
-#define _Included_mmdeploy_Classifier
-#ifdef __cplusplus
-extern "C" {
-#endif
-/*
- * Class:     mmdeploy_Classifier
- * Method:    create
- * Signature: (Ljava/lang/String;Ljava/lang/String;I)J
- */
-JNIEXPORT jlong JNICALL Java_mmdeploy_Classifier_create(JNIEnv *, jobject, jstring, jstring, jint);
+    #define _Included_mmdeploy_Classifier
+    #ifdef __cplusplus
+extern "C"
+{
+    #endif
+    /*
+     * Class:     mmdeploy_Classifier
+     * Method:    create
+     * Signature: (Ljava/lang/String;Ljava/lang/String;I)J
+     */
+    JNIEXPORT jlong JNICALL        Java_mmdeploy_Classifier_create(JNIEnv*, jobject, jstring, jstring, jint);
 
-/*
- * Class:     mmdeploy_Classifier
- * Method:    destroy
- * Signature: (J)V
- */
-JNIEXPORT void JNICALL Java_mmdeploy_Classifier_destroy(JNIEnv *, jobject, jlong);
+    /*
+     * Class:     mmdeploy_Classifier
+     * Method:    destroy
+     * Signature: (J)V
+     */
+    JNIEXPORT void JNICALL         Java_mmdeploy_Classifier_destroy(JNIEnv*, jobject, jlong);
 
-/*
- * Class:     mmdeploy_Classifier
- * Method:    apply
- * Signature: (J[Lmmdeploy/Mat;[I)[Lmmdeploy/Classifier/Result;
- */
-JNIEXPORT jobjectArray JNICALL Java_mmdeploy_Classifier_apply(JNIEnv *, jobject, jlong,
-                                                              jobjectArray, jintArray);
+    /*
+     * Class:     mmdeploy_Classifier
+     * Method:    apply
+     * Signature: (J[Lmmdeploy/Mat;[I)[Lmmdeploy/Classifier/Result;
+     */
+    JNIEXPORT jobjectArray JNICALL Java_mmdeploy_Classifier_apply(JNIEnv*, jobject, jlong, jobjectArray, jintArray);
 
-#ifdef __cplusplus
+    #ifdef __cplusplus
 }
-#endif
+    #endif
 #endif
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_Context.cpp b/csrc/mmdeploy/apis/java/native/mmdeploy_Context.cpp
index dbd401724e..e875a66ead 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_Context.cpp
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_Context.cpp
@@ -8,36 +8,43 @@
 #include "mmdeploy/apis/java/native/common.h"
 #include "mmdeploy/core/logger.h"
 
-jlong Java_mmdeploy_Context_create(JNIEnv *env, jobject) {
-  mmdeploy_context_t context{};
-  mmdeploy_context_create(&context);
-  return (jlong)context;
+jlong Java_mmdeploy_Context_create(JNIEnv* env, jobject)
+{
+    mmdeploy_context_t context{};
+    mmdeploy_context_create(&context);
+    return (jlong)context;
 }
 
-jint Java_mmdeploy_Context_add(JNIEnv *env, jobject, jlong context_, jint contextType, jstring name,
-                               jlong handle) {
-  auto object_name = env->GetStringUTFChars(name, nullptr);
-  if ((int)contextType == MMDEPLOY_TYPE_SCHEDULER) {
-    mmdeploy_context_add((mmdeploy_context_t)context_, (mmdeploy_context_type_t)contextType,
-                         object_name, (mmdeploy_scheduler_t)handle);
-  } else if ((int)contextType == MMDEPLOY_TYPE_MODEL) {
-    mmdeploy_context_add((mmdeploy_context_t)context_, (mmdeploy_context_type_t)contextType,
-                         object_name, (mmdeploy_model_t)handle);
-  } else if ((int)contextType == MMDEPLOY_TYPE_DEVICE) {
-    mmdeploy_context_add((mmdeploy_context_t)context_, (mmdeploy_context_type_t)contextType,
-                         nullptr, (mmdeploy_device_t)handle);
-  } else if ((int)contextType == MMDEPLOY_TYPE_PROFILER) {
-    mmdeploy_context_add((mmdeploy_context_t)context_, (mmdeploy_context_type_t)contextType,
-                         nullptr, (mmdeploy_profiler_t)handle);
-  } else {
-    MMDEPLOY_ERROR("wrong context type, got {}", (int)contextType);
-    return MMDEPLOY_E_NOT_SUPPORTED;
-  }
-  env->ReleaseStringUTFChars(name, object_name);
-  return 0;
+jint Java_mmdeploy_Context_add(JNIEnv* env, jobject, jlong context_, jint contextType, jstring name, jlong handle)
+{
+    auto object_name = env->GetStringUTFChars(name, nullptr);
+    if ((int)contextType == MMDEPLOY_TYPE_SCHEDULER)
+    {
+        mmdeploy_context_add((mmdeploy_context_t)context_, (mmdeploy_context_type_t)contextType, object_name, (mmdeploy_scheduler_t)handle);
+    }
+    else if ((int)contextType == MMDEPLOY_TYPE_MODEL)
+    {
+        mmdeploy_context_add((mmdeploy_context_t)context_, (mmdeploy_context_type_t)contextType, object_name, (mmdeploy_model_t)handle);
+    }
+    else if ((int)contextType == MMDEPLOY_TYPE_DEVICE)
+    {
+        mmdeploy_context_add((mmdeploy_context_t)context_, (mmdeploy_context_type_t)contextType, nullptr, (mmdeploy_device_t)handle);
+    }
+    else if ((int)contextType == MMDEPLOY_TYPE_PROFILER)
+    {
+        mmdeploy_context_add((mmdeploy_context_t)context_, (mmdeploy_context_type_t)contextType, nullptr, (mmdeploy_profiler_t)handle);
+    }
+    else
+    {
+        MMDEPLOY_ERROR("wrong context type, got {}", (int)contextType);
+        return MMDEPLOY_E_NOT_SUPPORTED;
+    }
+    env->ReleaseStringUTFChars(name, object_name);
+    return 0;
 }
 
-void Java_mmdeploy_Context_destroy(JNIEnv *, jobject, jlong context_) {
-  MMDEPLOY_DEBUG("Java_mmdeploy_Context_destroy");
-  mmdeploy_context_destroy((mmdeploy_context_t)context_);
+void Java_mmdeploy_Context_destroy(JNIEnv*, jobject, jlong context_)
+{
+    MMDEPLOY_DEBUG("Java_mmdeploy_Context_destroy");
+    mmdeploy_context_destroy((mmdeploy_context_t)context_);
 }
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_Context.h b/csrc/mmdeploy/apis/java/native/mmdeploy_Context.h
index 42df819580..00e24065c6 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_Context.h
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_Context.h
@@ -3,32 +3,33 @@
 /* Header for class mmdeploy_Context */
 
 #ifndef _Included_mmdeploy_Context
-#define _Included_mmdeploy_Context
-#ifdef __cplusplus
-extern "C" {
-#endif
-/*
- * Class:     mmdeploy_Context
- * Method:    create
- * Signature: ()J
- */
-JNIEXPORT jlong JNICALL Java_mmdeploy_Context_create(JNIEnv *, jobject);
+    #define _Included_mmdeploy_Context
+    #ifdef __cplusplus
+extern "C"
+{
+    #endif
+    /*
+     * Class:     mmdeploy_Context
+     * Method:    create
+     * Signature: ()J
+     */
+    JNIEXPORT jlong JNICALL Java_mmdeploy_Context_create(JNIEnv*, jobject);
 
-/*
- * Class:     mmdeploy_Context
- * Method:    add
- * Signature: (JILjava/lang/String;J)I
- */
-JNIEXPORT jint JNICALL Java_mmdeploy_Context_add(JNIEnv *, jobject, jlong, jint, jstring, jlong);
+    /*
+     * Class:     mmdeploy_Context
+     * Method:    add
+     * Signature: (JILjava/lang/String;J)I
+     */
+    JNIEXPORT jint JNICALL  Java_mmdeploy_Context_add(JNIEnv*, jobject, jlong, jint, jstring, jlong);
 
-/*
- * Class:     mmdeploy_Context
- * Method:    destroy
- * Signature: (J)V
- */
-JNIEXPORT void JNICALL Java_mmdeploy_Context_destroy(JNIEnv *, jobject, jlong);
+    /*
+     * Class:     mmdeploy_Context
+     * Method:    destroy
+     * Signature: (J)V
+     */
+    JNIEXPORT void JNICALL  Java_mmdeploy_Context_destroy(JNIEnv*, jobject, jlong);
 
-#ifdef __cplusplus
+    #ifdef __cplusplus
 }
-#endif
+    #endif
 #endif
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_Detector.cpp b/csrc/mmdeploy/apis/java/native/mmdeploy_Detector.cpp
index c03ff1a1ff..6e8a32dac7 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_Detector.cpp
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_Detector.cpp
@@ -6,29 +6,32 @@
 #include "mmdeploy/apis/java/native/common.h"
 #include "mmdeploy/core/logger.h"
 
-jlong Java_mmdeploy_Detector_create(JNIEnv *env, jobject, jstring modelPath, jstring deviceName,
-                                    jint device_id) {
-  auto model_path = env->GetStringUTFChars(modelPath, nullptr);
-  auto device_name = env->GetStringUTFChars(deviceName, nullptr);
-  mmdeploy_detector_t detector{};
-  auto ec = mmdeploy_detector_create_by_path(model_path, device_name, (int)device_id, &detector);
-  env->ReleaseStringUTFChars(modelPath, model_path);
-  env->ReleaseStringUTFChars(deviceName, device_name);
-  if (ec) {
-    MMDEPLOY_ERROR("failed to create detector, code = {}", ec);
-    return -1;
-  }
-  return (jlong)detector;
+jlong Java_mmdeploy_Detector_create(JNIEnv* env, jobject, jstring modelPath, jstring deviceName, jint device_id)
+{
+    auto                model_path  = env->GetStringUTFChars(modelPath, nullptr);
+    auto                device_name = env->GetStringUTFChars(deviceName, nullptr);
+    mmdeploy_detector_t detector{};
+    auto                ec = mmdeploy_detector_create_by_path(model_path, device_name, (int)device_id, &detector);
+    env->ReleaseStringUTFChars(modelPath, model_path);
+    env->ReleaseStringUTFChars(deviceName, device_name);
+    if (ec)
+    {
+        MMDEPLOY_ERROR("failed to create detector, code = {}", ec);
+        return -1;
+    }
+    return (jlong)detector;
 }
 
-void Java_mmdeploy_Detector_destroy(JNIEnv *, jobject, jlong handle) {
-  MMDEPLOY_DEBUG("Java_mmdeploy_Detector_destroy");  // maybe use info?
-  mmdeploy_detector_destroy((mmdeploy_detector_t)handle);
+void Java_mmdeploy_Detector_destroy(JNIEnv*, jobject, jlong handle)
+{
+    MMDEPLOY_DEBUG("Java_mmdeploy_Detector_destroy");  // maybe use info?
+    mmdeploy_detector_destroy((mmdeploy_detector_t)handle);
 }
 
-jobjectArray Java_mmdeploy_Detector_apply(JNIEnv *env, jobject thiz, jlong handle,
-                                          jobjectArray images, jintArray counts) {
-  return With(env, images, [&](const mmdeploy_mat_t imgs[], int size) -> jobjectArray {
+jobjectArray Java_mmdeploy_Detector_apply(JNIEnv* env, jobject thiz, jlong handle, jobjectArray images, jintArray counts)
+{
+    return With(env, images, [&](const mmdeploy_mat_t imgs[], int size) -> jobjectArray
+                {
     mmdeploy_detection_t *results{};
     int *result_count{};
     auto ec =
@@ -79,6 +82,5 @@ jobjectArray Java_mmdeploy_Detector_apply(JNIEnv *env, jobject thiz, jlong handl
     }
     env->ReleaseIntArrayElements(counts, counts_array, 0);
     mmdeploy_detector_release_result(results, result_count, size);
-    return array;
-  });
+    return array; });
 }
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_Detector.h b/csrc/mmdeploy/apis/java/native/mmdeploy_Detector.h
index 41e711d15a..578643efc8 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_Detector.h
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_Detector.h
@@ -3,33 +3,33 @@
 /* Header for class mmdeploy_Detector */
 
 #ifndef _Included_mmdeploy_Detector
-#define _Included_mmdeploy_Detector
-#ifdef __cplusplus
-extern "C" {
-#endif
-/*
- * Class:     mmdeploy_Detector
- * Method:    create
- * Signature: (Ljava/lang/String;Ljava/lang/String;I)J
- */
-JNIEXPORT jlong JNICALL Java_mmdeploy_Detector_create(JNIEnv *, jobject, jstring, jstring, jint);
+    #define _Included_mmdeploy_Detector
+    #ifdef __cplusplus
+extern "C"
+{
+    #endif
+    /*
+     * Class:     mmdeploy_Detector
+     * Method:    create
+     * Signature: (Ljava/lang/String;Ljava/lang/String;I)J
+     */
+    JNIEXPORT jlong JNICALL        Java_mmdeploy_Detector_create(JNIEnv*, jobject, jstring, jstring, jint);
 
-/*
- * Class:     mmdeploy_Detector
- * Method:    destroy
- * Signature: (J)V
- */
-JNIEXPORT void JNICALL Java_mmdeploy_Detector_destroy(JNIEnv *, jobject, jlong);
+    /*
+     * Class:     mmdeploy_Detector
+     * Method:    destroy
+     * Signature: (J)V
+     */
+    JNIEXPORT void JNICALL         Java_mmdeploy_Detector_destroy(JNIEnv*, jobject, jlong);
 
-/*
- * Class:     mmdeploy_Detector
- * Method:    apply
- * Signature: (J[Lmmdeploy/Mat;[I)[Lmmdeploy/Detector/Result;
- */
-JNIEXPORT jobjectArray JNICALL Java_mmdeploy_Detector_apply(JNIEnv *, jobject, jlong, jobjectArray,
-                                                            jintArray);
+    /*
+     * Class:     mmdeploy_Detector
+     * Method:    apply
+     * Signature: (J[Lmmdeploy/Mat;[I)[Lmmdeploy/Detector/Result;
+     */
+    JNIEXPORT jobjectArray JNICALL Java_mmdeploy_Detector_apply(JNIEnv*, jobject, jlong, jobjectArray, jintArray);
 
-#ifdef __cplusplus
+    #ifdef __cplusplus
 }
-#endif
+    #endif
 #endif
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_Device.cpp b/csrc/mmdeploy/apis/java/native/mmdeploy_Device.cpp
index 8dbec9285b..8160210ed5 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_Device.cpp
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_Device.cpp
@@ -6,19 +6,22 @@
 #include "mmdeploy/apis/java/native/common.h"
 #include "mmdeploy/core/logger.h"
 
-jlong Java_mmdeploy_Device_create(JNIEnv *env, jobject, jstring name, jint index) {
-  auto device_name = env->GetStringUTFChars(name, nullptr);
-  mmdeploy_device_t device{};
-  auto ec = mmdeploy_device_create(device_name, (int)index, &device);
-  env->ReleaseStringUTFChars(name, device_name);
-  if (ec) {
-    MMDEPLOY_ERROR("failed to create device, code = {}", ec);
-    return -1;
-  }
-  return (jlong)device;
+jlong Java_mmdeploy_Device_create(JNIEnv* env, jobject, jstring name, jint index)
+{
+    auto              device_name = env->GetStringUTFChars(name, nullptr);
+    mmdeploy_device_t device{};
+    auto              ec = mmdeploy_device_create(device_name, (int)index, &device);
+    env->ReleaseStringUTFChars(name, device_name);
+    if (ec)
+    {
+        MMDEPLOY_ERROR("failed to create device, code = {}", ec);
+        return -1;
+    }
+    return (jlong)device;
 }
 
-void Java_mmdeploy_Device_destroy(JNIEnv *, jobject, jlong device_) {
-  MMDEPLOY_DEBUG("Java_mmdeploy_Device_destroy");
-  mmdeploy_device_destroy((mmdeploy_device_t)device_);
+void Java_mmdeploy_Device_destroy(JNIEnv*, jobject, jlong device_)
+{
+    MMDEPLOY_DEBUG("Java_mmdeploy_Device_destroy");
+    mmdeploy_device_destroy((mmdeploy_device_t)device_);
 }
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_Device.h b/csrc/mmdeploy/apis/java/native/mmdeploy_Device.h
index 7d7ee9dee7..e751d0f781 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_Device.h
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_Device.h
@@ -3,25 +3,26 @@
 /* Header for class mmdeploy_Device */
 
 #ifndef _Included_mmdeploy_Device
-#define _Included_mmdeploy_Device
-#ifdef __cplusplus
-extern "C" {
-#endif
-/*
- * Class:     mmdeploy_Device
- * Method:    create
- * Signature: (Ljava/lang/String;I)J
- */
-JNIEXPORT jlong JNICALL Java_mmdeploy_Device_create(JNIEnv *, jobject, jstring, jint);
+    #define _Included_mmdeploy_Device
+    #ifdef __cplusplus
+extern "C"
+{
+    #endif
+    /*
+     * Class:     mmdeploy_Device
+     * Method:    create
+     * Signature: (Ljava/lang/String;I)J
+     */
+    JNIEXPORT jlong JNICALL Java_mmdeploy_Device_create(JNIEnv*, jobject, jstring, jint);
 
-/*
- * Class:     mmdeploy_Device
- * Method:    destroy
- * Signature: (J)V
- */
-JNIEXPORT void JNICALL Java_mmdeploy_Device_destroy(JNIEnv *, jobject, jlong);
+    /*
+     * Class:     mmdeploy_Device
+     * Method:    destroy
+     * Signature: (J)V
+     */
+    JNIEXPORT void JNICALL  Java_mmdeploy_Device_destroy(JNIEnv*, jobject, jlong);
 
-#ifdef __cplusplus
+    #ifdef __cplusplus
 }
-#endif
+    #endif
 #endif
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_Model.cpp b/csrc/mmdeploy/apis/java/native/mmdeploy_Model.cpp
index 2bbc9a6920..821b1e988e 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_Model.cpp
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_Model.cpp
@@ -6,19 +6,22 @@
 #include "mmdeploy/apis/java/native/common.h"
 #include "mmdeploy/core/logger.h"
 
-jlong Java_mmdeploy_Model_create(JNIEnv *env, jobject, jstring path) {
-  auto model_path = env->GetStringUTFChars(path, nullptr);
-  mmdeploy_model_t model{};
-  auto ec = mmdeploy_model_create_by_path(model_path, &model);
-  env->ReleaseStringUTFChars(path, model_path);
-  if (ec) {
-    MMDEPLOY_ERROR("failed to create model, code = {}", ec);
-    return -1;
-  }
-  return (jlong)model;
+jlong Java_mmdeploy_Model_create(JNIEnv* env, jobject, jstring path)
+{
+    auto             model_path = env->GetStringUTFChars(path, nullptr);
+    mmdeploy_model_t model{};
+    auto             ec = mmdeploy_model_create_by_path(model_path, &model);
+    env->ReleaseStringUTFChars(path, model_path);
+    if (ec)
+    {
+        MMDEPLOY_ERROR("failed to create model, code = {}", ec);
+        return -1;
+    }
+    return (jlong)model;
 }
 
-void Java_mmdeploy_Model_destroy(JNIEnv *, jobject, jlong model_) {
-  MMDEPLOY_DEBUG("Java_mmdeploy_Model_destroy");
-  mmdeploy_model_destroy((mmdeploy_model_t)model_);
+void Java_mmdeploy_Model_destroy(JNIEnv*, jobject, jlong model_)
+{
+    MMDEPLOY_DEBUG("Java_mmdeploy_Model_destroy");
+    mmdeploy_model_destroy((mmdeploy_model_t)model_);
 }
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_Model.h b/csrc/mmdeploy/apis/java/native/mmdeploy_Model.h
index 11e23a1a81..9fc714c259 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_Model.h
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_Model.h
@@ -3,25 +3,26 @@
 /* Header for class mmdeploy_Model */
 
 #ifndef _Included_mmdeploy_Model
-#define _Included_mmdeploy_Model
-#ifdef __cplusplus
-extern "C" {
-#endif
-/*
- * Class:     mmdeploy_Model
- * Method:    create
- * Signature: (Ljava/lang/String;)J
- */
-JNIEXPORT jlong JNICALL Java_mmdeploy_Model_create(JNIEnv *, jobject, jstring);
+    #define _Included_mmdeploy_Model
+    #ifdef __cplusplus
+extern "C"
+{
+    #endif
+    /*
+     * Class:     mmdeploy_Model
+     * Method:    create
+     * Signature: (Ljava/lang/String;)J
+     */
+    JNIEXPORT jlong JNICALL Java_mmdeploy_Model_create(JNIEnv*, jobject, jstring);
 
-/*
- * Class:     mmdeploy_Model
- * Method:    destroy
- * Signature: (J)V
- */
-JNIEXPORT void JNICALL Java_mmdeploy_Model_destroy(JNIEnv *, jobject, jlong);
+    /*
+     * Class:     mmdeploy_Model
+     * Method:    destroy
+     * Signature: (J)V
+     */
+    JNIEXPORT void JNICALL  Java_mmdeploy_Model_destroy(JNIEnv*, jobject, jlong);
 
-#ifdef __cplusplus
+    #ifdef __cplusplus
 }
-#endif
+    #endif
 #endif
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_PoseDetector.cpp b/csrc/mmdeploy/apis/java/native/mmdeploy_PoseDetector.cpp
index 4956555a6e..aac54574a0 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_PoseDetector.cpp
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_PoseDetector.cpp
@@ -6,30 +6,32 @@
 #include "mmdeploy/apis/java/native/common.h"
 #include "mmdeploy/core/logger.h"
 
-jlong Java_mmdeploy_PoseDetector_create(JNIEnv *env, jobject, jstring modelPath, jstring deviceName,
-                                        jint device_id) {
-  auto model_path = env->GetStringUTFChars(modelPath, nullptr);
-  auto device_name = env->GetStringUTFChars(deviceName, nullptr);
-  mmdeploy_pose_detector_t pose_estimator{};
-  auto ec = mmdeploy_pose_detector_create_by_path(model_path, device_name, (int)device_id,
-                                                  &pose_estimator);
-  env->ReleaseStringUTFChars(modelPath, model_path);
-  env->ReleaseStringUTFChars(deviceName, device_name);
-  if (ec) {
-    MMDEPLOY_ERROR("failed to create pose estimator, code = {}", ec);
-    return -1;
-  }
-  return (jlong)pose_estimator;
+jlong Java_mmdeploy_PoseDetector_create(JNIEnv* env, jobject, jstring modelPath, jstring deviceName, jint device_id)
+{
+    auto                     model_path  = env->GetStringUTFChars(modelPath, nullptr);
+    auto                     device_name = env->GetStringUTFChars(deviceName, nullptr);
+    mmdeploy_pose_detector_t pose_estimator{};
+    auto                     ec = mmdeploy_pose_detector_create_by_path(model_path, device_name, (int)device_id, &pose_estimator);
+    env->ReleaseStringUTFChars(modelPath, model_path);
+    env->ReleaseStringUTFChars(deviceName, device_name);
+    if (ec)
+    {
+        MMDEPLOY_ERROR("failed to create pose estimator, code = {}", ec);
+        return -1;
+    }
+    return (jlong)pose_estimator;
 }
 
-void Java_mmdeploy_PoseDetector_destroy(JNIEnv *, jobject, jlong handle) {
-  MMDEPLOY_DEBUG("Java_mmdeploy_PoseDetector_destroy");
-  mmdeploy_pose_detector_destroy((mmdeploy_pose_detector_t)handle);
+void Java_mmdeploy_PoseDetector_destroy(JNIEnv*, jobject, jlong handle)
+{
+    MMDEPLOY_DEBUG("Java_mmdeploy_PoseDetector_destroy");
+    mmdeploy_pose_detector_destroy((mmdeploy_pose_detector_t)handle);
 }
 
-jobjectArray Java_mmdeploy_PoseDetector_apply(JNIEnv *env, jobject thiz, jlong handle,
-                                              jobjectArray images) {
-  return With(env, images, [&](const mmdeploy_mat_t imgs[], int size) -> jobjectArray {
+jobjectArray Java_mmdeploy_PoseDetector_apply(JNIEnv* env, jobject thiz, jlong handle, jobjectArray images)
+{
+    return With(env, images, [&](const mmdeploy_mat_t imgs[], int size) -> jobjectArray
+                {
     mmdeploy_pose_detection_t *results{};
     auto ec = mmdeploy_pose_detector_apply((mmdeploy_pose_detector_t)handle, imgs, size, &results);
     if (ec) {
@@ -55,6 +57,5 @@ jobjectArray Java_mmdeploy_PoseDetector_apply(JNIEnv *env, jobject thiz, jlong h
       env->SetObjectArrayElement(array, i, res);
     }
     mmdeploy_pose_detector_release_result(results, size);
-    return array;
-  });
+    return array; });
 }
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_PoseDetector.h b/csrc/mmdeploy/apis/java/native/mmdeploy_PoseDetector.h
index a50b7fd821..87c70ac0a6 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_PoseDetector.h
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_PoseDetector.h
@@ -3,34 +3,33 @@
 /* Header for class mmdeploy_PoseDetector */
 
 #ifndef _Included_mmdeploy_PoseDetector
-#define _Included_mmdeploy_PoseDetector
-#ifdef __cplusplus
-extern "C" {
-#endif
-/*
- * Class:     mmdeploy_PoseDetector
- * Method:    create
- * Signature: (Ljava/lang/String;Ljava/lang/String;I)J
- */
-JNIEXPORT jlong JNICALL Java_mmdeploy_PoseDetector_create(JNIEnv *, jobject, jstring, jstring,
-                                                          jint);
+    #define _Included_mmdeploy_PoseDetector
+    #ifdef __cplusplus
+extern "C"
+{
+    #endif
+    /*
+     * Class:     mmdeploy_PoseDetector
+     * Method:    create
+     * Signature: (Ljava/lang/String;Ljava/lang/String;I)J
+     */
+    JNIEXPORT jlong JNICALL        Java_mmdeploy_PoseDetector_create(JNIEnv*, jobject, jstring, jstring, jint);
 
-/*
- * Class:     mmdeploy_PoseDetector
- * Method:    destroy
- * Signature: (J)V
- */
-JNIEXPORT void JNICALL Java_mmdeploy_PoseDetector_destroy(JNIEnv *, jobject, jlong);
+    /*
+     * Class:     mmdeploy_PoseDetector
+     * Method:    destroy
+     * Signature: (J)V
+     */
+    JNIEXPORT void JNICALL         Java_mmdeploy_PoseDetector_destroy(JNIEnv*, jobject, jlong);
 
-/*
- * Class:     mmdeploy_PoseDetector
- * Method:    apply
- * Signature: (J[Lmmdeploy/Mat;)[Lmmdeploy/PoseDetector/Result;
- */
-JNIEXPORT jobjectArray JNICALL Java_mmdeploy_PoseDetector_apply(JNIEnv *, jobject, jlong,
-                                                                jobjectArray);
+    /*
+     * Class:     mmdeploy_PoseDetector
+     * Method:    apply
+     * Signature: (J[Lmmdeploy/Mat;)[Lmmdeploy/PoseDetector/Result;
+     */
+    JNIEXPORT jobjectArray JNICALL Java_mmdeploy_PoseDetector_apply(JNIEnv*, jobject, jlong, jobjectArray);
 
-#ifdef __cplusplus
+    #ifdef __cplusplus
 }
-#endif
+    #endif
 #endif
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_PoseTracker.cpp b/csrc/mmdeploy/apis/java/native/mmdeploy_PoseTracker.cpp
index c0d1685729..61fd42eb07 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_PoseTracker.cpp
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_PoseTracker.cpp
@@ -6,143 +6,161 @@
 #include "mmdeploy/apis/java/native/common.h"
 #include "mmdeploy/core/logger.h"
 
-jlong Java_mmdeploy_PoseTracker_create(JNIEnv *env, jobject, jlong detModel, jlong poseModel,
-                                       jlong context) {
-  mmdeploy_pose_tracker_t pose_tracker{};
-  auto ec = mmdeploy_pose_tracker_create((mmdeploy_model_t)detModel, (mmdeploy_model_t)poseModel,
-                                         (mmdeploy_context_t)context, &pose_tracker);
-  if (ec) {
-    MMDEPLOY_ERROR("failed to create pose tracker, code = {}", ec);
-    return -1;
-  }
-  return (jlong)pose_tracker;
+jlong Java_mmdeploy_PoseTracker_create(JNIEnv* env, jobject, jlong detModel, jlong poseModel, jlong context)
+{
+    mmdeploy_pose_tracker_t pose_tracker{};
+    auto                    ec = mmdeploy_pose_tracker_create((mmdeploy_model_t)detModel, (mmdeploy_model_t)poseModel, (mmdeploy_context_t)context, &pose_tracker);
+    if (ec)
+    {
+        MMDEPLOY_ERROR("failed to create pose tracker, code = {}", ec);
+        return -1;
+    }
+    return (jlong)pose_tracker;
 }
 
-void Java_mmdeploy_PoseTracker_destroy(JNIEnv *, jobject, jlong handle) {
-  MMDEPLOY_DEBUG("Java_mmdeploy_PoseTracker_destroy");
-  mmdeploy_pose_tracker_destroy((mmdeploy_pose_tracker_t)handle);
+void Java_mmdeploy_PoseTracker_destroy(JNIEnv*, jobject, jlong handle)
+{
+    MMDEPLOY_DEBUG("Java_mmdeploy_PoseTracker_destroy");
+    mmdeploy_pose_tracker_destroy((mmdeploy_pose_tracker_t)handle);
 }
 
-jobject param_cpp_to_java(JNIEnv *env, mmdeploy_pose_tracker_param_t *params) {
-  auto param_cls = env->FindClass("mmdeploy/PoseTracker$Params");
-  auto param_ctor = env->GetMethodID(param_cls, "<init>", "(IIFFFIFIFFF[FIFIIFF[F)V");
+jobject param_cpp_to_java(JNIEnv* env, mmdeploy_pose_tracker_param_t* params)
+{
+    auto        param_cls  = env->FindClass("mmdeploy/PoseTracker$Params");
+    auto        param_ctor = env->GetMethodID(param_cls, "<init>", "(IIFFFIFIFFF[FIFIIFF[F)V");
 
-  jfloatArray keypointSigmas = env->NewFloatArray(params->keypoint_sigmas_size);
-  env->SetFloatArrayRegion(keypointSigmas, 0, params->keypoint_sigmas_size,
-                           (jfloat *)params->keypoint_sigmas);
-  jfloatArray smoothParams = env->NewFloatArray(3);
-  env->SetFloatArrayRegion(smoothParams, 0, 3, (jfloat *)params->smooth_params);
+    jfloatArray keypointSigmas = env->NewFloatArray(params->keypoint_sigmas_size);
+    env->SetFloatArrayRegion(keypointSigmas, 0, params->keypoint_sigmas_size, (jfloat*)params->keypoint_sigmas);
+    jfloatArray smoothParams = env->NewFloatArray(3);
+    env->SetFloatArrayRegion(smoothParams, 0, 3, (jfloat*)params->smooth_params);
 
-  auto param = env->NewObject(
-      param_cls, param_ctor, (jint)params->det_interval, (jint)params->det_label,
-      (jfloat)params->det_thr, (jfloat)params->det_min_bbox_size, (jfloat)params->det_nms_thr,
-      (jint)params->pose_max_num_bboxes, (jfloat)params->pose_kpt_thr,
-      (jint)params->pose_min_keypoints, (jfloat)params->pose_bbox_scale,
-      (jfloat)params->pose_min_bbox_size, (jfloat)params->pose_nms_thr, keypointSigmas,
-      (jint)params->keypoint_sigmas_size, (jfloat)params->track_iou_thr,
-      (jint)params->track_max_missing, (jint)params->track_history_size,
-      (jfloat)params->std_weight_position, (jfloat)params->std_weight_velocity, smoothParams);
-  return param;
+    auto param = env->NewObject(
+        param_cls,
+        param_ctor,
+        (jint)params->det_interval,
+        (jint)params->det_label,
+        (jfloat)params->det_thr,
+        (jfloat)params->det_min_bbox_size,
+        (jfloat)params->det_nms_thr,
+        (jint)params->pose_max_num_bboxes,
+        (jfloat)params->pose_kpt_thr,
+        (jint)params->pose_min_keypoints,
+        (jfloat)params->pose_bbox_scale,
+        (jfloat)params->pose_min_bbox_size,
+        (jfloat)params->pose_nms_thr,
+        keypointSigmas,
+        (jint)params->keypoint_sigmas_size,
+        (jfloat)params->track_iou_thr,
+        (jint)params->track_max_missing,
+        (jint)params->track_history_size,
+        (jfloat)params->std_weight_position,
+        (jfloat)params->std_weight_velocity,
+        smoothParams);
+    return param;
 }
 
-void param_java_to_cpp(JNIEnv *env, mmdeploy_pose_tracker_param_t *params, jobject customParam) {
-  auto param_cls = env->FindClass("mmdeploy/PoseTracker$Params");
-  auto param_ctor = env->GetMethodID(param_cls, "<init>", "(IIFFFIFIFFF[FIFIIFF[F)V");
+void param_java_to_cpp(JNIEnv* env, mmdeploy_pose_tracker_param_t* params, jobject customParam)
+{
+    auto     param_cls  = env->FindClass("mmdeploy/PoseTracker$Params");
+    auto     param_ctor = env->GetMethodID(param_cls, "<init>", "(IIFFFIFIFFF[FIFIIFF[F)V");
 
-  jfieldID fieldID_detInterval = env->GetFieldID(param_cls, "detInterval", "I");
-  jint detInterval = env->GetIntField(customParam, fieldID_detInterval);
-  params->det_interval = (int)detInterval;
-  jfieldID fieldID_detLabel = env->GetFieldID(param_cls, "detLabel", "I");
-  jint detLabel = env->GetIntField(customParam, fieldID_detLabel);
-  params->det_label = (int)detLabel;
-  jfieldID fieldID_detThr = env->GetFieldID(param_cls, "detThr", "F");
-  jfloat detThr = env->GetFloatField(customParam, fieldID_detThr);
-  params->det_thr = (float)detThr;
-  jfieldID fieldID_detMinBboxSize = env->GetFieldID(param_cls, "detMinBboxSize", "F");
-  jfloat detMinBboxSize = env->GetFloatField(customParam, fieldID_detMinBboxSize);
-  params->det_min_bbox_size = (float)detMinBboxSize;
-  jfieldID fieldID_detNmsThr = env->GetFieldID(param_cls, "detNmsThr", "F");
-  jfloat detNmsThr = env->GetFloatField(customParam, fieldID_detNmsThr);
-  params->det_nms_thr = (float)detNmsThr;
-  jfieldID fieldID_poseMaxNumBboxes = env->GetFieldID(param_cls, "poseMaxNumBboxes", "I");
-  jint poseMaxNumBboxes = env->GetIntField(customParam, fieldID_poseMaxNumBboxes);
-  params->pose_max_num_bboxes = (int)poseMaxNumBboxes;
-  jfieldID fieldID_poseKptThr = env->GetFieldID(param_cls, "poseKptThr", "F");
-  jfloat poseKptThr = env->GetFloatField(customParam, fieldID_poseKptThr);
-  params->pose_kpt_thr = (float)poseKptThr;
-  jfieldID fieldID_poseMinKeypoints = env->GetFieldID(param_cls, "poseMinKeypoints", "I");
-  jint poseMinKeypoints = env->GetIntField(customParam, fieldID_poseMinKeypoints);
-  params->pose_min_keypoints = (int)poseMinKeypoints;
-  jfieldID fieldID_poseBboxScale = env->GetFieldID(param_cls, "poseBboxScale", "F");
-  jfloat poseBboxScale = env->GetFloatField(customParam, fieldID_poseBboxScale);
-  params->pose_bbox_scale = (float)poseBboxScale;
-  jfieldID fieldID_poseMinBboxSize = env->GetFieldID(param_cls, "poseMinBboxSize", "F");
-  jfloat poseMinBboxSize = env->GetFloatField(customParam, fieldID_poseMinBboxSize);
-  params->pose_min_bbox_size = (float)poseMinBboxSize;
-  jfieldID fieldID_poseNmsThr = env->GetFieldID(param_cls, "poseNmsThr", "F");
-  jfloat poseNmsThr = env->GetFloatField(customParam, fieldID_poseNmsThr);
-  params->pose_nms_thr = (float)poseNmsThr;
-  jfieldID fieldID_keypointSigmas = env->GetFieldID(param_cls, "keypointSigmas", "[F");
-  auto keypointSigmasObj = env->GetObjectField(customParam, fieldID_keypointSigmas);
-  float *keypointSigmas =
-      (float *)env->GetFloatArrayElements((jfloatArray)keypointSigmasObj, nullptr);
-  params->keypoint_sigmas = keypointSigmas;
-  env->ReleaseFloatArrayElements((jfloatArray)keypointSigmasObj, keypointSigmas, JNI_ABORT);
-  jfieldID fieldID_keypointSigmasSize = env->GetFieldID(param_cls, "keypointSigmasSize", "I");
-  jint keypointSigmasSize = env->GetIntField(customParam, fieldID_keypointSigmasSize);
-  params->keypoint_sigmas_size = keypointSigmasSize;
-  jfieldID fieldID_trackIouThr = env->GetFieldID(param_cls, "trackIouThr", "F");
-  jfloat trackIouThr = env->GetFloatField(customParam, fieldID_trackIouThr);
-  params->track_iou_thr = trackIouThr;
-  jfieldID fieldID_trackMaxMissing = env->GetFieldID(param_cls, "trackMaxMissing", "I");
-  jint trackMaxMissing = env->GetIntField(customParam, fieldID_trackMaxMissing);
-  params->track_max_missing = trackMaxMissing;
-  jfieldID fieldID_trackHistorySize = env->GetFieldID(param_cls, "trackHistorySize", "I");
-  jint trackHistorySize = env->GetIntField(customParam, fieldID_trackHistorySize);
-  params->track_history_size = trackHistorySize;
-  jfieldID fieldID_stdWeightPosition = env->GetFieldID(param_cls, "stdWeightPosition", "F");
-  jfloat stdWeightPosition = env->GetFloatField(customParam, fieldID_stdWeightPosition);
-  params->std_weight_position = stdWeightPosition;
-  jfieldID fieldID_stdWeightVelocity = env->GetFieldID(param_cls, "stdWeightVelocity", "F");
-  jfloat stdWeightVelocity = env->GetFloatField(customParam, fieldID_stdWeightVelocity);
-  params->std_weight_velocity = stdWeightVelocity;
-  jfieldID fieldID_smoothParams = env->GetFieldID(param_cls, "smoothParams", "[F");
-  auto smoothParamsObj = env->GetObjectField(customParam, fieldID_smoothParams);
-  float *smoothParams = (float *)env->GetFloatArrayElements((jfloatArray)smoothParamsObj, nullptr);
-  params->smooth_params[0] = smoothParams[0];
-  params->smooth_params[1] = smoothParams[1];
-  params->smooth_params[2] = smoothParams[2];
-  env->ReleaseFloatArrayElements((jfloatArray)smoothParamsObj, smoothParams, JNI_ABORT);
+    jfieldID fieldID_detInterval      = env->GetFieldID(param_cls, "detInterval", "I");
+    jint     detInterval              = env->GetIntField(customParam, fieldID_detInterval);
+    params->det_interval              = (int)detInterval;
+    jfieldID fieldID_detLabel         = env->GetFieldID(param_cls, "detLabel", "I");
+    jint     detLabel                 = env->GetIntField(customParam, fieldID_detLabel);
+    params->det_label                 = (int)detLabel;
+    jfieldID fieldID_detThr           = env->GetFieldID(param_cls, "detThr", "F");
+    jfloat   detThr                   = env->GetFloatField(customParam, fieldID_detThr);
+    params->det_thr                   = (float)detThr;
+    jfieldID fieldID_detMinBboxSize   = env->GetFieldID(param_cls, "detMinBboxSize", "F");
+    jfloat   detMinBboxSize           = env->GetFloatField(customParam, fieldID_detMinBboxSize);
+    params->det_min_bbox_size         = (float)detMinBboxSize;
+    jfieldID fieldID_detNmsThr        = env->GetFieldID(param_cls, "detNmsThr", "F");
+    jfloat   detNmsThr                = env->GetFloatField(customParam, fieldID_detNmsThr);
+    params->det_nms_thr               = (float)detNmsThr;
+    jfieldID fieldID_poseMaxNumBboxes = env->GetFieldID(param_cls, "poseMaxNumBboxes", "I");
+    jint     poseMaxNumBboxes         = env->GetIntField(customParam, fieldID_poseMaxNumBboxes);
+    params->pose_max_num_bboxes       = (int)poseMaxNumBboxes;
+    jfieldID fieldID_poseKptThr       = env->GetFieldID(param_cls, "poseKptThr", "F");
+    jfloat   poseKptThr               = env->GetFloatField(customParam, fieldID_poseKptThr);
+    params->pose_kpt_thr              = (float)poseKptThr;
+    jfieldID fieldID_poseMinKeypoints = env->GetFieldID(param_cls, "poseMinKeypoints", "I");
+    jint     poseMinKeypoints         = env->GetIntField(customParam, fieldID_poseMinKeypoints);
+    params->pose_min_keypoints        = (int)poseMinKeypoints;
+    jfieldID fieldID_poseBboxScale    = env->GetFieldID(param_cls, "poseBboxScale", "F");
+    jfloat   poseBboxScale            = env->GetFloatField(customParam, fieldID_poseBboxScale);
+    params->pose_bbox_scale           = (float)poseBboxScale;
+    jfieldID fieldID_poseMinBboxSize  = env->GetFieldID(param_cls, "poseMinBboxSize", "F");
+    jfloat   poseMinBboxSize          = env->GetFloatField(customParam, fieldID_poseMinBboxSize);
+    params->pose_min_bbox_size        = (float)poseMinBboxSize;
+    jfieldID fieldID_poseNmsThr       = env->GetFieldID(param_cls, "poseNmsThr", "F");
+    jfloat   poseNmsThr               = env->GetFloatField(customParam, fieldID_poseNmsThr);
+    params->pose_nms_thr              = (float)poseNmsThr;
+    jfieldID fieldID_keypointSigmas   = env->GetFieldID(param_cls, "keypointSigmas", "[F");
+    auto     keypointSigmasObj        = env->GetObjectField(customParam, fieldID_keypointSigmas);
+    float*   keypointSigmas =
+        (float*)env->GetFloatArrayElements((jfloatArray)keypointSigmasObj, nullptr);
+    params->keypoint_sigmas = keypointSigmas;
+    env->ReleaseFloatArrayElements((jfloatArray)keypointSigmasObj, keypointSigmas, JNI_ABORT);
+    jfieldID fieldID_keypointSigmasSize = env->GetFieldID(param_cls, "keypointSigmasSize", "I");
+    jint     keypointSigmasSize         = env->GetIntField(customParam, fieldID_keypointSigmasSize);
+    params->keypoint_sigmas_size        = keypointSigmasSize;
+    jfieldID fieldID_trackIouThr        = env->GetFieldID(param_cls, "trackIouThr", "F");
+    jfloat   trackIouThr                = env->GetFloatField(customParam, fieldID_trackIouThr);
+    params->track_iou_thr               = trackIouThr;
+    jfieldID fieldID_trackMaxMissing    = env->GetFieldID(param_cls, "trackMaxMissing", "I");
+    jint     trackMaxMissing            = env->GetIntField(customParam, fieldID_trackMaxMissing);
+    params->track_max_missing           = trackMaxMissing;
+    jfieldID fieldID_trackHistorySize   = env->GetFieldID(param_cls, "trackHistorySize", "I");
+    jint     trackHistorySize           = env->GetIntField(customParam, fieldID_trackHistorySize);
+    params->track_history_size          = trackHistorySize;
+    jfieldID fieldID_stdWeightPosition  = env->GetFieldID(param_cls, "stdWeightPosition", "F");
+    jfloat   stdWeightPosition          = env->GetFloatField(customParam, fieldID_stdWeightPosition);
+    params->std_weight_position         = stdWeightPosition;
+    jfieldID fieldID_stdWeightVelocity  = env->GetFieldID(param_cls, "stdWeightVelocity", "F");
+    jfloat   stdWeightVelocity          = env->GetFloatField(customParam, fieldID_stdWeightVelocity);
+    params->std_weight_velocity         = stdWeightVelocity;
+    jfieldID fieldID_smoothParams       = env->GetFieldID(param_cls, "smoothParams", "[F");
+    auto     smoothParamsObj            = env->GetObjectField(customParam, fieldID_smoothParams);
+    float*   smoothParams               = (float*)env->GetFloatArrayElements((jfloatArray)smoothParamsObj, nullptr);
+    params->smooth_params[0]            = smoothParams[0];
+    params->smooth_params[1]            = smoothParams[1];
+    params->smooth_params[2]            = smoothParams[2];
+    env->ReleaseFloatArrayElements((jfloatArray)smoothParamsObj, smoothParams, JNI_ABORT);
 }
 
-jobject Java_mmdeploy_PoseTracker_setDefaultParams(JNIEnv *env, jobject) {
-  mmdeploy_pose_tracker_param_t params{};
-  mmdeploy_pose_tracker_default_params(&params);
-  return param_cpp_to_java(env, &params);
+jobject Java_mmdeploy_PoseTracker_setDefaultParams(JNIEnv* env, jobject)
+{
+    mmdeploy_pose_tracker_param_t params{};
+    mmdeploy_pose_tracker_default_params(&params);
+    return param_cpp_to_java(env, &params);
 }
 
-jlong Java_mmdeploy_PoseTracker_createState(JNIEnv *env, jobject, jlong pipeline,
-                                            jobject paramsObject) {
-  mmdeploy_pose_tracker_state_t state{};
-  mmdeploy_pose_tracker_param_t params{};
-  param_java_to_cpp(env, &params, paramsObject);
-  auto ec = mmdeploy_pose_tracker_create_state((mmdeploy_pose_tracker_t)pipeline, &params, &state);
-  if (ec) {
-    MMDEPLOY_ERROR("failed to create pose tracker state, code = {}", ec);
-    return -1;
-  }
-  return (jlong)state;
+jlong Java_mmdeploy_PoseTracker_createState(JNIEnv* env, jobject, jlong pipeline, jobject paramsObject)
+{
+    mmdeploy_pose_tracker_state_t state{};
+    mmdeploy_pose_tracker_param_t params{};
+    param_java_to_cpp(env, &params, paramsObject);
+    auto ec = mmdeploy_pose_tracker_create_state((mmdeploy_pose_tracker_t)pipeline, &params, &state);
+    if (ec)
+    {
+        MMDEPLOY_ERROR("failed to create pose tracker state, code = {}", ec);
+        return -1;
+    }
+    return (jlong)state;
 }
 
-void Java_mmdeploy_PoseTracker_destroyState(JNIEnv *, jobject, jlong state) {
-  MMDEPLOY_DEBUG("Java_mmdeploy_PoseTracker_destroy");
-  mmdeploy_pose_tracker_destroy_state((mmdeploy_pose_tracker_state_t)state);
+void Java_mmdeploy_PoseTracker_destroyState(JNIEnv*, jobject, jlong state)
+{
+    MMDEPLOY_DEBUG("Java_mmdeploy_PoseTracker_destroy");
+    mmdeploy_pose_tracker_destroy_state((mmdeploy_pose_tracker_state_t)state);
 }
 
-jobjectArray Java_mmdeploy_PoseTracker_apply(JNIEnv *env, jobject thiz, jlong handle,
-                                             jlongArray states, jobjectArray frames,
-                                             jintArray detects, jintArray counts) {
-  return With(env, frames, [&](const mmdeploy_mat_t imgs[], int size) -> jobjectArray {
+jobjectArray Java_mmdeploy_PoseTracker_apply(JNIEnv* env, jobject thiz, jlong handle, jlongArray states, jobjectArray frames, jintArray detects, jintArray counts)
+{
+    return With(env, frames, [&](const mmdeploy_mat_t imgs[], int size) -> jobjectArray
+                {
     mmdeploy_pose_tracker_target_t *results{};
     int *result_count{};
     auto states_array = env->GetLongArrayElements(states, nullptr);
@@ -189,6 +207,5 @@ jobjectArray Java_mmdeploy_PoseTracker_apply(JNIEnv *env, jobject thiz, jlong ha
     env->ReleaseLongArrayElements(states, states_array, 0);
     env->ReleaseIntArrayElements(detects, detects_array, 0);
     mmdeploy_pose_tracker_release_result(results, result_count, size);
-    return array;
-  });
+    return array; });
 }
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_PoseTracker.h b/csrc/mmdeploy/apis/java/native/mmdeploy_PoseTracker.h
index 8e8d3905c8..1de79b1eaa 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_PoseTracker.h
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_PoseTracker.h
@@ -3,54 +3,54 @@
 /* Header for class mmdeploy_PoseTracker */
 
 #ifndef _Included_mmdeploy_PoseTracker
-#define _Included_mmdeploy_PoseTracker
-#ifdef __cplusplus
-extern "C" {
-#endif
-/*
- * Class:     mmdeploy_PoseTracker
- * Method:    create
- * Signature: (JJJ)J
- */
-JNIEXPORT jlong JNICALL Java_mmdeploy_PoseTracker_create(JNIEnv *, jobject, jlong, jlong, jlong);
+    #define _Included_mmdeploy_PoseTracker
+    #ifdef __cplusplus
+extern "C"
+{
+    #endif
+    /*
+     * Class:     mmdeploy_PoseTracker
+     * Method:    create
+     * Signature: (JJJ)J
+     */
+    JNIEXPORT jlong JNICALL        Java_mmdeploy_PoseTracker_create(JNIEnv*, jobject, jlong, jlong, jlong);
 
-/*
- * Class:     mmdeploy_PoseTracker
- * Method:    destroy
- * Signature: (J)V
- */
-JNIEXPORT void JNICALL Java_mmdeploy_PoseTracker_destroy(JNIEnv *, jobject, jlong);
+    /*
+     * Class:     mmdeploy_PoseTracker
+     * Method:    destroy
+     * Signature: (J)V
+     */
+    JNIEXPORT void JNICALL         Java_mmdeploy_PoseTracker_destroy(JNIEnv*, jobject, jlong);
 
-/*
- * Class:     mmdeploy_PoseTracker
- * Method:    createState
- * Signature: (JLmmdeploy/PoseTracker/Params;)J
- */
-JNIEXPORT jlong JNICALL Java_mmdeploy_PoseTracker_createState(JNIEnv *, jobject, jlong, jobject);
+    /*
+     * Class:     mmdeploy_PoseTracker
+     * Method:    createState
+     * Signature: (JLmmdeploy/PoseTracker/Params;)J
+     */
+    JNIEXPORT jlong JNICALL        Java_mmdeploy_PoseTracker_createState(JNIEnv*, jobject, jlong, jobject);
 
-/*
- * Class:     mmdeploy_PoseTracker
- * Method:    destroyState
- * Signature: (J)V
- */
-JNIEXPORT void JNICALL Java_mmdeploy_PoseTracker_destroyState(JNIEnv *, jobject, jlong);
+    /*
+     * Class:     mmdeploy_PoseTracker
+     * Method:    destroyState
+     * Signature: (J)V
+     */
+    JNIEXPORT void JNICALL         Java_mmdeploy_PoseTracker_destroyState(JNIEnv*, jobject, jlong);
 
-/*
- * Class:     mmdeploy_PoseTracker
- * Method:    setDefaultParams
- * Signature: ()Lmmdeploy/PoseTracker/Params;
- */
-JNIEXPORT jobject JNICALL Java_mmdeploy_PoseTracker_setDefaultParams(JNIEnv *, jobject);
+    /*
+     * Class:     mmdeploy_PoseTracker
+     * Method:    setDefaultParams
+     * Signature: ()Lmmdeploy/PoseTracker/Params;
+     */
+    JNIEXPORT jobject JNICALL      Java_mmdeploy_PoseTracker_setDefaultParams(JNIEnv*, jobject);
 
-/*
- * Class:     mmdeploy_PoseTracker
- * Method:    apply
- * Signature: (J[J[Lmmdeploy/Mat;[I[I)[Lmmdeploy/PoseTracker/Result;
- */
-JNIEXPORT jobjectArray JNICALL Java_mmdeploy_PoseTracker_apply(JNIEnv *, jobject, jlong, jlongArray,
-                                                               jobjectArray, jintArray, jintArray);
+    /*
+     * Class:     mmdeploy_PoseTracker
+     * Method:    apply
+     * Signature: (J[J[Lmmdeploy/Mat;[I[I)[Lmmdeploy/PoseTracker/Result;
+     */
+    JNIEXPORT jobjectArray JNICALL Java_mmdeploy_PoseTracker_apply(JNIEnv*, jobject, jlong, jlongArray, jobjectArray, jintArray, jintArray);
 
-#ifdef __cplusplus
+    #ifdef __cplusplus
 }
-#endif
+    #endif
 #endif
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_Profiler.cpp b/csrc/mmdeploy/apis/java/native/mmdeploy_Profiler.cpp
index 2c63233c5c..2ff419ec7a 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_Profiler.cpp
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_Profiler.cpp
@@ -6,19 +6,22 @@
 #include "mmdeploy/apis/java/native/common.h"
 #include "mmdeploy/core/logger.h"
 
-jlong Java_mmdeploy_Profiler_create(JNIEnv *env, jobject, jstring path) {
-  auto profiler_path = env->GetStringUTFChars(path, nullptr);
-  mmdeploy_profiler_t profiler{};
-  auto ec = mmdeploy_profiler_create(profiler_path, &profiler);
-  env->ReleaseStringUTFChars(path, profiler_path);
-  if (ec) {
-    MMDEPLOY_ERROR("failed to create profiler, code = {}", ec);
-    return -1;
-  }
-  return (jlong)profiler;
+jlong Java_mmdeploy_Profiler_create(JNIEnv* env, jobject, jstring path)
+{
+    auto                profiler_path = env->GetStringUTFChars(path, nullptr);
+    mmdeploy_profiler_t profiler{};
+    auto                ec = mmdeploy_profiler_create(profiler_path, &profiler);
+    env->ReleaseStringUTFChars(path, profiler_path);
+    if (ec)
+    {
+        MMDEPLOY_ERROR("failed to create profiler, code = {}", ec);
+        return -1;
+    }
+    return (jlong)profiler;
 }
 
-void Java_mmdeploy_Profiler_destroy(JNIEnv *, jobject, jlong profiler_) {
-  MMDEPLOY_DEBUG("Java_mmdeploy_Profiler_destroy");
-  mmdeploy_profiler_destroy((mmdeploy_profiler_t)profiler_);
+void Java_mmdeploy_Profiler_destroy(JNIEnv*, jobject, jlong profiler_)
+{
+    MMDEPLOY_DEBUG("Java_mmdeploy_Profiler_destroy");
+    mmdeploy_profiler_destroy((mmdeploy_profiler_t)profiler_);
 }
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_Profiler.h b/csrc/mmdeploy/apis/java/native/mmdeploy_Profiler.h
index 2bcdbc42cc..9e829ad38c 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_Profiler.h
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_Profiler.h
@@ -3,25 +3,26 @@
 /* Header for class mmdeploy_Profiler */
 
 #ifndef _Included_mmdeploy_Profiler
-#define _Included_mmdeploy_Profiler
-#ifdef __cplusplus
-extern "C" {
-#endif
-/*
- * Class:     mmdeploy_Profiler
- * Method:    create
- * Signature: (Ljava/lang/String;)J
- */
-JNIEXPORT jlong JNICALL Java_mmdeploy_Profiler_create(JNIEnv *, jobject, jstring);
+    #define _Included_mmdeploy_Profiler
+    #ifdef __cplusplus
+extern "C"
+{
+    #endif
+    /*
+     * Class:     mmdeploy_Profiler
+     * Method:    create
+     * Signature: (Ljava/lang/String;)J
+     */
+    JNIEXPORT jlong JNICALL Java_mmdeploy_Profiler_create(JNIEnv*, jobject, jstring);
 
-/*
- * Class:     mmdeploy_Profiler
- * Method:    destroy
- * Signature: (J)V
- */
-JNIEXPORT void JNICALL Java_mmdeploy_Profiler_destroy(JNIEnv *, jobject, jlong);
+    /*
+     * Class:     mmdeploy_Profiler
+     * Method:    destroy
+     * Signature: (J)V
+     */
+    JNIEXPORT void JNICALL  Java_mmdeploy_Profiler_destroy(JNIEnv*, jobject, jlong);
 
-#ifdef __cplusplus
+    #ifdef __cplusplus
 }
-#endif
+    #endif
 #endif
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_Restorer.cpp b/csrc/mmdeploy/apis/java/native/mmdeploy_Restorer.cpp
index f124d5edae..abc630afa6 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_Restorer.cpp
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_Restorer.cpp
@@ -6,29 +6,32 @@
 #include "mmdeploy/apis/java/native/common.h"
 #include "mmdeploy/core/logger.h"
 
-jlong Java_mmdeploy_Restorer_create(JNIEnv *env, jobject, jstring modelPath, jstring deviceName,
-                                    jint device_id) {
-  auto model_path = env->GetStringUTFChars(modelPath, nullptr);
-  auto device_name = env->GetStringUTFChars(deviceName, nullptr);
-  mmdeploy_restorer_t restorer{};
-  auto ec = mmdeploy_restorer_create_by_path(model_path, device_name, (int)device_id, &restorer);
-  env->ReleaseStringUTFChars(modelPath, model_path);
-  env->ReleaseStringUTFChars(deviceName, device_name);
-  if (ec) {
-    MMDEPLOY_ERROR("failed to create restorer, code = {}", ec);
-    return -1;
-  }
-  return (jlong)restorer;
+jlong Java_mmdeploy_Restorer_create(JNIEnv* env, jobject, jstring modelPath, jstring deviceName, jint device_id)
+{
+    auto                model_path  = env->GetStringUTFChars(modelPath, nullptr);
+    auto                device_name = env->GetStringUTFChars(deviceName, nullptr);
+    mmdeploy_restorer_t restorer{};
+    auto                ec = mmdeploy_restorer_create_by_path(model_path, device_name, (int)device_id, &restorer);
+    env->ReleaseStringUTFChars(modelPath, model_path);
+    env->ReleaseStringUTFChars(deviceName, device_name);
+    if (ec)
+    {
+        MMDEPLOY_ERROR("failed to create restorer, code = {}", ec);
+        return -1;
+    }
+    return (jlong)restorer;
 }
 
-void Java_mmdeploy_Restorer_destroy(JNIEnv *, jobject, jlong handle) {
-  MMDEPLOY_DEBUG("Java_mmdeploy_Restorer_destroy");
-  mmdeploy_restorer_destroy((mmdeploy_restorer_t)handle);
+void Java_mmdeploy_Restorer_destroy(JNIEnv*, jobject, jlong handle)
+{
+    MMDEPLOY_DEBUG("Java_mmdeploy_Restorer_destroy");
+    mmdeploy_restorer_destroy((mmdeploy_restorer_t)handle);
 }
 
-jobjectArray Java_mmdeploy_Restorer_apply(JNIEnv *env, jobject thiz, jlong handle,
-                                          jobjectArray images) {
-  return With(env, images, [&](const mmdeploy_mat_t imgs[], int size) -> jobjectArray {
+jobjectArray Java_mmdeploy_Restorer_apply(JNIEnv* env, jobject thiz, jlong handle, jobjectArray images)
+{
+    return With(env, images, [&](const mmdeploy_mat_t imgs[], int size) -> jobjectArray
+                {
     mmdeploy_mat_t *results{};
     auto ec = mmdeploy_restorer_apply((mmdeploy_restorer_t)handle, imgs, size, &results);
     if (ec) {
@@ -68,6 +71,5 @@ jobjectArray Java_mmdeploy_Restorer_apply(JNIEnv *env, jobject thiz, jlong handl
       current_result++;
     }
     mmdeploy_restorer_release_result(results, size);
-    return array;
-  });
+    return array; });
 }
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_Restorer.h b/csrc/mmdeploy/apis/java/native/mmdeploy_Restorer.h
index 78b09787fe..7a4aec079b 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_Restorer.h
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_Restorer.h
@@ -3,32 +3,33 @@
 /* Header for class mmdeploy_Restorer */
 
 #ifndef _Included_mmdeploy_Restorer
-#define _Included_mmdeploy_Restorer
-#ifdef __cplusplus
-extern "C" {
-#endif
-/*
- * Class:     mmdeploy_Restorer
- * Method:    create
- * Signature: (Ljava/lang/String;Ljava/lang/String;I)J
- */
-JNIEXPORT jlong JNICALL Java_mmdeploy_Restorer_create(JNIEnv *, jobject, jstring, jstring, jint);
+    #define _Included_mmdeploy_Restorer
+    #ifdef __cplusplus
+extern "C"
+{
+    #endif
+    /*
+     * Class:     mmdeploy_Restorer
+     * Method:    create
+     * Signature: (Ljava/lang/String;Ljava/lang/String;I)J
+     */
+    JNIEXPORT jlong JNICALL        Java_mmdeploy_Restorer_create(JNIEnv*, jobject, jstring, jstring, jint);
 
-/*
- * Class:     mmdeploy_Restorer
- * Method:    destroy
- * Signature: (J)V
- */
-JNIEXPORT void JNICALL Java_mmdeploy_Restorer_destroy(JNIEnv *, jobject, jlong);
+    /*
+     * Class:     mmdeploy_Restorer
+     * Method:    destroy
+     * Signature: (J)V
+     */
+    JNIEXPORT void JNICALL         Java_mmdeploy_Restorer_destroy(JNIEnv*, jobject, jlong);
 
-/*
- * Class:     mmdeploy_Restorer
- * Method:    apply
- * Signature: (J[Lmmdeploy/Mat;)[Lmmdeploy/Restorer/Result;
- */
-JNIEXPORT jobjectArray JNICALL Java_mmdeploy_Restorer_apply(JNIEnv *, jobject, jlong, jobjectArray);
+    /*
+     * Class:     mmdeploy_Restorer
+     * Method:    apply
+     * Signature: (J[Lmmdeploy/Mat;)[Lmmdeploy/Restorer/Result;
+     */
+    JNIEXPORT jobjectArray JNICALL Java_mmdeploy_Restorer_apply(JNIEnv*, jobject, jlong, jobjectArray);
 
-#ifdef __cplusplus
+    #ifdef __cplusplus
 }
-#endif
+    #endif
 #endif
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_RotatedDetector.cpp b/csrc/mmdeploy/apis/java/native/mmdeploy_RotatedDetector.cpp
index 3872e7e158..9b34659aa5 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_RotatedDetector.cpp
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_RotatedDetector.cpp
@@ -6,30 +6,32 @@
 #include "mmdeploy/apis/java/native/common.h"
 #include "mmdeploy/core/logger.h"
 
-jlong Java_mmdeploy_RotatedDetector_create(JNIEnv *env, jobject, jstring modelPath,
-                                           jstring deviceName, jint device_id) {
-  auto model_path = env->GetStringUTFChars(modelPath, nullptr);
-  auto device_name = env->GetStringUTFChars(deviceName, nullptr);
-  mmdeploy_rotated_detector_t rotated_detector{};
-  auto ec = mmdeploy_rotated_detector_create_by_path(model_path, device_name, (int)device_id,
-                                                     &rotated_detector);
-  env->ReleaseStringUTFChars(modelPath, model_path);
-  env->ReleaseStringUTFChars(deviceName, device_name);
-  if (ec) {
-    MMDEPLOY_ERROR("failed to create rotated detector, code = {}", ec);
-    return -1;
-  }
-  return (jlong)rotated_detector;
+jlong Java_mmdeploy_RotatedDetector_create(JNIEnv* env, jobject, jstring modelPath, jstring deviceName, jint device_id)
+{
+    auto                        model_path  = env->GetStringUTFChars(modelPath, nullptr);
+    auto                        device_name = env->GetStringUTFChars(deviceName, nullptr);
+    mmdeploy_rotated_detector_t rotated_detector{};
+    auto                        ec = mmdeploy_rotated_detector_create_by_path(model_path, device_name, (int)device_id, &rotated_detector);
+    env->ReleaseStringUTFChars(modelPath, model_path);
+    env->ReleaseStringUTFChars(deviceName, device_name);
+    if (ec)
+    {
+        MMDEPLOY_ERROR("failed to create rotated detector, code = {}", ec);
+        return -1;
+    }
+    return (jlong)rotated_detector;
 }
 
-void Java_mmdeploy_RotatedDetector_destroy(JNIEnv *, jobject, jlong handle) {
-  MMDEPLOY_DEBUG("Java_mmdeploy_RotatedDetector_destroy");
-  mmdeploy_rotated_detector_destroy((mmdeploy_rotated_detector_t)handle);
+void Java_mmdeploy_RotatedDetector_destroy(JNIEnv*, jobject, jlong handle)
+{
+    MMDEPLOY_DEBUG("Java_mmdeploy_RotatedDetector_destroy");
+    mmdeploy_rotated_detector_destroy((mmdeploy_rotated_detector_t)handle);
 }
 
-jobjectArray Java_mmdeploy_RotatedDetector_apply(JNIEnv *env, jobject thiz, jlong handle,
-                                                 jobjectArray images, jintArray counts) {
-  return With(env, images, [&](const mmdeploy_mat_t imgs[], int size) -> jobjectArray {
+jobjectArray Java_mmdeploy_RotatedDetector_apply(JNIEnv* env, jobject thiz, jlong handle, jobjectArray images, jintArray counts)
+{
+    return With(env, images, [&](const mmdeploy_mat_t imgs[], int size) -> jobjectArray
+                {
     mmdeploy_rotated_detection_t *results{};
     int *result_count{};
     auto ec = mmdeploy_rotated_detector_apply((mmdeploy_rotated_detector_t)handle, imgs, size,
@@ -56,6 +58,5 @@ jobjectArray Java_mmdeploy_RotatedDetector_apply(JNIEnv *env, jobject thiz, jlon
     }
     env->ReleaseIntArrayElements(counts, counts_array, 0);
     mmdeploy_rotated_detector_release_result(results, result_count);
-    return array;
-  });
+    return array; });
 }
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_RotatedDetector.h b/csrc/mmdeploy/apis/java/native/mmdeploy_RotatedDetector.h
index 6de527ec40..7327b791ea 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_RotatedDetector.h
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_RotatedDetector.h
@@ -3,34 +3,33 @@
 /* Header for class mmdeploy_RotatedDetector */
 
 #ifndef _Included_mmdeploy_RotatedDetector
-#define _Included_mmdeploy_RotatedDetector
-#ifdef __cplusplus
-extern "C" {
-#endif
-/*
- * Class:     mmdeploy_RotatedDetector
- * Method:    create
- * Signature: (Ljava/lang/String;Ljava/lang/String;I)J
- */
-JNIEXPORT jlong JNICALL Java_mmdeploy_RotatedDetector_create(JNIEnv *, jobject, jstring, jstring,
-                                                             jint);
+    #define _Included_mmdeploy_RotatedDetector
+    #ifdef __cplusplus
+extern "C"
+{
+    #endif
+    /*
+     * Class:     mmdeploy_RotatedDetector
+     * Method:    create
+     * Signature: (Ljava/lang/String;Ljava/lang/String;I)J
+     */
+    JNIEXPORT jlong JNICALL        Java_mmdeploy_RotatedDetector_create(JNIEnv*, jobject, jstring, jstring, jint);
 
-/*
- * Class:     mmdeploy_RotatedDetector
- * Method:    destroy
- * Signature: (J)V
- */
-JNIEXPORT void JNICALL Java_mmdeploy_RotatedDetector_destroy(JNIEnv *, jobject, jlong);
+    /*
+     * Class:     mmdeploy_RotatedDetector
+     * Method:    destroy
+     * Signature: (J)V
+     */
+    JNIEXPORT void JNICALL         Java_mmdeploy_RotatedDetector_destroy(JNIEnv*, jobject, jlong);
 
-/*
- * Class:     mmdeploy_RotatedDetector
- * Method:    apply
- * Signature: (J[Lmmdeploy/Mat;[I)[Lmmdeploy/RotatedDetector/Result;
- */
-JNIEXPORT jobjectArray JNICALL Java_mmdeploy_RotatedDetector_apply(JNIEnv *, jobject, jlong,
-                                                                   jobjectArray, jintArray);
+    /*
+     * Class:     mmdeploy_RotatedDetector
+     * Method:    apply
+     * Signature: (J[Lmmdeploy/Mat;[I)[Lmmdeploy/RotatedDetector/Result;
+     */
+    JNIEXPORT jobjectArray JNICALL Java_mmdeploy_RotatedDetector_apply(JNIEnv*, jobject, jlong, jobjectArray, jintArray);
 
-#ifdef __cplusplus
+    #ifdef __cplusplus
 }
-#endif
+    #endif
 #endif
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_Scheduler.cpp b/csrc/mmdeploy/apis/java/native/mmdeploy_Scheduler.cpp
index 2c1f1c42c0..3ab391c44d 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_Scheduler.cpp
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_Scheduler.cpp
@@ -7,17 +7,20 @@
 #include "mmdeploy/apis/java/native/common.h"
 #include "mmdeploy/core/logger.h"
 
-jlong Java_mmdeploy_Scheduler_createThreadPool(JNIEnv *env, jobject, jint numThreads) {
-  mmdeploy_scheduler_t scheduler = mmdeploy_executor_create_thread_pool((int)numThreads);
-  return (jlong)scheduler;
+jlong Java_mmdeploy_Scheduler_createThreadPool(JNIEnv* env, jobject, jint numThreads)
+{
+    mmdeploy_scheduler_t scheduler = mmdeploy_executor_create_thread_pool((int)numThreads);
+    return (jlong)scheduler;
 }
 
-jlong Java_mmdeploy_Scheduler_createThread(JNIEnv *env, jobject) {
-  mmdeploy_scheduler_t scheduler = mmdeploy_executor_create_thread();
-  return (jlong)scheduler;
+jlong Java_mmdeploy_Scheduler_createThread(JNIEnv* env, jobject)
+{
+    mmdeploy_scheduler_t scheduler = mmdeploy_executor_create_thread();
+    return (jlong)scheduler;
 }
 
-void Java_mmdeploy_Scheduler_destroy(JNIEnv *, jobject, jlong scheduler_) {
-  MMDEPLOY_DEBUG("Java_mmdeploy_Scheduler_destroy");
-  mmdeploy_scheduler_destroy((mmdeploy_scheduler_t)scheduler_);
+void Java_mmdeploy_Scheduler_destroy(JNIEnv*, jobject, jlong scheduler_)
+{
+    MMDEPLOY_DEBUG("Java_mmdeploy_Scheduler_destroy");
+    mmdeploy_scheduler_destroy((mmdeploy_scheduler_t)scheduler_);
 }
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_Scheduler.h b/csrc/mmdeploy/apis/java/native/mmdeploy_Scheduler.h
index 363015cf95..8774db0fc7 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_Scheduler.h
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_Scheduler.h
@@ -3,32 +3,33 @@
 /* Header for class mmdeploy_Scheduler */
 
 #ifndef _Included_mmdeploy_Scheduler
-#define _Included_mmdeploy_Scheduler
-#ifdef __cplusplus
-extern "C" {
-#endif
-/*
- * Class:     mmdeploy_Scheduler
- * Method:    createThreadPool
- * Signature: (I)J
- */
-JNIEXPORT jlong JNICALL Java_mmdeploy_Scheduler_createThreadPool(JNIEnv *, jclass, jint);
+    #define _Included_mmdeploy_Scheduler
+    #ifdef __cplusplus
+extern "C"
+{
+    #endif
+    /*
+     * Class:     mmdeploy_Scheduler
+     * Method:    createThreadPool
+     * Signature: (I)J
+     */
+    JNIEXPORT jlong JNICALL Java_mmdeploy_Scheduler_createThreadPool(JNIEnv*, jclass, jint);
 
-/*
- * Class:     mmdeploy_Scheduler
- * Method:    createThread
- * Signature: ()J
- */
-JNIEXPORT jlong JNICALL Java_mmdeploy_Scheduler_createThread(JNIEnv *, jclass);
+    /*
+     * Class:     mmdeploy_Scheduler
+     * Method:    createThread
+     * Signature: ()J
+     */
+    JNIEXPORT jlong JNICALL Java_mmdeploy_Scheduler_createThread(JNIEnv*, jclass);
 
-/*
- * Class:     mmdeploy_Scheduler
- * Method:    destroy
- * Signature: (J)V
- */
-JNIEXPORT void JNICALL Java_mmdeploy_Scheduler_destroy(JNIEnv *, jobject, jlong);
+    /*
+     * Class:     mmdeploy_Scheduler
+     * Method:    destroy
+     * Signature: (J)V
+     */
+    JNIEXPORT void JNICALL  Java_mmdeploy_Scheduler_destroy(JNIEnv*, jobject, jlong);
 
-#ifdef __cplusplus
+    #ifdef __cplusplus
 }
-#endif
+    #endif
 #endif
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_Segmentor.cpp b/csrc/mmdeploy/apis/java/native/mmdeploy_Segmentor.cpp
index 12df31a49e..8942041c8c 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_Segmentor.cpp
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_Segmentor.cpp
@@ -6,29 +6,32 @@
 #include "mmdeploy/apis/java/native/common.h"
 #include "mmdeploy/core/logger.h"
 
-jlong Java_mmdeploy_Segmentor_create(JNIEnv *env, jobject, jstring modelPath, jstring deviceName,
-                                     jint device_id) {
-  auto model_path = env->GetStringUTFChars(modelPath, nullptr);
-  auto device_name = env->GetStringUTFChars(deviceName, nullptr);
-  mmdeploy_segmentor_t segmentor{};
-  auto ec = mmdeploy_segmentor_create_by_path(model_path, device_name, (int)device_id, &segmentor);
-  env->ReleaseStringUTFChars(modelPath, model_path);
-  env->ReleaseStringUTFChars(deviceName, device_name);
-  if (ec) {
-    MMDEPLOY_ERROR("failed to create segmentor, code = {}", ec);
-    return -1;
-  }
-  return (jlong)segmentor;
+jlong Java_mmdeploy_Segmentor_create(JNIEnv* env, jobject, jstring modelPath, jstring deviceName, jint device_id)
+{
+    auto                 model_path  = env->GetStringUTFChars(modelPath, nullptr);
+    auto                 device_name = env->GetStringUTFChars(deviceName, nullptr);
+    mmdeploy_segmentor_t segmentor{};
+    auto                 ec = mmdeploy_segmentor_create_by_path(model_path, device_name, (int)device_id, &segmentor);
+    env->ReleaseStringUTFChars(modelPath, model_path);
+    env->ReleaseStringUTFChars(deviceName, device_name);
+    if (ec)
+    {
+        MMDEPLOY_ERROR("failed to create segmentor, code = {}", ec);
+        return -1;
+    }
+    return (jlong)segmentor;
 }
 
-void Java_mmdeploy_Segmentor_destroy(JNIEnv *, jobject, jlong handle) {
-  MMDEPLOY_DEBUG("Java_mmdeploy_Segmentor_destroy");
-  mmdeploy_segmentor_destroy((mmdeploy_segmentor_t)handle);
+void Java_mmdeploy_Segmentor_destroy(JNIEnv*, jobject, jlong handle)
+{
+    MMDEPLOY_DEBUG("Java_mmdeploy_Segmentor_destroy");
+    mmdeploy_segmentor_destroy((mmdeploy_segmentor_t)handle);
 }
 
-jobjectArray Java_mmdeploy_Segmentor_apply(JNIEnv *env, jobject thiz, jlong handle,
-                                           jobjectArray images) {
-  return With(env, images, [&](const mmdeploy_mat_t imgs[], int size) -> jobjectArray {
+jobjectArray Java_mmdeploy_Segmentor_apply(JNIEnv* env, jobject thiz, jlong handle, jobjectArray images)
+{
+    return With(env, images, [&](const mmdeploy_mat_t imgs[], int size) -> jobjectArray
+                {
     mmdeploy_segmentation_t *results{};
     auto ec = mmdeploy_segmentor_apply((mmdeploy_segmentor_t)handle, imgs, size, &results);
     if (ec) {
@@ -65,6 +68,5 @@ jobjectArray Java_mmdeploy_Segmentor_apply(JNIEnv *env, jobject thiz, jlong hand
       env->SetObjectArrayElement(array, i, res);
     }
     mmdeploy_segmentor_release_result(results, size);
-    return array;
-  });
+    return array; });
 }
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_Segmentor.h b/csrc/mmdeploy/apis/java/native/mmdeploy_Segmentor.h
index afdf157bec..ec42c52dd5 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_Segmentor.h
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_Segmentor.h
@@ -3,33 +3,33 @@
 /* Header for class mmdeploy_Segmentor */
 
 #ifndef _Included_mmdeploy_Segmentor
-#define _Included_mmdeploy_Segmentor
-#ifdef __cplusplus
-extern "C" {
-#endif
-/*
- * Class:     mmdeploy_Segmentor
- * Method:    create
- * Signature: (Ljava/lang/String;Ljava/lang/String;I)J
- */
-JNIEXPORT jlong JNICALL Java_mmdeploy_Segmentor_create(JNIEnv *, jobject, jstring, jstring, jint);
+    #define _Included_mmdeploy_Segmentor
+    #ifdef __cplusplus
+extern "C"
+{
+    #endif
+    /*
+     * Class:     mmdeploy_Segmentor
+     * Method:    create
+     * Signature: (Ljava/lang/String;Ljava/lang/String;I)J
+     */
+    JNIEXPORT jlong JNICALL        Java_mmdeploy_Segmentor_create(JNIEnv*, jobject, jstring, jstring, jint);
 
-/*
- * Class:     mmdeploy_Segmentor
- * Method:    destroy
- * Signature: (J)V
- */
-JNIEXPORT void JNICALL Java_mmdeploy_Segmentor_destroy(JNIEnv *, jobject, jlong);
+    /*
+     * Class:     mmdeploy_Segmentor
+     * Method:    destroy
+     * Signature: (J)V
+     */
+    JNIEXPORT void JNICALL         Java_mmdeploy_Segmentor_destroy(JNIEnv*, jobject, jlong);
 
-/*
- * Class:     mmdeploy_Segmentor
- * Method:    apply
- * Signature: (J[Lmmdeploy/Mat;)[Lmmdeploy/Segmentor/Result;
- */
-JNIEXPORT jobjectArray JNICALL Java_mmdeploy_Segmentor_apply(JNIEnv *, jobject, jlong,
-                                                             jobjectArray);
+    /*
+     * Class:     mmdeploy_Segmentor
+     * Method:    apply
+     * Signature: (J[Lmmdeploy/Mat;)[Lmmdeploy/Segmentor/Result;
+     */
+    JNIEXPORT jobjectArray JNICALL Java_mmdeploy_Segmentor_apply(JNIEnv*, jobject, jlong, jobjectArray);
 
-#ifdef __cplusplus
+    #ifdef __cplusplus
 }
-#endif
+    #endif
 #endif
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_TextDetector.cpp b/csrc/mmdeploy/apis/java/native/mmdeploy_TextDetector.cpp
index 943d1e625b..adc1abe5cd 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_TextDetector.cpp
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_TextDetector.cpp
@@ -6,30 +6,32 @@
 #include "mmdeploy/apis/java/native/common.h"
 #include "mmdeploy/core/logger.h"
 
-jlong Java_mmdeploy_TextDetector_create(JNIEnv *env, jobject, jstring modelPath, jstring deviceName,
-                                        jint device_id) {
-  auto model_path = env->GetStringUTFChars(modelPath, nullptr);
-  auto device_name = env->GetStringUTFChars(deviceName, nullptr);
-  mmdeploy_text_detector_t text_detector{};
-  auto ec = mmdeploy_text_detector_create_by_path(model_path, device_name, (int)device_id,
-                                                  &text_detector);
-  env->ReleaseStringUTFChars(modelPath, model_path);
-  env->ReleaseStringUTFChars(deviceName, device_name);
-  if (ec) {
-    MMDEPLOY_ERROR("failed to create text_detector, code = {}", ec);
-    return -1;
-  }
-  return (jlong)text_detector;
+jlong Java_mmdeploy_TextDetector_create(JNIEnv* env, jobject, jstring modelPath, jstring deviceName, jint device_id)
+{
+    auto                     model_path  = env->GetStringUTFChars(modelPath, nullptr);
+    auto                     device_name = env->GetStringUTFChars(deviceName, nullptr);
+    mmdeploy_text_detector_t text_detector{};
+    auto                     ec = mmdeploy_text_detector_create_by_path(model_path, device_name, (int)device_id, &text_detector);
+    env->ReleaseStringUTFChars(modelPath, model_path);
+    env->ReleaseStringUTFChars(deviceName, device_name);
+    if (ec)
+    {
+        MMDEPLOY_ERROR("failed to create text_detector, code = {}", ec);
+        return -1;
+    }
+    return (jlong)text_detector;
 }
 
-void Java_mmdeploy_TextDetector_destroy(JNIEnv *, jobject, jlong handle) {
-  MMDEPLOY_DEBUG("Java_mmdeploy_TextDetector_destroy");
-  mmdeploy_text_detector_destroy((mmdeploy_text_detector_t)handle);
+void Java_mmdeploy_TextDetector_destroy(JNIEnv*, jobject, jlong handle)
+{
+    MMDEPLOY_DEBUG("Java_mmdeploy_TextDetector_destroy");
+    mmdeploy_text_detector_destroy((mmdeploy_text_detector_t)handle);
 }
 
-jobjectArray Java_mmdeploy_TextDetector_apply(JNIEnv *env, jobject thiz, jlong handle,
-                                              jobjectArray images, jintArray counts) {
-  return With(env, images, [&](const mmdeploy_mat_t imgs[], int size) -> jobjectArray {
+jobjectArray Java_mmdeploy_TextDetector_apply(JNIEnv* env, jobject thiz, jlong handle, jobjectArray images, jintArray counts)
+{
+    return With(env, images, [&](const mmdeploy_mat_t imgs[], int size) -> jobjectArray
+                {
     mmdeploy_text_detection_t *results{};
     int *result_count{};
     auto ec = mmdeploy_text_detector_apply((mmdeploy_text_detector_t)handle, imgs, size, &results,
@@ -61,6 +63,5 @@ jobjectArray Java_mmdeploy_TextDetector_apply(JNIEnv *env, jobject thiz, jlong h
     }
     env->ReleaseIntArrayElements(counts, counts_array, 0);
     mmdeploy_text_detector_release_result(results, result_count, size);
-    return array;
-  });
+    return array; });
 }
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_TextDetector.h b/csrc/mmdeploy/apis/java/native/mmdeploy_TextDetector.h
index dc5574f77b..6a5df47924 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_TextDetector.h
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_TextDetector.h
@@ -3,34 +3,33 @@
 /* Header for class mmdeploy_TextDetector */
 
 #ifndef _Included_mmdeploy_TextDetector
-#define _Included_mmdeploy_TextDetector
-#ifdef __cplusplus
-extern "C" {
-#endif
-/*
- * Class:     mmdeploy_TextDetector
- * Method:    create
- * Signature: (Ljava/lang/String;Ljava/lang/String;I)J
- */
-JNIEXPORT jlong JNICALL Java_mmdeploy_TextDetector_create(JNIEnv *, jobject, jstring, jstring,
-                                                          jint);
+    #define _Included_mmdeploy_TextDetector
+    #ifdef __cplusplus
+extern "C"
+{
+    #endif
+    /*
+     * Class:     mmdeploy_TextDetector
+     * Method:    create
+     * Signature: (Ljava/lang/String;Ljava/lang/String;I)J
+     */
+    JNIEXPORT jlong JNICALL        Java_mmdeploy_TextDetector_create(JNIEnv*, jobject, jstring, jstring, jint);
 
-/*
- * Class:     mmdeploy_TextDetector
- * Method:    destroy
- * Signature: (J)V
- */
-JNIEXPORT void JNICALL Java_mmdeploy_TextDetector_destroy(JNIEnv *, jobject, jlong);
+    /*
+     * Class:     mmdeploy_TextDetector
+     * Method:    destroy
+     * Signature: (J)V
+     */
+    JNIEXPORT void JNICALL         Java_mmdeploy_TextDetector_destroy(JNIEnv*, jobject, jlong);
 
-/*
- * Class:     mmdeploy_TextDetector
- * Method:    apply
- * Signature: (J[Lmmdeploy/Mat;[I)[Lmmdeploy/TextDetector/Result;
- */
-JNIEXPORT jobjectArray JNICALL Java_mmdeploy_TextDetector_apply(JNIEnv *, jobject, jlong,
-                                                                jobjectArray, jintArray);
+    /*
+     * Class:     mmdeploy_TextDetector
+     * Method:    apply
+     * Signature: (J[Lmmdeploy/Mat;[I)[Lmmdeploy/TextDetector/Result;
+     */
+    JNIEXPORT jobjectArray JNICALL Java_mmdeploy_TextDetector_apply(JNIEnv*, jobject, jlong, jobjectArray, jintArray);
 
-#ifdef __cplusplus
+    #ifdef __cplusplus
 }
-#endif
+    #endif
 #endif
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_TextRecognizer.cpp b/csrc/mmdeploy/apis/java/native/mmdeploy_TextRecognizer.cpp
index 06987fb623..607b7c2ee8 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_TextRecognizer.cpp
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_TextRecognizer.cpp
@@ -6,30 +6,32 @@
 #include "mmdeploy/apis/java/native/common.h"
 #include "mmdeploy/core/logger.h"
 
-jlong Java_mmdeploy_TextRecognizer_create(JNIEnv *env, jobject, jstring modelPath,
-                                          jstring deviceName, jint device_id) {
-  auto model_path = env->GetStringUTFChars(modelPath, nullptr);
-  auto device_name = env->GetStringUTFChars(deviceName, nullptr);
-  mmdeploy_text_recognizer_t text_recognizer{};
-  auto ec = mmdeploy_text_recognizer_create_by_path(model_path, device_name, (int)device_id,
-                                                    &text_recognizer);
-  env->ReleaseStringUTFChars(modelPath, model_path);
-  env->ReleaseStringUTFChars(deviceName, device_name);
-  if (ec) {
-    MMDEPLOY_ERROR("failed to create text recognizer, code = {}", ec);
-    return -1;
-  }
-  return (jlong)text_recognizer;
+jlong Java_mmdeploy_TextRecognizer_create(JNIEnv* env, jobject, jstring modelPath, jstring deviceName, jint device_id)
+{
+    auto                       model_path  = env->GetStringUTFChars(modelPath, nullptr);
+    auto                       device_name = env->GetStringUTFChars(deviceName, nullptr);
+    mmdeploy_text_recognizer_t text_recognizer{};
+    auto                       ec = mmdeploy_text_recognizer_create_by_path(model_path, device_name, (int)device_id, &text_recognizer);
+    env->ReleaseStringUTFChars(modelPath, model_path);
+    env->ReleaseStringUTFChars(deviceName, device_name);
+    if (ec)
+    {
+        MMDEPLOY_ERROR("failed to create text recognizer, code = {}", ec);
+        return -1;
+    }
+    return (jlong)text_recognizer;
 }
 
-void Java_mmdeploy_TextRecognizer_destroy(JNIEnv *, jobject, jlong handle) {
-  MMDEPLOY_DEBUG("Java_mmdeploy_TextRecognizer_destroy");  // maybe use info?
-  mmdeploy_text_recognizer_destroy((mmdeploy_text_recognizer_t)handle);
+void Java_mmdeploy_TextRecognizer_destroy(JNIEnv*, jobject, jlong handle)
+{
+    MMDEPLOY_DEBUG("Java_mmdeploy_TextRecognizer_destroy");  // maybe use info?
+    mmdeploy_text_recognizer_destroy((mmdeploy_text_recognizer_t)handle);
 }
 
-jobjectArray Java_mmdeploy_TextRecognizer_apply(JNIEnv *env, jobject thiz, jlong handle,
-                                                jobjectArray images) {
-  return With(env, images, [&](const mmdeploy_mat_t imgs[], int size) -> jobjectArray {
+jobjectArray Java_mmdeploy_TextRecognizer_apply(JNIEnv* env, jobject thiz, jlong handle, jobjectArray images)
+{
+    return With(env, images, [&](const mmdeploy_mat_t imgs[], int size) -> jobjectArray
+                {
     mmdeploy_text_recognition_t *results{};
     auto ec =
         mmdeploy_text_recognizer_apply((mmdeploy_text_recognizer_t)handle, imgs, size, &results);
@@ -51,13 +53,12 @@ jobjectArray Java_mmdeploy_TextRecognizer_apply(JNIEnv *env, jobject thiz, jlong
       env->SetObjectArrayElement(array, i, res);
     }
     mmdeploy_text_recognizer_release_result(results, size);
-    return array;
-  });
+    return array; });
 }
-jobjectArray Java_mmdeploy_TextRecognizer_applyBbox(JNIEnv *env, jobject thiz, jlong handle,
-                                                    jobjectArray images, jobjectArray bboxes,
-                                                    jintArray bbox_count) {
-  return With(env, images, [&](const mmdeploy_mat_t imgs[], int size) {
+jobjectArray Java_mmdeploy_TextRecognizer_applyBbox(JNIEnv* env, jobject thiz, jlong handle, jobjectArray images, jobjectArray bboxes, jintArray bbox_count)
+{
+    return With(env, images, [&](const mmdeploy_mat_t imgs[], int size)
+                {
     mmdeploy_text_recognition_t *recog_results{};
     auto *det_results = new mmdeploy_text_detection_t[env->GetArrayLength(bboxes)];
     int *det_result_count = new int[env->GetArrayLength(bbox_count)];
@@ -100,6 +101,5 @@ jobjectArray Java_mmdeploy_TextRecognizer_applyBbox(JNIEnv *env, jobject thiz, j
     }
     mmdeploy_text_recognizer_release_result(recog_results, size);
     mmdeploy_text_detector_release_result(det_results, det_result_count, 1);
-    return array;
-  });
+    return array; });
 }
diff --git a/csrc/mmdeploy/apis/java/native/mmdeploy_TextRecognizer.h b/csrc/mmdeploy/apis/java/native/mmdeploy_TextRecognizer.h
index 721c17f2b6..13ed048b7e 100644
--- a/csrc/mmdeploy/apis/java/native/mmdeploy_TextRecognizer.h
+++ b/csrc/mmdeploy/apis/java/native/mmdeploy_TextRecognizer.h
@@ -3,43 +3,40 @@
 /* Header for class mmdeploy_TextRecognizer */
 
 #ifndef _Included_mmdeploy_TextRecognizer
-#define _Included_mmdeploy_TextRecognizer
-#ifdef __cplusplus
-extern "C" {
-#endif
-/*
- * Class:     mmdeploy_TextRecognizer
- * Method:    create
- * Signature: (Ljava/lang/String;Ljava/lang/String;I)J
- */
-JNIEXPORT jlong JNICALL Java_mmdeploy_TextRecognizer_create(JNIEnv *, jobject, jstring, jstring,
-                                                            jint);
+    #define _Included_mmdeploy_TextRecognizer
+    #ifdef __cplusplus
+extern "C"
+{
+    #endif
+    /*
+     * Class:     mmdeploy_TextRecognizer
+     * Method:    create
+     * Signature: (Ljava/lang/String;Ljava/lang/String;I)J
+     */
+    JNIEXPORT jlong JNICALL        Java_mmdeploy_TextRecognizer_create(JNIEnv*, jobject, jstring, jstring, jint);
 
-/*
- * Class:     mmdeploy_TextRecognizer
- * Method:    destroy
- * Signature: (J)V
- */
-JNIEXPORT void JNICALL Java_mmdeploy_TextRecognizer_destroy(JNIEnv *, jobject, jlong);
+    /*
+     * Class:     mmdeploy_TextRecognizer
+     * Method:    destroy
+     * Signature: (J)V
+     */
+    JNIEXPORT void JNICALL         Java_mmdeploy_TextRecognizer_destroy(JNIEnv*, jobject, jlong);
 
-/*
- * Class:     mmdeploy_TextRecognizer
- * Method:    apply
- * Signature: (J[Lmmdeploy/Mat;)[Lmmdeploy/TextRecognizer/Result;
- */
-JNIEXPORT jobjectArray JNICALL Java_mmdeploy_TextRecognizer_apply(JNIEnv *, jobject, jlong,
-                                                                  jobjectArray);
+    /*
+     * Class:     mmdeploy_TextRecognizer
+     * Method:    apply
+     * Signature: (J[Lmmdeploy/Mat;)[Lmmdeploy/TextRecognizer/Result;
+     */
+    JNIEXPORT jobjectArray JNICALL Java_mmdeploy_TextRecognizer_apply(JNIEnv*, jobject, jlong, jobjectArray);
 
-/*
- * Class:     mmdeploy_TextRecognizer
- * Method:    applyBbox
- * Signature: (J[Lmmdeploy/Mat;[Lmmdeploy/TextDetector/Result;[I)[Lmmdeploy/TextRecognizer/Result;
- */
-JNIEXPORT jobjectArray JNICALL Java_mmdeploy_TextRecognizer_applyBbox(JNIEnv *, jobject, jlong,
-                                                                      jobjectArray, jobjectArray,
-                                                                      jintArray);
+    /*
+     * Class:     mmdeploy_TextRecognizer
+     * Method:    applyBbox
+     * Signature: (J[Lmmdeploy/Mat;[Lmmdeploy/TextDetector/Result;[I)[Lmmdeploy/TextRecognizer/Result;
+     */
+    JNIEXPORT jobjectArray JNICALL Java_mmdeploy_TextRecognizer_applyBbox(JNIEnv*, jobject, jlong, jobjectArray, jobjectArray, jintArray);
 
-#ifdef __cplusplus
+    #ifdef __cplusplus
 }
-#endif
+    #endif
 #endif
diff --git a/csrc/mmdeploy/apis/python/CMakeLists.txt b/csrc/mmdeploy/apis/python/CMakeLists.txt
index 12e7946e31..173332d0f7 100644
--- a/csrc/mmdeploy/apis/python/CMakeLists.txt
+++ b/csrc/mmdeploy/apis/python/CMakeLists.txt
@@ -3,53 +3,48 @@
 cmake_minimum_required(VERSION 3.14)
 project(mmdeploy_runtime)
 
-set(MMDEPLOY_RUNTIME_SRCS
-        common.cpp
-        internal.cpp
-        pipeline.cpp)
+set(MMDEPLOY_RUNTIME_SRCS common.cpp internal.cpp pipeline.cpp)
 
 set(CMAKE_CXX_STANDARD 17)
 
-if (${CMAKE_PROJECT_NAME} STREQUAL ${PROJECT_NAME})
-    # standard alone project
-    add_subdirectory(${CMAKE_SOURCE_DIR}/../../../../third_party/pybind11
-            ${CMAKE_CURRENT_BINARY_DIR}/pybind11)
-    find_package(MMDeploy REQUIRED)
-elseif (NOT TARGET pybind11)
-    add_subdirectory(${CMAKE_SOURCE_DIR}/third_party/pybind11 pybind11)
-endif ()
+if(${CMAKE_PROJECT_NAME} STREQUAL ${PROJECT_NAME})
+  # standard alone project
+  add_subdirectory(${CMAKE_SOURCE_DIR}/../../../../third_party/pybind11
+                   ${CMAKE_CURRENT_BINARY_DIR}/pybind11)
+  find_package(MMDeploy REQUIRED)
+elseif(NOT TARGET pybind11)
+  add_subdirectory(${CMAKE_SOURCE_DIR}/third_party/pybind11 pybind11)
+endif()
 
-foreach (task_name ${MMDEPLOY_TASKS})
-    list(APPEND MMDEPLOY_RUNTIME_SRCS ${task_name}.cpp)
-endforeach ()
+foreach(task_name ${MMDEPLOY_TASKS})
+  list(APPEND MMDEPLOY_RUNTIME_SRCS ${task_name}.cpp)
+endforeach()
 
 pybind11_add_module(${PROJECT_NAME} ${MMDEPLOY_RUNTIME_SRCS})
 # disable MMDEPLOY_CXX_USE_OPENCV in apis/cxx/mmdeploy/common.hpp
 target_compile_definitions(${PROJECT_NAME} PRIVATE -DMMDEPLOY_CXX_USE_OPENCV=0)
-if (APPLE)
-        set_target_properties(${PROJECT_NAME} PROPERTIES
-                BUILD_RPATH "@loader_path"
-                INSTALL_RPATH "@loader_path")
-else ()
-        set_target_properties(${PROJECT_NAME} PROPERTIES
-                BUILD_RPATH "\$ORIGIN"
-                INSTALL_RPATH "\$ORIGIN")
-endif ()
+if(APPLE)
+  set_target_properties(${PROJECT_NAME} PROPERTIES BUILD_RPATH "@loader_path"
+                                                   INSTALL_RPATH "@loader_path")
+else()
+  set_target_properties(${PROJECT_NAME} PROPERTIES BUILD_RPATH "\$ORIGIN"
+                                                   INSTALL_RPATH "\$ORIGIN")
+endif()
 
 # https://github.com/pybind/pybind11/issues/1604
-if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
-    target_compile_options(${PROJECT_NAME} PRIVATE -fsized-deallocation)
-endif ()
-
-if (MMDEPLOY_BUILD_SDK_MONOLITHIC)
-    target_link_libraries(${PROJECT_NAME} PRIVATE mmdeploy)
-else ()
-    mmdeploy_load_static(${PROJECT_NAME} MMDeployStaticModules)
-    mmdeploy_load_dynamic(${PROJECT_NAME} MMDeployDynamicModules)
-    target_link_libraries(${PROJECT_NAME} PRIVATE MMDeployLibs)
-endif ()
-
-target_include_directories(${PROJECT_NAME} PRIVATE
-        ${CMAKE_CURRENT_SOURCE_DIR}/..
-        ${CMAKE_CURRENT_SOURCE_DIR})
+if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+  target_compile_options(${PROJECT_NAME} PRIVATE -fsized-deallocation)
+endif()
+
+if(MMDEPLOY_BUILD_SDK_MONOLITHIC)
+  target_link_libraries(${PROJECT_NAME} PRIVATE mmdeploy)
+else()
+  mmdeploy_load_static(${PROJECT_NAME} MMDeployStaticModules)
+  mmdeploy_load_dynamic(${PROJECT_NAME} MMDeployDynamicModules)
+  target_link_libraries(${PROJECT_NAME} PRIVATE MMDeployLibs)
+endif()
+
+target_include_directories(
+  ${PROJECT_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/..
+                          ${CMAKE_CURRENT_SOURCE_DIR})
 install(DIRECTORY ${CMAKE_SOURCE_DIR}/demo/python/ DESTINATION example/python)
diff --git a/csrc/mmdeploy/apis/python/classifier.cpp b/csrc/mmdeploy/apis/python/classifier.cpp
index 9916909c86..983b3357b5 100644
--- a/csrc/mmdeploy/apis/python/classifier.cpp
+++ b/csrc/mmdeploy/apis/python/classifier.cpp
@@ -4,64 +4,76 @@
 
 #include "common.h"
 
-namespace mmdeploy::python {
+namespace mmdeploy::python
+{
 
-class PyClassifier {
- public:
-  PyClassifier(const char* model_path, const char* device_name, int device_id) {
-    auto status =
-        mmdeploy_classifier_create_by_path(model_path, device_name, device_id, &classifier_);
-    if (status != MMDEPLOY_SUCCESS) {
-      throw std::runtime_error("failed to create classifier");
-    }
-  }
-  ~PyClassifier() {
-    mmdeploy_classifier_destroy(classifier_);
-    classifier_ = {};
-  }
+    class PyClassifier
+    {
+      public:
+        PyClassifier(const char* model_path, const char* device_name, int device_id)
+        {
+            auto status =
+                mmdeploy_classifier_create_by_path(model_path, device_name, device_id, &classifier_);
+            if (status != MMDEPLOY_SUCCESS)
+            {
+                throw std::runtime_error("failed to create classifier");
+            }
+        }
+        ~PyClassifier()
+        {
+            mmdeploy_classifier_destroy(classifier_);
+            classifier_ = {};
+        }
 
-  std::vector<std::vector<std::tuple<int, float>>> Apply(const std::vector<PyImage>& imgs) {
-    std::vector<mmdeploy_mat_t> mats;
-    mats.reserve(imgs.size());
-    for (const auto& img : imgs) {
-      auto mat = GetMat(img);
-      mats.push_back(mat);
-    }
-    mmdeploy_classification_t* results{};
-    int* result_count{};
-    auto status = mmdeploy_classifier_apply(classifier_, mats.data(), (int)mats.size(), &results,
-                                            &result_count);
-    if (status != MMDEPLOY_SUCCESS) {
-      throw std::runtime_error("failed to apply classifier, code: " + std::to_string(status));
-    }
-    auto output = std::vector<std::vector<std::tuple<int, float>>>{};
-    output.reserve(mats.size());
-    auto result_ptr = results;
-    for (int i = 0; i < mats.size(); ++i) {
-      std::vector<std::tuple<int, float>> label_score;
-      for (int j = 0; j < result_count[i]; ++j) {
-        label_score.emplace_back(result_ptr[j].label_id, result_ptr[j].score);
-      }
-      output.push_back(std::move(label_score));
-      result_ptr += result_count[i];
-    }
-    mmdeploy_classifier_release_result(results, result_count, (int)mats.size());
-    return output;
-  }
+        std::vector<std::vector<std::tuple<int, float>>> Apply(const std::vector<PyImage>& imgs)
+        {
+            std::vector<mmdeploy_mat_t> mats;
+            mats.reserve(imgs.size());
+            for (const auto& img : imgs)
+            {
+                auto mat = GetMat(img);
+                mats.push_back(mat);
+            }
+            mmdeploy_classification_t* results{};
+            int*                       result_count{};
+            auto                       status = mmdeploy_classifier_apply(classifier_, mats.data(), (int)mats.size(), &results, &result_count);
+            if (status != MMDEPLOY_SUCCESS)
+            {
+                throw std::runtime_error("failed to apply classifier, code: " + std::to_string(status));
+            }
+            auto output = std::vector<std::vector<std::tuple<int, float>>>{};
+            output.reserve(mats.size());
+            auto result_ptr = results;
+            for (int i = 0; i < mats.size(); ++i)
+            {
+                std::vector<std::tuple<int, float>> label_score;
+                for (int j = 0; j < result_count[i]; ++j)
+                {
+                    label_score.emplace_back(result_ptr[j].label_id, result_ptr[j].score);
+                }
+                output.push_back(std::move(label_score));
+                result_ptr += result_count[i];
+            }
+            mmdeploy_classifier_release_result(results, result_count, (int)mats.size());
+            return output;
+        }
 
- private:
-  mmdeploy_classifier_t classifier_{};
-};
+      private:
+        mmdeploy_classifier_t classifier_{};
+    };
 
-static PythonBindingRegisterer register_classifier{[](py::module& m) {
-  py::class_<PyClassifier>(m, "Classifier")
-      .def(py::init([](const char* model_path, const char* device_name, int device_id) {
-             return std::make_unique<PyClassifier>(model_path, device_name, device_id);
-           }),
-           py::arg("model_path"), py::arg("device_name"), py::arg("device_id") = 0)
-      .def("__call__",
-           [](PyClassifier* self, const PyImage& img) { return self->Apply(std::vector{img})[0]; })
-      .def("batch", &PyClassifier::Apply);
-}};
+    static PythonBindingRegisterer register_classifier{[](py::module& m)
+                                                       {
+                                                           py::class_<PyClassifier>(m, "Classifier")
+                                                               .def(py::init([](const char* model_path, const char* device_name, int device_id)
+                                                                             { return std::make_unique<PyClassifier>(model_path, device_name, device_id); }),
+                                                                    py::arg("model_path"),
+                                                                    py::arg("device_name"),
+                                                                    py::arg("device_id") = 0)
+                                                               .def("__call__",
+                                                                    [](PyClassifier* self, const PyImage& img)
+                                                                    { return self->Apply(std::vector{img})[0]; })
+                                                               .def("batch", &PyClassifier::Apply);
+                                                       }};
 
 }  // namespace mmdeploy::python
diff --git a/csrc/mmdeploy/apis/python/common.cpp b/csrc/mmdeploy/apis/python/common.cpp
index de4e1adf0a..72ed22089a 100644
--- a/csrc/mmdeploy/apis/python/common.cpp
+++ b/csrc/mmdeploy/apis/python/common.cpp
@@ -7,166 +7,214 @@
 #include "mmdeploy/core/utils/formatter.h"
 #include "pybind11/numpy.h"
 
-namespace mmdeploy::python {
+namespace mmdeploy::python
+{
 
-std::vector<void (*)(py::module&)>& gPythonBindings() {
-  static std::vector<void (*)(py::module&)> v;
-  return v;
-}
-
-mmdeploy_mat_t GetMat(const PyImage& img) {
-  auto info = img.request();
-  if (info.ndim != 3) {
-    fprintf(stderr, "info.ndim = %d\n", (int)info.ndim);
-    throw std::runtime_error("continuous uint8 HWC array expected");
-  }
-  auto channels = (int)info.shape[2];
-  mmdeploy_mat_t mat{};
-  if (channels == 1) {
-    mat.format = MMDEPLOY_PIXEL_FORMAT_GRAYSCALE;
-  } else if (channels == 3) {
-    mat.format = MMDEPLOY_PIXEL_FORMAT_BGR;
-  } else {
-    throw std::runtime_error("images of 1 or 3 channels are supported");
-  }
-  mat.height = (int)info.shape[0];
-  mat.width = (int)info.shape[1];
-  mat.channel = channels;
-  mat.type = MMDEPLOY_DATA_TYPE_UINT8;
-  mat.data = (uint8_t*)info.ptr;
-  return mat;
-}
+    std::vector<void (*)(py::module&)>& gPythonBindings()
+    {
+        static std::vector<void (*)(py::module&)> v;
+        return v;
+    }
 
-py::object ToPyObject(const Value& value) {
-  switch (value.type()) {
-    case ValueType::kNull:
-      return py::none();
-    case ValueType::kBool:
-      return py::bool_(value.get<bool>());
-    case ValueType::kInt:
-      return py::int_(value.get<int64_t>());
-    case ValueType::kUInt:
-      return py::int_(value.get<uint64_t>());
-    case ValueType::kFloat:
-      return py::float_(value.get<double>());
-    case ValueType::kString:
-      return py::str(value.get<std::string>());
-    case ValueType::kArray: {
-      py::list list;
-      for (const auto& x : value) {
-        list.append(ToPyObject(x));
-      }
-      return list;
+    mmdeploy_mat_t GetMat(const PyImage& img)
+    {
+        auto info = img.request();
+        if (info.ndim != 3)
+        {
+            fprintf(stderr, "info.ndim = %d\n", (int)info.ndim);
+            throw std::runtime_error("continuous uint8 HWC array expected");
+        }
+        auto           channels = (int)info.shape[2];
+        mmdeploy_mat_t mat{};
+        if (channels == 1)
+        {
+            mat.format = MMDEPLOY_PIXEL_FORMAT_GRAYSCALE;
+        }
+        else if (channels == 3)
+        {
+            mat.format = MMDEPLOY_PIXEL_FORMAT_BGR;
+        }
+        else
+        {
+            throw std::runtime_error("images of 1 or 3 channels are supported");
+        }
+        mat.height  = (int)info.shape[0];
+        mat.width   = (int)info.shape[1];
+        mat.channel = channels;
+        mat.type    = MMDEPLOY_DATA_TYPE_UINT8;
+        mat.data    = (uint8_t*)info.ptr;
+        return mat;
     }
-    case ValueType::kObject: {
-      py::dict dict;
-      for (auto it = value.begin(); it != value.end(); ++it) {
-        dict[it.key().c_str()] = ToPyObject(*it);
-      }
-      return dict;
+
+    py::object ToPyObject(const Value& value)
+    {
+        switch (value.type())
+        {
+            case ValueType::kNull:
+                return py::none();
+            case ValueType::kBool:
+                return py::bool_(value.get<bool>());
+            case ValueType::kInt:
+                return py::int_(value.get<int64_t>());
+            case ValueType::kUInt:
+                return py::int_(value.get<uint64_t>());
+            case ValueType::kFloat:
+                return py::float_(value.get<double>());
+            case ValueType::kString:
+                return py::str(value.get<std::string>());
+            case ValueType::kArray:
+            {
+                py::list list;
+                for (const auto& x : value)
+                {
+                    list.append(ToPyObject(x));
+                }
+                return list;
+            }
+            case ValueType::kObject:
+            {
+                py::dict dict;
+                for (auto it = value.begin(); it != value.end(); ++it)
+                {
+                    dict[it.key().c_str()] = ToPyObject(*it);
+                }
+                return dict;
+            }
+            case ValueType::kAny:
+                return py::str("<any>");
+            default:
+                return py::str("<unknown>");
+        }
     }
-    case ValueType::kAny:
-      return py::str("<any>");
-    default:
-      return py::str("<unknown>");
-  }
-}
 
-std::optional<Value> _to_value_internal(const void* object, mmdeploy_context_type_t type);
+    std::optional<Value> _to_value_internal(const void* object, mmdeploy_context_type_t type);
 
-Value FromPyObject(const py::object& obj) {
-  if (py::isinstance<py::none>(obj)) {
-    return nullptr;
-  } else if (py::isinstance<py::bool_>(obj)) {
-    return obj.cast<bool>();
-  } else if (py::isinstance<py::int_>(obj)) {
-    return obj.cast<int>();
-  } else if (py::isinstance<py::float_>(obj)) {
-    return obj.cast<double>();
-  } else if (py::isinstance<py::str>(obj)) {
-    return obj.cast<std::string>();
-  } else if (py::isinstance<py::list>(obj) || py::isinstance<py::tuple>(obj)) {
-    py::list src(obj);
-    Value::Array dst;
-    dst.reserve(src.size());
-    for (const auto& item : src) {
-      dst.push_back(FromPyObject(py::reinterpret_borrow<py::object>(item)));
+    Value                FromPyObject(const py::object& obj)
+    {
+        if (py::isinstance<py::none>(obj))
+        {
+            return nullptr;
+        }
+        else if (py::isinstance<py::bool_>(obj))
+        {
+            return obj.cast<bool>();
+        }
+        else if (py::isinstance<py::int_>(obj))
+        {
+            return obj.cast<int>();
+        }
+        else if (py::isinstance<py::float_>(obj))
+        {
+            return obj.cast<double>();
+        }
+        else if (py::isinstance<py::str>(obj))
+        {
+            return obj.cast<std::string>();
+        }
+        else if (py::isinstance<py::list>(obj) || py::isinstance<py::tuple>(obj))
+        {
+            py::list     src(obj);
+            Value::Array dst;
+            dst.reserve(src.size());
+            for (const auto& item : src)
+            {
+                dst.push_back(FromPyObject(py::reinterpret_borrow<py::object>(item)));
+            }
+            return dst;
+        }
+        else if (py::isinstance<py::dict>(obj))
+        {
+            py::dict      src(obj);
+            Value::Object dst;
+            for (const auto& item : src)
+            {
+                dst.emplace(item.first.cast<std::string>(),
+                            FromPyObject(py::reinterpret_borrow<py::object>(item.second)));
+            }
+            return dst;
+        }
+        else if (py::isinstance<py::array>(obj))
+        {
+            const auto& array = obj.cast<py::array>();
+            return *_to_value_internal(&array, MMDEPLOY_TYPE_MAT);
+        }
+        else if (py::isinstance<Model>(obj))
+        {
+            const auto& model =
+                *reinterpret_cast<framework::Model*>(static_cast<mmdeploy_model_t>(obj.cast<Model>()));
+            return model;
+        }
+        else
+        {
+            std::stringstream ss;
+            ss << obj.get_type();
+            MMDEPLOY_ERROR("unsupported Python object type: {}", ss.str());
+            return nullptr;
+        }
+        return nullptr;
     }
-    return dst;
-  } else if (py::isinstance<py::dict>(obj)) {
-    py::dict src(obj);
-    Value::Object dst;
-    for (const auto& item : src) {
-      dst.emplace(item.first.cast<std::string>(),
-                  FromPyObject(py::reinterpret_borrow<py::object>(item.second)));
-    }
-    return dst;
-  } else if (py::isinstance<py::array>(obj)) {
-    const auto& array = obj.cast<py::array>();
-    return *_to_value_internal(&array, MMDEPLOY_TYPE_MAT);
-  } else if (py::isinstance<Model>(obj)) {
-    const auto& model =
-        *reinterpret_cast<framework::Model*>(static_cast<mmdeploy_model_t>(obj.cast<Model>()));
-    return model;
-  } else {
-    std::stringstream ss;
-    ss << obj.get_type();
-    MMDEPLOY_ERROR("unsupported Python object type: {}", ss.str());
-    return nullptr;
-  }
-  return nullptr;
-}
 
-std::pair<std::string, int> parse_device(const std::string& device) {
-  auto pos = device.find(':');
-  if (pos == std::string::npos) {
-    return {device, 0};  // logic for index -1 is not ready on some devices
-  }
-  auto name = device.substr(0, pos);
-  auto index = std::stoi(device.substr(pos + 1));
-  return {name, index};
-}
+    std::pair<std::string, int> parse_device(const std::string& device)
+    {
+        auto pos = device.find(':');
+        if (pos == std::string::npos)
+        {
+            return {device, 0};  // logic for index -1 is not ready on some devices
+        }
+        auto name  = device.substr(0, pos);
+        auto index = std::stoi(device.substr(pos + 1));
+        return {name, index};
+    }
 
-static PythonBindingRegisterer register_model{[](py::module& m) {
-  py::class_<Model>(m, "Model")
-      .def(py::init([](const py::str& path) {
+    static PythonBindingRegisterer register_model{[](py::module& m)
+                                                  {
+                                                      py::class_<Model>(m, "Model")
+                                                          .def(py::init([](const py::str& path)
+                                                                        {
         MMDEPLOY_DEBUG("py::init([](const py::str& path)");
-        return Model(path.cast<std::string>().c_str());
-      }))
-      .def(py::init([](const py::bytes& buffer) {
+        return Model(path.cast<std::string>().c_str()); }))
+                                                          .def(py::init([](const py::bytes& buffer)
+                                                                        {
         MMDEPLOY_DEBUG("py::init([](const py::bytes& buffer)");
         py::buffer_info info(py::buffer(buffer).request());
-        return Model(info.ptr, info.size);
-      }));
-}};
+        return Model(info.ptr, info.size); }));
+                                                  }};
 
-static PythonBindingRegisterer register_device{[](py::module& m) {
-  py::class_<Device>(m, "Device")
-      .def(py::init([](const std::string& device) {
+    static PythonBindingRegisterer register_device{[](py::module& m)
+                                                   {
+                                                       py::class_<Device>(m, "Device")
+                                                           .def(py::init([](const std::string& device)
+                                                                         {
         auto [name, index] = parse_device(device);
-        return Device(name, index);
-      }))
-      .def(py::init([](const std::string& name, int index) { return Device(name, index); }));
-}};
+        return Device(name, index); }))
+                                                           .def(py::init([](const std::string& name, int index)
+                                                                         { return Device(name, index); }));
+                                                   }};
 
-static PythonBindingRegisterer register_context{[](py::module& m) {
-  py::class_<Context>(m, "Context")
-      .def(py::init([](const Device& device) { return Context(device); }))
-      .def("add", [](Context* self, const std::string& name, const Scheduler& sched) {
-        self->Add(name, sched);
-      });
-}};
+    static PythonBindingRegisterer register_context{[](py::module& m)
+                                                    {
+                                                        py::class_<Context>(m, "Context")
+                                                            .def(py::init([](const Device& device)
+                                                                          { return Context(device); }))
+                                                            .def("add", [](Context* self, const std::string& name, const Scheduler& sched)
+                                                                 { self->Add(name, sched); });
+                                                    }};
 
-static PythonBindingRegisterer register_scheduler{[](py::module& m) {
-  py::class_<Scheduler>(m, "Scheduler")
-      .def_static("thread_pool", [](int n_workers) { return Scheduler::ThreadPool(n_workers); })
-      .def_static("thread", [] { return Scheduler::Thread(); });
-}};
+    static PythonBindingRegisterer register_scheduler{[](py::module& m)
+                                                      {
+                                                          py::class_<Scheduler>(m, "Scheduler")
+                                                              .def_static("thread_pool", [](int n_workers)
+                                                                          { return Scheduler::ThreadPool(n_workers); })
+                                                              .def_static("thread", []
+                                                                          { return Scheduler::Thread(); });
+                                                      }};
 
 }  // namespace mmdeploy::python
 
-PYBIND11_MODULE(mmdeploy_runtime, m) {
-  for (const auto& f : mmdeploy::python::gPythonBindings()) {
-    f(m);
-  }
+PYBIND11_MODULE(mmdeploy_runtime, m)
+{
+    for (const auto& f : mmdeploy::python::gPythonBindings())
+    {
+        f(m);
+    }
 }
diff --git a/csrc/mmdeploy/apis/python/common.h b/csrc/mmdeploy/apis/python/common.h
index 5b1ca96b74..e50ed76007 100644
--- a/csrc/mmdeploy/apis/python/common.h
+++ b/csrc/mmdeploy/apis/python/common.h
@@ -13,24 +13,27 @@
 
 namespace py = pybind11;
 
-namespace mmdeploy::python {
+namespace mmdeploy::python
+{
 
-using PyImage = py::array_t<uint8_t, py::array::c_style | py::array::forcecast>;
+    using PyImage = py::array_t<uint8_t, py::array::c_style | py::array::forcecast>;
 
-std::vector<void (*)(py::module&)>& gPythonBindings();
+    std::vector<void (*)(py::module&)>& gPythonBindings();
 
-mmdeploy_mat_t GetMat(const PyImage& img);
+    mmdeploy_mat_t                      GetMat(const PyImage& img);
 
-py::object ToPyObject(const Value& value);
+    py::object                          ToPyObject(const Value& value);
 
-Value FromPyObject(const py::object& obj);
+    Value                               FromPyObject(const py::object& obj);
 
-class PythonBindingRegisterer {
- public:
-  explicit PythonBindingRegisterer(void (*register_fn)(py::module& m)) {
-    gPythonBindings().push_back(register_fn);
-  }
-};
+    class PythonBindingRegisterer
+    {
+      public:
+        explicit PythonBindingRegisterer(void (*register_fn)(py::module& m))
+        {
+            gPythonBindings().push_back(register_fn);
+        }
+    };
 
 }  // namespace mmdeploy::python
 
diff --git a/csrc/mmdeploy/apis/python/detector.cpp b/csrc/mmdeploy/apis/python/detector.cpp
index 057a92ab00..137998f6b7 100644
--- a/csrc/mmdeploy/apis/python/detector.cpp
+++ b/csrc/mmdeploy/apis/python/detector.cpp
@@ -4,82 +4,97 @@
 
 #include "common.h"
 
-namespace mmdeploy::python {
+namespace mmdeploy::python
+{
 
-class PyDetector {
- public:
-  PyDetector(const char* model_path, const char* device_name, int device_id) {
-    auto status = mmdeploy_detector_create_by_path(model_path, device_name, device_id, &detector_);
-    if (status != MMDEPLOY_SUCCESS) {
-      throw std::runtime_error("failed to create detector");
-    }
-  }
-  py::list Apply(const std::vector<PyImage>& imgs) {
-    std::vector<mmdeploy_mat_t> mats;
-    mats.reserve(imgs.size());
-    for (const auto& img : imgs) {
-      auto mat = GetMat(img);
-      mats.push_back(mat);
-    }
-    mmdeploy_detection_t* detection{};
-    int* result_count{};
-    auto status = mmdeploy_detector_apply(detector_, mats.data(), (int)mats.size(), &detection,
-                                          &result_count);
-    if (status != MMDEPLOY_SUCCESS) {
-      throw std::runtime_error("failed to apply detector, code: " + std::to_string(status));
-    }
-    using Sptr = std::shared_ptr<mmdeploy_detection_t>;
-    Sptr holder(detection, [result_count, n = mats.size()](auto p) {
-      mmdeploy_detector_release_result(p, result_count, n);
-    });
-    auto output = py::list{};
-    auto result = detection;
-    for (int i = 0; i < mats.size(); ++i) {
-      auto bboxes = py::array_t<float>({result_count[i], 5});
-      auto labels = py::array_t<int>(result_count[i]);
-      auto masks = std::vector<py::array>();
-      masks.reserve(result_count[i]);
-      for (int j = 0; j < result_count[i]; ++j, ++result) {
-        auto bbox = bboxes.mutable_data(j);
-        bbox[0] = result->bbox.left;
-        bbox[1] = result->bbox.top;
-        bbox[2] = result->bbox.right;
-        bbox[3] = result->bbox.bottom;
-        bbox[4] = result->score;
-        labels.mutable_at(j) = result->label_id;
-        if (result->mask) {
-          masks.emplace_back(std::array{result->mask->height, result->mask->width},  // shape
-                             reinterpret_cast<uint8_t*>(result->mask->data),         // data
-                             py::capsule(new Sptr(holder),                           // handle
-                                         [](void* p) { delete reinterpret_cast<Sptr*>(p); }));
-        } else {
-          masks.emplace_back();
+    class PyDetector
+    {
+      public:
+        PyDetector(const char* model_path, const char* device_name, int device_id)
+        {
+            auto status = mmdeploy_detector_create_by_path(model_path, device_name, device_id, &detector_);
+            if (status != MMDEPLOY_SUCCESS)
+            {
+                throw std::runtime_error("failed to create detector");
+            }
+        }
+        py::list Apply(const std::vector<PyImage>& imgs)
+        {
+            std::vector<mmdeploy_mat_t> mats;
+            mats.reserve(imgs.size());
+            for (const auto& img : imgs)
+            {
+                auto mat = GetMat(img);
+                mats.push_back(mat);
+            }
+            mmdeploy_detection_t* detection{};
+            int*                  result_count{};
+            auto                  status = mmdeploy_detector_apply(detector_, mats.data(), (int)mats.size(), &detection, &result_count);
+            if (status != MMDEPLOY_SUCCESS)
+            {
+                throw std::runtime_error("failed to apply detector, code: " + std::to_string(status));
+            }
+            using Sptr = std::shared_ptr<mmdeploy_detection_t>;
+            Sptr holder(detection, [result_count, n = mats.size()](auto p)
+                        { mmdeploy_detector_release_result(p, result_count, n); });
+            auto output = py::list{};
+            auto result = detection;
+            for (int i = 0; i < mats.size(); ++i)
+            {
+                auto bboxes = py::array_t<float>({result_count[i], 5});
+                auto labels = py::array_t<int>(result_count[i]);
+                auto masks  = std::vector<py::array>();
+                masks.reserve(result_count[i]);
+                for (int j = 0; j < result_count[i]; ++j, ++result)
+                {
+                    auto bbox            = bboxes.mutable_data(j);
+                    bbox[0]              = result->bbox.left;
+                    bbox[1]              = result->bbox.top;
+                    bbox[2]              = result->bbox.right;
+                    bbox[3]              = result->bbox.bottom;
+                    bbox[4]              = result->score;
+                    labels.mutable_at(j) = result->label_id;
+                    if (result->mask)
+                    {
+                        masks.emplace_back(std::array{result->mask->height, result->mask->width},  // shape
+                                           reinterpret_cast<uint8_t*>(result->mask->data),         // data
+                                           py::capsule(new Sptr(holder),                           // handle
+                                                       [](void* p)
+                                                       { delete reinterpret_cast<Sptr*>(p); }));
+                    }
+                    else
+                    {
+                        masks.emplace_back();
+                    }
+                }
+                output.append(py::make_tuple(std::move(bboxes), std::move(labels), std::move(masks)));
+            }
+            return output;
+        }
+        ~PyDetector()
+        {
+            mmdeploy_detector_destroy(detector_);
+            detector_ = {};
         }
-      }
-      output.append(py::make_tuple(std::move(bboxes), std::move(labels), std::move(masks)));
-    }
-    return output;
-  }
-  ~PyDetector() {
-    mmdeploy_detector_destroy(detector_);
-    detector_ = {};
-  }
 
- private:
-  mmdeploy_detector_t detector_{};
-};
+      private:
+        mmdeploy_detector_t detector_{};
+    };
 
-static PythonBindingRegisterer register_detector{[](py::module& m) {
-  py::class_<PyDetector>(m, "Detector")
-      .def(py::init([](const char* model_path, const char* device_name, int device_id) {
-             return std::make_unique<PyDetector>(model_path, device_name, device_id);
-           }),
-           py::arg("model_path"), py::arg("device_name"), py::arg("device_id") = 0)
-      .def("__call__",
-           [](PyDetector* self, const PyImage& img) -> py::tuple {
-             return self->Apply(std::vector{img})[0];
-           })
-      .def("batch", &PyDetector::Apply);
-}};
+    static PythonBindingRegisterer register_detector{[](py::module& m)
+                                                     {
+                                                         py::class_<PyDetector>(m, "Detector")
+                                                             .def(py::init([](const char* model_path, const char* device_name, int device_id)
+                                                                           { return std::make_unique<PyDetector>(model_path, device_name, device_id); }),
+                                                                  py::arg("model_path"),
+                                                                  py::arg("device_name"),
+                                                                  py::arg("device_id") = 0)
+                                                             .def("__call__",
+                                                                  [](PyDetector* self, const PyImage& img) -> py::tuple
+                                                                  {
+                                                                      return self->Apply(std::vector{img})[0];
+                                                                  })
+                                                             .def("batch", &PyDetector::Apply);
+                                                     }};
 
 }  // namespace mmdeploy::python
diff --git a/csrc/mmdeploy/apis/python/executor.cpp b/csrc/mmdeploy/apis/python/executor.cpp
index eaa5c1144b..489985f232 100644
--- a/csrc/mmdeploy/apis/python/executor.cpp
+++ b/csrc/mmdeploy/apis/python/executor.cpp
@@ -8,39 +8,48 @@
 #include "mmdeploy/execution/schedulers/single_thread_context.h"
 #include "mmdeploy/execution/schedulers/static_thread_pool.h"
 
-namespace mmdeploy::python {
+namespace mmdeploy::python
+{
 
-struct PySender {
-  TypeErasedSender<Value> sender_;
-
-  explicit PySender(TypeErasedSender<Value> sender) : sender_(std::move(sender)) {}
-
-  struct gil_guarded_deleter {
-    void operator()(py::object* p) const {
-      py::gil_scoped_acquire _;
-      delete p;
-    }
-  };
-  using object_ptr = std::unique_ptr<py::object, gil_guarded_deleter>;
-
-  py::object __await__() {
-    auto future = py::module::import("concurrent.futures").attr("Future")();
+    struct PySender
     {
-      py::gil_scoped_release _;
-      StartDetached(std::move(sender_) |
-                    Then([future = object_ptr{new py::object(future)}](const Value& value) mutable {
+        TypeErasedSender<Value> sender_;
+
+        explicit PySender(TypeErasedSender<Value> sender)
+            : sender_(std::move(sender))
+        {
+        }
+
+        struct gil_guarded_deleter
+        {
+            void operator()(py::object* p) const
+            {
+                py::gil_scoped_acquire _;
+                delete p;
+            }
+        };
+        using object_ptr = std::unique_ptr<py::object, gil_guarded_deleter>;
+
+        py::object __await__()
+        {
+            auto future = py::module::import("concurrent.futures").attr("Future")();
+            {
+                py::gil_scoped_release _;
+                StartDetached(std::move(sender_) |
+                              Then([future = object_ptr{new py::object(future)}](const Value& value) mutable
+                                   {
                       py::gil_scoped_acquire _;
                       future->attr("set_result")(ToPyObject(value));
-                      delete future.release();
-                    }));
-    }
-    return py::module::import("asyncio").attr("wrap_future")(future).attr("__await__")();
-  }
-};
-
-static PythonBindingRegisterer register_sender{[](py::module& m) {
-  py::class_<PySender, std::unique_ptr<PySender>>(m, "PySender")
-      .def("__await__", &PySender::__await__);
-}};
+                      delete future.release(); }));
+            }
+            return py::module::import("asyncio").attr("wrap_future")(future).attr("__await__")();
+        }
+    };
+
+    static PythonBindingRegisterer register_sender{[](py::module& m)
+                                                   {
+                                                       py::class_<PySender, std::unique_ptr<PySender>>(m, "PySender")
+                                                           .def("__await__", &PySender::__await__);
+                                                   }};
 
 }  // namespace mmdeploy::python
diff --git a/csrc/mmdeploy/apis/python/internal.cpp b/csrc/mmdeploy/apis/python/internal.cpp
index 7373c1f184..8c38f5a7ce 100644
--- a/csrc/mmdeploy/apis/python/internal.cpp
+++ b/csrc/mmdeploy/apis/python/internal.cpp
@@ -9,49 +9,60 @@
 #include "mmdeploy/core/model.h"
 #include "mmdeploy/core/value.h"
 
-namespace mmdeploy {
-
-namespace python {
-
-framework::Mat _get_mat(const PyImage& img) {
-  auto info = img.request();
-  if (info.ndim != 3) {
-    fprintf(stderr, "info.ndim = %d\n", (int)info.ndim);
-    throw std::runtime_error("continuous uint8 HWC array expected");
-  }
-  auto channels = (int)info.shape[2];
-  PixelFormat format;
-  if (channels == 1) {
-    format = PixelFormat::kGRAYSCALE;
-  } else if (channels == 3) {
-    format = PixelFormat::kBGR;
-  } else {
-    throw std::runtime_error("images of 1 or 3 channels are supported");
-  }
-
-  return {
-      (int)info.shape[0],                             // height
-      (int)info.shape[1],                             // width
-      format,                                         // format
-      DataType::kINT8,                                // type
-      std::shared_ptr<void>(info.ptr, [](void*) {}),  // data
-      framework::Device(0),                           // device
-  };
-}
-
-std::optional<Value> _to_value_internal(const void* object, mmdeploy_context_type_t type) {
-  switch (type) {
-    case MMDEPLOY_TYPE_MODEL:
-      return Value(*(const framework::Model*)object);
-    case MMDEPLOY_TYPE_DEVICE:
-      return Value(*(const framework::Device*)object);
-    case MMDEPLOY_TYPE_MAT:
-      return _get_mat(*(const py::array*)object);
-    default:
-      return std::nullopt;
-  }
-}
-
-}  // namespace python
+namespace mmdeploy
+{
+
+    namespace python
+    {
+
+        framework::Mat _get_mat(const PyImage& img)
+        {
+            auto info = img.request();
+            if (info.ndim != 3)
+            {
+                fprintf(stderr, "info.ndim = %d\n", (int)info.ndim);
+                throw std::runtime_error("continuous uint8 HWC array expected");
+            }
+            auto        channels = (int)info.shape[2];
+            PixelFormat format;
+            if (channels == 1)
+            {
+                format = PixelFormat::kGRAYSCALE;
+            }
+            else if (channels == 3)
+            {
+                format = PixelFormat::kBGR;
+            }
+            else
+            {
+                throw std::runtime_error("images of 1 or 3 channels are supported");
+            }
+
+            return {
+                (int)info.shape[0],                             // height
+                (int)info.shape[1],                             // width
+                format,                                         // format
+                DataType::kINT8,                                // type
+                std::shared_ptr<void>(info.ptr, [](void*) {}),  // data
+                framework::Device(0),                           // device
+            };
+        }
+
+        std::optional<Value> _to_value_internal(const void* object, mmdeploy_context_type_t type)
+        {
+            switch (type)
+            {
+                case MMDEPLOY_TYPE_MODEL:
+                    return Value(*(const framework::Model*)object);
+                case MMDEPLOY_TYPE_DEVICE:
+                    return Value(*(const framework::Device*)object);
+                case MMDEPLOY_TYPE_MAT:
+                    return _get_mat(*(const py::array*)object);
+                default:
+                    return std::nullopt;
+            }
+        }
+
+    }  // namespace python
 
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/apis/python/pipeline.cpp b/csrc/mmdeploy/apis/python/pipeline.cpp
index e3e6237e44..114bce2095 100644
--- a/csrc/mmdeploy/apis/python/pipeline.cpp
+++ b/csrc/mmdeploy/apis/python/pipeline.cpp
@@ -7,41 +7,47 @@
 #include "mmdeploy/core/logger.h"
 #include "mmdeploy/core/utils/formatter.h"
 
-namespace mmdeploy::python {
+namespace mmdeploy::python
+{
 
-using namespace std::literals;
+    using namespace std::literals;
 
-static PythonBindingRegisterer register_pipeline{[](py::module& m) {
-  py::class_<Pipeline>(m, "Pipeline")
-      .def(py::init([](const py::object& config, const Context& context) {
+    static PythonBindingRegisterer register_pipeline{[](py::module& m)
+                                                     {
+                                                         py::class_<Pipeline>(m, "Pipeline")
+                                                             .def(py::init([](const py::object& config, const Context& context)
+                                                                           {
         auto _config = FromPyObject(config);
-        return std::make_unique<Pipeline>(_config, context);
-      }))
-      .def("__call__",
-           [](Pipeline* pipeline, const py::args& args) {
-             auto inputs = FromPyObject(args);
-             for (auto& input : inputs) {
-               input = Value::Array{std::move(input)};
-             }
-             auto outputs = pipeline->Apply(inputs);
-             for (auto& output : outputs) {
-               output = std::move(output[0]);
-             }
-             py::tuple rets(outputs.size());
-             for (int i = 0; i < outputs.size(); ++i) {
-               rets[i] = ToPyObject(outputs[i]);
-             }
-             return rets;
-           })
-      .def("batch", [](Pipeline* pipeline, const py::args& args) {
+        return std::make_unique<Pipeline>(_config, context); }))
+                                                             .def("__call__",
+                                                                  [](Pipeline* pipeline, const py::args& args)
+                                                                  {
+                                                                      auto inputs = FromPyObject(args);
+                                                                      for (auto& input : inputs)
+                                                                      {
+                                                                          input = Value::Array{std::move(input)};
+                                                                      }
+                                                                      auto outputs = pipeline->Apply(inputs);
+                                                                      for (auto& output : outputs)
+                                                                      {
+                                                                          output = std::move(output[0]);
+                                                                      }
+                                                                      py::tuple rets(outputs.size());
+                                                                      for (int i = 0; i < outputs.size(); ++i)
+                                                                      {
+                                                                          rets[i] = ToPyObject(outputs[i]);
+                                                                      }
+                                                                      return rets;
+                                                                  })
+                                                             .def("batch", [](Pipeline* pipeline, const py::args& args)
+                                                                  {
         auto inputs = FromPyObject(args);
         auto outputs = pipeline->Apply(inputs);
         py::tuple rets(outputs.size());
         for (int i = 0; i < outputs.size(); ++i) {
           rets[i] = ToPyObject(outputs[i]);
         }
-        return rets;
-      });
-}};
+        return rets; });
+                                                     }};
 
 }  // namespace mmdeploy::python
diff --git a/csrc/mmdeploy/apis/python/pose_detector.cpp b/csrc/mmdeploy/apis/python/pose_detector.cpp
index f9d99eaf14..b6dc96560a 100644
--- a/csrc/mmdeploy/apis/python/pose_detector.cpp
+++ b/csrc/mmdeploy/apis/python/pose_detector.cpp
@@ -7,122 +7,143 @@
 
 #include "common.h"
 
-namespace mmdeploy::python {
+namespace mmdeploy::python
+{
 
-using Rect = std::array<float, 4>;
+    using Rect = std::array<float, 4>;
 
-class PyPoseDetector {
- public:
-  PyPoseDetector(const char* model_path, const char* device_name, int device_id) {
-    auto status =
-        mmdeploy_pose_detector_create_by_path(model_path, device_name, device_id, &detector_);
-    if (status != MMDEPLOY_SUCCESS) {
-      throw std::runtime_error("failed to create pose_detector");
-    }
-  }
-  py::list Apply(const std::vector<PyImage>& imgs, const std::vector<std::vector<Rect>>& bboxes) {
-    if (imgs.size() == 0 && bboxes.size() == 0) {
-      return py::list{};
-    }
-    if (bboxes.size() != 0 && bboxes.size() != imgs.size()) {
-      std::ostringstream os;
-      os << "imgs length not equal with vboxes [" << imgs.size() << " vs " << bboxes.size() << "]";
-      throw std::invalid_argument(os.str());
-    }
+    class PyPoseDetector
+    {
+      public:
+        PyPoseDetector(const char* model_path, const char* device_name, int device_id)
+        {
+            auto status =
+                mmdeploy_pose_detector_create_by_path(model_path, device_name, device_id, &detector_);
+            if (status != MMDEPLOY_SUCCESS)
+            {
+                throw std::runtime_error("failed to create pose_detector");
+            }
+        }
+        py::list Apply(const std::vector<PyImage>& imgs, const std::vector<std::vector<Rect>>& bboxes)
+        {
+            if (imgs.size() == 0 && bboxes.size() == 0)
+            {
+                return py::list{};
+            }
+            if (bboxes.size() != 0 && bboxes.size() != imgs.size())
+            {
+                std::ostringstream os;
+                os << "imgs length not equal with vboxes [" << imgs.size() << " vs " << bboxes.size() << "]";
+                throw std::invalid_argument(os.str());
+            }
 
-    std::vector<mmdeploy_mat_t> mats;
-    std::vector<mmdeploy_rect_t> boxes;
-    std::vector<int> bbox_count;
-    mats.reserve(imgs.size());
-    for (const auto& img : imgs) {
-      auto mat = GetMat(img);
-      mats.push_back(mat);
-    }
+            std::vector<mmdeploy_mat_t>  mats;
+            std::vector<mmdeploy_rect_t> boxes;
+            std::vector<int>             bbox_count;
+            mats.reserve(imgs.size());
+            for (const auto& img : imgs)
+            {
+                auto mat = GetMat(img);
+                mats.push_back(mat);
+            }
 
-    for (auto _boxes : bboxes) {
-      for (auto _box : _boxes) {
-        mmdeploy_rect_t box = {_box[0], _box[1], _box[2], _box[3]};
-        boxes.push_back(box);
-      }
-      bbox_count.push_back(_boxes.size());
-    }
+            for (auto _boxes : bboxes)
+            {
+                for (auto _box : _boxes)
+                {
+                    mmdeploy_rect_t box = {_box[0], _box[1], _box[2], _box[3]};
+                    boxes.push_back(box);
+                }
+                bbox_count.push_back(_boxes.size());
+            }
 
-    // full image
-    if (bboxes.size() == 0) {
-      for (int i = 0; i < mats.size(); i++) {
-        mmdeploy_rect_t box = {0.f, 0.f, mats[i].width - 1.f, mats[i].height - 1.f};
-        boxes.push_back(box);
-        bbox_count.push_back(1);
-      }
-    }
+            // full image
+            if (bboxes.size() == 0)
+            {
+                for (int i = 0; i < mats.size(); i++)
+                {
+                    mmdeploy_rect_t box = {0.f, 0.f, mats[i].width - 1.f, mats[i].height - 1.f};
+                    boxes.push_back(box);
+                    bbox_count.push_back(1);
+                }
+            }
 
-    mmdeploy_pose_detection_t* detection{};
-    auto status = mmdeploy_pose_detector_apply_bbox(detector_, mats.data(), (int)mats.size(),
-                                                    boxes.data(), bbox_count.data(), &detection);
-    if (status != MMDEPLOY_SUCCESS) {
-      throw std::runtime_error("failed to apply pose_detector, code: " + std::to_string(status));
-    }
+            mmdeploy_pose_detection_t* detection{};
+            auto                       status = mmdeploy_pose_detector_apply_bbox(detector_, mats.data(), (int)mats.size(), boxes.data(), bbox_count.data(), &detection);
+            if (status != MMDEPLOY_SUCCESS)
+            {
+                throw std::runtime_error("failed to apply pose_detector, code: " + std::to_string(status));
+            }
 
-    auto output = py::list{};
-    auto result = detection;
-    for (int i = 0; i < mats.size(); i++) {
-      int n_point = result->length;
-      auto pred = py::array_t<float>({bbox_count[i], n_point, 3});
-      auto dst = pred.mutable_data();
-      for (int j = 0; j < bbox_count[i]; j++) {
-        for (int k = 0; k < n_point; k++) {
-          dst[0] = result->point[k].x;
-          dst[1] = result->point[k].y;
-          dst[2] = result->score[k];
-          dst += 3;
-        }
-        result++;
-      }
-      output.append(std::move(pred));
-    }
+            auto output = py::list{};
+            auto result = detection;
+            for (int i = 0; i < mats.size(); i++)
+            {
+                int  n_point = result->length;
+                auto pred    = py::array_t<float>({bbox_count[i], n_point, 3});
+                auto dst     = pred.mutable_data();
+                for (int j = 0; j < bbox_count[i]; j++)
+                {
+                    for (int k = 0; k < n_point; k++)
+                    {
+                        dst[0] = result->point[k].x;
+                        dst[1] = result->point[k].y;
+                        dst[2] = result->score[k];
+                        dst += 3;
+                    }
+                    result++;
+                }
+                output.append(std::move(pred));
+            }
 
-    int total = std::accumulate(bbox_count.begin(), bbox_count.end(), 0);
-    mmdeploy_pose_detector_release_result(detection, total);
-    return output;
-  }
-  ~PyPoseDetector() {
-    mmdeploy_pose_detector_destroy(detector_);
-    detector_ = {};
-  }
+            int total = std::accumulate(bbox_count.begin(), bbox_count.end(), 0);
+            mmdeploy_pose_detector_release_result(detection, total);
+            return output;
+        }
+        ~PyPoseDetector()
+        {
+            mmdeploy_pose_detector_destroy(detector_);
+            detector_ = {};
+        }
 
- private:
-  mmdeploy_pose_detector_t detector_{};
-};
+      private:
+        mmdeploy_pose_detector_t detector_{};
+    };
 
-static PythonBindingRegisterer register_pose_detector{[](py::module& m) {
-  py::class_<PyPoseDetector>(m, "PoseDetector")
-      .def(py::init([](const char* model_path, const char* device_name, int device_id) {
-             return std::make_unique<PyPoseDetector>(model_path, device_name, device_id);
-           }),
-           py::arg("model_path"), py::arg("device_name"), py::arg("device_id") = 0)
-      .def("__call__",
-           [](PyPoseDetector* self, const PyImage& img) -> py::array {
-             return self->Apply({img}, {})[0];
-           })
-      .def(
-          "__call__",
-          [](PyPoseDetector* self, const PyImage& img, const Rect& box) -> py::array {
-            std::vector<std::vector<Rect>> bboxes;
-            bboxes.push_back({box});
-            return self->Apply({img}, bboxes)[0];
-          },
-          py::arg("img"), py::arg("box"))
-      .def(
-          "__call__",
-          [](PyPoseDetector* self, const PyImage& img,
-             const std::vector<Rect>& bboxes) -> py::array {
-            std::vector<std::vector<Rect>> _bboxes;
-            _bboxes.push_back(bboxes);
-            return self->Apply({img}, _bboxes)[0];
-          },
-          py::arg("img"), py::arg("bboxes"))
-      .def("batch", &PyPoseDetector::Apply, py::arg("imgs"),
-           py::arg("bboxes") = std::vector<std::vector<Rect>>());
-}};
+    static PythonBindingRegisterer register_pose_detector{[](py::module& m)
+                                                          {
+                                                              py::class_<PyPoseDetector>(m, "PoseDetector")
+                                                                  .def(py::init([](const char* model_path, const char* device_name, int device_id)
+                                                                                { return std::make_unique<PyPoseDetector>(model_path, device_name, device_id); }),
+                                                                       py::arg("model_path"),
+                                                                       py::arg("device_name"),
+                                                                       py::arg("device_id") = 0)
+                                                                  .def("__call__",
+                                                                       [](PyPoseDetector* self, const PyImage& img) -> py::array
+                                                                       {
+                                                                           return self->Apply({img}, {})[0];
+                                                                       })
+                                                                  .def(
+                                                                      "__call__",
+                                                                      [](PyPoseDetector* self, const PyImage& img, const Rect& box) -> py::array
+                                                                      {
+                                                                          std::vector<std::vector<Rect>> bboxes;
+                                                                          bboxes.push_back({box});
+                                                                          return self->Apply({img}, bboxes)[0];
+                                                                      },
+                                                                      py::arg("img"),
+                                                                      py::arg("box"))
+                                                                  .def(
+                                                                      "__call__",
+                                                                      [](PyPoseDetector* self, const PyImage& img, const std::vector<Rect>& bboxes) -> py::array
+                                                                      {
+                                                                          std::vector<std::vector<Rect>> _bboxes;
+                                                                          _bboxes.push_back(bboxes);
+                                                                          return self->Apply({img}, _bboxes)[0];
+                                                                      },
+                                                                      py::arg("img"),
+                                                                      py::arg("bboxes"))
+                                                                  .def("batch", &PyPoseDetector::Apply, py::arg("imgs"), py::arg("bboxes") = std::vector<std::vector<Rect>>());
+                                                          }};
 
 }  // namespace mmdeploy::python
diff --git a/csrc/mmdeploy/apis/python/pose_tracker.cpp b/csrc/mmdeploy/apis/python/pose_tracker.cpp
index 035ce3cdd1..c14f2450e8 100644
--- a/csrc/mmdeploy/apis/python/pose_tracker.cpp
+++ b/csrc/mmdeploy/apis/python/pose_tracker.cpp
@@ -5,146 +5,200 @@
 #include "common.h"
 #include "mmdeploy/common.hpp"
 
-namespace mmdeploy::python {
+namespace mmdeploy::python
+{
 
-namespace {
+    namespace
+    {
 
-std::vector<py::tuple> Apply(mmdeploy::PoseTracker* self,
-                             const std::vector<mmdeploy::PoseTracker::State*>& _states,
-                             const std::vector<PyImage>& _frames, std::vector<int> detect) {
-  std::vector<mmdeploy_pose_tracker_state_t> tmp;
-  for (const auto& s : _states) {
-    tmp.push_back(static_cast<mmdeploy_pose_tracker_state_t>(*s));
-  }
-  mmdeploy::Span states(reinterpret_cast<mmdeploy::PoseTracker::State*>(tmp.data()), tmp.size());
-  std::vector<mmdeploy::Mat> frames;
-  for (const auto& f : _frames) {
-    frames.emplace_back(GetMat(f));
-  }
-  if (detect.empty()) {
-    detect.resize(frames.size(), -1);
-  }
-  assert(states.size() == frames.size());
-  assert(states.size() == detect.size());
-  auto results = self->Apply(states, frames, detect);
-  std::vector<py::tuple> batch_ret;
-  batch_ret.reserve(frames.size());
-  for (const auto& rs : results) {
-    py::array_t<float> keypoints(
-        {static_cast<int>(rs.size()), rs.size() > 0 ? rs[0].keypoint_count : 0, 3});
-    py::array_t<float> bboxes({static_cast<int>(rs.size()), 4});
-    py::array_t<uint32_t> track_ids(static_cast<int>(rs.size()));
-    auto kpts_ptr = keypoints.mutable_data();
-    auto bbox_ptr = bboxes.mutable_data();
-    auto track_id_ptr = track_ids.mutable_data();
-    for (const auto& r : rs) {
-      for (int i = 0; i < r.keypoint_count; ++i) {
-        kpts_ptr[0] = r.keypoints[i].x;
-        kpts_ptr[1] = r.keypoints[i].y;
-        kpts_ptr[2] = r.scores[i];
-        kpts_ptr += 3;
-      }
-      {
-        auto tmp_bbox = (std::array<float, 4>&)r.bbox;
-        bbox_ptr[0] = tmp_bbox[0];
-        bbox_ptr[1] = tmp_bbox[1];
-        bbox_ptr[2] = tmp_bbox[2];
-        bbox_ptr[3] = tmp_bbox[3];
-        bbox_ptr += 4;
-      }
-      *track_id_ptr++ = r.target_id;
-    }
-    batch_ret.push_back(
-        py::make_tuple(std::move(keypoints), std::move(bboxes), std::move(track_ids)));
-  }
-  return batch_ret;
-}
+        std::vector<py::tuple> Apply(mmdeploy::PoseTracker*                            self,
+                                     const std::vector<mmdeploy::PoseTracker::State*>& _states,
+                                     const std::vector<PyImage>&                       _frames,
+                                     std::vector<int>                                  detect)
+        {
+            std::vector<mmdeploy_pose_tracker_state_t> tmp;
+            for (const auto& s : _states)
+            {
+                tmp.push_back(static_cast<mmdeploy_pose_tracker_state_t>(*s));
+            }
+            mmdeploy::Span             states(reinterpret_cast<mmdeploy::PoseTracker::State*>(tmp.data()), tmp.size());
+            std::vector<mmdeploy::Mat> frames;
+            for (const auto& f : _frames)
+            {
+                frames.emplace_back(GetMat(f));
+            }
+            if (detect.empty())
+            {
+                detect.resize(frames.size(), -1);
+            }
+            assert(states.size() == frames.size());
+            assert(states.size() == detect.size());
+            auto                   results = self->Apply(states, frames, detect);
+            std::vector<py::tuple> batch_ret;
+            batch_ret.reserve(frames.size());
+            for (const auto& rs : results)
+            {
+                py::array_t<float> keypoints(
+                    {static_cast<int>(rs.size()), rs.size() > 0 ? rs[0].keypoint_count : 0, 3});
+                py::array_t<float>    bboxes({static_cast<int>(rs.size()), 4});
+                py::array_t<uint32_t> track_ids(static_cast<int>(rs.size()));
+                auto                  kpts_ptr     = keypoints.mutable_data();
+                auto                  bbox_ptr     = bboxes.mutable_data();
+                auto                  track_id_ptr = track_ids.mutable_data();
+                for (const auto& r : rs)
+                {
+                    for (int i = 0; i < r.keypoint_count; ++i)
+                    {
+                        kpts_ptr[0] = r.keypoints[i].x;
+                        kpts_ptr[1] = r.keypoints[i].y;
+                        kpts_ptr[2] = r.scores[i];
+                        kpts_ptr += 3;
+                    }
+                    {
+                        auto tmp_bbox = (std::array<float, 4>&)r.bbox;
+                        bbox_ptr[0]   = tmp_bbox[0];
+                        bbox_ptr[1]   = tmp_bbox[1];
+                        bbox_ptr[2]   = tmp_bbox[2];
+                        bbox_ptr[3]   = tmp_bbox[3];
+                        bbox_ptr += 4;
+                    }
+                    *track_id_ptr++ = r.target_id;
+                }
+                batch_ret.push_back(
+                    py::make_tuple(std::move(keypoints), std::move(bboxes), std::move(track_ids)));
+            }
+            return batch_ret;
+        }
 
-template <typename T, size_t N>
-void Copy(const py::handle& h, T (&a)[N]) {
-  auto array = h.cast<py::array_t<float>>();
-  assert(array.size() == N);
-  auto data = array.data();
-  for (int i = 0; i < N; ++i) {
-    a[i] = data[i];
-  }
-}
+        template<typename T, size_t N>
+        void Copy(const py::handle& h, T (&a)[N])
+        {
+            auto array = h.cast<py::array_t<float>>();
+            assert(array.size() == N);
+            auto data = array.data();
+            for (int i = 0; i < N; ++i)
+            {
+                a[i] = data[i];
+            }
+        }
 
-void Parse(const py::dict& dict, PoseTracker::Params& params, py::array_t<float>& sigmas) {
-  for (const auto& [_name, value] : dict) {
-    auto name = _name.cast<std::string>();
-    if (name == "det_interval") {
-      params->det_interval = value.cast<int32_t>();
-    } else if (name == "det_label") {
-      params->det_label = value.cast<int32_t>();
-    } else if (name == "det_thr") {
-      params->det_thr = value.cast<float>();
-    } else if (name == "det_min_bbox_size") {
-      params->det_min_bbox_size = value.cast<float>();
-    } else if (name == "det_nms_thr") {
-      params->det_nms_thr = value.cast<float>();
-    } else if (name == "pose_max_num_bboxes") {
-      params->pose_max_num_bboxes = value.cast<int32_t>();
-    } else if (name == "pose_min_keypoints") {
-      params->pose_min_keypoints = value.cast<int32_t>();
-    } else if (name == "pose_min_bbox_size") {
-      params->pose_min_bbox_size = value.cast<float>();
-    } else if (name == "pose_nms_thr") {
-      params->pose_nms_thr = value.cast<float>();
-    } else if (name == "track_kpt_thr") {
-      params->pose_kpt_thr = value.cast<float>();
-    } else if (name == "track_iou_thr") {
-      params->track_iou_thr = value.cast<float>();
-    } else if (name == "pose_bbox_scale") {
-      params->pose_bbox_scale = value.cast<float>();
-    } else if (name == "track_max_missing") {
-      params->track_max_missing = value.cast<float>();
-    } else if (name == "track_history_size") {
-      params->track_history_size = value.cast<int32_t>();
-    } else if (name == "keypoint_sigmas") {
-      sigmas = value.cast<py::array_t<float>>();
-      params->keypoint_sigmas = const_cast<float*>(sigmas.data());
-      params->keypoint_sigmas_size = sigmas.size();
-    } else if (name == "std_weight_position") {
-      params->std_weight_position = value.cast<float>();
-    } else if (name == "std_weight_velocity") {
-      params->std_weight_velocity = value.cast<float>();
-    } else if (name == "smooth_params") {
-      Copy(value, params->smooth_params);
-    } else {
-      MMDEPLOY_ERROR("unused argument: {}", name);
-    }
-  }
-}
+        void Parse(const py::dict& dict, PoseTracker::Params& params, py::array_t<float>& sigmas)
+        {
+            for (const auto& [_name, value] : dict)
+            {
+                auto name = _name.cast<std::string>();
+                if (name == "det_interval")
+                {
+                    params->det_interval = value.cast<int32_t>();
+                }
+                else if (name == "det_label")
+                {
+                    params->det_label = value.cast<int32_t>();
+                }
+                else if (name == "det_thr")
+                {
+                    params->det_thr = value.cast<float>();
+                }
+                else if (name == "det_min_bbox_size")
+                {
+                    params->det_min_bbox_size = value.cast<float>();
+                }
+                else if (name == "det_nms_thr")
+                {
+                    params->det_nms_thr = value.cast<float>();
+                }
+                else if (name == "pose_max_num_bboxes")
+                {
+                    params->pose_max_num_bboxes = value.cast<int32_t>();
+                }
+                else if (name == "pose_min_keypoints")
+                {
+                    params->pose_min_keypoints = value.cast<int32_t>();
+                }
+                else if (name == "pose_min_bbox_size")
+                {
+                    params->pose_min_bbox_size = value.cast<float>();
+                }
+                else if (name == "pose_nms_thr")
+                {
+                    params->pose_nms_thr = value.cast<float>();
+                }
+                else if (name == "track_kpt_thr")
+                {
+                    params->pose_kpt_thr = value.cast<float>();
+                }
+                else if (name == "track_iou_thr")
+                {
+                    params->track_iou_thr = value.cast<float>();
+                }
+                else if (name == "pose_bbox_scale")
+                {
+                    params->pose_bbox_scale = value.cast<float>();
+                }
+                else if (name == "track_max_missing")
+                {
+                    params->track_max_missing = value.cast<float>();
+                }
+                else if (name == "track_history_size")
+                {
+                    params->track_history_size = value.cast<int32_t>();
+                }
+                else if (name == "keypoint_sigmas")
+                {
+                    sigmas                       = value.cast<py::array_t<float>>();
+                    params->keypoint_sigmas      = const_cast<float*>(sigmas.data());
+                    params->keypoint_sigmas_size = sigmas.size();
+                }
+                else if (name == "std_weight_position")
+                {
+                    params->std_weight_position = value.cast<float>();
+                }
+                else if (name == "std_weight_velocity")
+                {
+                    params->std_weight_velocity = value.cast<float>();
+                }
+                else if (name == "smooth_params")
+                {
+                    Copy(value, params->smooth_params);
+                }
+                else
+                {
+                    MMDEPLOY_ERROR("unused argument: {}", name);
+                }
+            }
+        }
 
-}  // namespace
+    }  // namespace
 
-static PythonBindingRegisterer register_pose_tracker{[](py::module& m) {
-  py::class_<mmdeploy::PoseTracker::State>(m, "PoseTracker.State");
-  py::class_<mmdeploy::PoseTracker>(m, "PoseTracker")
-      .def(py::init([](const char* det_model_path, const char* pose_model_path,
-                       const char* device_name, int device_id) {
-             return mmdeploy::PoseTracker(
-                 mmdeploy::Model(det_model_path), mmdeploy::Model(pose_model_path),
-                 mmdeploy::Context(mmdeploy::Device(device_name, device_id)));
-           }),
-           py::arg("det_model"), py::arg("pose_model"), py::arg("device_name"),
-           py::arg("device_id") = 0)
-      .def(
-          "__call__",
-          [](mmdeploy::PoseTracker* self, mmdeploy::PoseTracker::State* state, const PyImage& img,
-             int detect) { return Apply(self, {state}, {img}, {detect})[0]; },
-          py::arg("state"), py::arg("frame"), py::arg("detect") = -1)
-      .def("batch", &Apply, py::arg("states"), py::arg("frames"),
-           py::arg("detects") = std::vector<int>{})
-      .def("create_state", [](mmdeploy::PoseTracker* self, const py::kwargs& kwargs) {
+    static PythonBindingRegisterer register_pose_tracker{[](py::module& m)
+                                                         {
+                                                             py::class_<mmdeploy::PoseTracker::State>(m, "PoseTracker.State");
+                                                             py::class_<mmdeploy::PoseTracker>(m, "PoseTracker")
+                                                                 .def(py::init([](const char* det_model_path, const char* pose_model_path, const char* device_name, int device_id)
+                                                                               { return mmdeploy::PoseTracker(
+                                                                                     mmdeploy::Model(det_model_path),
+                                                                                     mmdeploy::Model(pose_model_path),
+                                                                                     mmdeploy::Context(mmdeploy::Device(device_name, device_id))); }),
+                                                                      py::arg("det_model"),
+                                                                      py::arg("pose_model"),
+                                                                      py::arg("device_name"),
+                                                                      py::arg("device_id") = 0)
+                                                                 .def(
+                                                                     "__call__",
+                                                                     [](mmdeploy::PoseTracker* self, mmdeploy::PoseTracker::State* state, const PyImage& img, int detect)
+                                                                     { return Apply(self, {state}, {img}, {detect})[0]; },
+                                                                     py::arg("state"),
+                                                                     py::arg("frame"),
+                                                                     py::arg("detect") = -1)
+                                                                 .def("batch", &Apply, py::arg("states"), py::arg("frames"), py::arg("detects") = std::vector<int>{})
+                                                                 .def("create_state", [](mmdeploy::PoseTracker* self, const py::kwargs& kwargs)
+                                                                      {
         PoseTracker::Params params;
         py::array_t<float> sigmas;
         if (kwargs) {
           Parse(kwargs, params, sigmas);
         }
-        return self->CreateState(params);
-      });
-}};
+        return self->CreateState(params); });
+                                                         }};
 
 }  // namespace mmdeploy::python
diff --git a/csrc/mmdeploy/apis/python/restorer.cpp b/csrc/mmdeploy/apis/python/restorer.cpp
index 771af2a6c4..ddd4c0a8ff 100644
--- a/csrc/mmdeploy/apis/python/restorer.cpp
+++ b/csrc/mmdeploy/apis/python/restorer.cpp
@@ -4,63 +4,77 @@
 
 #include "common.h"
 
-namespace mmdeploy::python {
+namespace mmdeploy::python
+{
 
-class PyRestorer {
- public:
-  PyRestorer(const char* model_path, const char* device_name, int device_id) {
-    auto status = mmdeploy_restorer_create_by_path(model_path, device_name, device_id, &restorer_);
-    if (status != MMDEPLOY_SUCCESS) {
-      throw std::runtime_error("failed to create restorer");
-    }
-  }
-  ~PyRestorer() {
-    mmdeploy_restorer_destroy(restorer_);
-    restorer_ = {};
-  }
+    class PyRestorer
+    {
+      public:
+        PyRestorer(const char* model_path, const char* device_name, int device_id)
+        {
+            auto status = mmdeploy_restorer_create_by_path(model_path, device_name, device_id, &restorer_);
+            if (status != MMDEPLOY_SUCCESS)
+            {
+                throw std::runtime_error("failed to create restorer");
+            }
+        }
+        ~PyRestorer()
+        {
+            mmdeploy_restorer_destroy(restorer_);
+            restorer_ = {};
+        }
 
-  std::vector<py::array> Apply(const std::vector<PyImage>& imgs) {
-    std::vector<mmdeploy_mat_t> mats;
-    mats.reserve(imgs.size());
-    for (const auto& img : imgs) {
-      auto mat = GetMat(img);
-      mats.push_back(mat);
-    }
-    mmdeploy_mat_t* results{};
-    auto status = mmdeploy_restorer_apply(restorer_, mats.data(), (int)mats.size(), &results);
-    if (status != MMDEPLOY_SUCCESS) {
-      throw std::runtime_error("failed to apply restorer, code: " + std::to_string(status));
-    }
-    using Sptr = std::shared_ptr<mmdeploy_mat_t>;
-    Sptr holder(results, [n = mats.size()](auto p) { mmdeploy_restorer_release_result(p, n); });
+        std::vector<py::array> Apply(const std::vector<PyImage>& imgs)
+        {
+            std::vector<mmdeploy_mat_t> mats;
+            mats.reserve(imgs.size());
+            for (const auto& img : imgs)
+            {
+                auto mat = GetMat(img);
+                mats.push_back(mat);
+            }
+            mmdeploy_mat_t* results{};
+            auto            status = mmdeploy_restorer_apply(restorer_, mats.data(), (int)mats.size(), &results);
+            if (status != MMDEPLOY_SUCCESS)
+            {
+                throw std::runtime_error("failed to apply restorer, code: " + std::to_string(status));
+            }
+            using Sptr = std::shared_ptr<mmdeploy_mat_t>;
+            Sptr                   holder(results, [n = mats.size()](auto p)
+                        { mmdeploy_restorer_release_result(p, n); });
 
-    std::vector<py::array> rets(mats.size());
-    for (int i = 0; i < mats.size(); ++i) {
-      rets[i] = {
-          {results[i].height, results[i].width, results[i].channel},       // shape
-          results[i].data,                                                 // data
-          py::capsule(new Sptr(holder),                                    // handle
-                      [](void* p) { delete reinterpret_cast<Sptr*>(p); })  //
-      };
-    }
-    return rets;
-  }
+            std::vector<py::array> rets(mats.size());
+            for (int i = 0; i < mats.size(); ++i)
+            {
+                rets[i] = {
+                    {results[i].height, results[i].width, results[i].channel},  // shape
+                    results[i].data,                                            // data
+                    py::capsule(new Sptr(holder),                               // handle
+                                [](void* p)
+                                { delete reinterpret_cast<Sptr*>(p); })  //
+                };
+            }
+            return rets;
+        }
 
- private:
-  mmdeploy_restorer_t restorer_{};
-};
+      private:
+        mmdeploy_restorer_t restorer_{};
+    };
 
-static PythonBindingRegisterer register_restorer{[](py::module& m) {
-  py::class_<PyRestorer>(m, "Restorer")
-      .def(py::init([](const char* model_path, const char* device_name, int device_id) {
-             return std::make_unique<PyRestorer>(model_path, device_name, device_id);
-           }),
-           py::arg("model_path"), py::arg("device_name"), py::arg("device_id") = 0)
-      .def("__call__",
-           [](PyRestorer* self, const PyImage& img) -> py::array {
-             return self->Apply(std::vector{img})[0];
-           })
-      .def("batch", &PyRestorer::Apply);
-}};
+    static PythonBindingRegisterer register_restorer{[](py::module& m)
+                                                     {
+                                                         py::class_<PyRestorer>(m, "Restorer")
+                                                             .def(py::init([](const char* model_path, const char* device_name, int device_id)
+                                                                           { return std::make_unique<PyRestorer>(model_path, device_name, device_id); }),
+                                                                  py::arg("model_path"),
+                                                                  py::arg("device_name"),
+                                                                  py::arg("device_id") = 0)
+                                                             .def("__call__",
+                                                                  [](PyRestorer* self, const PyImage& img) -> py::array
+                                                                  {
+                                                                      return self->Apply(std::vector{img})[0];
+                                                                  })
+                                                             .def("batch", &PyRestorer::Apply);
+                                                     }};
 
 }  // namespace mmdeploy::python
diff --git a/csrc/mmdeploy/apis/python/rotated_detector.cpp b/csrc/mmdeploy/apis/python/rotated_detector.cpp
index bc760b04e4..148b31fa6e 100644
--- a/csrc/mmdeploy/apis/python/rotated_detector.cpp
+++ b/csrc/mmdeploy/apis/python/rotated_detector.cpp
@@ -4,74 +4,87 @@
 
 #include "common.h"
 
-namespace mmdeploy::python {
+namespace mmdeploy::python
+{
 
-class PyRotatedDetector {
- public:
-  PyRotatedDetector(const char* model_path, const char* device_name, int device_id) {
-    auto status =
-        mmdeploy_rotated_detector_create_by_path(model_path, device_name, device_id, &detector_);
-    if (status != MMDEPLOY_SUCCESS) {
-      throw std::runtime_error("failed to create rotated detector");
-    }
-  }
-  py::list Apply(const std::vector<PyImage>& imgs) {
-    std::vector<mmdeploy_mat_t> mats;
-    mats.reserve(imgs.size());
-    for (const auto& img : imgs) {
-      auto mat = GetMat(img);
-      mats.push_back(mat);
-    }
+    class PyRotatedDetector
+    {
+      public:
+        PyRotatedDetector(const char* model_path, const char* device_name, int device_id)
+        {
+            auto status =
+                mmdeploy_rotated_detector_create_by_path(model_path, device_name, device_id, &detector_);
+            if (status != MMDEPLOY_SUCCESS)
+            {
+                throw std::runtime_error("failed to create rotated detector");
+            }
+        }
+        py::list Apply(const std::vector<PyImage>& imgs)
+        {
+            std::vector<mmdeploy_mat_t> mats;
+            mats.reserve(imgs.size());
+            for (const auto& img : imgs)
+            {
+                auto mat = GetMat(img);
+                mats.push_back(mat);
+            }
 
-    mmdeploy_rotated_detection_t* rbboxes{};
-    int* res_count{};
-    auto status = mmdeploy_rotated_detector_apply(detector_, mats.data(), (int)mats.size(),
-                                                  &rbboxes, &res_count);
-    if (status != MMDEPLOY_SUCCESS) {
-      throw std::runtime_error("failed to apply rotated detector, code: " + std::to_string(status));
-    }
-    auto output = py::list{};
-    auto result = rbboxes;
-    auto counts = res_count;
-    for (int i = 0; i < mats.size(); i++) {
-      auto _dets = py::array_t<float>({*counts, 6});
-      auto _labels = py::array_t<int>({*counts});
-      auto dets = _dets.mutable_data();
-      auto labels = _labels.mutable_data();
-      for (int j = 0; j < *counts; j++) {
-        for (int k = 0; k < 5; k++) {
-          *dets++ = result->rbbox[k];
+            mmdeploy_rotated_detection_t* rbboxes{};
+            int*                          res_count{};
+            auto                          status = mmdeploy_rotated_detector_apply(detector_, mats.data(), (int)mats.size(), &rbboxes, &res_count);
+            if (status != MMDEPLOY_SUCCESS)
+            {
+                throw std::runtime_error("failed to apply rotated detector, code: " + std::to_string(status));
+            }
+            auto output = py::list{};
+            auto result = rbboxes;
+            auto counts = res_count;
+            for (int i = 0; i < mats.size(); i++)
+            {
+                auto _dets   = py::array_t<float>({*counts, 6});
+                auto _labels = py::array_t<int>({*counts});
+                auto dets    = _dets.mutable_data();
+                auto labels  = _labels.mutable_data();
+                for (int j = 0; j < *counts; j++)
+                {
+                    for (int k = 0; k < 5; k++)
+                    {
+                        *dets++ = result->rbbox[k];
+                    }
+                    *dets++   = result->score;
+                    *labels++ = result->label_id;
+                    result++;
+                }
+                counts++;
+                output.append(py::make_tuple(std::move(_dets), std::move(_labels)));
+            }
+            mmdeploy_rotated_detector_release_result(rbboxes, res_count);
+            return output;
+        }
+        ~PyRotatedDetector()
+        {
+            mmdeploy_rotated_detector_destroy(detector_);
+            detector_ = {};
         }
-        *dets++ = result->score;
-        *labels++ = result->label_id;
-        result++;
-      }
-      counts++;
-      output.append(py::make_tuple(std::move(_dets), std::move(_labels)));
-    }
-    mmdeploy_rotated_detector_release_result(rbboxes, res_count);
-    return output;
-  }
-  ~PyRotatedDetector() {
-    mmdeploy_rotated_detector_destroy(detector_);
-    detector_ = {};
-  }
 
- private:
-  mmdeploy_rotated_detector_t detector_{};
-};
+      private:
+        mmdeploy_rotated_detector_t detector_{};
+    };
 
-static PythonBindingRegisterer register_rotated_detector{[](py::module& m) {
-  py::class_<PyRotatedDetector>(m, "RotatedDetector")
-      .def(py::init([](const char* model_path, const char* device_name, int device_id) {
-             return std::make_unique<PyRotatedDetector>(model_path, device_name, device_id);
-           }),
-           py::arg("model_path"), py::arg("device_name"), py::arg("device_id") = 0)
-      .def("__call__",
-           [](PyRotatedDetector* self, const PyImage& img) -> py::tuple {
-             return self->Apply(std::vector{img})[0];
-           })
-      .def("batch", &PyRotatedDetector::Apply);
-}};
+    static PythonBindingRegisterer register_rotated_detector{[](py::module& m)
+                                                             {
+                                                                 py::class_<PyRotatedDetector>(m, "RotatedDetector")
+                                                                     .def(py::init([](const char* model_path, const char* device_name, int device_id)
+                                                                                   { return std::make_unique<PyRotatedDetector>(model_path, device_name, device_id); }),
+                                                                          py::arg("model_path"),
+                                                                          py::arg("device_name"),
+                                                                          py::arg("device_id") = 0)
+                                                                     .def("__call__",
+                                                                          [](PyRotatedDetector* self, const PyImage& img) -> py::tuple
+                                                                          {
+                                                                              return self->Apply(std::vector{img})[0];
+                                                                          })
+                                                                     .def("batch", &PyRotatedDetector::Apply);
+                                                             }};
 
 }  // namespace mmdeploy::python
diff --git a/csrc/mmdeploy/apis/python/segmentor.cpp b/csrc/mmdeploy/apis/python/segmentor.cpp
index 940972ab61..9e1db508c7 100644
--- a/csrc/mmdeploy/apis/python/segmentor.cpp
+++ b/csrc/mmdeploy/apis/python/segmentor.cpp
@@ -4,74 +4,91 @@
 
 #include "common.h"
 
-namespace mmdeploy::python {
+namespace mmdeploy::python
+{
 
-class PySegmentor {
- public:
-  PySegmentor(const char* model_path, const char* device_name, int device_id) {
-    auto status =
-        mmdeploy_segmentor_create_by_path(model_path, device_name, device_id, &segmentor_);
-    if (status != MMDEPLOY_SUCCESS) {
-      throw std::runtime_error("failed to create segmentor");
-    }
-  }
-  ~PySegmentor() {
-    mmdeploy_segmentor_destroy(segmentor_);
-    segmentor_ = {};
-  }
+    class PySegmentor
+    {
+      public:
+        PySegmentor(const char* model_path, const char* device_name, int device_id)
+        {
+            auto status =
+                mmdeploy_segmentor_create_by_path(model_path, device_name, device_id, &segmentor_);
+            if (status != MMDEPLOY_SUCCESS)
+            {
+                throw std::runtime_error("failed to create segmentor");
+            }
+        }
+        ~PySegmentor()
+        {
+            mmdeploy_segmentor_destroy(segmentor_);
+            segmentor_ = {};
+        }
 
-  std::vector<py::array> Apply(const std::vector<PyImage>& imgs) {
-    std::vector<mmdeploy_mat_t> mats;
-    mats.reserve(imgs.size());
-    for (const auto& img : imgs) {
-      auto mat = GetMat(img);
-      mats.push_back(mat);
-    }
-    mmdeploy_segmentation_t* segm{};
-    auto status = mmdeploy_segmentor_apply(segmentor_, mats.data(), (int)mats.size(), &segm);
-    if (status != MMDEPLOY_SUCCESS) {
-      throw std::runtime_error("failed to apply segmentor, code: " + std::to_string(status));
-    }
-    using Sptr = std::shared_ptr<mmdeploy_segmentation_t>;
-    Sptr holder(segm, [n = mats.size()](auto p) { mmdeploy_segmentor_release_result(p, n); });
+        std::vector<py::array> Apply(const std::vector<PyImage>& imgs)
+        {
+            std::vector<mmdeploy_mat_t> mats;
+            mats.reserve(imgs.size());
+            for (const auto& img : imgs)
+            {
+                auto mat = GetMat(img);
+                mats.push_back(mat);
+            }
+            mmdeploy_segmentation_t* segm{};
+            auto                     status = mmdeploy_segmentor_apply(segmentor_, mats.data(), (int)mats.size(), &segm);
+            if (status != MMDEPLOY_SUCCESS)
+            {
+                throw std::runtime_error("failed to apply segmentor, code: " + std::to_string(status));
+            }
+            using Sptr = std::shared_ptr<mmdeploy_segmentation_t>;
+            Sptr                   holder(segm, [n = mats.size()](auto p)
+                        { mmdeploy_segmentor_release_result(p, n); });
 
-    std::vector<py::array> rets(mats.size());
-    for (size_t i = 0; i < mats.size(); ++i) {
-      if (segm[i].mask != nullptr) {
-        rets[i] = {
-            {segm[i].height, segm[i].width},                                 // shape
-            segm[i].mask,                                                    // mask
-            py::capsule(new Sptr(holder),                                    // handle
-                        [](void* p) { delete reinterpret_cast<Sptr*>(p); })  //
-        };
-      }
-      if (segm[i].score != nullptr) {
-        rets[i] = {
-            {segm[i].classes, segm[i].height, segm[i].width},                // shape
-            segm[i].score,                                                   // score
-            py::capsule(new Sptr(holder),                                    // handle
-                        [](void* p) { delete reinterpret_cast<Sptr*>(p); })  //
-        };
-      }
-    }
-    return rets;
-  }
+            std::vector<py::array> rets(mats.size());
+            for (size_t i = 0; i < mats.size(); ++i)
+            {
+                if (segm[i].mask != nullptr)
+                {
+                    rets[i] = {
+                        {segm[i].height, segm[i].width},  // shape
+                        segm[i].mask,                     // mask
+                        py::capsule(new Sptr(holder),     // handle
+                                    [](void* p)
+                                    { delete reinterpret_cast<Sptr*>(p); })  //
+                    };
+                }
+                if (segm[i].score != nullptr)
+                {
+                    rets[i] = {
+                        {segm[i].classes, segm[i].height, segm[i].width},  // shape
+                        segm[i].score,                                     // score
+                        py::capsule(new Sptr(holder),                      // handle
+                                    [](void* p)
+                                    { delete reinterpret_cast<Sptr*>(p); })  //
+                    };
+                }
+            }
+            return rets;
+        }
 
- private:
-  mmdeploy_segmentor_t segmentor_{};
-};
+      private:
+        mmdeploy_segmentor_t segmentor_{};
+    };
 
-static PythonBindingRegisterer register_segmentor{[](py::module& m) {
-  py::class_<PySegmentor>(m, "Segmentor")
-      .def(py::init([](const char* model_path, const char* device_name, int device_id) {
-             return std::make_unique<PySegmentor>(model_path, device_name, device_id);
-           }),
-           py::arg("model_path"), py::arg("device_name"), py::arg("device_id") = 0)
-      .def("__call__",
-           [](PySegmentor* self, const PyImage& img) -> py::array {
-             return self->Apply(std::vector{img})[0];
-           })
-      .def("batch", &PySegmentor::Apply);
-}};
+    static PythonBindingRegisterer register_segmentor{[](py::module& m)
+                                                      {
+                                                          py::class_<PySegmentor>(m, "Segmentor")
+                                                              .def(py::init([](const char* model_path, const char* device_name, int device_id)
+                                                                            { return std::make_unique<PySegmentor>(model_path, device_name, device_id); }),
+                                                                   py::arg("model_path"),
+                                                                   py::arg("device_name"),
+                                                                   py::arg("device_id") = 0)
+                                                              .def("__call__",
+                                                                   [](PySegmentor* self, const PyImage& img) -> py::array
+                                                                   {
+                                                                       return self->Apply(std::vector{img})[0];
+                                                                   })
+                                                              .def("batch", &PySegmentor::Apply);
+                                                      }};
 
 }  // namespace mmdeploy::python
diff --git a/csrc/mmdeploy/apis/python/text_detector.cpp b/csrc/mmdeploy/apis/python/text_detector.cpp
index 19762d08ec..1326588a1f 100644
--- a/csrc/mmdeploy/apis/python/text_detector.cpp
+++ b/csrc/mmdeploy/apis/python/text_detector.cpp
@@ -4,68 +4,81 @@
 
 #include "common.h"
 
-namespace mmdeploy::python {
+namespace mmdeploy::python
+{
 
-class PyTextDetector {
- public:
-  PyTextDetector(const char* model_path, const char* device_name, int device_id) {
-    auto status =
-        mmdeploy_text_detector_create_by_path(model_path, device_name, device_id, &detector_);
-    if (status != MMDEPLOY_SUCCESS) {
-      throw std::runtime_error("failed to create text_detector");
-    }
-  }
-  std::vector<py::array_t<float>> Apply(const std::vector<PyImage>& imgs) {
-    std::vector<mmdeploy_mat_t> mats;
-    mats.reserve(imgs.size());
-    for (const auto& img : imgs) {
-      auto mat = GetMat(img);
-      mats.push_back(mat);
-    }
-    mmdeploy_text_detection_t* detection{};
-    int* result_count{};
-    auto status = mmdeploy_text_detector_apply(detector_, mats.data(), (int)mats.size(), &detection,
-                                               &result_count);
-    if (status != MMDEPLOY_SUCCESS) {
-      throw std::runtime_error("failed to apply text_detector, code: " + std::to_string(status));
-    }
-    auto output = std::vector<py::array_t<float>>{};
-    auto result = detection;
-    for (int i = 0; i < mats.size(); ++i) {
-      auto bboxes = py::array_t<float>({result_count[i], 9});
-      for (int j = 0; j < result_count[i]; ++j, ++result) {
-        auto data = bboxes.mutable_data(j);
-        for (const auto& p : result->bbox) {
-          *data++ = p.x;
-          *data++ = p.y;
+    class PyTextDetector
+    {
+      public:
+        PyTextDetector(const char* model_path, const char* device_name, int device_id)
+        {
+            auto status =
+                mmdeploy_text_detector_create_by_path(model_path, device_name, device_id, &detector_);
+            if (status != MMDEPLOY_SUCCESS)
+            {
+                throw std::runtime_error("failed to create text_detector");
+            }
+        }
+        std::vector<py::array_t<float>> Apply(const std::vector<PyImage>& imgs)
+        {
+            std::vector<mmdeploy_mat_t> mats;
+            mats.reserve(imgs.size());
+            for (const auto& img : imgs)
+            {
+                auto mat = GetMat(img);
+                mats.push_back(mat);
+            }
+            mmdeploy_text_detection_t* detection{};
+            int*                       result_count{};
+            auto                       status = mmdeploy_text_detector_apply(detector_, mats.data(), (int)mats.size(), &detection, &result_count);
+            if (status != MMDEPLOY_SUCCESS)
+            {
+                throw std::runtime_error("failed to apply text_detector, code: " + std::to_string(status));
+            }
+            auto output = std::vector<py::array_t<float>>{};
+            auto result = detection;
+            for (int i = 0; i < mats.size(); ++i)
+            {
+                auto bboxes = py::array_t<float>({result_count[i], 9});
+                for (int j = 0; j < result_count[i]; ++j, ++result)
+                {
+                    auto data = bboxes.mutable_data(j);
+                    for (const auto& p : result->bbox)
+                    {
+                        *data++ = p.x;
+                        *data++ = p.y;
+                    }
+                    *data++ = result->score;
+                }
+                output.push_back(std::move(bboxes));
+            }
+            mmdeploy_text_detector_release_result(detection, result_count, (int)mats.size());
+            return output;
+        }
+        ~PyTextDetector()
+        {
+            mmdeploy_text_detector_destroy(detector_);
+            detector_ = {};
         }
-        *data++ = result->score;
-      }
-      output.push_back(std::move(bboxes));
-    }
-    mmdeploy_text_detector_release_result(detection, result_count, (int)mats.size());
-    return output;
-  }
-  ~PyTextDetector() {
-    mmdeploy_text_detector_destroy(detector_);
-    detector_ = {};
-  }
 
- private:
-  mmdeploy_text_detector_t detector_{};
-};
+      private:
+        mmdeploy_text_detector_t detector_{};
+    };
 
-static PythonBindingRegisterer register_text_detector{[](py::module& m) {
-  py::class_<PyTextDetector>(m, "TextDetector")
-      .def(py::init([](const char* model_path, const char* device_name, int device_id) {
-             return std::make_unique<PyTextDetector>(model_path, device_name, device_id);
-           }),
-           py::arg("model_path"), py::arg("device_name"), py::arg("device_id") = 0)
-      .def("__call__",
-           [](PyTextDetector* self, const PyImage& img) -> py::array {
-             return self->Apply(std::vector{img})[0];
-           })
-      .def("batch", &PyTextDetector::Apply);
-}};
+    static PythonBindingRegisterer register_text_detector{[](py::module& m)
+                                                          {
+                                                              py::class_<PyTextDetector>(m, "TextDetector")
+                                                                  .def(py::init([](const char* model_path, const char* device_name, int device_id)
+                                                                                { return std::make_unique<PyTextDetector>(model_path, device_name, device_id); }),
+                                                                       py::arg("model_path"),
+                                                                       py::arg("device_name"),
+                                                                       py::arg("device_id") = 0)
+                                                                  .def("__call__",
+                                                                       [](PyTextDetector* self, const PyImage& img) -> py::array
+                                                                       {
+                                                                           return self->Apply(std::vector{img})[0];
+                                                                       })
+                                                                  .def("batch", &PyTextDetector::Apply);
+                                                          }};
 
 }  // namespace mmdeploy::python
diff --git a/csrc/mmdeploy/apis/python/text_recognizer.cpp b/csrc/mmdeploy/apis/python/text_recognizer.cpp
index 317f55103a..1b3bc92af8 100644
--- a/csrc/mmdeploy/apis/python/text_recognizer.cpp
+++ b/csrc/mmdeploy/apis/python/text_recognizer.cpp
@@ -4,79 +4,99 @@
 
 #include "common.h"
 
-namespace mmdeploy::python {
+namespace mmdeploy::python
+{
 
-class PyTextRecognizer {
- public:
-  PyTextRecognizer(const char* model_path, const char* device_name, int device_id) {
-    auto status =
-        mmdeploy_text_recognizer_create_by_path(model_path, device_name, device_id, &recognizer_);
-    if (status != MMDEPLOY_SUCCESS) {
-      throw std::runtime_error("failed to create text_recognizer");
-    }
-  }
-  std::vector<std::tuple<std::string, std::vector<float>>> Apply(const std::vector<PyImage>& imgs) {
-    std::vector<mmdeploy_mat_t> mats;
-    mats.reserve(imgs.size());
-    for (const auto& img : imgs) {
-      auto mat = GetMat(img);
-      mats.push_back(mat);
-    }
-    mmdeploy_text_recognition_t* results{};
-    auto status =
-        mmdeploy_text_recognizer_apply(recognizer_, mats.data(), (int)mats.size(), &results);
-    if (status != MMDEPLOY_SUCCESS) {
-      throw std::runtime_error("failed to apply text_recognizer, code: " + std::to_string(status));
-    }
-    auto output = std::vector<std::tuple<std::string, std::vector<float>>>{};
-    for (int i = 0; i < mats.size(); ++i) {
-      std::vector<float> score(results[i].score, results[i].score + results[i].length);
-      output.emplace_back(results[i].text, std::move(score));
-    }
-    mmdeploy_text_recognizer_release_result(results, (int)mats.size());
-    return output;
-  }
-  std::vector<std::tuple<std::string, std::vector<float>>> Apply(const PyImage& img,
-                                                                 const std::vector<float>& bboxes) {
-    if (bboxes.size() * sizeof(float) % sizeof(mmdeploy_text_detection_t)) {
-      throw std::invalid_argument("bboxes is not a list of 'mmdeploy_text_detection_t'");
-    }
-    auto mat = GetMat(img);
-    int bbox_count = bboxes.size() * sizeof(float) / sizeof(mmdeploy_text_detection_t);
-    mmdeploy_text_recognition_t* results{};
-    auto status = mmdeploy_text_recognizer_apply_bbox(
-        recognizer_, &mat, 1, (mmdeploy_text_detection_t*)bboxes.data(), &bbox_count, &results);
-    if (status != MMDEPLOY_SUCCESS) {
-      throw std::runtime_error("failed to apply text_recognizer, code: " + std::to_string(status));
-    }
-    auto output = std::vector<std::tuple<std::string, std::vector<float>>>{};
-    for (int i = 0; i < bbox_count; ++i) {
-      std::vector<float> score(results[i].score, results[i].score + results[i].length);
-      output.emplace_back(results[i].text, std::move(score));
-    }
-    mmdeploy_text_recognizer_release_result(results, bbox_count);
-    return output;
-  }
-  ~PyTextRecognizer() {
-    mmdeploy_text_recognizer_destroy(recognizer_);
-    recognizer_ = {};
-  }
+    class PyTextRecognizer
+    {
+      public:
+        PyTextRecognizer(const char* model_path, const char* device_name, int device_id)
+        {
+            auto status =
+                mmdeploy_text_recognizer_create_by_path(model_path, device_name, device_id, &recognizer_);
+            if (status != MMDEPLOY_SUCCESS)
+            {
+                throw std::runtime_error("failed to create text_recognizer");
+            }
+        }
+        std::vector<std::tuple<std::string, std::vector<float>>> Apply(const std::vector<PyImage>& imgs)
+        {
+            std::vector<mmdeploy_mat_t> mats;
+            mats.reserve(imgs.size());
+            for (const auto& img : imgs)
+            {
+                auto mat = GetMat(img);
+                mats.push_back(mat);
+            }
+            mmdeploy_text_recognition_t* results{};
+            auto                         status =
+                mmdeploy_text_recognizer_apply(recognizer_, mats.data(), (int)mats.size(), &results);
+            if (status != MMDEPLOY_SUCCESS)
+            {
+                throw std::runtime_error("failed to apply text_recognizer, code: " + std::to_string(status));
+            }
+            auto output = std::vector<std::tuple<std::string, std::vector<float>>>{};
+            for (int i = 0; i < mats.size(); ++i)
+            {
+                std::vector<float> score(results[i].score, results[i].score + results[i].length);
+                output.emplace_back(results[i].text, std::move(score));
+            }
+            mmdeploy_text_recognizer_release_result(results, (int)mats.size());
+            return output;
+        }
+        std::vector<std::tuple<std::string, std::vector<float>>> Apply(const PyImage&            img,
+                                                                       const std::vector<float>& bboxes)
+        {
+            if (bboxes.size() * sizeof(float) % sizeof(mmdeploy_text_detection_t))
+            {
+                throw std::invalid_argument("bboxes is not a list of 'mmdeploy_text_detection_t'");
+            }
+            auto                         mat        = GetMat(img);
+            int                          bbox_count = bboxes.size() * sizeof(float) / sizeof(mmdeploy_text_detection_t);
+            mmdeploy_text_recognition_t* results{};
+            auto                         status = mmdeploy_text_recognizer_apply_bbox(
+                recognizer_,
+                &mat,
+                1,
+                (mmdeploy_text_detection_t*)bboxes.data(),
+                &bbox_count,
+                &results);
+            if (status != MMDEPLOY_SUCCESS)
+            {
+                throw std::runtime_error("failed to apply text_recognizer, code: " + std::to_string(status));
+            }
+            auto output = std::vector<std::tuple<std::string, std::vector<float>>>{};
+            for (int i = 0; i < bbox_count; ++i)
+            {
+                std::vector<float> score(results[i].score, results[i].score + results[i].length);
+                output.emplace_back(results[i].text, std::move(score));
+            }
+            mmdeploy_text_recognizer_release_result(results, bbox_count);
+            return output;
+        }
+        ~PyTextRecognizer()
+        {
+            mmdeploy_text_recognizer_destroy(recognizer_);
+            recognizer_ = {};
+        }
 
- private:
-  mmdeploy_text_recognizer_t recognizer_{};
-};
+      private:
+        mmdeploy_text_recognizer_t recognizer_{};
+    };
 
-static PythonBindingRegisterer register_text_recognizer{[](py::module& m) {
-  py::class_<PyTextRecognizer>(m, "TextRecognizer")
-      .def(py::init([](const char* model_path, const char* device_name, int device_id) {
-             return std::make_unique<PyTextRecognizer>(model_path, device_name, device_id);
-           }),
-           py::arg("model_path"), py::arg("device_name"), py::arg("device_id") = 0)
-      .def("__call__", [](PyTextRecognizer* self,
-                          const PyImage& img) { return self->Apply(std::vector{img})[0]; })
-      .def("__call__", [](PyTextRecognizer* self, const PyImage& img,
-                          const std::vector<float>& bboxes) { return self->Apply(img, bboxes); })
-      .def("batch", py::overload_cast<const std::vector<PyImage>&>(&PyTextRecognizer::Apply));
-}};
+    static PythonBindingRegisterer register_text_recognizer{[](py::module& m)
+                                                            {
+                                                                py::class_<PyTextRecognizer>(m, "TextRecognizer")
+                                                                    .def(py::init([](const char* model_path, const char* device_name, int device_id)
+                                                                                  { return std::make_unique<PyTextRecognizer>(model_path, device_name, device_id); }),
+                                                                         py::arg("model_path"),
+                                                                         py::arg("device_name"),
+                                                                         py::arg("device_id") = 0)
+                                                                    .def("__call__", [](PyTextRecognizer* self, const PyImage& img)
+                                                                         { return self->Apply(std::vector{img})[0]; })
+                                                                    .def("__call__", [](PyTextRecognizer* self, const PyImage& img, const std::vector<float>& bboxes)
+                                                                         { return self->Apply(img, bboxes); })
+                                                                    .def("batch", py::overload_cast<const std::vector<PyImage>&>(&PyTextRecognizer::Apply));
+                                                            }};
 
 }  // namespace mmdeploy::python
diff --git a/csrc/mmdeploy/apis/python/video_recognizer.cpp b/csrc/mmdeploy/apis/python/video_recognizer.cpp
index 7c70337e51..ac2e691be3 100644
--- a/csrc/mmdeploy/apis/python/video_recognizer.cpp
+++ b/csrc/mmdeploy/apis/python/video_recognizer.cpp
@@ -4,85 +4,102 @@
 
 #include "common.h"
 
-namespace mmdeploy::python {
+namespace mmdeploy::python
+{
 
-class PyVideoRecognizer {
- public:
-  PyVideoRecognizer(const char* model_path, const char* device_name, int device_id) {
-    auto status =
-        mmdeploy_video_recognizer_create_by_path(model_path, device_name, device_id, &recognizer_);
-    if (status != MMDEPLOY_SUCCESS) {
-      throw std::runtime_error("failed to create video_recognizer");
-    }
-  }
-  std::vector<std::vector<std::tuple<int, float>>> Apply(
-      const std::vector<std::vector<PyImage>>& imgs, const std::vector<std::pair<int, int>>& info) {
-    if (info.size() != imgs.size()) {
-      throw std::invalid_argument("the length of info is not equal with imgs");
-    }
-    for (int i = 0; i < info.size(); i++) {
-      if (imgs[i].size() != info[i].first * info[i].second) {
-        throw std::invalid_argument("invalid info");
-      }
-    }
-    int total = 0;
-    for (int i = 0; i < imgs.size(); i++) {
-      total += imgs[i].size();
-    }
-    std::vector<mmdeploy_mat_t> clips;
-    std::vector<mmdeploy_video_sample_info_t> clip_info;
-    clips.reserve(total);
-    clip_info.reserve(total);
-    for (int i = 0; i < imgs.size(); i++) {
-      for (const auto& img : imgs[i]) {
-        auto mat = GetMat(img);
-        clips.push_back(mat);
-      }
-      clip_info.push_back({info[i].first, info[i].second});
-    }
+    class PyVideoRecognizer
+    {
+      public:
+        PyVideoRecognizer(const char* model_path, const char* device_name, int device_id)
+        {
+            auto status =
+                mmdeploy_video_recognizer_create_by_path(model_path, device_name, device_id, &recognizer_);
+            if (status != MMDEPLOY_SUCCESS)
+            {
+                throw std::runtime_error("failed to create video_recognizer");
+            }
+        }
+        std::vector<std::vector<std::tuple<int, float>>> Apply(
+            const std::vector<std::vector<PyImage>>& imgs,
+            const std::vector<std::pair<int, int>>&  info)
+        {
+            if (info.size() != imgs.size())
+            {
+                throw std::invalid_argument("the length of info is not equal with imgs");
+            }
+            for (int i = 0; i < info.size(); i++)
+            {
+                if (imgs[i].size() != info[i].first * info[i].second)
+                {
+                    throw std::invalid_argument("invalid info");
+                }
+            }
+            int total = 0;
+            for (int i = 0; i < imgs.size(); i++)
+            {
+                total += imgs[i].size();
+            }
+            std::vector<mmdeploy_mat_t>               clips;
+            std::vector<mmdeploy_video_sample_info_t> clip_info;
+            clips.reserve(total);
+            clip_info.reserve(total);
+            for (int i = 0; i < imgs.size(); i++)
+            {
+                for (const auto& img : imgs[i])
+                {
+                    auto mat = GetMat(img);
+                    clips.push_back(mat);
+                }
+                clip_info.push_back({info[i].first, info[i].second});
+            }
 
-    mmdeploy_video_recognition_t* results{};
-    int* result_count{};
-    auto status = mmdeploy_video_recognizer_apply(recognizer_, clips.data(), clip_info.data(), 1,
-                                                  &results, &result_count);
-    if (status != MMDEPLOY_SUCCESS) {
-      throw std::runtime_error("failed to apply video_recognizer, code: " + std::to_string(status));
-    }
+            mmdeploy_video_recognition_t* results{};
+            int*                          result_count{};
+            auto                          status = mmdeploy_video_recognizer_apply(recognizer_, clips.data(), clip_info.data(), 1, &results, &result_count);
+            if (status != MMDEPLOY_SUCCESS)
+            {
+                throw std::runtime_error("failed to apply video_recognizer, code: " + std::to_string(status));
+            }
 
-    auto output = std::vector<std::vector<std::tuple<int, float>>>{};
-    output.reserve(imgs.size());
-    auto result_ptr = results;
-    for (int i = 0; i < imgs.size(); ++i) {
-      std::vector<std::tuple<int, float>> label_score;
-      for (int j = 0; j < result_count[i]; ++j) {
-        label_score.emplace_back(result_ptr[j].label_id, result_ptr[j].score);
-      }
-      output.push_back(std::move(label_score));
-      result_ptr += result_count[i];
-    }
-    mmdeploy_video_recognizer_release_result(results, result_count, (int)imgs.size());
-    return output;
-  }
+            auto output = std::vector<std::vector<std::tuple<int, float>>>{};
+            output.reserve(imgs.size());
+            auto result_ptr = results;
+            for (int i = 0; i < imgs.size(); ++i)
+            {
+                std::vector<std::tuple<int, float>> label_score;
+                for (int j = 0; j < result_count[i]; ++j)
+                {
+                    label_score.emplace_back(result_ptr[j].label_id, result_ptr[j].score);
+                }
+                output.push_back(std::move(label_score));
+                result_ptr += result_count[i];
+            }
+            mmdeploy_video_recognizer_release_result(results, result_count, (int)imgs.size());
+            return output;
+        }
 
-  ~PyVideoRecognizer() {
-    mmdeploy_video_recognizer_destroy(recognizer_);
-    recognizer_ = {};
-  }
+        ~PyVideoRecognizer()
+        {
+            mmdeploy_video_recognizer_destroy(recognizer_);
+            recognizer_ = {};
+        }
 
- private:
-  mmdeploy_video_recognizer_t recognizer_{};
-};
+      private:
+        mmdeploy_video_recognizer_t recognizer_{};
+    };
 
-static PythonBindingRegisterer register_video_recognizer{[](py::module& m) {
-  py::class_<PyVideoRecognizer>(m, "VideoRecognizer")
-      .def(py::init([](const char* model_path, const char* device_name, int device_id) {
-             return std::make_unique<PyVideoRecognizer>(model_path, device_name, device_id);
-           }),
-           py::arg("model_path"), py::arg("device_name"), py::arg("device_id") = 0)
-      .def("__call__",
-           [](PyVideoRecognizer* self, const std::vector<PyImage>& imgs,
-              const std::pair<int, int>& info) { return self->Apply({imgs}, {info})[0]; })
-      .def("batch", &PyVideoRecognizer::Apply);
-}};
+    static PythonBindingRegisterer register_video_recognizer{[](py::module& m)
+                                                             {
+                                                                 py::class_<PyVideoRecognizer>(m, "VideoRecognizer")
+                                                                     .def(py::init([](const char* model_path, const char* device_name, int device_id)
+                                                                                   { return std::make_unique<PyVideoRecognizer>(model_path, device_name, device_id); }),
+                                                                          py::arg("model_path"),
+                                                                          py::arg("device_name"),
+                                                                          py::arg("device_id") = 0)
+                                                                     .def("__call__",
+                                                                          [](PyVideoRecognizer* self, const std::vector<PyImage>& imgs, const std::pair<int, int>& info)
+                                                                          { return self->Apply({imgs}, {info})[0]; })
+                                                                     .def("batch", &PyVideoRecognizer::Apply);
+                                                             }};
 
 }  // namespace mmdeploy::python
diff --git a/csrc/mmdeploy/archive/CMakeLists.txt b/csrc/mmdeploy/archive/CMakeLists.txt
index 3f3d1f1104..68c34d3d05 100644
--- a/csrc/mmdeploy/archive/CMakeLists.txt
+++ b/csrc/mmdeploy/archive/CMakeLists.txt
@@ -6,8 +6,10 @@ add_library(${PROJECT_NAME} INTERFACE)
 target_link_libraries(${PROJECT_NAME} INTERFACE mmdeploy::core)
 add_library(mmdeploy::archive ALIAS mmdeploy_archive)
 
-install(DIRECTORY ${CMAKE_SOURCE_DIR}/csrc/mmdeploy/archive
-        DESTINATION include/mmdeploy
-        FILES_MATCHING PATTERN "*.h")
+install(
+  DIRECTORY ${CMAKE_SOURCE_DIR}/csrc/mmdeploy/archive
+  DESTINATION include/mmdeploy
+  FILES_MATCHING
+  PATTERN "*.h")
 install(FILES ${CMAKE_SOURCE_DIR}/third_party/json/json.hpp
         DESTINATION include/mmdeploy/third_party/json)
diff --git a/csrc/mmdeploy/archive/json_archive.h b/csrc/mmdeploy/archive/json_archive.h
index 2803ee22b2..cf03005856 100644
--- a/csrc/mmdeploy/archive/json_archive.h
+++ b/csrc/mmdeploy/archive/json_archive.h
@@ -7,207 +7,247 @@
 #include "mmdeploy/core/archive.h"
 #include "mmdeploy/core/value.h"
 
-namespace mmdeploy {
-
-namespace detail {
-
-template <typename T>
-nlohmann::json to_json_impl(T&& val);
-
-inline nlohmann::json value_to_json(const Value& value) {
-  switch (value.type()) {
-    case ValueType::kNull:
-      return {};
-    case ValueType::kBool:
-      return value.get<bool>();
-    case ValueType::kInt:
-      return value.get<int64_t>();
-    case ValueType::kUInt:
-      return value.get<uint64_t>();
-    case ValueType::kFloat:
-      return value.get<double>();
-    case ValueType::kString:
-      return value.get<std::string>();
-    case ValueType::kArray: {
-      nlohmann::json json = nlohmann::json::value_t::array;
-      for (const auto& x : value) {
-        json.push_back(value_to_json(x));
-      }
-      return json;
+namespace mmdeploy
+{
+
+    namespace detail
+    {
+
+        template<typename T>
+        nlohmann::json        to_json_impl(T&& val);
+
+        inline nlohmann::json value_to_json(const Value& value)
+        {
+            switch (value.type())
+            {
+                case ValueType::kNull:
+                    return {};
+                case ValueType::kBool:
+                    return value.get<bool>();
+                case ValueType::kInt:
+                    return value.get<int64_t>();
+                case ValueType::kUInt:
+                    return value.get<uint64_t>();
+                case ValueType::kFloat:
+                    return value.get<double>();
+                case ValueType::kString:
+                    return value.get<std::string>();
+                case ValueType::kArray:
+                {
+                    nlohmann::json json = nlohmann::json::value_t::array;
+                    for (const auto& x : value)
+                    {
+                        json.push_back(value_to_json(x));
+                    }
+                    return json;
+                }
+                case ValueType::kObject:
+                {
+                    nlohmann::json json = nlohmann::json::value_t::object;
+                    for (auto it = value.begin(); it != value.end(); ++it)
+                    {
+                        auto key  = it.key();
+                        json[key] = value_to_json(*it);
+                    }
+                    return json;
+                }
+                case ValueType::kAny:
+                    return "<any>";
+                default:
+                    return "<unknown>";
+            }
+        }
+
+    }  // namespace detail
+
+    template<typename T, std::enable_if_t<!is_value_v<uncvref_t<T>>, int> = 0>
+    nlohmann::json to_json(T&& val)
+    {
+        return detail::to_json_impl(std::forward<T>(val));
     }
-    case ValueType::kObject: {
-      nlohmann::json json = nlohmann::json::value_t::object;
-      for (auto it = value.begin(); it != value.end(); ++it) {
-        auto key = it.key();
-        json[key] = value_to_json(*it);
-      }
-      return json;
+
+    inline nlohmann::json to_json(const Value& value)
+    {
+        return detail::value_to_json(value);
+    }
+
+    // save to JSON
+    class JsonOutputArchive : public OutputArchive<JsonOutputArchive>
+    {
+      public:
+        explicit JsonOutputArchive(nlohmann::json& data)
+            : data_(data)
+        {
+        }
+
+        void init(...) {}
+
+        template<typename T>
+        void named_value(const std::string& name, T&& val)
+        {
+            data_[name] = to_json(std::forward<T>(val));
+        }
+
+        template<typename T>
+        void item(T&& val)
+        {
+            data_.push_back(to_json(std::forward<T>(val)));
+        }
+
+        template<typename T, typename V = uncvref_t<T>, std::enable_if_t<std::disjunction_v<std::is_arithmetic<V>, std::is_same<V, const char*>, std::is_same<V, std::string>, std::is_same<V, nlohmann::json>>, int> = 0>
+        void native(T&& val)
+        {
+            data_ = std::forward<T>(val);
+        }
+
+      private:
+        nlohmann::json& data_;
+    };
+
+    namespace detail
+    {
+
+        template<typename T>
+        inline nlohmann::json to_json_impl(T&& val)
+        {
+            nlohmann::json    json;
+            JsonOutputArchive archive(json);
+            archive(std::forward<T>(val));
+            return json;
+        }
+
+    }  // namespace detail
+
+    namespace detail
+    {
+
+        inline Value json_to_value(const nlohmann::json& json)
+        {
+            using value_t = nlohmann::json::value_t;
+            switch (json.type())
+            {
+                case value_t::null:
+                    return {};
+                case value_t::boolean:
+                    return json.get<bool>();
+                case value_t::number_integer:
+                    return json.get<int64_t>();
+                case value_t::number_unsigned:
+                    return json.get<uint64_t>();
+                case value_t::number_float:
+                    return json.get<double>();
+                case value_t::string:
+                    return json.get<std::string>();
+                case value_t::array:
+                {
+                    Value value = ValueType::kArray;
+                    for (const auto& x : json)
+                    {
+                        value.push_back(json_to_value(x));
+                    }
+                    return value;
+                }
+                case value_t::object:
+                {
+                    Value value = ValueType::kObject;
+                    for (const auto& proxy : json.items())
+                    {
+                        value[proxy.key()] = json_to_value(proxy.value());
+                    }
+                    return value;
+                }
+                default:
+                    MMDEPLOY_ERROR("unsupported json type: {}", json.type_name());
+                    return {};
+            }
+        }
+
+        template<typename T>
+        void from_json_impl(const nlohmann::json& json, T&& val);
+
+    }  // namespace detail
+
+    template<typename T, std::enable_if_t<!std::is_same_v<Value, uncvref_t<T>>, int> = 0>
+    void from_json(const nlohmann::json& json, T&& val)
+    {
+        detail::from_json_impl(json, std::forward<T>(val));
     }
-    case ValueType::kAny:
-      return "<any>";
-    default:
-      return "<unknown>";
-  }
-}
-
-}  // namespace detail
-
-template <typename T, std::enable_if_t<!is_value_v<uncvref_t<T>>, int> = 0>
-nlohmann::json to_json(T&& val) {
-  return detail::to_json_impl(std::forward<T>(val));
-}
-
-inline nlohmann::json to_json(const Value& value) { return detail::value_to_json(value); }
-
-// save to JSON
-class JsonOutputArchive : public OutputArchive<JsonOutputArchive> {
- public:
-  explicit JsonOutputArchive(nlohmann::json& data) : data_(data) {}
-
-  void init(...) {}
-
-  template <typename T>
-  void named_value(const std::string& name, T&& val) {
-    data_[name] = to_json(std::forward<T>(val));
-  }
-
-  template <typename T>
-  void item(T&& val) {
-    data_.push_back(to_json(std::forward<T>(val)));
-  }
-
-  template <typename T, typename V = uncvref_t<T>,
-            std::enable_if_t<
-                std::disjunction_v<std::is_arithmetic<V>, std::is_same<V, const char*>,
-                                   std::is_same<V, std::string>, std::is_same<V, nlohmann::json>>,
-                int> = 0>
-  void native(T&& val) {
-    data_ = std::forward<T>(val);
-  }
-
- private:
-  nlohmann::json& data_;
-};
-
-namespace detail {
-
-template <typename T>
-inline nlohmann::json to_json_impl(T&& val) {
-  nlohmann::json json;
-  JsonOutputArchive archive(json);
-  archive(std::forward<T>(val));
-  return json;
-}
-
-}  // namespace detail
-
-namespace detail {
-
-inline Value json_to_value(const nlohmann::json& json) {
-  using value_t = nlohmann::json::value_t;
-  switch (json.type()) {
-    case value_t::null:
-      return {};
-    case value_t::boolean:
-      return json.get<bool>();
-    case value_t::number_integer:
-      return json.get<int64_t>();
-    case value_t::number_unsigned:
-      return json.get<uint64_t>();
-    case value_t::number_float:
-      return json.get<double>();
-    case value_t::string:
-      return json.get<std::string>();
-    case value_t::array: {
-      Value value = ValueType::kArray;
-      for (const auto& x : json) {
-        value.push_back(json_to_value(x));
-      }
-      return value;
+
+    inline void from_json(const nlohmann::json& json, Value& val)
+    {
+        val = detail::json_to_value(json);
     }
-    case value_t::object: {
-      Value value = ValueType::kObject;
-      for (const auto& proxy : json.items()) {
-        value[proxy.key()] = json_to_value(proxy.value());
-      }
-      return value;
+
+    template<typename T>
+    T from_json(const nlohmann::json& json);
+
+    // load from JSON
+    class JsonInputArchive : public InputArchive<JsonInputArchive>
+    {
+      public:
+        explicit JsonInputArchive(const nlohmann::json& data)
+            : data_(data)
+        {
+        }
+
+        template<typename SizeType>
+        void init(SizeType& size)
+        {
+            size  = static_cast<SizeType>(data_.size());
+            iter_ = data_.begin();
+        }
+
+        template<typename T>
+        void named_value(std::string& name, T& val)
+        {
+            name = iter_.key();
+            from_json(*iter_++, std::forward<T>(val));
+        }
+
+        template<typename T>
+        void named_value(const std::string& name, T&& val)
+        {
+            from_json(data_[name], std::forward<T>(val));
+        }
+
+        template<typename T>
+        void item(T&& val)
+        {
+            from_json(*iter_++, std::forward<T>(val));
+        }
+
+        template<typename T>
+        void native(T&& val)
+        {
+            data_.get_to(val);
+        }
+
+      private:
+        const nlohmann::json&          data_;
+        nlohmann::json::const_iterator iter_;
+    };
+
+    namespace detail
+    {
+
+        template<typename T>
+        inline void from_json_impl(const nlohmann::json& json, T&& val)
+        {
+            JsonInputArchive archive(json);
+            archive(std::forward<T>(val));
+        }
+
+    }  // namespace detail
+
+    template<typename T>
+    inline T from_json(const nlohmann::json& json)
+    {
+        T val{};
+        from_json(json, val);
+        return val;
     }
-    default:
-      MMDEPLOY_ERROR("unsupported json type: {}", json.type_name());
-      return {};
-  }
-}
-
-template <typename T>
-void from_json_impl(const nlohmann::json& json, T&& val);
-
-}  // namespace detail
-
-template <typename T, std::enable_if_t<!std::is_same_v<Value, uncvref_t<T>>, int> = 0>
-void from_json(const nlohmann::json& json, T&& val) {
-  detail::from_json_impl(json, std::forward<T>(val));
-}
-
-inline void from_json(const nlohmann::json& json, Value& val) { val = detail::json_to_value(json); }
-
-template <typename T>
-T from_json(const nlohmann::json& json);
-
-// load from JSON
-class JsonInputArchive : public InputArchive<JsonInputArchive> {
- public:
-  explicit JsonInputArchive(const nlohmann::json& data) : data_(data) {}
-
-  template <typename SizeType>
-  void init(SizeType& size) {
-    size = static_cast<SizeType>(data_.size());
-    iter_ = data_.begin();
-  }
-
-  template <typename T>
-  void named_value(std::string& name, T& val) {
-    name = iter_.key();
-    from_json(*iter_++, std::forward<T>(val));
-  }
-
-  template <typename T>
-  void named_value(const std::string& name, T&& val) {
-    from_json(data_[name], std::forward<T>(val));
-  }
-
-  template <typename T>
-  void item(T&& val) {
-    from_json(*iter_++, std::forward<T>(val));
-  }
-
-  template <typename T>
-  void native(T&& val) {
-    data_.get_to(val);
-  }
-
- private:
-  const nlohmann::json& data_;
-  nlohmann::json::const_iterator iter_;
-};
-
-namespace detail {
-
-template <typename T>
-inline void from_json_impl(const nlohmann::json& json, T&& val) {
-  JsonInputArchive archive(json);
-  archive(std::forward<T>(val));
-}
-
-}  // namespace detail
-
-template <typename T>
-inline T from_json(const nlohmann::json& json) {
-  T val{};
-  from_json(json, val);
-  return val;
-}
-
-void from_json(const nlohmann::json& json, Value& val);
+
+    void from_json(const nlohmann::json& json, Value& val);
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/archive/value_archive.h b/csrc/mmdeploy/archive/value_archive.h
index 2f559c1a10..f3245f0dfc 100644
--- a/csrc/mmdeploy/archive/value_archive.h
+++ b/csrc/mmdeploy/archive/value_archive.h
@@ -6,131 +6,169 @@
 #include "mmdeploy/core/archive.h"
 #include "mmdeploy/core/value.h"
 
-namespace mmdeploy {
-
-template <typename T>
-Value to_value(T&& val);
-
-// save to Value
-class ValueOutputArchive : public OutputArchive<ValueOutputArchive> {
- public:
-  explicit ValueOutputArchive(Value& data) : data_(data) {}
-
-  template <typename T>
-  void init(array_tag<T>) {
-    data_ = ValueType::kArray;
-  }
-
-  template <typename T>
-  void init(object_tag<T>) {
-    data_ = ValueType::kObject;
-  }
-
-  template <typename T>
-  void named_value(const std::string& name, T&& val) {
-    data_[name] = to_value(std::forward<T>(val));
-  }
-
-  template <typename T>
-  void item(T&& val) {
-    data_.push_back(to_value(std::forward<T>(val)));
-  }
-
-  template <typename T, std::enable_if_t<std::is_constructible_v<Value, T>, int> = 0>
-  void native(T&& val) {
-    data_ = std::forward<T>(val);
-  };
-
- private:
-  Value& data_;
-};
-
-template <typename T>
-inline Value to_value(T&& val) {
-  Value value;
-  ValueOutputArchive archive(value);
-  archive(std::forward<T>(val));
-  return value;
-}
-
-// fast path
-inline Value to_value(const Value& v) { return v; }
-inline Value to_value(Value&& v) { return std::move(v); }
-
-template <typename T>
-void from_value(const Value& value, T&& x);
-
-template <typename T>
-T from_value(const Value& value);
-
-// load from Value
-class ValueInputArchive : public InputArchive<ValueInputArchive> {
- public:
-  explicit ValueInputArchive(const Value& data) : data_(data) {}
-
-  template <typename SizeType>
-  void init(SizeType& size) {
-    size = static_cast<SizeType>(data_.size());
-    iter_ = data_.begin();
-  }
-
-  template <typename T>
-  void named_value(std::string& name, T& val) {
-    name = iter_.key();
-    from_value(*iter_, std::forward<T>(val));
-    ++iter_;
-  }
-
-  template <typename T>
-  void named_value(const std::string& name, T&& val) {
-    from_value(data_[name], std::forward<T>(val));
-  }
-
-  template <typename T>
-  void item(T&& val) {
-    from_value(*iter_, std::forward<T>(val));
-    ++iter_;
-  }
-
-  template <typename T>
-  void native(T&& val) {
-    data_.get_to(val);
-  }
-
-  template <typename T>
-  void value(T&& value) {}
-
- private:
-  const Value& data_;
-  Value::const_iterator iter_;
-};
-
-template <typename T>
-void from_value(const Value& value, T&& x) {
-  ValueInputArchive archive(value);
-  archive(std::forward<T>(x));
-}
-
-// Required to avoid Value::Pointer being unwrapped by Value::get_to()
-inline void from_value(const Value& value, Value& x) { x = value; }
-
-template <typename T>
-inline T from_value(const Value& value) {
-  T x{};
-  from_value(value, x);
-  return x;
-}
-
-namespace detail {
-
-inline void load(ValueInputArchive& archive, Value& v) { archive.native(v); }
-
-template <class T, std::enable_if_t<std::is_same<std::decay_t<T>, Value>::value, bool> = true>
-inline void save(ValueOutputArchive& archive, T&& v) {
-  archive.native(std::forward<T>(v));
-}
-
-}  // namespace detail
+namespace mmdeploy
+{
+
+    template<typename T>
+    Value to_value(T&& val);
+
+    // save to Value
+    class ValueOutputArchive : public OutputArchive<ValueOutputArchive>
+    {
+      public:
+        explicit ValueOutputArchive(Value& data)
+            : data_(data)
+        {
+        }
+
+        template<typename T>
+        void init(array_tag<T>)
+        {
+            data_ = ValueType::kArray;
+        }
+
+        template<typename T>
+        void init(object_tag<T>)
+        {
+            data_ = ValueType::kObject;
+        }
+
+        template<typename T>
+        void named_value(const std::string& name, T&& val)
+        {
+            data_[name] = to_value(std::forward<T>(val));
+        }
+
+        template<typename T>
+        void item(T&& val)
+        {
+            data_.push_back(to_value(std::forward<T>(val)));
+        }
+
+        template<typename T, std::enable_if_t<std::is_constructible_v<Value, T>, int> = 0>
+        void native(T&& val)
+        {
+            data_ = std::forward<T>(val);
+        };
+
+      private:
+        Value& data_;
+    };
+
+    template<typename T>
+    inline Value to_value(T&& val)
+    {
+        Value              value;
+        ValueOutputArchive archive(value);
+        archive(std::forward<T>(val));
+        return value;
+    }
+
+    // fast path
+    inline Value to_value(const Value& v)
+    {
+        return v;
+    }
+    inline Value to_value(Value&& v)
+    {
+        return std::move(v);
+    }
+
+    template<typename T>
+    void from_value(const Value& value, T&& x);
+
+    template<typename T>
+    T from_value(const Value& value);
+
+    // load from Value
+    class ValueInputArchive : public InputArchive<ValueInputArchive>
+    {
+      public:
+        explicit ValueInputArchive(const Value& data)
+            : data_(data)
+        {
+        }
+
+        template<typename SizeType>
+        void init(SizeType& size)
+        {
+            size  = static_cast<SizeType>(data_.size());
+            iter_ = data_.begin();
+        }
+
+        template<typename T>
+        void named_value(std::string& name, T& val)
+        {
+            name = iter_.key();
+            from_value(*iter_, std::forward<T>(val));
+            ++iter_;
+        }
+
+        template<typename T>
+        void named_value(const std::string& name, T&& val)
+        {
+            from_value(data_[name], std::forward<T>(val));
+        }
+
+        template<typename T>
+        void item(T&& val)
+        {
+            from_value(*iter_, std::forward<T>(val));
+            ++iter_;
+        }
+
+        template<typename T>
+        void native(T&& val)
+        {
+            data_.get_to(val);
+        }
+
+        template<typename T>
+        void value(T&& value)
+        {
+        }
+
+      private:
+        const Value&          data_;
+        Value::const_iterator iter_;
+    };
+
+    template<typename T>
+    void from_value(const Value& value, T&& x)
+    {
+        ValueInputArchive archive(value);
+        archive(std::forward<T>(x));
+    }
+
+    // Required to avoid Value::Pointer being unwrapped by Value::get_to()
+    inline void from_value(const Value& value, Value& x)
+    {
+        x = value;
+    }
+
+    template<typename T>
+    inline T from_value(const Value& value)
+    {
+        T x{};
+        from_value(value, x);
+        return x;
+    }
+
+    namespace detail
+    {
+
+        inline void load(ValueInputArchive& archive, Value& v)
+        {
+            archive.native(v);
+        }
+
+        template<class T, std::enable_if_t<std::is_same<std::decay_t<T>, Value>::value, bool> = true>
+        inline void save(ValueOutputArchive& archive, T&& v)
+        {
+            archive.native(std::forward<T>(v));
+        }
+
+    }  // namespace detail
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/backend_ops/CMakeLists.txt b/csrc/mmdeploy/backend_ops/CMakeLists.txt
index 761c35a59a..4fc59bbf8c 100644
--- a/csrc/mmdeploy/backend_ops/CMakeLists.txt
+++ b/csrc/mmdeploy/backend_ops/CMakeLists.txt
@@ -1,39 +1,39 @@
-if (NOT MSVC)
-    set(CMAKE_CXX_STANDARD 14)
-    set(CMAKE_CXX_FLAGS_RELEASE "-O3")
-endif ()
+if(NOT MSVC)
+  set(CMAKE_CXX_STANDARD 14)
+  set(CMAKE_CXX_FLAGS_RELEASE "-O3")
+endif()
 
 # build ONNXRUNTIME ops
-if ("ort" IN_LIST MMDEPLOY_TARGET_BACKENDS)
-    if (NOT DEFINED ONNXRUNTIME_DIR)
-        set(ONNXRUNTIME_DIR $ENV{ONNXRUNTIME_DIR})
-    endif ()
-    if (NOT ONNXRUNTIME_DIR)
-        message(FATAL_ERROR " ONNXRUNTIME_DIR is not found.")
-    else ()
-        message(STATUS "Build ONNXRUNTIME custom ops.")
-        add_subdirectory(onnxruntime)
-    endif ()
-endif ()
+if("ort" IN_LIST MMDEPLOY_TARGET_BACKENDS)
+  if(NOT DEFINED ONNXRUNTIME_DIR)
+    set(ONNXRUNTIME_DIR $ENV{ONNXRUNTIME_DIR})
+  endif()
+  if(NOT ONNXRUNTIME_DIR)
+    message(FATAL_ERROR " ONNXRUNTIME_DIR is not found.")
+  else()
+    message(STATUS "Build ONNXRUNTIME custom ops.")
+    add_subdirectory(onnxruntime)
+  endif()
+endif()
 
 # build TensorRT ops
-if ("trt" IN_LIST MMDEPLOY_TARGET_BACKENDS)
-    if (NOT DEFINED TENSORRT_DIR)
-        set(TENSORRT_DIR $ENV{TENSORRT_DIR})
-    endif ()
-    message(STATUS "Build TensorRT custom ops.")
-    add_subdirectory(tensorrt)
-endif ()
+if("trt" IN_LIST MMDEPLOY_TARGET_BACKENDS)
+  if(NOT DEFINED TENSORRT_DIR)
+    set(TENSORRT_DIR $ENV{TENSORRT_DIR})
+  endif()
+  message(STATUS "Build TensorRT custom ops.")
+  add_subdirectory(tensorrt)
+endif()
 
 # build ncnn ops
-if ("ncnn" IN_LIST MMDEPLOY_TARGET_BACKENDS)
-    message(STATUS "Build ncnn custom ops")
-    add_subdirectory(ncnn)
-endif ()
+if("ncnn" IN_LIST MMDEPLOY_TARGET_BACKENDS)
+  message(STATUS "Build ncnn custom ops")
+  add_subdirectory(ncnn)
+endif()
 
 # build TorchScript ops
-if ("torchscript" IN_LIST MMDEPLOY_TARGET_BACKENDS
-    OR "coreml" IN_LIST MMDEPLOY_TARGET_BACKENDS)
+if("torchscript" IN_LIST MMDEPLOY_TARGET_BACKENDS OR "coreml" IN_LIST
+                                                     MMDEPLOY_TARGET_BACKENDS)
   message(STATUS "Build torchscript custom ops")
   add_subdirectory(torchscript)
-endif ()
+endif()
diff --git a/csrc/mmdeploy/backend_ops/common/modulated_deform_conv/common_cuda_helper.cuh b/csrc/mmdeploy/backend_ops/common/modulated_deform_conv/common_cuda_helper.cuh
index 02c57c62e6..d5b0f57bfc 100644
--- a/csrc/mmdeploy/backend_ops/common/modulated_deform_conv/common_cuda_helper.cuh
+++ b/csrc/mmdeploy/backend_ops/common/modulated_deform_conv/common_cuda_helper.cuh
@@ -8,25 +8,27 @@
 #include <algorithm>
 
 #define CUDA_1D_KERNEL_LOOP(i, n) \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
+    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
 
 #define THREADS_PER_BLOCK 512
 
 #define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
-inline int GET_BLOCKS(const int N) {
-  int optimal_block_num = DIVUP(N, THREADS_PER_BLOCK);
-  int max_block_num = 4096;
-  return std::min(optimal_block_num, max_block_num);
+inline int GET_BLOCKS(const int N)
+{
+    int optimal_block_num = DIVUP(N, THREADS_PER_BLOCK);
+    int max_block_num     = 4096;
+    return std::min(optimal_block_num, max_block_num);
 }
 
-#define cudaCheckError()                                                               \
-  {                                                                                    \
-    cudaError_t e = cudaGetLastError();                                                \
-    if (e != cudaSuccess) {                                                            \
-      printf("Cuda failure %s:%d: '%s'\n", __FILE__, __LINE__, cudaGetErrorString(e)); \
-      exit(0);                                                                         \
-    }                                                                                  \
-  }
+#define cudaCheckError()                                                                     \
+    {                                                                                        \
+        cudaError_t e = cudaGetLastError();                                                  \
+        if (e != cudaSuccess)                                                                \
+        {                                                                                    \
+            printf("Cuda failure %s:%d: '%s'\n", __FILE__, __LINE__, cudaGetErrorString(e)); \
+            exit(0);                                                                         \
+        }                                                                                    \
+    }
 
 /**
  * Returns a view of the original tensor with its dimensions permuted.
@@ -38,57 +40,81 @@ inline int GET_BLOCKS(const int N) {
  * @param[in] src_dim dim of src tensor
  * @param[in] stream cuda stream handle
  */
-template <class scalar_t>
-void memcpyPermute(scalar_t* dst, const scalar_t* src, int* src_size, int* permute, int src_dim,
-                   cudaStream_t stream = 0);
-
-template <typename scalar_t>
-cublasStatus_t cublasGemmWrap(cublasHandle_t handle, cublasOperation_t transa,
-                              cublasOperation_t transb, int m, int n, int k, const scalar_t* alpha,
-                              const scalar_t* A, int lda, const scalar_t* B, int ldb,
-                              const scalar_t* beta, scalar_t* C, int ldc);
-
-template <typename scalar_t>
-__device__ scalar_t bilinear_interpolate(const scalar_t* input, const int height, const int width,
-                                         scalar_t y, scalar_t x) {
-  // deal with cases that inverse elements are out of feature map boundary
-  if (y < -1.0 || y > height || x < -1.0 || x > width) return 0;
-
-  if (y <= 0) y = 0;
-  if (x <= 0) x = 0;
-
-  int y_low = (int)y;
-  int x_low = (int)x;
-  int y_high;
-  int x_high;
-
-  if (y_low >= height - 1) {
-    y_high = y_low = height - 1;
-    y = (scalar_t)y_low;
-  } else {
-    y_high = y_low + 1;
-  }
-
-  if (x_low >= width - 1) {
-    x_high = x_low = width - 1;
-    x = (scalar_t)x_low;
-  } else {
-    x_high = x_low + 1;
-  }
-
-  scalar_t ly = y - y_low;
-  scalar_t lx = x - x_low;
-  scalar_t hy = 1. - ly, hx = 1. - lx;
-  // do bilinear interpolation
-  scalar_t v1 = input[y_low * width + x_low];
-  scalar_t v2 = input[y_low * width + x_high];
-  scalar_t v3 = input[y_high * width + x_low];
-  scalar_t v4 = input[y_high * width + x_high];
-  scalar_t w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-
-  scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-
-  return val;
+template<class scalar_t>
+void memcpyPermute(scalar_t*       dst,
+                   const scalar_t* src,
+                   int*            src_size,
+                   int*            permute,
+                   int             src_dim,
+                   cudaStream_t    stream = 0);
+
+template<typename scalar_t>
+cublasStatus_t cublasGemmWrap(cublasHandle_t    handle,
+                              cublasOperation_t transa,
+                              cublasOperation_t transb,
+                              int               m,
+                              int               n,
+                              int               k,
+                              const scalar_t*   alpha,
+                              const scalar_t*   A,
+                              int               lda,
+                              const scalar_t*   B,
+                              int               ldb,
+                              const scalar_t*   beta,
+                              scalar_t*         C,
+                              int               ldc);
+
+template<typename scalar_t>
+__device__ scalar_t bilinear_interpolate(const scalar_t* input,
+                                         const int       height,
+                                         const int       width,
+                                         scalar_t        y,
+                                         scalar_t        x)
+{
+    // deal with cases that inverse elements are out of feature map boundary
+    if (y < -1.0 || y > height || x < -1.0 || x > width) return 0;
+
+    if (y <= 0) y = 0;
+    if (x <= 0) x = 0;
+
+    int y_low = (int)y;
+    int x_low = (int)x;
+    int y_high;
+    int x_high;
+
+    if (y_low >= height - 1)
+    {
+        y_high = y_low = height - 1;
+        y              = (scalar_t)y_low;
+    }
+    else
+    {
+        y_high = y_low + 1;
+    }
+
+    if (x_low >= width - 1)
+    {
+        x_high = x_low = width - 1;
+        x              = (scalar_t)x_low;
+    }
+    else
+    {
+        x_high = x_low + 1;
+    }
+
+    scalar_t ly = y - y_low;
+    scalar_t lx = x - x_low;
+    scalar_t hy = 1. - ly, hx = 1. - lx;
+    // do bilinear interpolation
+    scalar_t v1 = input[y_low * width + x_low];
+    scalar_t v2 = input[y_low * width + x_high];
+    scalar_t v3 = input[y_high * width + x_low];
+    scalar_t v4 = input[y_high * width + x_high];
+    scalar_t w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+    scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+    return val;
 }
 
 #endif  // COMMON_CUDA_HELPER
diff --git a/csrc/mmdeploy/backend_ops/common/modulated_deform_conv/modulated_deform_conv_cpu.h b/csrc/mmdeploy/backend_ops/common/modulated_deform_conv/modulated_deform_conv_cpu.h
index a37e243109..a65096df08 100644
--- a/csrc/mmdeploy/backend_ops/common/modulated_deform_conv/modulated_deform_conv_cpu.h
+++ b/csrc/mmdeploy/backend_ops/common/modulated_deform_conv/modulated_deform_conv_cpu.h
@@ -1,82 +1,105 @@
 #include <cmath>
 #include <cstdint>
 
-template <typename T>
-T bilinear_interpolate_2d(const T *src, const int64_t src_h, const int64_t src_w, const T h,
-                          const T w) {
-  if (h <= -1 || src_h <= h || w <= -1 || src_w <= w) {
-    return 0;
-  }
+template<typename T>
+T bilinear_interpolate_2d(const T*      src,
+                          const int64_t src_h,
+                          const int64_t src_w,
+                          const T       h,
+                          const T       w)
+{
+    if (h <= -1 || src_h <= h || w <= -1 || src_w <= w)
+    {
+        return 0;
+    }
 
-  int64_t h_low = floor(h);
-  int64_t w_low = floor(w);
-  int64_t h_high = h_low + 1;
-  int64_t w_high = w_low + 1;
+    int64_t h_low  = floor(h);
+    int64_t w_low  = floor(w);
+    int64_t h_high = h_low + 1;
+    int64_t w_high = w_low + 1;
 
-  T lh = h - h_low;
-  T lw = w - w_low;
-  T hh = 1 - lh;
-  T hw = 1 - lw;
+    T       lh = h - h_low;
+    T       lw = w - w_low;
+    T       hh = 1 - lh;
+    T       hw = 1 - lw;
 
-  T v1 = 0;
-  if (h_low >= 0 && w_low >= 0) v1 = src[h_low * src_w + w_low];
-  T v2 = 0;
-  if (h_low >= 0 && w_high <= src_w - 1) v2 = src[h_low * src_w + w_high];
-  T v3 = 0;
-  if (h_high <= src_h - 1 && w_low >= 0) v3 = src[h_high * src_w + w_low];
-  T v4 = 0;
-  if (h_high <= src_h - 1 && w_high <= src_w - 1) v4 = src[h_high * src_w + w_high];
+    T       v1 = 0;
+    if (h_low >= 0 && w_low >= 0) v1 = src[h_low * src_w + w_low];
+    T v2 = 0;
+    if (h_low >= 0 && w_high <= src_w - 1) v2 = src[h_low * src_w + w_high];
+    T v3 = 0;
+    if (h_high <= src_h - 1 && w_low >= 0) v3 = src[h_high * src_w + w_low];
+    T v4 = 0;
+    if (h_high <= src_h - 1 && w_high <= src_w - 1) v4 = src[h_high * src_w + w_high];
 
-  T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+    T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
 
-  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-  return val;
+    T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+    return val;
 }
 
 // output: (channels * kernel_h * kernel_w, dst_h * dst_w)
-template <typename T>
-void deformable_im2col_2d(const T *input, const T *offset, const T *mask, const int64_t src_h,
-                          const int64_t src_w, const int64_t kernel_h, const int64_t kernel_w,
-                          const int64_t pad_h, const int64_t pad_w, const int64_t stride_h,
-                          const int64_t stride_w, const int64_t dilation_h,
-                          const int64_t dilation_w, const int64_t channels,
-                          const int64_t offset_groups, const int64_t dst_h, const int64_t dst_w,
-                          const bool use_mask, T *columns) {
-  const int64_t workload = channels * dst_h * dst_w;
-  for (int64_t index = 0; index != workload; ++index) {
-    const int64_t ow = index % dst_w;
-    const int64_t oh = (index / dst_w) % dst_h;
-    const int64_t ic = index / (dst_w * dst_h);
-    const int64_t oc = ic * kernel_h * kernel_w;
+template<typename T>
+void deformable_im2col_2d(const T*      input,
+                          const T*      offset,
+                          const T*      mask,
+                          const int64_t src_h,
+                          const int64_t src_w,
+                          const int64_t kernel_h,
+                          const int64_t kernel_w,
+                          const int64_t pad_h,
+                          const int64_t pad_w,
+                          const int64_t stride_h,
+                          const int64_t stride_w,
+                          const int64_t dilation_h,
+                          const int64_t dilation_w,
+                          const int64_t channels,
+                          const int64_t offset_groups,
+                          const int64_t dst_h,
+                          const int64_t dst_w,
+                          const bool    use_mask,
+                          T*            columns)
+{
+    const int64_t workload = channels * dst_h * dst_w;
+    for (int64_t index = 0; index != workload; ++index)
+    {
+        const int64_t ow = index % dst_w;
+        const int64_t oh = (index / dst_w) % dst_h;
+        const int64_t ic = index / (dst_w * dst_h);
+        const int64_t oc = ic * kernel_h * kernel_w;
 
-    int64_t c_per_offset_grp = channels / offset_groups;
-    const int64_t grp_idx = ic / c_per_offset_grp;
+        int64_t       c_per_offset_grp = channels / offset_groups;
+        const int64_t grp_idx          = ic / c_per_offset_grp;
 
-    auto columns_ptr = columns + (oc * (dst_h * dst_w) + oh * dst_w + ow);
-    auto input_ptr = input + ic * (src_h * src_w);
-    auto offset_ptr = offset + grp_idx * 2 * kernel_h * kernel_w * dst_h * dst_w;
-    auto mask_ptr = mask;
-    if (use_mask) {
-      mask_ptr += grp_idx * kernel_h * kernel_w * dst_h * dst_w;
-    }
+        auto          columns_ptr = columns + (oc * (dst_h * dst_w) + oh * dst_w + ow);
+        auto          input_ptr   = input + ic * (src_h * src_w);
+        auto          offset_ptr  = offset + grp_idx * 2 * kernel_h * kernel_w * dst_h * dst_w;
+        auto          mask_ptr    = mask;
+        if (use_mask)
+        {
+            mask_ptr += grp_idx * kernel_h * kernel_w * dst_h * dst_w;
+        }
 
-    for (int64_t kh = 0; kh < kernel_h; ++kh) {
-      for (int64_t kw = 0; kw < kernel_w; ++kw) {
-        const int64_t mask_idx = kh * kernel_w + kw;
-        const int64_t offset_idx = 2 * mask_idx;
+        for (int64_t kh = 0; kh < kernel_h; ++kh)
+        {
+            for (int64_t kw = 0; kw < kernel_w; ++kw)
+            {
+                const int64_t mask_idx   = kh * kernel_w + kw;
+                const int64_t offset_idx = 2 * mask_idx;
 
-        T mask_value = 1;
-        if (use_mask) {
-          mask_value = mask_ptr[mask_idx * (dst_h * dst_w) + oh * dst_w + ow];
-        }
+                T             mask_value = 1;
+                if (use_mask)
+                {
+                    mask_value = mask_ptr[mask_idx * (dst_h * dst_w) + oh * dst_w + ow];
+                }
 
-        const T offset_h = offset_ptr[offset_idx * (dst_h * dst_w) + oh * dst_w + ow];
-        const T offset_w = offset_ptr[(offset_idx + 1) * (dst_h * dst_w) + oh * dst_w + ow];
-        const T ih = (oh * stride_h - pad_h) + kh * dilation_h + offset_h;
-        const T iw = (ow * stride_w - pad_w) + kw * dilation_w + offset_w;
-        *columns_ptr = mask_value * bilinear_interpolate_2d<T>(input_ptr, src_h, src_w, ih, iw);
-        columns_ptr += dst_h * dst_w;
-      }
+                const T offset_h = offset_ptr[offset_idx * (dst_h * dst_w) + oh * dst_w + ow];
+                const T offset_w = offset_ptr[(offset_idx + 1) * (dst_h * dst_w) + oh * dst_w + ow];
+                const T ih       = (oh * stride_h - pad_h) + kh * dilation_h + offset_h;
+                const T iw       = (ow * stride_w - pad_w) + kw * dilation_w + offset_w;
+                *columns_ptr     = mask_value * bilinear_interpolate_2d<T>(input_ptr, src_h, src_w, ih, iw);
+                columns_ptr += dst_h * dst_w;
+            }
+        }
     }
-  }
 }
diff --git a/csrc/mmdeploy/backend_ops/common/modulated_deform_conv/modulated_deform_conv_cuda.cuh b/csrc/mmdeploy/backend_ops/common/modulated_deform_conv/modulated_deform_conv_cuda.cuh
index 43166e7d6b..20429a37c9 100644
--- a/csrc/mmdeploy/backend_ops/common/modulated_deform_conv/modulated_deform_conv_cuda.cuh
+++ b/csrc/mmdeploy/backend_ops/common/modulated_deform_conv/modulated_deform_conv_cuda.cuh
@@ -71,110 +71,139 @@
 
 #include "common_cuda_helper.cuh"
 
-template <typename T>
-__device__ float mdcn_im2col_bilinear(const T *input, const int data_width, const int height,
-                                      const int width, float h, float w) {
-  int h_low = floorf(h);
-  int w_low = floorf(w);
-  int h_high = h_low + 1;
-  int w_high = w_low + 1;
-
-  T lh = h - h_low;
-  T lw = w - w_low;
-  T hh = 1 - lh, hw = 1 - lw;
-
-  T v1 = 0;
-  if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];
-  T v2 = 0;
-  if (h_low >= 0 && w_high <= width - 1) v2 = input[h_low * data_width + w_high];
-  T v3 = 0;
-  if (h_high <= height - 1 && w_low >= 0) v3 = input[h_high * data_width + w_low];
-  T v4 = 0;
-  if (h_high <= height - 1 && w_high <= width - 1) v4 = input[h_high * data_width + w_high];
-
-  T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
-
-  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-  return float(val);
+template<typename T>
+__device__ float mdcn_im2col_bilinear(const T*  input,
+                                      const int data_width,
+                                      const int height,
+                                      const int width,
+                                      float     h,
+                                      float     w)
+{
+    int h_low  = floorf(h);
+    int w_low  = floorf(w);
+    int h_high = h_low + 1;
+    int w_high = w_low + 1;
+
+    T   lh = h - h_low;
+    T   lw = w - w_low;
+    T   hh = 1 - lh, hw = 1 - lw;
+
+    T   v1 = 0;
+    if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];
+    T v2 = 0;
+    if (h_low >= 0 && w_high <= width - 1) v2 = input[h_low * data_width + w_high];
+    T v3 = 0;
+    if (h_high <= height - 1 && w_low >= 0) v3 = input[h_high * data_width + w_low];
+    T v4 = 0;
+    if (h_high <= height - 1 && w_high <= width - 1) v4 = input[h_high * data_width + w_high];
+
+    T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+    T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+    return float(val);
 }
-template <>
-__device__ float mdcn_im2col_bilinear<__half>(const __half *input, const int data_width,
-                                              const int height, const int width, float h, float w) {
-  int h_low = floorf(h);
-  int w_low = floorf(w);
-  int h_high = h_low + 1;
-  int w_high = w_low + 1;
-
-  float lh = h - h_low;
-  float lw = w - w_low;
-  float hh = 1 - lh, hw = 1 - lw;
-
-  float v1 = 0;
-  if (h_low >= 0 && w_low >= 0) v1 = __half2float(input[h_low * data_width + w_low]);
-  float v2 = 0;
-  if (h_low >= 0 && w_high <= width - 1) v2 = __half2float(input[h_low * data_width + w_high]);
-  float v3 = 0;
-  if (h_high <= height - 1 && w_low >= 0) v3 = __half2float(input[h_high * data_width + w_low]);
-  float v4 = 0;
-  if (h_high <= height - 1 && w_high <= width - 1)
-    v4 = __half2float(input[h_high * data_width + w_high]);
-
-  float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
-
-  float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-  return val;
+template<>
+__device__ float mdcn_im2col_bilinear<__half>(const __half* input,
+                                              const int     data_width,
+                                              const int     height,
+                                              const int     width,
+                                              float         h,
+                                              float         w)
+{
+    int   h_low  = floorf(h);
+    int   w_low  = floorf(w);
+    int   h_high = h_low + 1;
+    int   w_high = w_low + 1;
+
+    float lh = h - h_low;
+    float lw = w - w_low;
+    float hh = 1 - lh, hw = 1 - lw;
+
+    float v1 = 0;
+    if (h_low >= 0 && w_low >= 0) v1 = __half2float(input[h_low * data_width + w_low]);
+    float v2 = 0;
+    if (h_low >= 0 && w_high <= width - 1) v2 = __half2float(input[h_low * data_width + w_high]);
+    float v3 = 0;
+    if (h_high <= height - 1 && w_low >= 0) v3 = __half2float(input[h_high * data_width + w_low]);
+    float v4 = 0;
+    if (h_high <= height - 1 && w_high <= width - 1)
+        v4 = __half2float(input[h_high * data_width + w_high]);
+
+    float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+    float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+    return val;
 }
 
-template <typename T>
-__global__ void modulated_deformable_im2col_gpu_kernel(
-    const int n, const T *data_im, const T *data_offset, const T *data_mask, const int height,
-    const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w, const int dilation_h, const int dilation_w,
-    const int channel_per_deformable_group, const int batch_size, const int num_channels,
-    const int deformable_group, const int height_col, const int width_col, T *data_col) {
-  CUDA_1D_KERNEL_LOOP(index, n) {
-    // index index of output matrix
-    const int w_col = index % width_col;
-    const int h_col = (index / width_col) % height_col;
-    const int b_col = (index / width_col / height_col) % batch_size;
-    const int c_im = (index / width_col / height_col) / batch_size;
-    const int c_col = c_im * kernel_h * kernel_w;
-
-    // compute deformable group index
-    const int deformable_group_index = c_im / channel_per_deformable_group;
-
-    const int h_in = h_col * stride_h - pad_h;
-    const int w_in = w_col * stride_w - pad_w;
-
-    T *data_col_ptr =
-        data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
-    const T *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width;
-    const T *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) *
-                                                 2 * kernel_h * kernel_w * height_col * width_col;
-
-    const T *data_mask_ptr = data_mask + (b_col * deformable_group + deformable_group_index) *
-                                             kernel_h * kernel_w * height_col * width_col;
-
-    for (int i = 0; i < kernel_h; ++i) {
-      for (int j = 0; j < kernel_w; ++j) {
-        const int data_offset_h_ptr =
-            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
-        const int data_offset_w_ptr =
-            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col;
-        const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
-        const T offset_h = data_offset_ptr[data_offset_h_ptr];
-        const T offset_w = data_offset_ptr[data_offset_w_ptr];
-        const T mask = data_mask_ptr[data_mask_hw_ptr];
-        float val = 0.0f;
-        const float h_im = h_in + i * dilation_h + (float)offset_h;
-        const float w_im = w_in + j * dilation_w + (float)offset_w;
-        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
-          val = mdcn_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im);
-        *data_col_ptr = (T)(val * (float)mask);
-        data_col_ptr += batch_size * height_col * width_col;
-      }
+template<typename T>
+__global__ void modulated_deformable_im2col_gpu_kernel(const int n,
+                                                       const T*  data_im,
+                                                       const T*  data_offset,
+                                                       const T*  data_mask,
+                                                       const int height,
+                                                       const int width,
+                                                       const int kernel_h,
+                                                       const int kernel_w,
+                                                       const int pad_h,
+                                                       const int pad_w,
+                                                       const int stride_h,
+                                                       const int stride_w,
+                                                       const int dilation_h,
+                                                       const int dilation_w,
+                                                       const int channel_per_deformable_group,
+                                                       const int batch_size,
+                                                       const int num_channels,
+                                                       const int deformable_group,
+                                                       const int height_col,
+                                                       const int width_col,
+                                                       T*        data_col)
+{
+    CUDA_1D_KERNEL_LOOP(index, n)
+    {
+        // index index of output matrix
+        const int w_col = index % width_col;
+        const int h_col = (index / width_col) % height_col;
+        const int b_col = (index / width_col / height_col) % batch_size;
+        const int c_im  = (index / width_col / height_col) / batch_size;
+        const int c_col = c_im * kernel_h * kernel_w;
+
+        // compute deformable group index
+        const int deformable_group_index = c_im / channel_per_deformable_group;
+
+        const int h_in = h_col * stride_h - pad_h;
+        const int w_in = w_col * stride_w - pad_w;
+
+        T*        data_col_ptr =
+            data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+        const T* data_im_ptr     = data_im + (b_col * num_channels + c_im) * height * width;
+        const T* data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) *
+                                                     2 * kernel_h * kernel_w * height_col * width_col;
+
+        const T* data_mask_ptr = data_mask + (b_col * deformable_group + deformable_group_index) *
+                                                 kernel_h * kernel_w * height_col * width_col;
+
+        for (int i = 0; i < kernel_h; ++i)
+        {
+            for (int j = 0; j < kernel_w; ++j)
+            {
+                const int data_offset_h_ptr =
+                    ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+                const int data_offset_w_ptr =
+                    ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col;
+                const int   data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
+                const T     offset_h         = data_offset_ptr[data_offset_h_ptr];
+                const T     offset_w         = data_offset_ptr[data_offset_w_ptr];
+                const T     mask             = data_mask_ptr[data_mask_hw_ptr];
+                float       val              = 0.0f;
+                const float h_im             = h_in + i * dilation_h + (float)offset_h;
+                const float w_im             = w_in + j * dilation_w + (float)offset_w;
+                if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
+                    val = mdcn_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im);
+                *data_col_ptr = (T)(val * (float)mask);
+                data_col_ptr += batch_size * height_col * width_col;
+            }
+        }
     }
-  }
 }
 
 #endif  // TRT_MODULATED_DEFORM_CONV_KERNEL_CUH
diff --git a/csrc/mmdeploy/backend_ops/ncnn/CMakeLists.txt b/csrc/mmdeploy/backend_ops/ncnn/CMakeLists.txt
index 4df9ad1233..f3e2aeb51e 100755
--- a/csrc/mmdeploy/backend_ops/ncnn/CMakeLists.txt
+++ b/csrc/mmdeploy/backend_ops/ncnn/CMakeLists.txt
@@ -3,20 +3,21 @@
 # ncnn
 find_package(ncnn)
 
-if (ncnn_FOUND)
-    message(STATUS "ncnn library found!")
-else ()
-    message(FATAL_ERROR "Could not locate ncnn")
-endif ()
+if(ncnn_FOUND)
+  message(STATUS "ncnn library found!")
+else()
+  message(FATAL_ERROR "Could not locate ncnn")
+endif()
 
-
-if (NOT ANDROID AND NOT IOS AND NOT CMAKE_CROSSCOMPILING)
-    add_subdirectory(ops)
-    add_subdirectory(onnx2ncnn)
-    add_subdirectory(pyncnn_ext)
-else ()
-    # In case of embedded platform, like android, or ios, we only build custom ncnn
-    # ops, and leave the executable converter(onnx2ncnn, pyncnn_ext) built under
-    # the host platforms
-    add_subdirectory(ops)
-endif ()
+if(NOT ANDROID
+   AND NOT IOS
+   AND NOT CMAKE_CROSSCOMPILING)
+  add_subdirectory(ops)
+  add_subdirectory(onnx2ncnn)
+  add_subdirectory(pyncnn_ext)
+else()
+  # In case of embedded platform, like android, or ios, we only build custom
+  # ncnn ops, and leave the executable converter(onnx2ncnn, pyncnn_ext) built
+  # under the host platforms
+  add_subdirectory(ops)
+endif()
diff --git a/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/CMakeLists.txt b/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/CMakeLists.txt
index fe1687e951..deeb1e1241 100755
--- a/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/CMakeLists.txt
+++ b/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/CMakeLists.txt
@@ -4,22 +4,28 @@ project(onnx2ncnn)
 
 find_package(Protobuf)
 
-if (PROTOBUF_FOUND)
-    if (${Protobuf_PROTOC_EXECUTABLE} STREQUAL "")
-        message(FATAL_ERROR "protoc not found, try `-DProtobuf_PROTOC_EXECUTABLE=/path/to/protoc`")
-    endif ()
-    protobuf_generate_cpp(ONNX_PROTO_SRCS ONNX_PROTO_HDRS
-            ${CMAKE_CURRENT_SOURCE_DIR}/onnx.proto)
-    add_executable(mmdeploy_onnx2ncnn onnx2ncnn.cpp fuse_pass.cpp shape_inference.cpp ${ONNX_PROTO_SRCS} ${ONNX_PROTO_HDRS})
-    target_include_directories(mmdeploy_onnx2ncnn PRIVATE ${PROTOBUF_INCLUDE_DIR}
-            ${CMAKE_CURRENT_BINARY_DIR})
-    target_link_libraries(mmdeploy_onnx2ncnn PRIVATE ${PROTOBUF_LIBRARIES})
-    if (MSVC)
-        target_compile_options(mmdeploy_onnx2ncnn PUBLIC $<$<COMPILE_LANGUAGE:CXX>:/Za>)
-    endif()
-    set(_NCNN_CONVERTER_DIR ${CMAKE_SOURCE_DIR}/mmdeploy/backend/ncnn)
-    install(TARGETS mmdeploy_onnx2ncnn DESTINATION ${_NCNN_CONVERTER_DIR})
-else ()
+if(PROTOBUF_FOUND)
+  if(${Protobuf_PROTOC_EXECUTABLE} STREQUAL "")
     message(
-            FATAL_ERROR "Protobuf not found, onnx model convert tool won't be built")
-endif ()
+      FATAL_ERROR
+        "protoc not found, try `-DProtobuf_PROTOC_EXECUTABLE=/path/to/protoc`")
+  endif()
+  protobuf_generate_cpp(ONNX_PROTO_SRCS ONNX_PROTO_HDRS
+                        ${CMAKE_CURRENT_SOURCE_DIR}/onnx.proto)
+  add_executable(
+    mmdeploy_onnx2ncnn onnx2ncnn.cpp fuse_pass.cpp shape_inference.cpp
+                       ${ONNX_PROTO_SRCS} ${ONNX_PROTO_HDRS})
+  target_include_directories(
+    mmdeploy_onnx2ncnn PRIVATE ${PROTOBUF_INCLUDE_DIR}
+                               ${CMAKE_CURRENT_BINARY_DIR})
+  target_link_libraries(mmdeploy_onnx2ncnn PRIVATE ${PROTOBUF_LIBRARIES})
+  if(MSVC)
+    target_compile_options(mmdeploy_onnx2ncnn
+                           PUBLIC $<$<COMPILE_LANGUAGE:CXX>:/Za>)
+  endif()
+  set(_NCNN_CONVERTER_DIR ${CMAKE_SOURCE_DIR}/mmdeploy/backend/ncnn)
+  install(TARGETS mmdeploy_onnx2ncnn DESTINATION ${_NCNN_CONVERTER_DIR})
+else()
+  message(
+    FATAL_ERROR "Protobuf not found, onnx model convert tool won't be built")
+endif()
diff --git a/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/fuse_pass.cpp b/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/fuse_pass.cpp
index 4d620e4c82..274ba76bca 100644
--- a/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/fuse_pass.cpp
+++ b/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/fuse_pass.cpp
@@ -1,355 +1,402 @@
 // Copyright (c) OpenMMLab. All rights reserved.
 #include "fuse_pass.h"
 
-void fuse_identity(onnx::GraphProto* mutable_graph,
+void fuse_identity(onnx::GraphProto*                         mutable_graph,
                    std::map<std::string, onnx::TensorProto>& weights,
-                   std::map<std::string, int>& node_reference, std::set<std::string>& blob_names,
-                   int& reduced_node_count) {
-  // fuse
-  // identity -->  op
-  // to
-  // noop_reducencnn --> op
-  const int node_count = mutable_graph->node_size();
-  for (int i = 0; i < node_count; ++i) {
-    onnx::NodeProto* node = mutable_graph->mutable_node(i);
-    for (int j = 0; j < node->input_size(); ++j) {
-      std::string output_name = node->input(j);
-      onnx::NodeProto* last_node = find_node_by_output_name(mutable_graph, output_name);
-      if (last_node && last_node->op_type() == "Identity") {
-        node->set_input(j, last_node->input(0));
-        node_reference[last_node->output(0)] -= 1;
-        node_reference[last_node->input(0)] += 1;
-        if (node_reference[last_node->output(0)] == 0) {
-          last_node->set_op_type("noop_reducedncnn");
-          node_reference[last_node->input(0)] -= 1;
-          reduced_node_count += 1;
+                   std::map<std::string, int>&               node_reference,
+                   std::set<std::string>&                    blob_names,
+                   int&                                      reduced_node_count)
+{
+    // fuse
+    // identity -->  op
+    // to
+    // noop_reducencnn --> op
+    const int node_count = mutable_graph->node_size();
+    for (int i = 0; i < node_count; ++i)
+    {
+        onnx::NodeProto* node = mutable_graph->mutable_node(i);
+        for (int j = 0; j < node->input_size(); ++j)
+        {
+            std::string      output_name = node->input(j);
+            onnx::NodeProto* last_node   = find_node_by_output_name(mutable_graph, output_name);
+            if (last_node && last_node->op_type() == "Identity")
+            {
+                node->set_input(j, last_node->input(0));
+                node_reference[last_node->output(0)] -= 1;
+                node_reference[last_node->input(0)] += 1;
+                if (node_reference[last_node->output(0)] == 0)
+                {
+                    last_node->set_op_type("noop_reducedncnn");
+                    node_reference[last_node->input(0)] -= 1;
+                    reduced_node_count += 1;
+                }
+            }
         }
-      }
     }
-  }
 }
 
-void fuse_rewrite_gather(onnx::GraphProto* mutable_graph,
+void fuse_rewrite_gather(onnx::GraphProto*                         mutable_graph,
                          std::map<std::string, onnx::TensorProto>& weights,
-                         std::map<std::string, int>& node_reference,
-                         std::set<std::string>& blob_names, int& reduced_node_count) {
-  const int node_count = mutable_graph->node_size();
-  for (int i = 0; i < node_count; ++i) {
-    onnx::NodeProto* gather = mutable_graph->mutable_node(i);
-    if (gather->op_type() != "Gather") {
-      continue;
-    }
-    if (weights.find(std::string(gather->input(1))) == weights.end()) {
-      continue;
-    }
-    auto indices = get_node_attr_from_input_ai(weights[gather->input(1)]);
-    if (indices.size() != 1) {
-      continue;
-    }
-
+                         std::map<std::string, int>&               node_reference,
+                         std::set<std::string>&                    blob_names,
+                         int&                                      reduced_node_count)
+{
+    const int node_count = mutable_graph->node_size();
+    for (int i = 0; i < node_count; ++i)
     {
-      // reconstruct node connections
-      node_reference[gather->input(1)] -= 1;
-      std::string origin_inp = gather->input(0);
-      gather->clear_input();
-      gather->add_input(origin_inp);
-    }
+        onnx::NodeProto* gather = mutable_graph->mutable_node(i);
+        if (gather->op_type() != "Gather")
+        {
+            continue;
+        }
+        if (weights.find(std::string(gather->input(1))) == weights.end())
+        {
+            continue;
+        }
+        auto indices = get_node_attr_from_input_ai(weights[gather->input(1)]);
+        if (indices.size() != 1)
+        {
+            continue;
+        }
 
-    {
-      // update axis, starts and ends
-      int axis = get_node_attr_i(*gather, "axis", 1) - 1;
+        {
+            // reconstruct node connections
+            node_reference[gather->input(1)] -= 1;
+            std::string origin_inp = gather->input(0);
+            gather->clear_input();
+            gather->add_input(origin_inp);
+        }
+
+        {
+            // update axis, starts and ends
+            int axis = get_node_attr_i(*gather, "axis", 1) - 1;
 
-      gather->set_op_type("Crop");
-      gather->clear_attribute();
+            gather->set_op_type("Crop");
+            gather->clear_attribute();
 
-      int indice = indices[0];
-      set_node_attr_ai(*gather, "starts", std::vector<int>{indice});
-      set_node_attr_ai(*gather, "ends", std::vector<int>{indice + 1});
-      set_node_attr_ai(*gather, "axis", std::vector<int>{axis});
+            int indice = indices[0];
+            set_node_attr_ai(*gather, "starts", std::vector<int>{indice});
+            set_node_attr_ai(*gather, "ends", std::vector<int>{indice + 1});
+            set_node_attr_ai(*gather, "axis", std::vector<int>{axis});
+        }
     }
-  }
 }
 
-void fuse_weight_reshape(onnx::GraphProto* mutable_graph,
+void fuse_weight_reshape(onnx::GraphProto*                         mutable_graph,
                          std::map<std::string, onnx::TensorProto>& weights,
-                         std::map<std::string, int>& node_reference,
-                         std::set<std::string>& blob_names, int& reduced_node_count) {
-  int node_count = mutable_graph->node_size();
-  for (int i = 0; i < node_count; i++) {
-    onnx::NodeProto* node = mutable_graph->mutable_node(i);
-
-    // weight <= Reshape(weight)
-    if (node->op_type() == "Reshape") {
-      // check weight
-      if (weights.find(node->input(0)) == weights.end()) continue;
-
-      weights[node->output(0)] = weights[node->input(0)];
-
-      // set weight shape directly
-      std::vector<int> shape;
-      if (node->input_size() == 1) {
-        shape = get_node_attr_ai(*node, "shape");
-      } else if (node->input_size() == 2) {
-        // opset 5
-        shape = get_node_attr_from_input_ai(weights[node->input(1)]);
-      }
-
-      weights[node->output(0)].clear_dims();
-      for (int j = 0; j < shape.size(); j++) {
-        weights[node->output(0)].add_dims(shape[j]);
-      }
-
-      // reduce
-      node->set_op_type("noop_reducedncnn");
-
-      node_reference[node->input(0)] -= 1;
-      if (node->input_size() == 2) {
-        node_reference[node->input(1)] -= 1;
-      }
-
-      reduced_node_count += 1;
-      i += 1;
+                         std::map<std::string, int>&               node_reference,
+                         std::set<std::string>&                    blob_names,
+                         int&                                      reduced_node_count)
+{
+    int node_count = mutable_graph->node_size();
+    for (int i = 0; i < node_count; i++)
+    {
+        onnx::NodeProto* node = mutable_graph->mutable_node(i);
+
+        // weight <= Reshape(weight)
+        if (node->op_type() == "Reshape")
+        {
+            // check weight
+            if (weights.find(node->input(0)) == weights.end()) continue;
+
+            weights[node->output(0)] = weights[node->input(0)];
+
+            // set weight shape directly
+            std::vector<int> shape;
+            if (node->input_size() == 1)
+            {
+                shape = get_node_attr_ai(*node, "shape");
+            }
+            else if (node->input_size() == 2)
+            {
+                // opset 5
+                shape = get_node_attr_from_input_ai(weights[node->input(1)]);
+            }
+
+            weights[node->output(0)].clear_dims();
+            for (int j = 0; j < shape.size(); j++)
+            {
+                weights[node->output(0)].add_dims(shape[j]);
+            }
+
+            // reduce
+            node->set_op_type("noop_reducedncnn");
+
+            node_reference[node->input(0)] -= 1;
+            if (node->input_size() == 2)
+            {
+                node_reference[node->input(1)] -= 1;
+            }
+
+            reduced_node_count += 1;
+            i += 1;
+        }
     }
-  }
 }
 
-void fuse_weight_transpose(onnx::GraphProto* mutable_graph,
+void fuse_weight_transpose(onnx::GraphProto*                         mutable_graph,
                            std::map<std::string, onnx::TensorProto>& weights,
-                           std::map<std::string, int>& node_reference,
-                           std::set<std::string>& blob_names, int& reduced_node_count) {
-  int node_count = mutable_graph->node_size();
-  for (int i = 0; i < node_count; i++) {
-    onnx::NodeProto* node = mutable_graph->mutable_node(i);
-
-    // weight <= Transpose(weight)
-    if (node->op_type() == "Transpose") {
-      // check weight
-      if (weights.find(node->input(0)) == weights.end()) continue;
-
-      if (weights[node->input(0)].dims_size() != 2) continue;
-
-      // perm = (1, 0)
-      std::vector<int> perm = get_node_attr_ai(*node, "perm");
-      if (perm.size() != 2) continue;
-      if (perm[0] != 1 || perm[1] != 0) continue;
-
-      weights[node->output(0)] = weights[node->input(0)];
-
-      // permute weight
-      {
-        onnx::TensorProto& B = weights[node->output(0)];
-
-        const int h = B.dims(0);
-        const int w = B.dims(1);
-
-        std::vector<float> permuted_data;
-        permuted_data.reserve((size_t)h * w);
-        const float* bptr =
-            B.has_raw_data() ? (const float*)B.raw_data().data() : B.float_data().data();
-
-        for (int j = 0; j < w; j++) {
-          for (int k = 0; k < h; k++) {
-            float vb = bptr[k * w + j];
-            permuted_data.push_back(vb);
-          }
-        }
-
-        B.set_dims(0, w);
-        B.set_dims(1, h);
-
-        if (B.has_raw_data()) {
-          B.set_raw_data(permuted_data.data(), permuted_data.size() * sizeof(float));
-        } else {
-          for (int j = 0; j < (int)permuted_data.size(); j++) B.set_float_data(j, permuted_data[j]);
+                           std::map<std::string, int>&               node_reference,
+                           std::set<std::string>&                    blob_names,
+                           int&                                      reduced_node_count)
+{
+    int node_count = mutable_graph->node_size();
+    for (int i = 0; i < node_count; i++)
+    {
+        onnx::NodeProto* node = mutable_graph->mutable_node(i);
+
+        // weight <= Transpose(weight)
+        if (node->op_type() == "Transpose")
+        {
+            // check weight
+            if (weights.find(node->input(0)) == weights.end()) continue;
+
+            if (weights[node->input(0)].dims_size() != 2) continue;
+
+            // perm = (1, 0)
+            std::vector<int> perm = get_node_attr_ai(*node, "perm");
+            if (perm.size() != 2) continue;
+            if (perm[0] != 1 || perm[1] != 0) continue;
+
+            weights[node->output(0)] = weights[node->input(0)];
+
+            // permute weight
+            {
+                onnx::TensorProto& B = weights[node->output(0)];
+
+                const int          h = B.dims(0);
+                const int          w = B.dims(1);
+
+                std::vector<float> permuted_data;
+                permuted_data.reserve((size_t)h * w);
+                const float* bptr =
+                    B.has_raw_data() ? (const float*)B.raw_data().data() : B.float_data().data();
+
+                for (int j = 0; j < w; j++)
+                {
+                    for (int k = 0; k < h; k++)
+                    {
+                        float vb = bptr[k * w + j];
+                        permuted_data.push_back(vb);
+                    }
+                }
+
+                B.set_dims(0, w);
+                B.set_dims(1, h);
+
+                if (B.has_raw_data())
+                {
+                    B.set_raw_data(permuted_data.data(), permuted_data.size() * sizeof(float));
+                }
+                else
+                {
+                    for (int j = 0; j < (int)permuted_data.size(); j++) B.set_float_data(j, permuted_data[j]);
+                }
+            }
+
+            // reduce
+            node->set_op_type("noop_reducedncnn");
+
+            node_reference[node->input(0)] -= 1;
+
+            reduced_node_count += 1;
+            i += 1;
         }
-      }
-
-      // reduce
-      node->set_op_type("noop_reducedncnn");
-
-      node_reference[node->input(0)] -= 1;
-
-      reduced_node_count += 1;
-      i += 1;
     }
-  }
 }
 
-void fuse_shufflechannel(onnx::GraphProto* mutable_graph,
+void fuse_shufflechannel(onnx::GraphProto*                         mutable_graph,
                          std::map<std::string, onnx::TensorProto>& weights,
-                         std::map<std::string, int>& node_reference,
-                         std::set<std::string>& blob_names, int& reduced_node_count) {
-  int node_count = mutable_graph->node_size();
-  for (int i = 0; i < node_count; i++) {
-    onnx::NodeProto* node = mutable_graph->mutable_node(i);
-
-    // ShuffleChannel <= Reshape - Transpose - Reshape
-    // ShuffleChannel <= Reshape - Transpose - Constant - Reshape
-    if (node->op_type() == "Reshape") {
-      if (node_reference[node->output(0)] != 1) continue;
-
-      std::vector<int> shape;
-      if (node->input_size() == 1) {
-        shape = get_node_attr_ai(*node, "shape");
-      } else {
-        // skip weight reshape
-        if (weights.find(node->input(1)) == weights.end()) continue;
-
-        shape = get_node_attr_from_input_ai(weights[node->input(1)]);
-      }
-
-      // 1 groups channels_per_group, height, width
-      // reverse style = channels_per_group, groups, height * width
-      if (shape.size() != 5 && shape.size() != 3) continue;
-
-      if (shape.size() == 5 && shape[0] != 1) continue;
-
-      if (i + 2 >= node_count) continue;
-
-      onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
-      onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
-
-      if (node3->op_type() == "Constant") {
-        if (i + 3 >= node_count) continue;
-
-        node3 = mutable_graph->mutable_node(i + 3);
-      }
-
-      if (node2->op_type() != "Transpose" || node3->op_type() != "Reshape") continue;
-
-      if (node_reference[node2->output(0)] != 1) continue;
-
-      // 0 2 1 3 4
-      // reverse style = 1 0 2
-      std::vector<int> perm = get_node_attr_ai(*node2, "perm");
-      if (perm.size() != 5 && perm.size() != 3) continue;
-
-      if (perm.size() == 5 &&
-          (perm[0] != 0 || perm[1] != 2 || perm[2] != 1 || perm[3] != 3 || perm[4] != 4))
-        continue;
-
-      if (perm.size() == 3 && (perm[0] != 1 || perm[1] != 0 || perm[2] != 2)) continue;
-
-      std::vector<int> shape3;
-      if (node3->input_size() == 1) {
-        shape3 = get_node_attr_ai(*node3, "shape");
-      } else {
-        // skip weight reshape
-        if (weights.find(node3->input(1)) == weights.end()) continue;
-
-        shape3 = get_node_attr_from_input_ai(weights[node3->input(1)]);
-      }
-
-      // 1, -1, height, width
-      // reverse style = group, -1, channels_per_group, height, width
-      if (shape3.size() != 4 && shape3.size() != 5) continue;
-
-      if (shape3.size() == 4 &&
-          (shape3[0] != 1 || (shape3[1] != -1 && shape3[1] != shape[1] * shape[2])))
-        continue;
-
-      if (shape3.size() == 5 &&
-          (shape3[0] != shape[1] || shape3[2] != shape[0] || shape3[3] * shape3[4] != shape[2]))
-        continue;
-
-      // reduce
-      node->set_op_type("noop_reducedncnn");
-      node2->set_op_type("noop_reducedncnn");
-
-      if (node->input_size() == 2) {
-        node_reference[node->input(1)] -= 1;
-      }
-      node_reference[node->output(0)] -= 1;
-      node_reference[node2->output(0)] -= 1;
-      if (node3->input_size() == 2) {
-        node_reference[node3->input(1)] -= 1;
-      }
-
-      blob_names.erase(node->output(0));
-      blob_names.erase(node2->output(0));
-
-      node3->set_op_type("ShuffleChannel");
-      node3->set_input(0, node->input(0));
-
-      onnx::AttributeProto* attr_group = node3->add_attribute();
-      attr_group->set_name("group");
-      attr_group->set_i(shape[1]);
-
-      onnx::AttributeProto* attr_reverse = node3->add_attribute();
-      attr_reverse->set_name("reverse");
-      attr_reverse->set_i(shape.size() == 3);
+                         std::map<std::string, int>&               node_reference,
+                         std::set<std::string>&                    blob_names,
+                         int&                                      reduced_node_count)
+{
+    int node_count = mutable_graph->node_size();
+    for (int i = 0; i < node_count; i++)
+    {
+        onnx::NodeProto* node = mutable_graph->mutable_node(i);
+
+        // ShuffleChannel <= Reshape - Transpose - Reshape
+        // ShuffleChannel <= Reshape - Transpose - Constant - Reshape
+        if (node->op_type() == "Reshape")
+        {
+            if (node_reference[node->output(0)] != 1) continue;
+
+            std::vector<int> shape;
+            if (node->input_size() == 1)
+            {
+                shape = get_node_attr_ai(*node, "shape");
+            }
+            else
+            {
+                // skip weight reshape
+                if (weights.find(node->input(1)) == weights.end()) continue;
+
+                shape = get_node_attr_from_input_ai(weights[node->input(1)]);
+            }
+
+            // 1 groups channels_per_group, height, width
+            // reverse style = channels_per_group, groups, height * width
+            if (shape.size() != 5 && shape.size() != 3) continue;
+
+            if (shape.size() == 5 && shape[0] != 1) continue;
+
+            if (i + 2 >= node_count) continue;
+
+            onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
+            onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
+
+            if (node3->op_type() == "Constant")
+            {
+                if (i + 3 >= node_count) continue;
+
+                node3 = mutable_graph->mutable_node(i + 3);
+            }
+
+            if (node2->op_type() != "Transpose" || node3->op_type() != "Reshape") continue;
+
+            if (node_reference[node2->output(0)] != 1) continue;
+
+            // 0 2 1 3 4
+            // reverse style = 1 0 2
+            std::vector<int> perm = get_node_attr_ai(*node2, "perm");
+            if (perm.size() != 5 && perm.size() != 3) continue;
+
+            if (perm.size() == 5 &&
+                (perm[0] != 0 || perm[1] != 2 || perm[2] != 1 || perm[3] != 3 || perm[4] != 4))
+                continue;
+
+            if (perm.size() == 3 && (perm[0] != 1 || perm[1] != 0 || perm[2] != 2)) continue;
+
+            std::vector<int> shape3;
+            if (node3->input_size() == 1)
+            {
+                shape3 = get_node_attr_ai(*node3, "shape");
+            }
+            else
+            {
+                // skip weight reshape
+                if (weights.find(node3->input(1)) == weights.end()) continue;
+
+                shape3 = get_node_attr_from_input_ai(weights[node3->input(1)]);
+            }
+
+            // 1, -1, height, width
+            // reverse style = group, -1, channels_per_group, height, width
+            if (shape3.size() != 4 && shape3.size() != 5) continue;
+
+            if (shape3.size() == 4 &&
+                (shape3[0] != 1 || (shape3[1] != -1 && shape3[1] != shape[1] * shape[2])))
+                continue;
+
+            if (shape3.size() == 5 &&
+                (shape3[0] != shape[1] || shape3[2] != shape[0] || shape3[3] * shape3[4] != shape[2]))
+                continue;
+
+            // reduce
+            node->set_op_type("noop_reducedncnn");
+            node2->set_op_type("noop_reducedncnn");
+
+            if (node->input_size() == 2)
+            {
+                node_reference[node->input(1)] -= 1;
+            }
+            node_reference[node->output(0)] -= 1;
+            node_reference[node2->output(0)] -= 1;
+            if (node3->input_size() == 2)
+            {
+                node_reference[node3->input(1)] -= 1;
+            }
+
+            blob_names.erase(node->output(0));
+            blob_names.erase(node2->output(0));
+
+            node3->set_op_type("ShuffleChannel");
+            node3->set_input(0, node->input(0));
+
+            onnx::AttributeProto* attr_group = node3->add_attribute();
+            attr_group->set_name("group");
+            attr_group->set_i(shape[1]);
+
+            onnx::AttributeProto* attr_reverse = node3->add_attribute();
+            attr_reverse->set_name("reverse");
+            attr_reverse->set_i(shape.size() == 3);
 
-      reduced_node_count += 2;
-      i += 2;
+            reduced_node_count += 2;
+            i += 2;
+        }
     }
-  }
 }
 
-void fuse_shufflechannel_split(onnx::GraphProto* mutable_graph,
+void fuse_shufflechannel_split(onnx::GraphProto*                         mutable_graph,
                                std::map<std::string, onnx::TensorProto>& weights,
-                               std::map<std::string, int>& node_reference,
-                               std::set<std::string>& blob_names, int& reduced_node_count) {
-  int node_count = mutable_graph->node_size();
-  for (int i = 0; i < node_count; i++) {
-    onnx::NodeProto* node = mutable_graph->mutable_node(i);
+                               std::map<std::string, int>&               node_reference,
+                               std::set<std::string>&                    blob_names,
+                               int&                                      reduced_node_count)
+{
+    int node_count = mutable_graph->node_size();
+    for (int i = 0; i < node_count; i++)
+    {
+        onnx::NodeProto* node = mutable_graph->mutable_node(i);
 
-    // Split <= ShuffleChannel(reverse type) - Gather(0) - Gather(1)
-    if (node->op_type() == "ShuffleChannel") {
-      // reverse = 1
-      int reverse = get_node_attr_i(*node, "reverse");
-      if (reverse != 1) continue;
+        // Split <= ShuffleChannel(reverse type) - Gather(0) - Gather(1)
+        if (node->op_type() == "ShuffleChannel")
+        {
+            // reverse = 1
+            int reverse = get_node_attr_i(*node, "reverse");
+            if (reverse != 1) continue;
 
-      if (i + 2 >= node_count) continue;
+            if (i + 2 >= node_count) continue;
 
-      onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
-      onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
+            onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
+            onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
 
-      if (node2->op_type() != "Gather" || node3->op_type() != "Gather") continue;
+            if (node2->op_type() != "Gather" || node3->op_type() != "Gather") continue;
 
-      if (node2->input(0) != node->output(0) || node3->input(0) != node->output(0)) continue;
+            if (node2->input(0) != node->output(0) || node3->input(0) != node->output(0)) continue;
 
-      // axis = 0
-      int gather2_axis = get_node_attr_i(*node2, "axis");
-      if (gather2_axis != 0) continue;
+            // axis = 0
+            int gather2_axis = get_node_attr_i(*node2, "axis");
+            if (gather2_axis != 0) continue;
 
-      // indices = 0
-      if (weights.find(node2->input(1)) == weights.end()) continue;
+            // indices = 0
+            if (weights.find(node2->input(1)) == weights.end()) continue;
 
-      std::vector<int> gather2_indices = get_node_attr_from_input_ai(weights[node2->input(1)]);
-      if (gather2_indices.size() != 1 || gather2_indices[0] != 0) continue;
+            std::vector<int> gather2_indices = get_node_attr_from_input_ai(weights[node2->input(1)]);
+            if (gather2_indices.size() != 1 || gather2_indices[0] != 0) continue;
 
-      // axis = 0
-      int gather3_axis = get_node_attr_i(*node3, "axis");
-      if (gather3_axis != 0) continue;
+            // axis = 0
+            int gather3_axis = get_node_attr_i(*node3, "axis");
+            if (gather3_axis != 0) continue;
 
-      // indices = 1
-      if (weights.find(node3->input(1)) == weights.end()) continue;
+            // indices = 1
+            if (weights.find(node3->input(1)) == weights.end()) continue;
 
-      std::vector<int> gather3_indices = get_node_attr_from_input_ai(weights[node3->input(1)]);
-      if (gather3_indices.size() != 1 || gather3_indices[0] != 1) continue;
+            std::vector<int> gather3_indices = get_node_attr_from_input_ai(weights[node3->input(1)]);
+            if (gather3_indices.size() != 1 || gather3_indices[0] != 1) continue;
 
-      // reduce
-      node2->set_op_type("noop_reducedncnn");
+            // reduce
+            node2->set_op_type("noop_reducedncnn");
 
-      node_reference[node->output(0)] -= 2;
-      node_reference[node2->input(1)] -= 1;
-      node_reference[node3->input(1)] -= 1;
+            node_reference[node->output(0)] -= 2;
+            node_reference[node2->input(1)] -= 1;
+            node_reference[node3->input(1)] -= 1;
 
-      node3->set_op_type("Split");
-      node3->clear_input();
-      node3->add_input(node->output(0));
-      node3->add_output(node3->output(0));
-      node3->set_output(0, node2->output(0));
+            node3->set_op_type("Split");
+            node3->clear_input();
+            node3->add_input(node->output(0));
+            node3->add_output(node3->output(0));
+            node3->set_output(0, node2->output(0));
 
-      node3->clear_attribute();
-      onnx::AttributeProto* attr_axis = node3->add_attribute();
-      attr_axis->set_name("axis");
-      attr_axis->set_i(1);
+            node3->clear_attribute();
+            onnx::AttributeProto* attr_axis = node3->add_attribute();
+            attr_axis->set_name("axis");
+            attr_axis->set_i(1);
 
-      reduced_node_count += 1;
-      i += 1;
+            reduced_node_count += 1;
+            i += 1;
+        }
     }
-  }
 }
 
 /**
@@ -369,2034 +416,2209 @@ void fuse_shufflechannel_split(onnx::GraphProto* mutable_graph,
  * @param blob_names
  * @param reduced_node_count
  */
-void fuse_conv_reshape(onnx::GraphProto* mutable_graph,
+void fuse_conv_reshape(onnx::GraphProto*                         mutable_graph,
                        std::map<std::string, onnx::TensorProto>& weights,
-                       std::map<std::string, int>& node_reference,
-                       std::set<std::string>& blob_names, int& reduced_node_count) {
-  std::map<std::string, std::vector<int>> shape_context;
-  const int node_count = mutable_graph->node_size();
-
-  for (int i = 0; i < node_count; i++) {
-    onnx::NodeProto* conv = mutable_graph->mutable_node(i);
-
-    if (conv->op_type() != "Conv") {
-      continue;
-    }
-
-    if (i + 4 >= node_count) {
-      continue;
-    }
-
-    onnx::NodeProto *shape = nullptr, *slice = nullptr, *concat = nullptr, *reshape = nullptr;
-
-    // match [Shape ... Slice, Concat ... Reshape] from near sequence, skip useless Constant
-    std::vector<std::tuple<std::string, onnx::NodeProto**>> candidates = {
-        {"Shape", &shape}, {"Slice", &slice}, {"Concat", &concat}, {"Reshape", &reshape}};
+                       std::map<std::string, int>&               node_reference,
+                       std::set<std::string>&                    blob_names,
+                       int&                                      reduced_node_count)
+{
+    std::map<std::string, std::vector<int>> shape_context;
+    const int                               node_count = mutable_graph->node_size();
+
+    for (int i = 0; i < node_count; i++)
+    {
+        onnx::NodeProto* conv = mutable_graph->mutable_node(i);
 
-    int MAX = std::min(10, node_count - i - 1);
-    int pos_candidate = 0;
+        if (conv->op_type() != "Conv")
+        {
+            continue;
+        }
 
-    for (int j = 0; j < MAX; ++j) {
-      auto node_ptr = mutable_graph->mutable_node(j + i + 1);
-      if (node_ptr->op_type() == "Constant") {
-        continue;
-      }
-      if (node_ptr->op_type() == std::get<0>(candidates[pos_candidate])) {
-        *(std::get<1>(candidates[pos_candidate])) = node_ptr;
-        pos_candidate++;
-      }
-    }
+        if (i + 4 >= node_count)
+        {
+            continue;
+        }
 
-    if (pos_candidate != candidates.size()) {
-      // not match the sequence
-      continue;
-    }
+        onnx::NodeProto *                                       shape = nullptr, *slice = nullptr, *concat = nullptr, *reshape = nullptr;
+
+        // match [Shape ... Slice, Concat ... Reshape] from near sequence, skip useless Constant
+        std::vector<std::tuple<std::string, onnx::NodeProto**>> candidates = {
+            {"Shape", &shape},
+            {"Slice", &slice},
+            {"Concat", &concat},
+            {"Reshape", &reshape}};
+
+        int MAX           = std::min(10, node_count - i - 1);
+        int pos_candidate = 0;
+
+        for (int j = 0; j < MAX; ++j)
+        {
+            auto node_ptr = mutable_graph->mutable_node(j + i + 1);
+            if (node_ptr->op_type() == "Constant")
+            {
+                continue;
+            }
+            if (node_ptr->op_type() == std::get<0>(candidates[pos_candidate]))
+            {
+                *(std::get<1>(candidates[pos_candidate])) = node_ptr;
+                pos_candidate++;
+            }
+        }
 
-    if (node_reference[conv->output(0)] != 2 || node_reference[shape->output(0)] != 1 ||
-        node_reference[slice->output(0)] != 1 || node_reference[concat->output(0)] != 1 ||
-        node_reference[reshape->output(0)] != 1) {
-      continue;
-    }
+        if (pos_candidate != candidates.size())
+        {
+            // not match the sequence
+            continue;
+        }
 
-    // check the connections
-    if (shape->input(0) != conv->output(0) || reshape->input(0) != conv->output(0)) {
-      continue;
-    }
-    if (slice->input(0) != shape->output(0)) {
-      continue;
-    }
-    if (concat->input(0) != slice->output(0)) {
-      continue;
-    }
-    if (reshape->input(0) != conv->output(0) || reshape->input(1) != concat->output(0)) {
-      continue;
-    }
+        if (node_reference[conv->output(0)] != 2 || node_reference[shape->output(0)] != 1 ||
+            node_reference[slice->output(0)] != 1 || node_reference[concat->output(0)] != 1 ||
+            node_reference[reshape->output(0)] != 1)
+        {
+            continue;
+        }
 
-    // add reshape attr
-    auto result = query_shape(mutable_graph, concat, weights, shape_context);
-    if (!std::get<0>(result)) {
-      continue;
-    }
-    set_node_attr_ai(*reshape, "shape", std::get<1>(result));
+        // check the connections
+        if (shape->input(0) != conv->output(0) || reshape->input(0) != conv->output(0))
+        {
+            continue;
+        }
+        if (slice->input(0) != shape->output(0))
+        {
+            continue;
+        }
+        if (concat->input(0) != slice->output(0))
+        {
+            continue;
+        }
+        if (reshape->input(0) != conv->output(0) || reshape->input(1) != concat->output(0))
+        {
+            continue;
+        }
 
-    // reconstruct graph
-    {
-      // remove reference
-      node_reference[reshape->input(1)] -= 1;
-      node_reference[concat->input(0)] -= 1;
-      node_reference[slice->input(0)] -= 1;
-      node_reference[shape->input(0)] -= 1;
-
-      // remove tensor/blob on edge
-      blob_names.erase(slice->input(0));
-      blob_names.erase(slice->input(1));
-      blob_names.erase(slice->input(2));
-      blob_names.erase(slice->input(3));
-      weights.erase(slice->input(1));
-      weights.erase(slice->input(2));
-      weights.erase(slice->input(3));
-
-      blob_names.erase(concat->input(0));
-      blob_names.erase(concat->input(1));
-      weights.erase(concat->input(1));
-
-      blob_names.erase(reshape->input(0));
-
-      // update edge
-      shape->clear_input();
-      reshape->clear_input();
-      reshape->add_input(conv->output(0));
-
-      shape->set_op_type("noop_reducedncnn");
-      slice->set_op_type("noop_reducedncnn");
-      concat->set_op_type("noop_reducedncnn");
-
-      reduced_node_count += 3;
+        // add reshape attr
+        auto result = query_shape(mutable_graph, concat, weights, shape_context);
+        if (!std::get<0>(result))
+        {
+            continue;
+        }
+        set_node_attr_ai(*reshape, "shape", std::get<1>(result));
+
+        // reconstruct graph
+        {
+            // remove reference
+            node_reference[reshape->input(1)] -= 1;
+            node_reference[concat->input(0)] -= 1;
+            node_reference[slice->input(0)] -= 1;
+            node_reference[shape->input(0)] -= 1;
+
+            // remove tensor/blob on edge
+            blob_names.erase(slice->input(0));
+            blob_names.erase(slice->input(1));
+            blob_names.erase(slice->input(2));
+            blob_names.erase(slice->input(3));
+            weights.erase(slice->input(1));
+            weights.erase(slice->input(2));
+            weights.erase(slice->input(3));
+
+            blob_names.erase(concat->input(0));
+            blob_names.erase(concat->input(1));
+            weights.erase(concat->input(1));
+
+            blob_names.erase(reshape->input(0));
+
+            // update edge
+            shape->clear_input();
+            reshape->clear_input();
+            reshape->add_input(conv->output(0));
+
+            shape->set_op_type("noop_reducedncnn");
+            slice->set_op_type("noop_reducedncnn");
+            concat->set_op_type("noop_reducedncnn");
+
+            reduced_node_count += 3;
+        }
+        i += 3;
     }
-    i += 3;
-  }
 }
 
-void fuse_binaryop_with_scalar(onnx::GraphProto* mutable_graph,
+void fuse_binaryop_with_scalar(onnx::GraphProto*                         mutable_graph,
                                std::map<std::string, onnx::TensorProto>& weights,
-                               std::map<std::string, int>& node_reference,
-                               std::set<std::string>& blob_names, int& reduced_node_count) {
-  int node_count = mutable_graph->node_size();
-  for (int i = 0; i < node_count; i++) {
-    onnx::NodeProto* node = mutable_graph->mutable_node(i);
+                               std::map<std::string, int>&               node_reference,
+                               std::set<std::string>&                    blob_names,
+                               int&                                      reduced_node_count)
+{
+    int node_count = mutable_graph->node_size();
+    for (int i = 0; i < node_count; i++)
+    {
+        onnx::NodeProto* node = mutable_graph->mutable_node(i);
 
-    // Add/Sub/Mul/Div/Min/Max/Pow
-    if (node->op_type() == "Add" || node->op_type() == "Sub" || node->op_type() == "Mul" ||
-        node->op_type() == "Div" || node->op_type() == "Max" || node->op_type() == "Min" ||
-        node->op_type() == "Pow") {
-      if (weights.find(node->input(1)) == weights.end()) continue;
+        // Add/Sub/Mul/Div/Min/Max/Pow
+        if (node->op_type() == "Add" || node->op_type() == "Sub" || node->op_type() == "Mul" ||
+            node->op_type() == "Div" || node->op_type() == "Max" || node->op_type() == "Min" ||
+            node->op_type() == "Pow")
+        {
+            if (weights.find(node->input(1)) == weights.end()) continue;
 
-      const onnx::TensorProto& scalar_b = weights[node->input(1)];
-      if (scalar_b.dims_size() != 0 || get_tensor_proto_data_size(scalar_b) != 1) continue;
+            const onnx::TensorProto& scalar_b = weights[node->input(1)];
+            if (scalar_b.dims_size() != 0 || get_tensor_proto_data_size(scalar_b) != 1) continue;
 
-      float b = get_node_attr_from_input<float>(scalar_b);
+            float b = get_node_attr_from_input<float>(scalar_b);
 
-      node_reference[node->input(1)] -= 1;
+            node_reference[node->input(1)] -= 1;
 
-      std::string input = node->input(0);
+            std::string input = node->input(0);
 
-      node->clear_input();
-      node->add_input(input);
+            node->clear_input();
+            node->add_input(input);
 
-      onnx::AttributeProto* attr_with_scalar = node->add_attribute();
-      attr_with_scalar->set_name("with_scalar");
-      attr_with_scalar->set_i(1);
+            onnx::AttributeProto* attr_with_scalar = node->add_attribute();
+            attr_with_scalar->set_name("with_scalar");
+            attr_with_scalar->set_i(1);
 
-      onnx::AttributeProto* attr_b = node->add_attribute();
-      attr_b->set_name("b");
-      attr_b->set_f(b);
+            onnx::AttributeProto* attr_b = node->add_attribute();
+            attr_b->set_name("b");
+            attr_b->set_f(b);
+        }
     }
-  }
 }
 
-void fuse_hardswish(onnx::GraphProto* mutable_graph,
+void fuse_hardswish(onnx::GraphProto*                         mutable_graph,
                     std::map<std::string, onnx::TensorProto>& weights,
-                    std::map<std::string, int>& node_reference, std::set<std::string>& blob_names,
-                    int& reduced_node_count) {
-  int node_count = mutable_graph->node_size();
-  for (int i = 0; i < node_count; i++) {
-    onnx::NodeProto* node = mutable_graph->mutable_node(i);
-
-    // HardSwish <= Add(+3) - Clip(0,6) - Mul(X,) - Div(/6)
-    // HardSwish <= Add(+3) - Clip(0,6) - Mul(X,) - Mul(*(1/6))
-    // HardSwish <= Add(+3) - Clip(0,6) - Mul(X,) - Constant - Div(/6)
-    // HardSwish <= Add(+3) - Clip(0,6) - Mul(X,) - Constant - Mul(*(1/6))
-    //     out = x * F.relu6(x + 3, inplace=True) / 6
-    if (node->op_type() == "Add") {
-      if (node_reference[node->output(0)] != 1) continue;
-
-      if (i + 3 >= node_count) continue;
-
-      if (weights.find(node->input(1)) == weights.end()) continue;
-
-      const onnx::TensorProto& add_three = weights[node->input(1)];
-      if (add_three.dims_size() != 0 || get_tensor_proto_data_size(add_three) != 1) continue;
-
-      float constant_add_three = get_node_attr_from_input<float>(add_three);
-      if (constant_add_three != 3.f) continue;
-
-      onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
-      onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
-      onnx::NodeProto* node4 = mutable_graph->mutable_node(i + 3);
-
-      if (node4->op_type() == "Constant") {
-        if (i + 4 >= node_count) continue;
-
-        node4 = mutable_graph->mutable_node(i + 4);
-      }
-
-      if (node2->op_type() != "Clip" || node3->op_type() != "Mul" ||
-          (node4->op_type() != "Div" && node4->op_type() != "Mul"))
-        continue;
-
-      if (node_reference[node2->output(0)] != 1) continue;
-
-      float relu6_min;
-      float relu6_max;
-      if (node2->input_size() == 1) {
-        relu6_min = get_node_attr_f(*node2, "min", -FLT_MAX);
-        relu6_max = get_node_attr_f(*node2, "max", FLT_MAX);
-      } else {
-        const onnx::TensorProto& min_tp = weights[node2->input(1)];
-        const onnx::TensorProto& max_tp = weights[node2->input(2)];
-
-        relu6_min = get_node_attr_from_input<float>(min_tp);
-        relu6_max = get_node_attr_from_input<float>(max_tp);
-      }
-      if (relu6_min != 0.f || relu6_max != 6.f) continue;
-
-      if (node_reference[node3->output(0)] != 1) continue;
-
-      if (node3->input(0) != node->input(0) || node3->input(1) != node2->output(0)) continue;
-
-      if (weights.find(node4->input(1)) == weights.end()) continue;
-
-      const onnx::TensorProto& div_six = weights[node4->input(1)];
-      if (div_six.dims_size() != 0 || get_tensor_proto_data_size(div_six) != 1) continue;
-
-      float constant_div_six = get_node_attr_from_input<float>(div_six);
-      if (node4->op_type() == "Div" && constant_div_six != 6.f) continue;
-      if (node4->op_type() == "Mul" && constant_div_six != 1 / 6.f) continue;
-
-      // reduce
-      node->set_op_type("noop_reducedncnn");
-      node2->set_op_type("noop_reducedncnn");
-      node3->set_op_type("noop_reducedncnn");
-
-      node_reference[node->input(0)] -= 1;
-      node_reference[node->input(1)] -= 1;
-      node_reference[node->output(0)] -= 1;
-      if (node2->input_size() == 3) {
-        node_reference[node2->input(1)] -= 1;
-        node_reference[node2->input(2)] -= 1;
-      }
-      node_reference[node2->output(0)] -= 1;
-      node_reference[node3->output(0)] -= 1;
-      node_reference[node4->input(1)] -= 1;
-
-      blob_names.erase(node->output(0));
-      blob_names.erase(node2->output(0));
-      blob_names.erase(node3->output(0));
-
-      node4->set_op_type("HardSwish");
-      node4->clear_input();
-      node4->add_input(node->input(0));
-
-      onnx::AttributeProto* attr_alpha = node4->add_attribute();
-      attr_alpha->set_name("alpha");
-      attr_alpha->set_f(1.f / 6.f);
-
-      onnx::AttributeProto* attr_beta = node4->add_attribute();
-      attr_beta->set_name("beta");
-      attr_beta->set_f(3.f / 6.f);
-
-      reduced_node_count += 3;
-      i += 3;
+                    std::map<std::string, int>&               node_reference,
+                    std::set<std::string>&                    blob_names,
+                    int&                                      reduced_node_count)
+{
+    int node_count = mutable_graph->node_size();
+    for (int i = 0; i < node_count; i++)
+    {
+        onnx::NodeProto* node = mutable_graph->mutable_node(i);
+
+        // HardSwish <= Add(+3) - Clip(0,6) - Mul(X,) - Div(/6)
+        // HardSwish <= Add(+3) - Clip(0,6) - Mul(X,) - Mul(*(1/6))
+        // HardSwish <= Add(+3) - Clip(0,6) - Mul(X,) - Constant - Div(/6)
+        // HardSwish <= Add(+3) - Clip(0,6) - Mul(X,) - Constant - Mul(*(1/6))
+        //     out = x * F.relu6(x + 3, inplace=True) / 6
+        if (node->op_type() == "Add")
+        {
+            if (node_reference[node->output(0)] != 1) continue;
+
+            if (i + 3 >= node_count) continue;
+
+            if (weights.find(node->input(1)) == weights.end()) continue;
+
+            const onnx::TensorProto& add_three = weights[node->input(1)];
+            if (add_three.dims_size() != 0 || get_tensor_proto_data_size(add_three) != 1) continue;
+
+            float constant_add_three = get_node_attr_from_input<float>(add_three);
+            if (constant_add_three != 3.f) continue;
+
+            onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
+            onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
+            onnx::NodeProto* node4 = mutable_graph->mutable_node(i + 3);
+
+            if (node4->op_type() == "Constant")
+            {
+                if (i + 4 >= node_count) continue;
+
+                node4 = mutable_graph->mutable_node(i + 4);
+            }
+
+            if (node2->op_type() != "Clip" || node3->op_type() != "Mul" ||
+                (node4->op_type() != "Div" && node4->op_type() != "Mul"))
+                continue;
+
+            if (node_reference[node2->output(0)] != 1) continue;
+
+            float relu6_min;
+            float relu6_max;
+            if (node2->input_size() == 1)
+            {
+                relu6_min = get_node_attr_f(*node2, "min", -FLT_MAX);
+                relu6_max = get_node_attr_f(*node2, "max", FLT_MAX);
+            }
+            else
+            {
+                const onnx::TensorProto& min_tp = weights[node2->input(1)];
+                const onnx::TensorProto& max_tp = weights[node2->input(2)];
+
+                relu6_min = get_node_attr_from_input<float>(min_tp);
+                relu6_max = get_node_attr_from_input<float>(max_tp);
+            }
+            if (relu6_min != 0.f || relu6_max != 6.f) continue;
+
+            if (node_reference[node3->output(0)] != 1) continue;
+
+            if (node3->input(0) != node->input(0) || node3->input(1) != node2->output(0)) continue;
+
+            if (weights.find(node4->input(1)) == weights.end()) continue;
+
+            const onnx::TensorProto& div_six = weights[node4->input(1)];
+            if (div_six.dims_size() != 0 || get_tensor_proto_data_size(div_six) != 1) continue;
+
+            float constant_div_six = get_node_attr_from_input<float>(div_six);
+            if (node4->op_type() == "Div" && constant_div_six != 6.f) continue;
+            if (node4->op_type() == "Mul" && constant_div_six != 1 / 6.f) continue;
+
+            // reduce
+            node->set_op_type("noop_reducedncnn");
+            node2->set_op_type("noop_reducedncnn");
+            node3->set_op_type("noop_reducedncnn");
+
+            node_reference[node->input(0)] -= 1;
+            node_reference[node->input(1)] -= 1;
+            node_reference[node->output(0)] -= 1;
+            if (node2->input_size() == 3)
+            {
+                node_reference[node2->input(1)] -= 1;
+                node_reference[node2->input(2)] -= 1;
+            }
+            node_reference[node2->output(0)] -= 1;
+            node_reference[node3->output(0)] -= 1;
+            node_reference[node4->input(1)] -= 1;
+
+            blob_names.erase(node->output(0));
+            blob_names.erase(node2->output(0));
+            blob_names.erase(node3->output(0));
+
+            node4->set_op_type("HardSwish");
+            node4->clear_input();
+            node4->add_input(node->input(0));
+
+            onnx::AttributeProto* attr_alpha = node4->add_attribute();
+            attr_alpha->set_name("alpha");
+            attr_alpha->set_f(1.f / 6.f);
+
+            onnx::AttributeProto* attr_beta = node4->add_attribute();
+            attr_beta->set_name("beta");
+            attr_beta->set_f(3.f / 6.f);
+
+            reduced_node_count += 3;
+            i += 3;
+        }
     }
-  }
 
-  for (int i = 0; i < node_count; i++) {
-    onnx::NodeProto* node = mutable_graph->mutable_node(i);
+    for (int i = 0; i < node_count; i++)
+    {
+        onnx::NodeProto* node = mutable_graph->mutable_node(i);
 
-    // HardSwish <= HardSigmoid - Mul
-    //     out = x * hsigmoid(x)
-    if (node->op_type() == "HardSigmoid") {
-      if (node_reference[node->output(0)] != 1) continue;
+        // HardSwish <= HardSigmoid - Mul
+        //     out = x * hsigmoid(x)
+        if (node->op_type() == "HardSigmoid")
+        {
+            if (node_reference[node->output(0)] != 1) continue;
 
-      float alpha = get_node_attr_f(*node, "alpha", 0.2f);
-      float beta = get_node_attr_f(*node, "beta", 0.5f);
+            float alpha = get_node_attr_f(*node, "alpha", 0.2f);
+            float beta  = get_node_attr_f(*node, "beta", 0.5f);
 
-      if (i + 1 >= node_count) continue;
+            if (i + 1 >= node_count) continue;
 
-      onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
+            onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
 
-      if (node2->op_type() != "Mul") continue;
+            if (node2->op_type() != "Mul") continue;
 
-      if (node2->input(0) != node->input(0) || node2->input(1) != node->output(0)) continue;
+            if (node2->input(0) != node->input(0) || node2->input(1) != node->output(0)) continue;
 
-      // reduce
-      node->set_op_type("noop_reducedncnn");
+            // reduce
+            node->set_op_type("noop_reducedncnn");
 
-      node_reference[node->input(0)] -= 1;
-      node_reference[node->output(0)] -= 1;
+            node_reference[node->input(0)] -= 1;
+            node_reference[node->output(0)] -= 1;
 
-      blob_names.erase(node->output(0));
+            blob_names.erase(node->output(0));
 
-      node2->set_op_type("HardSwish");
-      node2->clear_input();
-      node2->add_input(node->input(0));
+            node2->set_op_type("HardSwish");
+            node2->clear_input();
+            node2->add_input(node->input(0));
 
-      onnx::AttributeProto* attr_alpha = node2->add_attribute();
-      attr_alpha->set_name("alpha");
-      attr_alpha->set_f(alpha);
+            onnx::AttributeProto* attr_alpha = node2->add_attribute();
+            attr_alpha->set_name("alpha");
+            attr_alpha->set_f(alpha);
 
-      onnx::AttributeProto* attr_beta = node2->add_attribute();
-      attr_beta->set_name("beta");
-      attr_beta->set_f(beta);
+            onnx::AttributeProto* attr_beta = node2->add_attribute();
+            attr_beta->set_name("beta");
+            attr_beta->set_f(beta);
 
-      reduced_node_count += 1;
-      i += 1;
+            reduced_node_count += 1;
+            i += 1;
+        }
     }
-  }
 }
 
-void fuse_hardsigmoid(onnx::GraphProto* mutable_graph,
+void fuse_hardsigmoid(onnx::GraphProto*                         mutable_graph,
                       std::map<std::string, onnx::TensorProto>& weights,
-                      std::map<std::string, int>& node_reference, std::set<std::string>& blob_names,
-                      int& reduced_node_count) {
-  int node_count = mutable_graph->node_size();
-  for (int i = 0; i < node_count; i++) {
-    onnx::NodeProto* node = mutable_graph->mutable_node(i);
-
-    // HardSigmoid <= Add(+3) - Clip(0,6) - Div(/6)
-    // HardSigmoid <= Add(+3) - Clip(0,6) - Mul(*(1/6))
-    // HardSigmoid <= Add(+3) - Clip(0,6) - Constant - Div(/6)
-    // HardSigmoid <= Add(+3) - Clip(0,6) - Constant - Mul(*(1/6))
-    //     out = F.relu6(x + 3, inplace=True) / 6
-    if (node->op_type() == "Add") {
-      if (node_reference[node->output(0)] != 1) continue;
-
-      if (i + 2 >= node_count) continue;
-
-      if (weights.find(node->input(1)) == weights.end()) continue;
-
-      const onnx::TensorProto& add_three = weights[node->input(1)];
-      if (add_three.dims_size() != 0 || get_tensor_proto_data_size(add_three) != 1) continue;
-
-      float constant_add_three = get_node_attr_from_input<float>(add_three);
-      if (constant_add_three != 3.f) continue;
-
-      onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
-      onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
-
-      if (node3->op_type() == "Constant") {
-        if (i + 3 >= node_count) continue;
-
-        node3 = mutable_graph->mutable_node(i + 3);
-      }
-
-      if (node2->op_type() != "Clip" || (node3->op_type() != "Div" && node3->op_type() != "Mul"))
-        continue;
-
-      if (node_reference[node2->output(0)] != 1) continue;
-
-      float relu6_min;
-      float relu6_max;
-      if (node2->input_size() == 1) {
-        relu6_min = get_node_attr_f(*node2, "min", -FLT_MAX);
-        relu6_max = get_node_attr_f(*node2, "max", FLT_MAX);
-      } else {
-        const onnx::TensorProto& min_tp = weights[node2->input(1)];
-        const onnx::TensorProto& max_tp = weights[node2->input(2)];
-
-        relu6_min = get_node_attr_from_input<float>(min_tp);
-        relu6_max = get_node_attr_from_input<float>(max_tp);
-      }
-      if (relu6_min != 0.f || relu6_max != 6.f) continue;
-
-      if (weights.find(node3->input(1)) == weights.end()) continue;
-
-      const onnx::TensorProto& div_six = weights[node3->input(1)];
-      if (div_six.dims_size() != 0 || get_tensor_proto_data_size(div_six) != 1) continue;
-
-      float constant_div_six = get_node_attr_from_input<float>(div_six);
-      if (node3->op_type() == "Div" && constant_div_six != 6.f) continue;
-      if (node3->op_type() == "Mul" && constant_div_six != 1 / 6.f) continue;
-
-      // reduce
-      node->set_op_type("noop_reducedncnn");
-      node2->set_op_type("noop_reducedncnn");
-
-      node_reference[node->input(1)] -= 1;
-      node_reference[node->output(0)] -= 1;
-      if (node2->input_size() == 3) {
-        node_reference[node2->input(1)] -= 1;
-        node_reference[node2->input(2)] -= 1;
-      }
-      node_reference[node2->output(0)] -= 1;
-      node_reference[node3->input(1)] -= 1;
-
-      blob_names.erase(node->output(0));
-      blob_names.erase(node2->output(0));
-
-      node3->set_op_type("HardSigmoid");
-      node3->clear_input();
-      node3->add_input(node->input(0));
-
-      onnx::AttributeProto* attr_alpha = node3->add_attribute();
-      attr_alpha->set_name("alpha");
-      attr_alpha->set_f(1.f / 6.f);
-
-      onnx::AttributeProto* attr_beta = node3->add_attribute();
-      attr_beta->set_name("beta");
-      attr_beta->set_f(3.f / 6.f);
-
-      reduced_node_count += 2;
-      i += 2;
+                      std::map<std::string, int>&               node_reference,
+                      std::set<std::string>&                    blob_names,
+                      int&                                      reduced_node_count)
+{
+    int node_count = mutable_graph->node_size();
+    for (int i = 0; i < node_count; i++)
+    {
+        onnx::NodeProto* node = mutable_graph->mutable_node(i);
+
+        // HardSigmoid <= Add(+3) - Clip(0,6) - Div(/6)
+        // HardSigmoid <= Add(+3) - Clip(0,6) - Mul(*(1/6))
+        // HardSigmoid <= Add(+3) - Clip(0,6) - Constant - Div(/6)
+        // HardSigmoid <= Add(+3) - Clip(0,6) - Constant - Mul(*(1/6))
+        //     out = F.relu6(x + 3, inplace=True) / 6
+        if (node->op_type() == "Add")
+        {
+            if (node_reference[node->output(0)] != 1) continue;
+
+            if (i + 2 >= node_count) continue;
+
+            if (weights.find(node->input(1)) == weights.end()) continue;
+
+            const onnx::TensorProto& add_three = weights[node->input(1)];
+            if (add_three.dims_size() != 0 || get_tensor_proto_data_size(add_three) != 1) continue;
+
+            float constant_add_three = get_node_attr_from_input<float>(add_three);
+            if (constant_add_three != 3.f) continue;
+
+            onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
+            onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
+
+            if (node3->op_type() == "Constant")
+            {
+                if (i + 3 >= node_count) continue;
+
+                node3 = mutable_graph->mutable_node(i + 3);
+            }
+
+            if (node2->op_type() != "Clip" || (node3->op_type() != "Div" && node3->op_type() != "Mul"))
+                continue;
+
+            if (node_reference[node2->output(0)] != 1) continue;
+
+            float relu6_min;
+            float relu6_max;
+            if (node2->input_size() == 1)
+            {
+                relu6_min = get_node_attr_f(*node2, "min", -FLT_MAX);
+                relu6_max = get_node_attr_f(*node2, "max", FLT_MAX);
+            }
+            else
+            {
+                const onnx::TensorProto& min_tp = weights[node2->input(1)];
+                const onnx::TensorProto& max_tp = weights[node2->input(2)];
+
+                relu6_min = get_node_attr_from_input<float>(min_tp);
+                relu6_max = get_node_attr_from_input<float>(max_tp);
+            }
+            if (relu6_min != 0.f || relu6_max != 6.f) continue;
+
+            if (weights.find(node3->input(1)) == weights.end()) continue;
+
+            const onnx::TensorProto& div_six = weights[node3->input(1)];
+            if (div_six.dims_size() != 0 || get_tensor_proto_data_size(div_six) != 1) continue;
+
+            float constant_div_six = get_node_attr_from_input<float>(div_six);
+            if (node3->op_type() == "Div" && constant_div_six != 6.f) continue;
+            if (node3->op_type() == "Mul" && constant_div_six != 1 / 6.f) continue;
+
+            // reduce
+            node->set_op_type("noop_reducedncnn");
+            node2->set_op_type("noop_reducedncnn");
+
+            node_reference[node->input(1)] -= 1;
+            node_reference[node->output(0)] -= 1;
+            if (node2->input_size() == 3)
+            {
+                node_reference[node2->input(1)] -= 1;
+                node_reference[node2->input(2)] -= 1;
+            }
+            node_reference[node2->output(0)] -= 1;
+            node_reference[node3->input(1)] -= 1;
+
+            blob_names.erase(node->output(0));
+            blob_names.erase(node2->output(0));
+
+            node3->set_op_type("HardSigmoid");
+            node3->clear_input();
+            node3->add_input(node->input(0));
+
+            onnx::AttributeProto* attr_alpha = node3->add_attribute();
+            attr_alpha->set_name("alpha");
+            attr_alpha->set_f(1.f / 6.f);
+
+            onnx::AttributeProto* attr_beta = node3->add_attribute();
+            attr_beta->set_name("beta");
+            attr_beta->set_f(3.f / 6.f);
+
+            reduced_node_count += 2;
+            i += 2;
+        }
     }
-  }
 }
 
-void fuse_swish(onnx::GraphProto* mutable_graph, std::map<std::string, onnx::TensorProto>& weights,
-                std::map<std::string, int>& node_reference, std::set<std::string>& blob_names,
-                int& reduced_node_count) {
-  int node_count = mutable_graph->node_size();
-  for (int i = 0; i < node_count; i++) {
-    onnx::NodeProto* node = mutable_graph->mutable_node(i);
+void fuse_swish(onnx::GraphProto* mutable_graph, std::map<std::string, onnx::TensorProto>& weights, std::map<std::string, int>& node_reference, std::set<std::string>& blob_names, int& reduced_node_count)
+{
+    int node_count = mutable_graph->node_size();
+    for (int i = 0; i < node_count; i++)
+    {
+        onnx::NodeProto* node = mutable_graph->mutable_node(i);
 
-    // Swish <= Sigmoid - Mul
-    //     x * torch.sigmoid(x)
-    if (node->op_type() == "Sigmoid") {
-      if (node_reference[node->output(0)] != 1) continue;
+        // Swish <= Sigmoid - Mul
+        //     x * torch.sigmoid(x)
+        if (node->op_type() == "Sigmoid")
+        {
+            if (node_reference[node->output(0)] != 1) continue;
 
-      if (i + 1 >= node_count) continue;
+            if (i + 1 >= node_count) continue;
 
-      onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
+            onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
 
-      if (node2->op_type() != "Mul") continue;
+            if (node2->op_type() != "Mul") continue;
 
-      if (node2->input(0) != node->input(0) || node2->input(1) != node->output(0)) continue;
+            if (node2->input(0) != node->input(0) || node2->input(1) != node->output(0)) continue;
 
-      // reduce
-      node->set_op_type("noop_reducedncnn");
+            // reduce
+            node->set_op_type("noop_reducedncnn");
 
-      node_reference[node->input(0)] -= 1;
-      node_reference[node->output(0)] -= 1;
+            node_reference[node->input(0)] -= 1;
+            node_reference[node->output(0)] -= 1;
 
-      blob_names.erase(node->output(0));
+            blob_names.erase(node->output(0));
 
-      node2->set_op_type("Swish");
-      node2->clear_input();
-      node2->add_input(node->input(0));
+            node2->set_op_type("Swish");
+            node2->clear_input();
+            node2->add_input(node->input(0));
 
-      reduced_node_count += 1;
-      i += 1;
+            reduced_node_count += 1;
+            i += 1;
+        }
     }
-  }
 }
 
-void fuse_batchnorm1d_squeeze_unsqueeze(onnx::GraphProto* mutable_graph,
+void fuse_batchnorm1d_squeeze_unsqueeze(onnx::GraphProto*                         mutable_graph,
                                         std::map<std::string, onnx::TensorProto>& weights,
-                                        std::map<std::string, int>& node_reference,
-                                        std::set<std::string>& blob_names,
-                                        int& reduced_node_count) {
-  int node_count = mutable_graph->node_size();
-  for (int i = 0; i < node_count; i++) {
-    onnx::NodeProto* node = mutable_graph->mutable_node(i);
+                                        std::map<std::string, int>&               node_reference,
+                                        std::set<std::string>&                    blob_names,
+                                        int&                                      reduced_node_count)
+{
+    int node_count = mutable_graph->node_size();
+    for (int i = 0; i < node_count; i++)
+    {
+        onnx::NodeProto* node = mutable_graph->mutable_node(i);
 
-    // BatchNormalization <= Unsqueeze - BatchNormalization - Squeeze
-    if (node->op_type() == "Unsqueeze") {
-      if (node_reference[node->output(0)] != 1) continue;
+        // BatchNormalization <= Unsqueeze - BatchNormalization - Squeeze
+        if (node->op_type() == "Unsqueeze")
+        {
+            if (node_reference[node->output(0)] != 1) continue;
 
-      if (i + 2 >= node_count) continue;
+            if (i + 2 >= node_count) continue;
 
-      onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
-      onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
+            onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
+            onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
 
-      if (node2->op_type() != "BatchNormalization" || node3->op_type() != "Squeeze") continue;
+            if (node2->op_type() != "BatchNormalization" || node3->op_type() != "Squeeze") continue;
 
-      if (node_reference[node2->output(0)] != 1) continue;
+            if (node_reference[node2->output(0)] != 1) continue;
 
-      if (node2->input(0) != node->output(0) || node3->input(0) != node2->output(0)) continue;
+            if (node2->input(0) != node->output(0) || node3->input(0) != node2->output(0)) continue;
 
-      // reduce
-      node->set_op_type("noop_reducedncnn");
-      node3->set_op_type("noop_reducedncnn");
+            // reduce
+            node->set_op_type("noop_reducedncnn");
+            node3->set_op_type("noop_reducedncnn");
 
-      node_reference[node->output(0)] -= 1;
-      node_reference[node2->output(0)] -= 1;
+            node_reference[node->output(0)] -= 1;
+            node_reference[node2->output(0)] -= 1;
 
-      blob_names.erase(node->output(0));
-      blob_names.erase(node2->output(0));
+            blob_names.erase(node->output(0));
+            blob_names.erase(node2->output(0));
 
-      node2->set_input(0, node->input(0));
-      node2->set_output(0, node3->output(0));
+            node2->set_input(0, node->input(0));
+            node2->set_output(0, node3->output(0));
 
-      reduced_node_count += 2;
-      i += 2;
+            reduced_node_count += 2;
+            i += 2;
+        }
     }
-  }
 }
 
-void fuse_unsqueeze_prelu(onnx::GraphProto* mutable_graph,
+void fuse_unsqueeze_prelu(onnx::GraphProto*                         mutable_graph,
                           std::map<std::string, onnx::TensorProto>& weights,
-                          std::map<std::string, int>& node_reference,
-                          std::set<std::string>& blob_names, int& reduced_node_count) {
-  int node_count = mutable_graph->node_size();
-  for (int i = 0; i < node_count; i++) {
-    onnx::NodeProto* node = mutable_graph->mutable_node(i);
+                          std::map<std::string, int>&               node_reference,
+                          std::set<std::string>&                    blob_names,
+                          int&                                      reduced_node_count)
+{
+    int node_count = mutable_graph->node_size();
+    for (int i = 0; i < node_count; i++)
+    {
+        onnx::NodeProto* node = mutable_graph->mutable_node(i);
 
-    // PReLU <= Unsqueeze - PReLU
-    if (node->op_type() == "Unsqueeze") {
-      // check weight
-      if (weights.find(node->input(0)) == weights.end()) continue;
+        // PReLU <= Unsqueeze - PReLU
+        if (node->op_type() == "Unsqueeze")
+        {
+            // check weight
+            if (weights.find(node->input(0)) == weights.end()) continue;
 
-      onnx::TensorProto& B = weights[node->input(0)];
-      if (B.dims_size() != 1) continue;
+            onnx::TensorProto& B = weights[node->input(0)];
+            if (B.dims_size() != 1) continue;
 
-      if (node_reference[node->output(0)] != 1) continue;
+            if (node_reference[node->output(0)] != 1) continue;
 
-      // axes = (1, 2)
-      std::vector<int> axes = get_node_attr_ai(*node, "axes");
-      if (axes.size() != 2) continue;
-      if (axes[0] != 1 || axes[1] != 2) continue;
+            // axes = (1, 2)
+            std::vector<int> axes = get_node_attr_ai(*node, "axes");
+            if (axes.size() != 2) continue;
+            if (axes[0] != 1 || axes[1] != 2) continue;
 
-      if (i + 1 >= node_count) continue;
+            if (i + 1 >= node_count) continue;
 
-      onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
+            onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
 
-      if (node2->op_type() != "PRelu") continue;
+            if (node2->op_type() != "PRelu") continue;
 
-      if (node2->input(1) != node->output(0)) continue;
+            if (node2->input(1) != node->output(0)) continue;
 
-      // reduce
-      node->set_op_type("noop_reducedncnn");
+            // reduce
+            node->set_op_type("noop_reducedncnn");
 
-      node_reference[node->output(0)] -= 1;
+            node_reference[node->output(0)] -= 1;
 
-      blob_names.erase(node->output(0));
+            blob_names.erase(node->output(0));
 
-      node2->set_input(1, node->input(0));
+            node2->set_input(1, node->input(0));
 
-      reduced_node_count += 1;
-      i += 1;
+            reduced_node_count += 1;
+            i += 1;
+        }
     }
-  }
 }
 
-void fuse_normalize(onnx::GraphProto* mutable_graph,
+void fuse_normalize(onnx::GraphProto*                         mutable_graph,
                     std::map<std::string, onnx::TensorProto>& weights,
-                    std::map<std::string, int>& node_reference, std::set<std::string>& blob_names,
-                    int& reduced_node_count) {
-  int node_count = mutable_graph->node_size();
-  for (int i = 0; i < node_count; i++) {
-    onnx::NodeProto* node = mutable_graph->mutable_node(i);
-
-    // Normalize <= X - ReduceL2 - Clip - Expand - Div
-    // Normalize <= X - ReduceL2 - Clip - Shape - Expand - Div
-    if (node->op_type() == "ReduceL2") {
-      if (node_reference[node->output(0)] != 1) continue;
-
-      // axes = (1)
-      std::vector<int> axes = get_node_attr_ai(*node, "axes");
-      if (axes.size() != 1) continue;
-      if (axes[0] != 1) continue;
-
-      if (i + 3 >= node_count) continue;
-
-      onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
-      onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
-      onnx::NodeProto* node4 = mutable_graph->mutable_node(i + 3);
-
-      bool has_shape_node = node3->op_type() == "Shape";
-      onnx::NodeProto* node_shape = 0;
-      if (has_shape_node) {
-        if (i + 4 >= node_count) continue;
-
-        node_shape = node3;
-        node3 = mutable_graph->mutable_node(i + 3);
-        node4 = mutable_graph->mutable_node(i + 4);
-      }
-
-      if (node2->op_type() != "Clip" || node3->op_type() != "Expand" || node4->op_type() != "Div")
-        continue;
-
-      if (node_reference[node2->output(0)] != 1) continue;
-
-      if (node_reference[node3->output(0)] != 1) continue;
-
-      if (node2->input(0) != node->output(0) || node3->input(0) != node2->output(0) ||
-          node4->input(0) != node->input(0) || node4->input(1) != node3->output(0))
-        continue;
-
-      if (has_shape_node) {
-        if (node_shape->input(0) != node->input(0) || node3->input(1) != node_shape->output(0))
-          continue;
-      }
-
-      // +eps
-      float clip_min;
-      if (node2->input_size() == 1) {
-        clip_min = get_node_attr_f(*node2, "min", -FLT_MAX);
-      } else {
-        const onnx::TensorProto& min_tp = weights[node2->input(1)];
-
-        clip_min = get_node_attr_from_input<float>(min_tp);
-      }
-
-      // reduce
-      node->set_op_type("noop_reducedncnn");
-      node2->set_op_type("noop_reducedncnn");
-      if (has_shape_node) {
-        node_shape->set_op_type("noop_reducedncnn");
-      }
-      node3->set_op_type("noop_reducedncnn");
-
-      node_reference[node->input(0)] -= has_shape_node ? 2 : 1;
-      node_reference[node->output(0)] -= 1;
-      node_reference[node2->output(0)] -= 1;
-      if (has_shape_node) {
-        node_reference[node_shape->output(0)] -= 1;
-      }
-      node_reference[node3->output(0)] -= 1;
-      if (node3->input_size() == 2) {
-        node_reference[node3->input(1)] -= 1;
-      }
-
-      blob_names.erase(node->output(0));
-      blob_names.erase(node2->output(0));
-      if (has_shape_node) {
-        blob_names.erase(node_shape->output(0));
-      }
-      blob_names.erase(node3->output(0));
-
-      node4->set_op_type("Normalize");
-      node4->clear_input();
-      node4->add_input(node->input(0));
-
-      onnx::AttributeProto* attr_alpha = node4->add_attribute();
-      attr_alpha->set_name("eps");
-      attr_alpha->set_f(clip_min);
-
-      reduced_node_count += has_shape_node ? 4 : 3;
-      i += has_shape_node ? 4 : 3;
+                    std::map<std::string, int>&               node_reference,
+                    std::set<std::string>&                    blob_names,
+                    int&                                      reduced_node_count)
+{
+    int node_count = mutable_graph->node_size();
+    for (int i = 0; i < node_count; i++)
+    {
+        onnx::NodeProto* node = mutable_graph->mutable_node(i);
+
+        // Normalize <= X - ReduceL2 - Clip - Expand - Div
+        // Normalize <= X - ReduceL2 - Clip - Shape - Expand - Div
+        if (node->op_type() == "ReduceL2")
+        {
+            if (node_reference[node->output(0)] != 1) continue;
+
+            // axes = (1)
+            std::vector<int> axes = get_node_attr_ai(*node, "axes");
+            if (axes.size() != 1) continue;
+            if (axes[0] != 1) continue;
+
+            if (i + 3 >= node_count) continue;
+
+            onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
+            onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
+            onnx::NodeProto* node4 = mutable_graph->mutable_node(i + 3);
+
+            bool             has_shape_node = node3->op_type() == "Shape";
+            onnx::NodeProto* node_shape     = 0;
+            if (has_shape_node)
+            {
+                if (i + 4 >= node_count) continue;
+
+                node_shape = node3;
+                node3      = mutable_graph->mutable_node(i + 3);
+                node4      = mutable_graph->mutable_node(i + 4);
+            }
+
+            if (node2->op_type() != "Clip" || node3->op_type() != "Expand" || node4->op_type() != "Div")
+                continue;
+
+            if (node_reference[node2->output(0)] != 1) continue;
+
+            if (node_reference[node3->output(0)] != 1) continue;
+
+            if (node2->input(0) != node->output(0) || node3->input(0) != node2->output(0) ||
+                node4->input(0) != node->input(0) || node4->input(1) != node3->output(0))
+                continue;
+
+            if (has_shape_node)
+            {
+                if (node_shape->input(0) != node->input(0) || node3->input(1) != node_shape->output(0))
+                    continue;
+            }
+
+            // +eps
+            float clip_min;
+            if (node2->input_size() == 1)
+            {
+                clip_min = get_node_attr_f(*node2, "min", -FLT_MAX);
+            }
+            else
+            {
+                const onnx::TensorProto& min_tp = weights[node2->input(1)];
+
+                clip_min = get_node_attr_from_input<float>(min_tp);
+            }
+
+            // reduce
+            node->set_op_type("noop_reducedncnn");
+            node2->set_op_type("noop_reducedncnn");
+            if (has_shape_node)
+            {
+                node_shape->set_op_type("noop_reducedncnn");
+            }
+            node3->set_op_type("noop_reducedncnn");
+
+            node_reference[node->input(0)] -= has_shape_node ? 2 : 1;
+            node_reference[node->output(0)] -= 1;
+            node_reference[node2->output(0)] -= 1;
+            if (has_shape_node)
+            {
+                node_reference[node_shape->output(0)] -= 1;
+            }
+            node_reference[node3->output(0)] -= 1;
+            if (node3->input_size() == 2)
+            {
+                node_reference[node3->input(1)] -= 1;
+            }
+
+            blob_names.erase(node->output(0));
+            blob_names.erase(node2->output(0));
+            if (has_shape_node)
+            {
+                blob_names.erase(node_shape->output(0));
+            }
+            blob_names.erase(node3->output(0));
+
+            node4->set_op_type("Normalize");
+            node4->clear_input();
+            node4->add_input(node->input(0));
+
+            onnx::AttributeProto* attr_alpha = node4->add_attribute();
+            attr_alpha->set_name("eps");
+            attr_alpha->set_f(clip_min);
+
+            reduced_node_count += has_shape_node ? 4 : 3;
+            i += has_shape_node ? 4 : 3;
+        }
     }
-  }
 }
 
-void fuse_groupnorm(onnx::GraphProto* mutable_graph,
+void fuse_groupnorm(onnx::GraphProto*                         mutable_graph,
                     std::map<std::string, onnx::TensorProto>& weights,
-                    std::map<std::string, int>& node_reference, std::set<std::string>& blob_names,
-                    int& reduced_node_count) {
-  int node_count = mutable_graph->node_size();
-  for (int i = 0; i < node_count; i++) {
-    onnx::NodeProto* node = mutable_graph->mutable_node(i);
-
-    // GroupNorm <= X - Reshape - InstanceNormalization - Reshape - Mul - Add
-    if (node->op_type() == "Reshape") {
-      if (node_reference[node->output(0)] != 1) continue;
-
-      std::vector<int> shape;
-      if (node->input_size() == 1) {
-        shape = get_node_attr_ai(*node, "shape");
-      } else {
-        // skip weight reshape
-        if (weights.find(node->input(1)) == weights.end()) continue;
-
-        shape = get_node_attr_from_input_ai(weights[node->input(1)]);
-      }
-
-      // 0, group, -1
-      if (shape.size() != 3) continue;
-
-      if (shape[0] != 0 || shape[2] != -1) continue;
-
-      int groups = shape[1];
-
-      if (i + 4 >= node_count) continue;
-
-      onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
-      onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
-      onnx::NodeProto* node4 = mutable_graph->mutable_node(i + 3);
-      onnx::NodeProto* node5 = mutable_graph->mutable_node(i + 4);
-
-      if (node2->op_type() != "InstanceNormalization" || node3->op_type() != "Reshape" ||
-          node4->op_type() != "Mul" || node5->op_type() != "Add")
-        continue;
-
-      if (node_reference[node2->output(0)] != 1) continue;
-
-      if (node_reference[node3->output(0)] != 1) continue;
-
-      if (node_reference[node4->output(0)] != 1) continue;
-
-      if (node2->input(0) != node->output(0) || node3->input(0) != node2->output(0) ||
-          node4->input(0) != node3->output(0) || node5->input(0) != node4->output(0))
-        continue;
-
-      // +eps
-      float eps = get_node_attr_f(*node2, "epsilon", 1e-05f);
-
-      // InstanceNormalization S=1 B=0
-      std::vector<float> S = get_node_attr_from_input_af(weights[node2->input(1)]);
-      std::vector<float> B = get_node_attr_from_input_af(weights[node2->input(2)]);
-      if ((int)S.size() != groups || (int)B.size() != groups) continue;
-
-      bool instancenorm_affine = false;
-      for (int j = 0; j < groups; j++) {
-        if (S[j] != 1.f || B[j] != 0.f) {
-          instancenorm_affine = true;
-          break;
+                    std::map<std::string, int>&               node_reference,
+                    std::set<std::string>&                    blob_names,
+                    int&                                      reduced_node_count)
+{
+    int node_count = mutable_graph->node_size();
+    for (int i = 0; i < node_count; i++)
+    {
+        onnx::NodeProto* node = mutable_graph->mutable_node(i);
+
+        // GroupNorm <= X - Reshape - InstanceNormalization - Reshape - Mul - Add
+        if (node->op_type() == "Reshape")
+        {
+            if (node_reference[node->output(0)] != 1) continue;
+
+            std::vector<int> shape;
+            if (node->input_size() == 1)
+            {
+                shape = get_node_attr_ai(*node, "shape");
+            }
+            else
+            {
+                // skip weight reshape
+                if (weights.find(node->input(1)) == weights.end()) continue;
+
+                shape = get_node_attr_from_input_ai(weights[node->input(1)]);
+            }
+
+            // 0, group, -1
+            if (shape.size() != 3) continue;
+
+            if (shape[0] != 0 || shape[2] != -1) continue;
+
+            int groups = shape[1];
+
+            if (i + 4 >= node_count) continue;
+
+            onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
+            onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
+            onnx::NodeProto* node4 = mutable_graph->mutable_node(i + 3);
+            onnx::NodeProto* node5 = mutable_graph->mutable_node(i + 4);
+
+            if (node2->op_type() != "InstanceNormalization" || node3->op_type() != "Reshape" ||
+                node4->op_type() != "Mul" || node5->op_type() != "Add")
+                continue;
+
+            if (node_reference[node2->output(0)] != 1) continue;
+
+            if (node_reference[node3->output(0)] != 1) continue;
+
+            if (node_reference[node4->output(0)] != 1) continue;
+
+            if (node2->input(0) != node->output(0) || node3->input(0) != node2->output(0) ||
+                node4->input(0) != node3->output(0) || node5->input(0) != node4->output(0))
+                continue;
+
+            // +eps
+            float              eps = get_node_attr_f(*node2, "epsilon", 1e-05f);
+
+            // InstanceNormalization S=1 B=0
+            std::vector<float> S = get_node_attr_from_input_af(weights[node2->input(1)]);
+            std::vector<float> B = get_node_attr_from_input_af(weights[node2->input(2)]);
+            if ((int)S.size() != groups || (int)B.size() != groups) continue;
+
+            bool instancenorm_affine = false;
+            for (int j = 0; j < groups; j++)
+            {
+                if (S[j] != 1.f || B[j] != 0.f)
+                {
+                    instancenorm_affine = true;
+                    break;
+                }
+            }
+
+            if (instancenorm_affine) continue;
+
+            std::vector<int> shape2;
+            if (node3->input_size() == 1)
+            {
+                shape2 = get_node_attr_ai(*node3, "shape");
+            }
+            else
+            {
+                // skip weight reshape
+                if (weights.find(node3->input(1)) == weights.end()) continue;
+
+                shape2 = get_node_attr_from_input_ai(weights[node3->input(1)]);
+            }
+
+            // 1, channels, w, h
+            if (shape2.size() != 4) continue;
+
+            if (shape2[0] != 1) continue;
+
+            int                channels = shape2[1];
+
+            // affine
+            std::vector<float> affine_S = get_node_attr_from_input_af(weights[node4->input(1)]);
+            std::vector<float> affine_B = get_node_attr_from_input_af(weights[node5->input(1)]);
+            if (affine_S.size() == 1 && affine_S[0] == 1.f && affine_B.size() == 1 &&
+                affine_B[0] == 0.f)
+            {
+                // no affine
+            }
+            else if ((int)affine_S.size() != channels && (int)affine_B.size() != channels)
+            {
+                // we only allow per-channel affine
+                continue;
+            }
+
+            // reduce
+            node->set_op_type("noop_reducedncnn");
+            node2->set_op_type("noop_reducedncnn");
+            node3->set_op_type("noop_reducedncnn");
+            node4->set_op_type("noop_reducedncnn");
+
+            if (node->input_size() == 2)
+            {
+                node_reference[node->input(1)] -= 1;
+            }
+            node_reference[node->output(0)] -= 1;
+            node_reference[node2->input(1)] -= 1;
+            node_reference[node2->input(2)] -= 1;
+            node_reference[node2->output(0)] -= 1;
+            if (node3->input_size() == 2)
+            {
+                node_reference[node3->input(1)] -= 1;
+            }
+            node_reference[node3->output(0)] -= 1;
+            node_reference[node4->output(0)] -= 1;
+
+            blob_names.erase(node->output(0));
+            blob_names.erase(node2->output(0));
+            blob_names.erase(node3->output(0));
+            blob_names.erase(node4->output(0));
+
+            std::string affine_scale = node4->input(1);
+            std::string affine_bias  = node5->input(1);
+
+            node5->set_op_type("GroupNorm");
+            node5->clear_input();
+            node5->add_input(node->input(0));
+            node5->add_input(affine_scale);
+            node5->add_input(affine_bias);
+
+            onnx::AttributeProto* attr_groups = node5->add_attribute();
+            attr_groups->set_name("groups");
+            attr_groups->set_i(groups);
+
+            onnx::AttributeProto* attr_channels = node5->add_attribute();
+            attr_channels->set_name("channels");
+            attr_channels->set_i(channels);
+
+            onnx::AttributeProto* attr_eps = node5->add_attribute();
+            attr_eps->set_name("epsilon");
+            attr_eps->set_f(eps);
+
+            onnx::AttributeProto* attr_affine = node5->add_attribute();
+            attr_affine->set_name("affine");
+            attr_affine->set_i(1);
+
+            reduced_node_count += 4;
+            i += 4;
         }
-      }
-
-      if (instancenorm_affine) continue;
-
-      std::vector<int> shape2;
-      if (node3->input_size() == 1) {
-        shape2 = get_node_attr_ai(*node3, "shape");
-      } else {
-        // skip weight reshape
-        if (weights.find(node3->input(1)) == weights.end()) continue;
-
-        shape2 = get_node_attr_from_input_ai(weights[node3->input(1)]);
-      }
-
-      // 1, channels, w, h
-      if (shape2.size() != 4) continue;
-
-      if (shape2[0] != 1) continue;
-
-      int channels = shape2[1];
-
-      // affine
-      std::vector<float> affine_S = get_node_attr_from_input_af(weights[node4->input(1)]);
-      std::vector<float> affine_B = get_node_attr_from_input_af(weights[node5->input(1)]);
-      if (affine_S.size() == 1 && affine_S[0] == 1.f && affine_B.size() == 1 &&
-          affine_B[0] == 0.f) {
-        // no affine
-      } else if ((int)affine_S.size() != channels && (int)affine_B.size() != channels) {
-        // we only allow per-channel affine
-        continue;
-      }
-
-      // reduce
-      node->set_op_type("noop_reducedncnn");
-      node2->set_op_type("noop_reducedncnn");
-      node3->set_op_type("noop_reducedncnn");
-      node4->set_op_type("noop_reducedncnn");
-
-      if (node->input_size() == 2) {
-        node_reference[node->input(1)] -= 1;
-      }
-      node_reference[node->output(0)] -= 1;
-      node_reference[node2->input(1)] -= 1;
-      node_reference[node2->input(2)] -= 1;
-      node_reference[node2->output(0)] -= 1;
-      if (node3->input_size() == 2) {
-        node_reference[node3->input(1)] -= 1;
-      }
-      node_reference[node3->output(0)] -= 1;
-      node_reference[node4->output(0)] -= 1;
-
-      blob_names.erase(node->output(0));
-      blob_names.erase(node2->output(0));
-      blob_names.erase(node3->output(0));
-      blob_names.erase(node4->output(0));
-
-      std::string affine_scale = node4->input(1);
-      std::string affine_bias = node5->input(1);
-
-      node5->set_op_type("GroupNorm");
-      node5->clear_input();
-      node5->add_input(node->input(0));
-      node5->add_input(affine_scale);
-      node5->add_input(affine_bias);
-
-      onnx::AttributeProto* attr_groups = node5->add_attribute();
-      attr_groups->set_name("groups");
-      attr_groups->set_i(groups);
-
-      onnx::AttributeProto* attr_channels = node5->add_attribute();
-      attr_channels->set_name("channels");
-      attr_channels->set_i(channels);
-
-      onnx::AttributeProto* attr_eps = node5->add_attribute();
-      attr_eps->set_name("epsilon");
-      attr_eps->set_f(eps);
-
-      onnx::AttributeProto* attr_affine = node5->add_attribute();
-      attr_affine->set_name("affine");
-      attr_affine->set_i(1);
-
-      reduced_node_count += 4;
-      i += 4;
     }
-  }
 }
 
-void fuse_layernorm(onnx::GraphProto* mutable_graph,
+void fuse_layernorm(onnx::GraphProto*                         mutable_graph,
                     std::map<std::string, onnx::TensorProto>& weights,
-                    std::map<std::string, int>& node_reference, std::set<std::string>& blob_names,
-                    int& reduced_node_count) {
-  int node_count = mutable_graph->node_size();
-  for (int i = 0; i < node_count; i++) {
-    onnx::NodeProto* node = mutable_graph->mutable_node(i);
-
-    // LayerNorm <= X - ReduceMean - Sub - Pow - ReduceMean - Add - Sqrt - Div
-    // LayerNorm <= X - ReduceMean - Sub - Pow - ReduceMean - Add - Sqrt - Div -
-    // Mul - Add
-    if (node->op_type() == "ReduceMean") {
-      if (node_reference[node->output(0)] != 1) continue;
-
-      std::vector<int> axes = get_node_attr_ai(*node, "axes");
-
-      // -1
-      // -2 -1
-      if (axes.size() != 1 && axes.size() != 2) continue;
+                    std::map<std::string, int>&               node_reference,
+                    std::set<std::string>&                    blob_names,
+                    int&                                      reduced_node_count)
+{
+    int node_count = mutable_graph->node_size();
+    for (int i = 0; i < node_count; i++)
+    {
+        onnx::NodeProto* node = mutable_graph->mutable_node(i);
 
-      int normed_axes = (int)axes.size();
-      if (normed_axes == 1 && axes[0] != -1) continue;
-      if (normed_axes == 2 && (axes[0] != -2 || axes[1] != -1)) continue;
+        // LayerNorm <= X - ReduceMean - Sub - Pow - ReduceMean - Add - Sqrt - Div
+        // LayerNorm <= X - ReduceMean - Sub - Pow - ReduceMean - Add - Sqrt - Div -
+        // Mul - Add
+        if (node->op_type() == "ReduceMean")
+        {
+            if (node_reference[node->output(0)] != 1) continue;
 
-      if (i + 6 >= node_count) continue;
+            std::vector<int> axes = get_node_attr_ai(*node, "axes");
 
-      onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
-      onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
-      onnx::NodeProto* node4 = mutable_graph->mutable_node(i + 3);
-      onnx::NodeProto* node5 = mutable_graph->mutable_node(i + 4);
-      onnx::NodeProto* node6 = mutable_graph->mutable_node(i + 5);
-      onnx::NodeProto* node7 = mutable_graph->mutable_node(i + 6);
+            // -1
+            // -2 -1
+            if (axes.size() != 1 && axes.size() != 2) continue;
 
-      if (node2->op_type() != "Sub" || node3->op_type() != "Pow" ||
-          node4->op_type() != "ReduceMean" || node5->op_type() != "Add" ||
-          node6->op_type() != "Sqrt" || node7->op_type() != "Div")
-        continue;
+            int normed_axes = (int)axes.size();
+            if (normed_axes == 1 && axes[0] != -1) continue;
+            if (normed_axes == 2 && (axes[0] != -2 || axes[1] != -1)) continue;
 
-      if (node_reference[node2->output(0)] != 2) continue;
+            if (i + 6 >= node_count) continue;
 
-      if (node_reference[node3->output(0)] != 1) continue;
+            onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
+            onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
+            onnx::NodeProto* node4 = mutable_graph->mutable_node(i + 3);
+            onnx::NodeProto* node5 = mutable_graph->mutable_node(i + 4);
+            onnx::NodeProto* node6 = mutable_graph->mutable_node(i + 5);
+            onnx::NodeProto* node7 = mutable_graph->mutable_node(i + 6);
 
-      if (node_reference[node4->output(0)] != 1) continue;
+            if (node2->op_type() != "Sub" || node3->op_type() != "Pow" ||
+                node4->op_type() != "ReduceMean" || node5->op_type() != "Add" ||
+                node6->op_type() != "Sqrt" || node7->op_type() != "Div")
+                continue;
 
-      if (node_reference[node5->output(0)] != 1) continue;
+            if (node_reference[node2->output(0)] != 2) continue;
 
-      if (node_reference[node6->output(0)] != 1) continue;
+            if (node_reference[node3->output(0)] != 1) continue;
 
-      if (node2->input(0) != node->input(0) || node2->input(1) != node->output(0) ||
-          node3->input(0) != node2->output(0) || node4->input(0) != node3->output(0) ||
-          node5->input(0) != node4->output(0) || node6->input(0) != node5->output(0) ||
-          node7->input(0) != node2->output(0) || node7->input(1) != node6->output(0))
-        continue;
+            if (node_reference[node4->output(0)] != 1) continue;
 
-      if (weights.find(node3->input(1)) == weights.end()) continue;
+            if (node_reference[node5->output(0)] != 1) continue;
 
-      const onnx::TensorProto& pow_two = weights[node3->input(1)];
-      if (pow_two.dims_size() != 0 || get_tensor_proto_data_size(pow_two) != 1) continue;
+            if (node_reference[node6->output(0)] != 1) continue;
 
-      float constant_pow_two = get_node_attr_from_input<float>(pow_two);
-      if (constant_pow_two != 2.f) continue;
+            if (node2->input(0) != node->input(0) || node2->input(1) != node->output(0) ||
+                node3->input(0) != node2->output(0) || node4->input(0) != node3->output(0) ||
+                node5->input(0) != node4->output(0) || node6->input(0) != node5->output(0) ||
+                node7->input(0) != node2->output(0) || node7->input(1) != node6->output(0))
+                continue;
 
-      std::vector<int> axes4 = get_node_attr_ai(*node4, "axes");
+            if (weights.find(node3->input(1)) == weights.end()) continue;
 
-      // -1
-      // -2 -1
-      if ((int)axes4.size() != normed_axes) continue;
+            const onnx::TensorProto& pow_two = weights[node3->input(1)];
+            if (pow_two.dims_size() != 0 || get_tensor_proto_data_size(pow_two) != 1) continue;
 
-      if (normed_axes == 1 && axes4[0] != -1) continue;
-      if (normed_axes == 2 && (axes4[0] != -2 || axes4[1] != -1)) continue;
+            float constant_pow_two = get_node_attr_from_input<float>(pow_two);
+            if (constant_pow_two != 2.f) continue;
 
-      if (weights.find(node5->input(1)) == weights.end()) continue;
+            std::vector<int> axes4 = get_node_attr_ai(*node4, "axes");
 
-      const onnx::TensorProto& add_eps = weights[node5->input(1)];
-      if (add_eps.dims_size() != 0 || get_tensor_proto_data_size(add_eps) != 1) continue;
+            // -1
+            // -2 -1
+            if ((int)axes4.size() != normed_axes) continue;
 
-      float eps = get_node_attr_from_input<float>(add_eps);
+            if (normed_axes == 1 && axes4[0] != -1) continue;
+            if (normed_axes == 2 && (axes4[0] != -2 || axes4[1] != -1)) continue;
 
-      int affine = 0;
-      while (i + 8 < node_count) {
-        onnx::NodeProto* node8 = mutable_graph->mutable_node(i + 7);
-        onnx::NodeProto* node9 = mutable_graph->mutable_node(i + 8);
+            if (weights.find(node5->input(1)) == weights.end()) continue;
 
-        if (node8->op_type() != "Mul" || node9->op_type() != "Add") break;
+            const onnx::TensorProto& add_eps = weights[node5->input(1)];
+            if (add_eps.dims_size() != 0 || get_tensor_proto_data_size(add_eps) != 1) continue;
 
-        if (node_reference[node7->output(0)] != 1) break;
+            float eps = get_node_attr_from_input<float>(add_eps);
 
-        if (node_reference[node8->output(0)] != 1) break;
+            int   affine = 0;
+            while (i + 8 < node_count)
+            {
+                onnx::NodeProto* node8 = mutable_graph->mutable_node(i + 7);
+                onnx::NodeProto* node9 = mutable_graph->mutable_node(i + 8);
 
-        if (node8->input(0) != node7->output(0) || node9->input(0) != node8->output(0)) break;
+                if (node8->op_type() != "Mul" || node9->op_type() != "Add") break;
 
-        // affine
-        std::vector<float> affine_S = get_node_attr_from_input_af(weights[node8->input(1)]);
-        std::vector<float> affine_B = get_node_attr_from_input_af(weights[node9->input(1)]);
-        if (affine_S.size() != affine_B.size()) break;
+                if (node_reference[node7->output(0)] != 1) break;
 
-        affine = 1;
-        break;
-      }
+                if (node_reference[node8->output(0)] != 1) break;
 
-      // reduce
-      node->set_op_type("noop_reducedncnn");
-      node2->set_op_type("noop_reducedncnn");
-      node3->set_op_type("noop_reducedncnn");
-      node4->set_op_type("noop_reducedncnn");
-      node5->set_op_type("noop_reducedncnn");
-      node6->set_op_type("noop_reducedncnn");
+                if (node8->input(0) != node7->output(0) || node9->input(0) != node8->output(0)) break;
 
-      node_reference[node->input(0)] -= 1;
-      node_reference[node2->input(0)] -= 1;
-      node_reference[node2->input(1)] -= 1;
-      node_reference[node3->input(0)] -= 1;
-      node_reference[node3->input(1)] -= 1;
-      node_reference[node4->input(0)] -= 1;
-      node_reference[node5->input(0)] -= 1;
-      node_reference[node5->input(1)] -= 1;
-      node_reference[node6->input(0)] -= 1;
-      node_reference[node7->input(0)] -= 1;
-      node_reference[node7->input(1)] -= 1;
+                // affine
+                std::vector<float> affine_S = get_node_attr_from_input_af(weights[node8->input(1)]);
+                std::vector<float> affine_B = get_node_attr_from_input_af(weights[node9->input(1)]);
+                if (affine_S.size() != affine_B.size()) break;
 
-      blob_names.erase(node->output(0));
-      blob_names.erase(node2->output(0));
-      blob_names.erase(node3->output(0));
-      blob_names.erase(node4->output(0));
-      blob_names.erase(node5->output(0));
-      blob_names.erase(node6->output(0));
+                affine = 1;
+                break;
+            }
 
-      node_reference[node->input(0)] += 1;
+            // reduce
+            node->set_op_type("noop_reducedncnn");
+            node2->set_op_type("noop_reducedncnn");
+            node3->set_op_type("noop_reducedncnn");
+            node4->set_op_type("noop_reducedncnn");
+            node5->set_op_type("noop_reducedncnn");
+            node6->set_op_type("noop_reducedncnn");
 
-      if (affine == 0) {
-        node7->set_op_type("LayerNorm");
-        node7->clear_input();
-        node7->add_input(node->input(0));
+            node_reference[node->input(0)] -= 1;
+            node_reference[node2->input(0)] -= 1;
+            node_reference[node2->input(1)] -= 1;
+            node_reference[node3->input(0)] -= 1;
+            node_reference[node3->input(1)] -= 1;
+            node_reference[node4->input(0)] -= 1;
+            node_reference[node5->input(0)] -= 1;
+            node_reference[node5->input(1)] -= 1;
+            node_reference[node6->input(0)] -= 1;
+            node_reference[node7->input(0)] -= 1;
+            node_reference[node7->input(1)] -= 1;
 
-        onnx::AttributeProto* attr_eps = node7->add_attribute();
-        attr_eps->set_name("epsilon");
-        attr_eps->set_f(eps);
+            blob_names.erase(node->output(0));
+            blob_names.erase(node2->output(0));
+            blob_names.erase(node3->output(0));
+            blob_names.erase(node4->output(0));
+            blob_names.erase(node5->output(0));
+            blob_names.erase(node6->output(0));
 
-        onnx::AttributeProto* attr_affine = node7->add_attribute();
-        attr_affine->set_name("affine");
-        attr_affine->set_i(affine);
+            node_reference[node->input(0)] += 1;
 
-        reduced_node_count += 6;
-        i += 6;
-      } else  // if (affine == 1)
-      {
-        onnx::NodeProto* node8 = mutable_graph->mutable_node(i + 7);
-        onnx::NodeProto* node9 = mutable_graph->mutable_node(i + 8);
+            if (affine == 0)
+            {
+                node7->set_op_type("LayerNorm");
+                node7->clear_input();
+                node7->add_input(node->input(0));
 
-        node7->set_op_type("noop_reducedncnn");
-        node8->set_op_type("noop_reducedncnn");
+                onnx::AttributeProto* attr_eps = node7->add_attribute();
+                attr_eps->set_name("epsilon");
+                attr_eps->set_f(eps);
 
-        node_reference[node8->input(0)] -= 1;
-        node_reference[node9->input(0)] -= 1;
+                onnx::AttributeProto* attr_affine = node7->add_attribute();
+                attr_affine->set_name("affine");
+                attr_affine->set_i(affine);
 
-        blob_names.erase(node7->output(0));
-        blob_names.erase(node8->output(0));
+                reduced_node_count += 6;
+                i += 6;
+            }
+            else  // if (affine == 1)
+            {
+                onnx::NodeProto* node8 = mutable_graph->mutable_node(i + 7);
+                onnx::NodeProto* node9 = mutable_graph->mutable_node(i + 8);
 
-        std::string affine_scale = node8->input(1);
-        std::string affine_bias = node9->input(1);
+                node7->set_op_type("noop_reducedncnn");
+                node8->set_op_type("noop_reducedncnn");
 
-        node9->set_op_type("LayerNorm");
-        node9->clear_input();
-        node9->add_input(node->input(0));
-        node9->add_input(affine_scale);
-        node9->add_input(affine_bias);
-
-        onnx::AttributeProto* attr_eps = node9->add_attribute();
-        attr_eps->set_name("epsilon");
-        attr_eps->set_f(eps);
-
-        onnx::AttributeProto* attr_affine = node9->add_attribute();
-        attr_affine->set_name("affine");
-        attr_affine->set_i(affine);
-
-        reduced_node_count += 8;
-        i += 8;
-      }
+                node_reference[node8->input(0)] -= 1;
+                node_reference[node9->input(0)] -= 1;
+
+                blob_names.erase(node7->output(0));
+                blob_names.erase(node8->output(0));
+
+                std::string affine_scale = node8->input(1);
+                std::string affine_bias  = node9->input(1);
+
+                node9->set_op_type("LayerNorm");
+                node9->clear_input();
+                node9->add_input(node->input(0));
+                node9->add_input(affine_scale);
+                node9->add_input(affine_bias);
+
+                onnx::AttributeProto* attr_eps = node9->add_attribute();
+                attr_eps->set_name("epsilon");
+                attr_eps->set_f(eps);
+
+                onnx::AttributeProto* attr_affine = node9->add_attribute();
+                attr_affine->set_name("affine");
+                attr_affine->set_i(affine);
+
+                reduced_node_count += 8;
+                i += 8;
+            }
+        }
     }
-  }
 }
 
-void fuse_flatten(onnx::GraphProto* mutable_graph,
+void fuse_flatten(onnx::GraphProto*                         mutable_graph,
                   std::map<std::string, onnx::TensorProto>& weights,
-                  std::map<std::string, int>& node_reference, std::set<std::string>& blob_names,
-                  int& reduced_node_count) {
-  int node_count = mutable_graph->node_size();
-  for (int i = 0; i < node_count; i++) {
-    onnx::NodeProto* node = mutable_graph->mutable_node(i);
-
-    // Flatten <= X - Shape - Gather - Constant - Unsqueeze - Unsqueeze - Concat
-    // - Reshape
-    if (node->op_type() == "Shape") {
-      if (node_reference[node->output(0)] != 1) continue;
-
-      if (i + 6 >= node_count) continue;
-
-      onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
-      onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
-      onnx::NodeProto* node4 = mutable_graph->mutable_node(i + 3);
-      onnx::NodeProto* node5 = mutable_graph->mutable_node(i + 4);
-      onnx::NodeProto* node6 = mutable_graph->mutable_node(i + 5);
-      onnx::NodeProto* node7 = mutable_graph->mutable_node(i + 6);
-
-      if (node2->op_type() != "Gather" || node3->op_type() != "Constant" ||
-          node4->op_type() != "Unsqueeze" || node5->op_type() != "Unsqueeze" ||
-          node6->op_type() != "Concat" || node7->op_type() != "Reshape")
-        continue;
-
-      if (node_reference[node2->output(0)] != 1) continue;
-
-      //             if (node_reference[node3->output(0)] != 1)
-      //                 continue;
-
-      if (node_reference[node4->output(0)] != 1) continue;
-
-      if (node_reference[node5->output(0)] != 1) continue;
-
-      if (node_reference[node6->output(0)] != 1) continue;
-
-      if (node2->input(0) != node->output(0) || node4->input(0) != node2->output(0) ||
-          node5->input(0) != node3->output(0) || node6->input(0) != node4->output(0) ||
-          node6->input(1) != node5->output(0) || node7->input(0) != node->input(0) ||
-          node7->input(1) != node6->output(0))
-        continue;
-
-      // axis = 0
-      int gather_axis = get_node_attr_i(*node2, "axis");
-      if (gather_axis != 0) continue;
-
-      // indices = 0
-      if (weights.find(node2->input(1)) == weights.end()) continue;
-
-      std::vector<int> gather_indices = get_node_attr_from_input_ai(weights[node2->input(1)]);
-      if (gather_indices.size() != 1 || gather_indices[0] != 0) continue;
-
-      // axes = (0)
-      std::vector<int> unsqueeze_axes = get_node_attr_ai(*node4, "axes");
-      if (unsqueeze_axes.size() != 1) continue;
-      if (unsqueeze_axes[0] != 0) continue;
-
-      // axes = (0)
-      std::vector<int> unsqueeze2_axes = get_node_attr_ai(*node5, "axes");
-      if (unsqueeze2_axes.size() != 1) continue;
-      if (unsqueeze2_axes[0] != 0) continue;
-
-      // data = -1
-      if (weights.find(node5->input(0)) == weights.end()) continue;
-
-      std::vector<int> unsqueeze2_data = get_node_attr_from_input_ai(weights[node5->input(0)]);
-      if (unsqueeze2_data.size() != 1 || unsqueeze2_data[0] != -1) continue;
-
-      // axis = 0
-      int concat_axis = get_node_attr_i(*node6, "axis");
-      if (concat_axis != 0) continue;
-
-      // reduce
-      node->set_op_type("noop_reducedncnn");
-      node2->set_op_type("noop_reducedncnn");
-      //             node3->set_op_type("noop_reducedncnn");
-      node4->set_op_type("noop_reducedncnn");
-      node5->set_op_type("noop_reducedncnn");
-      node6->set_op_type("noop_reducedncnn");
-
-      node_reference[node->input(0)] -= 1;
-      node_reference[node->output(0)] -= 1;
-      node_reference[node2->input(1)] -= 1;
-      node_reference[node2->output(0)] -= 1;
-      //             node_reference[node3->output(0)] -= 1;
-      node_reference[node4->output(0)] -= 1;
-      node_reference[node5->input(0)] -= 1;
-      node_reference[node5->output(0)] -= 1;
-      node_reference[node6->output(0)] -= 1;
-
-      blob_names.erase(node->output(0));
-      blob_names.erase(node2->output(0));
-      //             blob_names.erase(node3->output(0));
-      blob_names.erase(node4->output(0));
-      blob_names.erase(node5->output(0));
-      blob_names.erase(node6->output(0));
-
-      node7->set_op_type("Flatten");
-      node7->clear_input();
-      node7->add_input(node->input(0));
-
-      reduced_node_count += 5;
-      i += 5;
+                  std::map<std::string, int>&               node_reference,
+                  std::set<std::string>&                    blob_names,
+                  int&                                      reduced_node_count)
+{
+    int node_count = mutable_graph->node_size();
+    for (int i = 0; i < node_count; i++)
+    {
+        onnx::NodeProto* node = mutable_graph->mutable_node(i);
+
+        // Flatten <= X - Shape - Gather - Constant - Unsqueeze - Unsqueeze - Concat
+        // - Reshape
+        if (node->op_type() == "Shape")
+        {
+            if (node_reference[node->output(0)] != 1) continue;
+
+            if (i + 6 >= node_count) continue;
+
+            onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
+            onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
+            onnx::NodeProto* node4 = mutable_graph->mutable_node(i + 3);
+            onnx::NodeProto* node5 = mutable_graph->mutable_node(i + 4);
+            onnx::NodeProto* node6 = mutable_graph->mutable_node(i + 5);
+            onnx::NodeProto* node7 = mutable_graph->mutable_node(i + 6);
+
+            if (node2->op_type() != "Gather" || node3->op_type() != "Constant" ||
+                node4->op_type() != "Unsqueeze" || node5->op_type() != "Unsqueeze" ||
+                node6->op_type() != "Concat" || node7->op_type() != "Reshape")
+                continue;
+
+            if (node_reference[node2->output(0)] != 1) continue;
+
+            //             if (node_reference[node3->output(0)] != 1)
+            //                 continue;
+
+            if (node_reference[node4->output(0)] != 1) continue;
+
+            if (node_reference[node5->output(0)] != 1) continue;
+
+            if (node_reference[node6->output(0)] != 1) continue;
+
+            if (node2->input(0) != node->output(0) || node4->input(0) != node2->output(0) ||
+                node5->input(0) != node3->output(0) || node6->input(0) != node4->output(0) ||
+                node6->input(1) != node5->output(0) || node7->input(0) != node->input(0) ||
+                node7->input(1) != node6->output(0))
+                continue;
+
+            // axis = 0
+            int gather_axis = get_node_attr_i(*node2, "axis");
+            if (gather_axis != 0) continue;
+
+            // indices = 0
+            if (weights.find(node2->input(1)) == weights.end()) continue;
+
+            std::vector<int> gather_indices = get_node_attr_from_input_ai(weights[node2->input(1)]);
+            if (gather_indices.size() != 1 || gather_indices[0] != 0) continue;
+
+            // axes = (0)
+            std::vector<int> unsqueeze_axes = get_node_attr_ai(*node4, "axes");
+            if (unsqueeze_axes.size() != 1) continue;
+            if (unsqueeze_axes[0] != 0) continue;
+
+            // axes = (0)
+            std::vector<int> unsqueeze2_axes = get_node_attr_ai(*node5, "axes");
+            if (unsqueeze2_axes.size() != 1) continue;
+            if (unsqueeze2_axes[0] != 0) continue;
+
+            // data = -1
+            if (weights.find(node5->input(0)) == weights.end()) continue;
+
+            std::vector<int> unsqueeze2_data = get_node_attr_from_input_ai(weights[node5->input(0)]);
+            if (unsqueeze2_data.size() != 1 || unsqueeze2_data[0] != -1) continue;
+
+            // axis = 0
+            int concat_axis = get_node_attr_i(*node6, "axis");
+            if (concat_axis != 0) continue;
+
+            // reduce
+            node->set_op_type("noop_reducedncnn");
+            node2->set_op_type("noop_reducedncnn");
+            //             node3->set_op_type("noop_reducedncnn");
+            node4->set_op_type("noop_reducedncnn");
+            node5->set_op_type("noop_reducedncnn");
+            node6->set_op_type("noop_reducedncnn");
+
+            node_reference[node->input(0)] -= 1;
+            node_reference[node->output(0)] -= 1;
+            node_reference[node2->input(1)] -= 1;
+            node_reference[node2->output(0)] -= 1;
+            //             node_reference[node3->output(0)] -= 1;
+            node_reference[node4->output(0)] -= 1;
+            node_reference[node5->input(0)] -= 1;
+            node_reference[node5->output(0)] -= 1;
+            node_reference[node6->output(0)] -= 1;
+
+            blob_names.erase(node->output(0));
+            blob_names.erase(node2->output(0));
+            //             blob_names.erase(node3->output(0));
+            blob_names.erase(node4->output(0));
+            blob_names.erase(node5->output(0));
+            blob_names.erase(node6->output(0));
+
+            node7->set_op_type("Flatten");
+            node7->clear_input();
+            node7->add_input(node->input(0));
+
+            reduced_node_count += 5;
+            i += 5;
+        }
     }
-  }
 }
 
-void fuse_pixelshuffle(onnx::GraphProto* mutable_graph,
+void fuse_pixelshuffle(onnx::GraphProto*                         mutable_graph,
                        std::map<std::string, onnx::TensorProto>& weights,
-                       std::map<std::string, int>& node_reference,
-                       std::set<std::string>& blob_names, int& reduced_node_count) {
-  int node_count = mutable_graph->node_size();
-  for (int i = 0; i < node_count; i++) {
-    onnx::NodeProto* node = mutable_graph->mutable_node(i);
+                       std::map<std::string, int>&               node_reference,
+                       std::set<std::string>&                    blob_names,
+                       int&                                      reduced_node_count)
+{
+    int node_count = mutable_graph->node_size();
+    for (int i = 0; i < node_count; i++)
+    {
+        onnx::NodeProto* node = mutable_graph->mutable_node(i);
 
-    // PixelShuffle <= Reshape - Transpose - Reshape
-    // PixelShuffle <= Reshape - Transpose - Constant - Reshape
-    if (node->op_type() == "Reshape") {
-      if (node_reference[node->output(0)] != 1) continue;
+        // PixelShuffle <= Reshape - Transpose - Reshape
+        // PixelShuffle <= Reshape - Transpose - Constant - Reshape
+        if (node->op_type() == "Reshape")
+        {
+            if (node_reference[node->output(0)] != 1) continue;
 
-      std::vector<int> shape;
-      if (node->input_size() == 1) {
-        shape = get_node_attr_ai(*node, "shape");
-      } else {
-        // skip weight reshape
-        if (weights.find(node->input(1)) == weights.end()) continue;
+            std::vector<int> shape;
+            if (node->input_size() == 1)
+            {
+                shape = get_node_attr_ai(*node, "shape");
+            }
+            else
+            {
+                // skip weight reshape
+                if (weights.find(node->input(1)) == weights.end()) continue;
 
-        shape = get_node_attr_from_input_ai(weights[node->input(1)]);
-      }
+                shape = get_node_attr_from_input_ai(weights[node->input(1)]);
+            }
 
-      // -1, 3, upscale_factor, upscale_factor, height, width
-      if (shape.size() != 6) continue;
+            // -1, 3, upscale_factor, upscale_factor, height, width
+            if (shape.size() != 6) continue;
 
-      if (shape[0] != 1 && shape[0] != -1) continue;
+            if (shape[0] != 1 && shape[0] != -1) continue;
 
-      if (shape[2] != shape[3]) continue;
+            if (shape[2] != shape[3]) continue;
 
-      if (i + 2 >= node_count) continue;
+            if (i + 2 >= node_count) continue;
 
-      onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
-      onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
+            onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
+            onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
 
-      if (node3->op_type() == "Constant") {
-        if (i + 3 >= node_count) continue;
+            if (node3->op_type() == "Constant")
+            {
+                if (i + 3 >= node_count) continue;
 
-        node3 = mutable_graph->mutable_node(i + 3);
-      }
+                node3 = mutable_graph->mutable_node(i + 3);
+            }
 
-      if (node2->op_type() != "Transpose" || node3->op_type() != "Reshape") continue;
+            if (node2->op_type() != "Transpose" || node3->op_type() != "Reshape") continue;
 
-      if (node_reference[node2->output(0)] != 1) continue;
+            if (node_reference[node2->output(0)] != 1) continue;
 
-      // 0 1 4 2 5 3
-      std::vector<int> perm = get_node_attr_ai(*node2, "perm");
-      if (perm.size() != 6) continue;
+            // 0 1 4 2 5 3
+            std::vector<int> perm = get_node_attr_ai(*node2, "perm");
+            if (perm.size() != 6) continue;
 
-      if (perm[0] != 0 || perm[1] != 1 || perm[2] != 4 || perm[3] != 2 || perm[4] != 5 ||
-          perm[5] != 3)
-        continue;
+            if (perm[0] != 0 || perm[1] != 1 || perm[2] != 4 || perm[3] != 2 || perm[4] != 5 ||
+                perm[5] != 3)
+                continue;
 
-      std::vector<int> shape3;
-      if (node3->input_size() == 1) {
-        shape3 = get_node_attr_ai(*node3, "shape");
-      } else {
-        // skip weight reshape
-        if (weights.find(node3->input(1)) == weights.end()) continue;
+            std::vector<int> shape3;
+            if (node3->input_size() == 1)
+            {
+                shape3 = get_node_attr_ai(*node3, "shape");
+            }
+            else
+            {
+                // skip weight reshape
+                if (weights.find(node3->input(1)) == weights.end()) continue;
 
-        shape3 = get_node_attr_from_input_ai(weights[node3->input(1)]);
-      }
+                shape3 = get_node_attr_from_input_ai(weights[node3->input(1)]);
+            }
 
-      // -1, 3, height, width
-      if (shape3.size() != 4) continue;
+            // -1, 3, height, width
+            if (shape3.size() != 4) continue;
 
-      if (shape3[0] != 1 && shape3[0] != -1) continue;
+            if (shape3[0] != 1 && shape3[0] != -1) continue;
 
-      if (shape3[1] != shape[1] || shape3[2] != shape[2] * shape[4] ||
-          shape3[3] != shape[3] * shape[5])
-        continue;
+            if (shape3[1] != shape[1] || shape3[2] != shape[2] * shape[4] ||
+                shape3[3] != shape[3] * shape[5])
+                continue;
 
-      // reduce
-      node->set_op_type("noop_reducedncnn");
-      node2->set_op_type("noop_reducedncnn");
+            // reduce
+            node->set_op_type("noop_reducedncnn");
+            node2->set_op_type("noop_reducedncnn");
 
-      if (node->input_size() == 2) {
-        node_reference[node->input(1)] -= 1;
-      }
-      node_reference[node->output(0)] -= 1;
-      node_reference[node2->output(0)] -= 1;
-      if (node3->input_size() == 2) {
-        node_reference[node3->input(1)] -= 1;
-      }
+            if (node->input_size() == 2)
+            {
+                node_reference[node->input(1)] -= 1;
+            }
+            node_reference[node->output(0)] -= 1;
+            node_reference[node2->output(0)] -= 1;
+            if (node3->input_size() == 2)
+            {
+                node_reference[node3->input(1)] -= 1;
+            }
 
-      blob_names.erase(node->output(0));
-      blob_names.erase(node2->output(0));
+            blob_names.erase(node->output(0));
+            blob_names.erase(node2->output(0));
 
-      node3->set_op_type("PixelShuffle");
-      node3->set_input(0, node->input(0));
+            node3->set_op_type("PixelShuffle");
+            node3->set_input(0, node->input(0));
 
-      onnx::AttributeProto* attr_group = node3->add_attribute();
-      attr_group->set_name("scale_factor");
-      attr_group->set_i(shape[2]);
+            onnx::AttributeProto* attr_group = node3->add_attribute();
+            attr_group->set_name("scale_factor");
+            attr_group->set_i(shape[2]);
 
-      reduced_node_count += 2;
-      i += 2;
+            reduced_node_count += 2;
+            i += 2;
+        }
     }
-  }
 }
 
-void fuse_reorg(onnx::GraphProto* mutable_graph, std::map<std::string, onnx::TensorProto>& weights,
-                std::map<std::string, int>& node_reference, std::set<std::string>& blob_names,
-                int& reduced_node_count) {
-  int node_count = mutable_graph->node_size();
-  for (int i = 0; i < node_count; i++) {
-    onnx::NodeProto* node = mutable_graph->mutable_node(i);
+void fuse_reorg(onnx::GraphProto* mutable_graph, std::map<std::string, onnx::TensorProto>& weights, std::map<std::string, int>& node_reference, std::set<std::string>& blob_names, int& reduced_node_count)
+{
+    int node_count = mutable_graph->node_size();
+    for (int i = 0; i < node_count; i++)
+    {
+        onnx::NodeProto* node = mutable_graph->mutable_node(i);
 
-    // PixelShuffle <= Reshape - Transpose - Reshape
-    // PixelShuffle <= Reshape - Transpose - Constant - Reshape
-    if (node->op_type() == "Reshape") {
-      if (node_reference[node->output(0)] != 1) continue;
+        // PixelShuffle <= Reshape - Transpose - Reshape
+        // PixelShuffle <= Reshape - Transpose - Constant - Reshape
+        if (node->op_type() == "Reshape")
+        {
+            if (node_reference[node->output(0)] != 1) continue;
 
-      std::vector<int> shape;
-      if (node->input_size() == 1) {
-        shape = get_node_attr_ai(*node, "shape");
-      } else {
-        // skip weight reshape
-        if (weights.find(node->input(1)) == weights.end()) continue;
+            std::vector<int> shape;
+            if (node->input_size() == 1)
+            {
+                shape = get_node_attr_ai(*node, "shape");
+            }
+            else
+            {
+                // skip weight reshape
+                if (weights.find(node->input(1)) == weights.end()) continue;
 
-        shape = get_node_attr_from_input_ai(weights[node->input(1)]);
-      }
+                shape = get_node_attr_from_input_ai(weights[node->input(1)]);
+            }
 
-      // -1, 3, out_height, block_size, out_width, block_size
-      if (shape.size() != 6) continue;
+            // -1, 3, out_height, block_size, out_width, block_size
+            if (shape.size() != 6) continue;
 
-      if (shape[0] != 1 && shape[0] != -1) continue;
+            if (shape[0] != 1 && shape[0] != -1) continue;
 
-      if (shape[3] != shape[5]) continue;
+            if (shape[3] != shape[5]) continue;
 
-      if (i + 2 >= node_count) continue;
+            if (i + 2 >= node_count) continue;
 
-      onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
-      onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
+            onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
+            onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
 
-      if (node3->op_type() == "Constant") {
-        if (i + 3 >= node_count) continue;
+            if (node3->op_type() == "Constant")
+            {
+                if (i + 3 >= node_count) continue;
 
-        node3 = mutable_graph->mutable_node(i + 3);
-      }
+                node3 = mutable_graph->mutable_node(i + 3);
+            }
 
-      if (node2->op_type() != "Transpose" || node3->op_type() != "Reshape") continue;
+            if (node2->op_type() != "Transpose" || node3->op_type() != "Reshape") continue;
 
-      if (node_reference[node2->output(0)] != 1) continue;
+            if (node_reference[node2->output(0)] != 1) continue;
 
-      // 0 1 3 5 2 4
-      std::vector<int> perm = get_node_attr_ai(*node2, "perm");
-      if (perm.size() != 6) continue;
+            // 0 1 3 5 2 4
+            std::vector<int> perm = get_node_attr_ai(*node2, "perm");
+            if (perm.size() != 6) continue;
 
-      if (perm[0] != 0 || perm[1] != 1 || perm[2] != 3 || perm[3] != 5 || perm[4] != 2 ||
-          perm[5] != 4)
-        continue;
+            if (perm[0] != 0 || perm[1] != 1 || perm[2] != 3 || perm[3] != 5 || perm[4] != 2 ||
+                perm[5] != 4)
+                continue;
 
-      std::vector<int> shape3;
-      if (node3->input_size() == 1) {
-        shape3 = get_node_attr_ai(*node3, "shape");
-      } else {
-        // skip weight reshape
-        if (weights.find(node3->input(1)) == weights.end()) continue;
+            std::vector<int> shape3;
+            if (node3->input_size() == 1)
+            {
+                shape3 = get_node_attr_ai(*node3, "shape");
+            }
+            else
+            {
+                // skip weight reshape
+                if (weights.find(node3->input(1)) == weights.end()) continue;
 
-        shape3 = get_node_attr_from_input_ai(weights[node3->input(1)]);
-      }
+                shape3 = get_node_attr_from_input_ai(weights[node3->input(1)]);
+            }
 
-      // -1, out_channels, out_height, out_width
-      if (shape3.size() != 4) continue;
+            // -1, out_channels, out_height, out_width
+            if (shape3.size() != 4) continue;
 
-      if (shape3[0] != 1 && shape3[0] != -1) continue;
+            if (shape3[0] != 1 && shape3[0] != -1) continue;
 
-      if (shape3[1] != shape[1] * shape[3] * shape[5] || shape3[2] != shape[2] ||
-          shape3[3] != shape[4])
-        continue;
+            if (shape3[1] != shape[1] * shape[3] * shape[5] || shape3[2] != shape[2] ||
+                shape3[3] != shape[4])
+                continue;
 
-      // reduce
-      node->set_op_type("noop_reducedncnn");
-      node2->set_op_type("noop_reducedncnn");
+            // reduce
+            node->set_op_type("noop_reducedncnn");
+            node2->set_op_type("noop_reducedncnn");
 
-      if (node->input_size() == 2) {
-        node_reference[node->input(1)] -= 1;
-      }
-      node_reference[node->output(0)] -= 1;
-      node_reference[node2->output(0)] -= 1;
-      if (node3->input_size() == 2) {
-        node_reference[node3->input(1)] -= 1;
-      }
+            if (node->input_size() == 2)
+            {
+                node_reference[node->input(1)] -= 1;
+            }
+            node_reference[node->output(0)] -= 1;
+            node_reference[node2->output(0)] -= 1;
+            if (node3->input_size() == 2)
+            {
+                node_reference[node3->input(1)] -= 1;
+            }
 
-      blob_names.erase(node->output(0));
-      blob_names.erase(node2->output(0));
+            blob_names.erase(node->output(0));
+            blob_names.erase(node2->output(0));
 
-      node3->set_op_type("Reorg");
-      node3->set_input(0, node->input(0));
+            node3->set_op_type("Reorg");
+            node3->set_input(0, node->input(0));
 
-      onnx::AttributeProto* attr_group = node3->add_attribute();
-      attr_group->set_name("stride");
-      attr_group->set_i(shape[3]);
+            onnx::AttributeProto* attr_group = node3->add_attribute();
+            attr_group->set_name("stride");
+            attr_group->set_i(shape[3]);
 
-      reduced_node_count += 2;
-      i += 2;
+            reduced_node_count += 2;
+            i += 2;
+        }
     }
-  }
 }
 
-void fuse_expand_broadcast(onnx::GraphProto* mutable_graph,
+void fuse_expand_broadcast(onnx::GraphProto*                         mutable_graph,
                            std::map<std::string, onnx::TensorProto>& weights,
-                           std::map<std::string, int>& node_reference,
-                           std::set<std::string>& blob_names, int& reduced_node_count) {
-  int node_count = mutable_graph->node_size();
-  for (int i = 0; i < node_count; i++) {
-    onnx::NodeProto* node = mutable_graph->mutable_node(i);
+                           std::map<std::string, int>&               node_reference,
+                           std::set<std::string>&                    blob_names,
+                           int&                                      reduced_node_count)
+{
+    int node_count = mutable_graph->node_size();
+    for (int i = 0; i < node_count; i++)
+    {
+        onnx::NodeProto* node = mutable_graph->mutable_node(i);
 
-    // Add/Sub/Mul/Div/Min/Max <= Expand - Add/Sub/Mul/Div/Min/Max
-    if (node->op_type() == "Expand") {
-      if (node_reference[node->output(0)] != 1) continue;
+        // Add/Sub/Mul/Div/Min/Max <= Expand - Add/Sub/Mul/Div/Min/Max
+        if (node->op_type() == "Expand")
+        {
+            if (node_reference[node->output(0)] != 1) continue;
 
-      if (i + 1 >= node_count) continue;
+            if (i + 1 >= node_count) continue;
 
-      onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
+            onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
 
-      if (node2->op_type() != "Add" && node2->op_type() != "Sub" && node2->op_type() != "Mul" &&
-          node2->op_type() != "Div" && node2->op_type() != "Min" && node2->op_type() != "Max")
-        continue;
+            if (node2->op_type() != "Add" && node2->op_type() != "Sub" && node2->op_type() != "Mul" &&
+                node2->op_type() != "Div" && node2->op_type() != "Min" && node2->op_type() != "Max")
+                continue;
 
-      if (node2->input(1) != node->output(0) && node2->input(0) != node->output(0)) continue;
+            if (node2->input(1) != node->output(0) && node2->input(0) != node->output(0)) continue;
 
-      // reduce
-      node->set_op_type("noop_reducedncnn");
+            // reduce
+            node->set_op_type("noop_reducedncnn");
 
-      node_reference[node->output(0)] -= 1;
-      if (node->input_size() == 2) {
-        node_reference[node->input(1)] -= 1;
-      }
+            node_reference[node->output(0)] -= 1;
+            if (node->input_size() == 2)
+            {
+                node_reference[node->input(1)] -= 1;
+            }
 
-      blob_names.erase(node->output(0));
+            blob_names.erase(node->output(0));
 
-      if (node2->input(0) == node->output(0)) {
-        node2->set_input(0, node->input(0));
-      } else {
-        node2->set_input(1, node->input(0));
-      }
+            if (node2->input(0) == node->output(0))
+            {
+                node2->set_input(0, node->input(0));
+            }
+            else
+            {
+                node2->set_input(1, node->input(0));
+            }
 
-      reduced_node_count += 1;
-      i += 1;
+            reduced_node_count += 1;
+            i += 1;
+        }
     }
-  }
 }
 
-void fuse_lstm_gru_rnn(onnx::GraphProto* mutable_graph,
+void fuse_lstm_gru_rnn(onnx::GraphProto*                         mutable_graph,
                        std::map<std::string, onnx::TensorProto>& weights,
-                       std::map<std::string, int>& node_reference,
-                       std::set<std::string>& blob_names, int& reduced_node_count) {
-  int node_count = mutable_graph->node_size();
-  for (int i = 0; i < node_count; i++) {
-    onnx::NodeProto* node = mutable_graph->mutable_node(i);
+                       std::map<std::string, int>&               node_reference,
+                       std::set<std::string>&                    blob_names,
+                       int&                                      reduced_node_count)
+{
+    int node_count = mutable_graph->node_size();
+    for (int i = 0; i < node_count; i++)
+    {
+        onnx::NodeProto* node = mutable_graph->mutable_node(i);
 
-    // LSTM(bi) <= LSTM(bi) - Transpose - Reshape - Transpose
-    // or LSTM(bi) <= LSTM(bi) - Transpose Constant - Reshape - Transpose
-    if (node->op_type() == "LSTM" || node->op_type() == "GRU" || node->op_type() == "RNN") {
-      if (node_reference[node->output(0)] != 1) continue;
+        // LSTM(bi) <= LSTM(bi) - Transpose - Reshape - Transpose
+        // or LSTM(bi) <= LSTM(bi) - Transpose Constant - Reshape - Transpose
+        if (node->op_type() == "LSTM" || node->op_type() == "GRU" || node->op_type() == "RNN")
+        {
+            if (node_reference[node->output(0)] != 1) continue;
 
-      if (i + 2 >= node_count) continue;
+            if (i + 2 >= node_count) continue;
 
-      onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
-      onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
+            onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
+            onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
 
-      // skip if second ops is constant
-      if (node3->op_type() == "Constant") {
-        if (i + 3 >= node_count) continue;
-        node3 = mutable_graph->mutable_node(i + 3);
-        i += 1;
-      }
+            // skip if second ops is constant
+            if (node3->op_type() == "Constant")
+            {
+                if (i + 3 >= node_count) continue;
+                node3 = mutable_graph->mutable_node(i + 3);
+                i += 1;
+            }
 
-      if (node2->op_type() != "Transpose" || node3->op_type() != "Reshape") continue;
+            if (node2->op_type() != "Transpose" || node3->op_type() != "Reshape") continue;
 
-      if (node_reference[node2->output(0)] != 1) continue;
+            if (node_reference[node2->output(0)] != 1) continue;
 
-      if (node2->input(0) != node->output(0) || node3->input(0) != node2->output(0)) continue;
+            if (node2->input(0) != node->output(0) || node3->input(0) != node2->output(0)) continue;
 
-      std::string direction = get_node_attr_s(*node, "direction");
-      if (direction != "bidirectional") continue;
+            std::string direction = get_node_attr_s(*node, "direction");
+            if (direction != "bidirectional") continue;
 
-      // 0 2 1 3
-      std::vector<int> perm = get_node_attr_ai(*node2, "perm");
-      if (perm.size() != 4) continue;
+            // 0 2 1 3
+            std::vector<int> perm = get_node_attr_ai(*node2, "perm");
+            if (perm.size() != 4) continue;
 
-      if (perm[0] != 0 || perm[1] != 2 || perm[2] != 1 || perm[3] != 3) continue;
+            if (perm[0] != 0 || perm[1] != 2 || perm[2] != 1 || perm[3] != 3) continue;
 
-      std::vector<int> shape;
-      if (node3->input_size() == 1) {
-        shape = get_node_attr_ai(*node3, "shape");
-      } else {
-        // skip weight reshape
-        if (weights.find(node3->input(1)) == weights.end()) continue;
+            std::vector<int> shape;
+            if (node3->input_size() == 1)
+            {
+                shape = get_node_attr_ai(*node3, "shape");
+            }
+            else
+            {
+                // skip weight reshape
+                if (weights.find(node3->input(1)) == weights.end()) continue;
 
-        shape = get_node_attr_from_input_ai(weights[node3->input(1)]);
-      }
+                shape = get_node_attr_from_input_ai(weights[node3->input(1)]);
+            }
 
-      // 0 0 -1
-      if (shape.size() != 3) continue;
+            // 0 0 -1
+            if (shape.size() != 3) continue;
 
-      if (shape[0] != 0 || shape[1] != 0 || shape[2] != -1) continue;
+            if (shape[0] != 0 || shape[1] != 0 || shape[2] != -1) continue;
 
-      // reduce
-      node2->set_op_type("noop_reducedncnn");
-      node3->set_op_type("noop_reducedncnn");
+            // reduce
+            node2->set_op_type("noop_reducedncnn");
+            node3->set_op_type("noop_reducedncnn");
 
-      node_reference[node->output(0)] -= 1;
-      node_reference[node2->output(0)] -= 1;
-      if (node3->input_size() == 2) {
-        node_reference[node3->input(1)] -= 1;
-      }
+            node_reference[node->output(0)] -= 1;
+            node_reference[node2->output(0)] -= 1;
+            if (node3->input_size() == 2)
+            {
+                node_reference[node3->input(1)] -= 1;
+            }
 
-      blob_names.erase(node->output(0));
-      blob_names.erase(node2->output(0));
+            blob_names.erase(node->output(0));
+            blob_names.erase(node2->output(0));
 
-      node->set_output(0, node3->output(0));
+            node->set_output(0, node3->output(0));
 
-      reduced_node_count += 2;
-      i += 2;
+            reduced_node_count += 2;
+            i += 2;
 
-      if (i + 1 < node_count) {
-        if (node_reference[node3->output(0)] != 1) continue;
+            if (i + 1 < node_count)
+            {
+                if (node_reference[node3->output(0)] != 1) continue;
 
-        onnx::NodeProto* node4 = mutable_graph->mutable_node(i + 1);
+                onnx::NodeProto* node4 = mutable_graph->mutable_node(i + 1);
 
-        if (node4->op_type() != "Transpose") continue;
+                if (node4->op_type() != "Transpose") continue;
 
-        if (node4->input(0) != node->output(0)) continue;
+                if (node4->input(0) != node->output(0)) continue;
 
-        // 1 0 2
-        std::vector<int> perm4 = get_node_attr_ai(*node4, "perm");
-        if (perm4.size() != 3) continue;
+                // 1 0 2
+                std::vector<int> perm4 = get_node_attr_ai(*node4, "perm");
+                if (perm4.size() != 3) continue;
 
-        if (perm4[0] != 1 || perm4[1] != 0 || perm4[2] != 2) continue;
+                if (perm4[0] != 1 || perm4[1] != 0 || perm4[2] != 2) continue;
 
-        // reduce
-        node4->set_op_type("noop_reducedncnn");
+                // reduce
+                node4->set_op_type("noop_reducedncnn");
 
-        node_reference[node->output(0)] -= 1;
+                node_reference[node->output(0)] -= 1;
 
-        blob_names.erase(node->output(0));
+                blob_names.erase(node->output(0));
 
-        node->set_output(0, node4->output(0));
+                node->set_output(0, node4->output(0));
 
-        reduced_node_count += 1;
-        i += 1;
-      }
+                reduced_node_count += 1;
+                i += 1;
+            }
+        }
     }
-  }
 
-  for (int i = 0; i < node_count; i++) {
-    onnx::NodeProto* node = mutable_graph->mutable_node(i);
+    for (int i = 0; i < node_count; i++)
+    {
+        onnx::NodeProto* node = mutable_graph->mutable_node(i);
 
-    // LSTM(uni) <= LSTM(uni) - Squeeze - Transpose
-    if (node->op_type() == "LSTM" || node->op_type() == "GRU" || node->op_type() == "RNN") {
-      if (node_reference[node->output(0)] != 1) continue;
+        // LSTM(uni) <= LSTM(uni) - Squeeze - Transpose
+        if (node->op_type() == "LSTM" || node->op_type() == "GRU" || node->op_type() == "RNN")
+        {
+            if (node_reference[node->output(0)] != 1) continue;
 
-      if (i + 1 >= node_count) continue;
+            if (i + 1 >= node_count) continue;
 
-      onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
+            onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
 
-      if (node2->op_type() != "Squeeze") continue;
+            if (node2->op_type() != "Squeeze") continue;
 
-      if (node2->input(0) != node->output(0)) continue;
+            if (node2->input(0) != node->output(0)) continue;
 
-      std::string direction = get_node_attr_s(*node, "direction");
-      if (direction == "bidirectional") continue;
+            std::string direction = get_node_attr_s(*node, "direction");
+            if (direction == "bidirectional") continue;
 
-      // 1
-      std::vector<int> axes = get_node_attr_ai(*node2, "axes");
-      if (axes.size() != 1) continue;
+            // 1
+            std::vector<int> axes = get_node_attr_ai(*node2, "axes");
+            if (axes.size() != 1) continue;
 
-      if (axes[0] != 1) continue;
+            if (axes[0] != 1) continue;
 
-      // reduce
-      node2->set_op_type("noop_reducedncnn");
+            // reduce
+            node2->set_op_type("noop_reducedncnn");
 
-      node_reference[node->output(0)] -= 1;
+            node_reference[node->output(0)] -= 1;
 
-      blob_names.erase(node->output(0));
+            blob_names.erase(node->output(0));
 
-      node->set_output(0, node2->output(0));
+            node->set_output(0, node2->output(0));
 
-      reduced_node_count += 1;
-      i += 1;
+            reduced_node_count += 1;
+            i += 1;
 
-      if (i + 1 < node_count) {
-        if (node_reference[node2->output(0)] != 1) continue;
+            if (i + 1 < node_count)
+            {
+                if (node_reference[node2->output(0)] != 1) continue;
 
-        onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 1);
+                onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 1);
 
-        if (node3->op_type() != "Transpose") continue;
+                if (node3->op_type() != "Transpose") continue;
 
-        if (node3->input(0) != node->output(0)) continue;
+                if (node3->input(0) != node->output(0)) continue;
 
-        // 1 0 2
-        std::vector<int> perm4 = get_node_attr_ai(*node3, "perm");
-        if (perm4.size() != 3) continue;
+                // 1 0 2
+                std::vector<int> perm4 = get_node_attr_ai(*node3, "perm");
+                if (perm4.size() != 3) continue;
 
-        if (perm4[0] != 1 || perm4[1] != 0 || perm4[2] != 2) continue;
+                if (perm4[0] != 1 || perm4[1] != 0 || perm4[2] != 2) continue;
 
-        // reduce
-        node3->set_op_type("noop_reducedncnn");
+                // reduce
+                node3->set_op_type("noop_reducedncnn");
 
-        node_reference[node->output(0)] -= 1;
+                node_reference[node->output(0)] -= 1;
 
-        blob_names.erase(node->output(0));
+                blob_names.erase(node->output(0));
 
-        node->set_output(0, node3->output(0));
+                node->set_output(0, node3->output(0));
 
-        reduced_node_count += 1;
-        i += 1;
-      }
+                reduced_node_count += 1;
+                i += 1;
+            }
+        }
     }
-  }
 
-  for (int i = 0; i < node_count; i++) {
-    onnx::NodeProto* node = mutable_graph->mutable_node(i);
+    for (int i = 0; i < node_count; i++)
+    {
+        onnx::NodeProto* node = mutable_graph->mutable_node(i);
 
-    // LSTM <= Transpose - LSTM
-    if (node->op_type() == "Transpose") {
-      if (node_reference[node->output(0)] != 1) continue;
+        // LSTM <= Transpose - LSTM
+        if (node->op_type() == "Transpose")
+        {
+            if (node_reference[node->output(0)] != 1) continue;
 
-      // 1 0 2
-      std::vector<int> perm = get_node_attr_ai(*node, "perm");
-      if (perm.size() != 3) continue;
+            // 1 0 2
+            std::vector<int> perm = get_node_attr_ai(*node, "perm");
+            if (perm.size() != 3) continue;
 
-      if (perm[0] != 1 || perm[1] != 0 || perm[2] != 2) continue;
+            if (perm[0] != 1 || perm[1] != 0 || perm[2] != 2) continue;
 
-      if (i + 1 >= node_count) continue;
+            if (i + 1 >= node_count) continue;
 
-      onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
+            onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
 
-      if (node2->op_type() != "LSTM" && node->op_type() != "GRU" && node->op_type() != "RNN")
-        continue;
+            if (node2->op_type() != "LSTM" && node->op_type() != "GRU" && node->op_type() != "RNN")
+                continue;
 
-      if (node2->input(0) != node->output(0)) continue;
+            if (node2->input(0) != node->output(0)) continue;
 
-      // reduce
-      node->set_op_type("noop_reducedncnn");
+            // reduce
+            node->set_op_type("noop_reducedncnn");
 
-      node_reference[node->output(0)] -= 1;
+            node_reference[node->output(0)] -= 1;
 
-      blob_names.erase(node->output(0));
+            blob_names.erase(node->output(0));
 
-      node2->set_input(0, node->input(0));
+            node2->set_input(0, node->input(0));
 
-      reduced_node_count += 1;
-      i += 1;
+            reduced_node_count += 1;
+            i += 1;
+        }
     }
-  }
 }
 
-void fuse_multiheadattention(onnx::GraphProto* mutable_graph,
+void fuse_multiheadattention(onnx::GraphProto*                         mutable_graph,
                              std::map<std::string, onnx::TensorProto>& weights,
-                             std::map<std::string, int>& node_reference,
-                             std::set<std::string>& blob_names, int& reduced_node_count) {
-  int node_count = mutable_graph->node_size();
-  for (int i = 0; i < node_count; i++) {
-    onnx::NodeProto* node = mutable_graph->mutable_node(i);
-
-    // MultiHeadAttention <= MatMul(q) - Add
-    //                      - MatMul(k) - Add
-    //                      - MatMul(v) - Add
-    //                      - Mul
-    //                      - Reshape - Transpose
-    //                      - Reshape - Reshape - Transpose - Transpose
-    //                      - Gemm - Softmax - Gemm - Transpose - Reshape -
-    //                      MatMul - Add
-    if (node->op_type() == "MatMul") {
-      if (i + 19 >= node_count) continue;
-
-      if (node_reference[node->output(0)] != 1) continue;
-
-      onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
-      onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
-      onnx::NodeProto* node4 = mutable_graph->mutable_node(i + 3);
-      onnx::NodeProto* node5 = mutable_graph->mutable_node(i + 4);
-      onnx::NodeProto* node6 = mutable_graph->mutable_node(i + 5);
-      onnx::NodeProto* node7 = mutable_graph->mutable_node(i + 6);
-      onnx::NodeProto* node8 = mutable_graph->mutable_node(i + 7);
-      onnx::NodeProto* node9 = mutable_graph->mutable_node(i + 8);
-      onnx::NodeProto* node10 = mutable_graph->mutable_node(i + 9);
-      onnx::NodeProto* node11 = mutable_graph->mutable_node(i + 10);
-      onnx::NodeProto* node12 = mutable_graph->mutable_node(i + 11);
-      onnx::NodeProto* node13 = mutable_graph->mutable_node(i + 12);
-      onnx::NodeProto* node14 = mutable_graph->mutable_node(i + 13);
-      onnx::NodeProto* node15 = mutable_graph->mutable_node(i + 14);
-      onnx::NodeProto* node16 = mutable_graph->mutable_node(i + 15);
-      onnx::NodeProto* node17 = mutable_graph->mutable_node(i + 16);
-      onnx::NodeProto* node18 = mutable_graph->mutable_node(i + 17);
-      onnx::NodeProto* node19 = mutable_graph->mutable_node(i + 18);
-      onnx::NodeProto* node20 = mutable_graph->mutable_node(i + 19);
-
-      if (node2->op_type() != "Add" || node3->op_type() != "MatMul" || node4->op_type() != "Add" ||
-          node5->op_type() != "MatMul" || node6->op_type() != "Add" || node7->op_type() != "Mul" ||
-          node8->op_type() != "Reshape" || node9->op_type() != "Transpose" ||
-          node10->op_type() != "Reshape" || node11->op_type() != "Reshape" ||
-          node12->op_type() != "Transpose" || node13->op_type() != "Transpose" ||
-          node14->op_type() != "MatMul" || node15->op_type() != "Softmax" ||
-          node16->op_type() != "MatMul" || node17->op_type() != "Transpose" ||
-          node18->op_type() != "Reshape" || node19->op_type() != "MatMul" ||
-          node20->op_type() != "Add")
-        continue;
-
-      if (node_reference[node2->output(0)] != 1 || node_reference[node3->output(0)] != 1 ||
-          node_reference[node4->output(0)] != 1 || node_reference[node5->output(0)] != 1 ||
-          node_reference[node6->output(0)] != 1 || node_reference[node7->output(0)] != 1 ||
-          node_reference[node8->output(0)] != 1 || node_reference[node9->output(0)] != 1 ||
-          node_reference[node10->output(0)] != 1 || node_reference[node11->output(0)] != 1 ||
-          node_reference[node12->output(0)] != 1 || node_reference[node13->output(0)] != 1 ||
-          node_reference[node14->output(0)] != 1 || node_reference[node15->output(0)] != 1 ||
-          node_reference[node16->output(0)] != 1 || node_reference[node17->output(0)] != 1 ||
-          node_reference[node18->output(0)] != 1 || node_reference[node19->output(0)] != 1)
-        continue;
-
-      if (node2->input(0) != node->output(0) || node4->input(0) != node3->output(0) ||
-          node6->input(0) != node5->output(0) || node7->input(0) != node2->output(0) ||
-          node8->input(0) != node7->output(0) || node9->input(0) != node8->output(0) ||
-          node10->input(0) != node4->output(0) || node11->input(0) != node6->output(0) ||
-          node12->input(0) != node11->output(0) || node13->input(0) != node10->output(0) ||
-          node14->input(0) != node9->output(0) || node14->input(1) != node13->output(0) ||
-          node15->input(0) != node14->output(0) || node16->input(0) != node15->output(0) ||
-          node16->input(1) != node12->output(0) || node17->input(0) != node16->output(0) ||
-          node18->input(0) != node17->output(0) || node19->input(0) != node18->output(0) ||
-          node20->input(0) != node19->output(0))
-        continue;
-
-      std::vector<float> q_B = get_node_attr_from_input_af(weights[node2->input(1)]);
-      std::vector<float> k_B = get_node_attr_from_input_af(weights[node4->input(1)]);
-      std::vector<float> v_B = get_node_attr_from_input_af(weights[node6->input(1)]);
-      std::vector<float> o_B = get_node_attr_from_input_af(weights[node20->input(1)]);
-
-      if (q_B.size() != k_B.size() || q_B.size() != v_B.size() || q_B.size() != o_B.size())
-        continue;
-
-      int embed_dim = q_B.size();
-
-      // 1 0 2
-      std::vector<int> perm9 = get_node_attr_ai(*node9, "perm");
-      std::vector<int> perm12 = get_node_attr_ai(*node12, "perm");
-      if (perm9.size() != 3 || perm12.size() != 3) continue;
-
-      if (perm9[0] != 1 || perm9[1] != 0 || perm9[2] != 2 || perm12[0] != 1 || perm12[1] != 0 ||
-          perm12[2] != 2)
-        continue;
-
-      // 1 2 0
-      std::vector<int> perm13 = get_node_attr_ai(*node13, "perm");
-      if (perm13.size() != 3) continue;
-
-      if (perm13[0] != 1 || perm13[1] != 2 || perm13[2] != 0) continue;
-
-      // 1 0 2
-      std::vector<int> perm17 = get_node_attr_ai(*node17, "perm");
-      if (perm17.size() != 3) continue;
-
-      if (perm17[0] != 1 || perm17[1] != 0 || perm17[2] != 2) continue;
-
-      int softmax_axis = get_node_attr_i(*node15, "axis");
-      if (softmax_axis != 2) continue;
-
-      // 1/-1, seqlen * num_heads, embed_dim / num_heads
-      std::vector<int> shape8;
-      std::vector<int> shape10;
-      std::vector<int> shape11;
-      if (node8->input_size() == 1) {
-        shape8 = get_node_attr_ai(*node8, "shape");
-      } else {
-        // skip weight reshape
-        if (weights.find(node8->input(1)) == weights.end()) continue;
-
-        shape8 = get_node_attr_from_input_ai(weights[node8->input(1)]);
-      }
-      if (node10->input_size() == 1) {
-        shape10 = get_node_attr_ai(*node10, "shape");
-      } else {
-        // skip weight reshape
-        if (weights.find(node10->input(1)) == weights.end()) continue;
-
-        shape10 = get_node_attr_from_input_ai(weights[node10->input(1)]);
-      }
-      if (node11->input_size() == 1) {
-        shape11 = get_node_attr_ai(*node11, "shape");
-      } else {
-        // skip weight reshape
-        if (weights.find(node11->input(1)) == weights.end()) continue;
-
-        shape11 = get_node_attr_from_input_ai(weights[node11->input(1)]);
-      }
-
-      if (shape8.size() != 3 || shape10.size() != 3 || shape11.size() != 3) continue;
-
-      if (shape8[1] != shape10[1] || shape8[1] != shape11[1] || shape8[2] != shape10[2] ||
-          shape8[2] != shape11[2])
-        continue;
-
-      int num_heads = embed_dim / shape8[2];
-
-      // 1, seqlen, embed_dim
-      std::vector<int> shape18;
-      if (node18->input_size() == 1) {
-        shape18 = get_node_attr_ai(*node18, "shape");
-      } else {
-        // skip weight reshape
-        if (weights.find(node18->input(1)) == weights.end()) continue;
-
-        shape18 = get_node_attr_from_input_ai(weights[node18->input(1)]);
-      }
-
-      if (shape18.size() != 3) continue;
-
-      if (shape18[2] != embed_dim || shape18[1] * num_heads != shape8[1]) continue;
-
-      // reduce
-      node->set_op_type("noop_reducedncnn");
-      node2->set_op_type("noop_reducedncnn");
-      node3->set_op_type("noop_reducedncnn");
-      node4->set_op_type("noop_reducedncnn");
-      node5->set_op_type("noop_reducedncnn");
-      node6->set_op_type("noop_reducedncnn");
-      node7->set_op_type("noop_reducedncnn");
-      node8->set_op_type("noop_reducedncnn");
-      node9->set_op_type("noop_reducedncnn");
-      node10->set_op_type("noop_reducedncnn");
-      node11->set_op_type("noop_reducedncnn");
-      node12->set_op_type("noop_reducedncnn");
-      node13->set_op_type("noop_reducedncnn");
-      node14->set_op_type("noop_reducedncnn");
-      node15->set_op_type("noop_reducedncnn");
-      node16->set_op_type("noop_reducedncnn");
-      node17->set_op_type("noop_reducedncnn");
-      node18->set_op_type("noop_reducedncnn");
-      node19->set_op_type("noop_reducedncnn");
-
-      node_reference[node2->input(0)] -= 1;
-      node_reference[node4->input(0)] -= 1;
-      node_reference[node6->input(0)] -= 1;
-      node_reference[node7->input(0)] -= 1;
-      node_reference[node7->input(1)] -= 1;
-      node_reference[node8->input(0)] -= 1;
-      if (node8->input_size() == 2) {
-        node_reference[node8->input(1)] -= 1;
-      }
-      node_reference[node9->input(0)] -= 1;
-      node_reference[node10->input(0)] -= 1;
-      if (node10->input_size() == 2) {
-        node_reference[node10->input(1)] -= 1;
-      }
-      node_reference[node11->input(0)] -= 1;
-      if (node11->input_size() == 2) {
-        node_reference[node11->input(1)] -= 1;
-      }
-      node_reference[node12->input(0)] -= 1;
-      node_reference[node13->input(0)] -= 1;
-      node_reference[node14->input(0)] -= 1;
-      node_reference[node14->input(1)] -= 1;
-      node_reference[node15->input(0)] -= 1;
-      node_reference[node16->input(0)] -= 1;
-      node_reference[node16->input(1)] -= 1;
-      node_reference[node17->input(0)] -= 1;
-      node_reference[node18->input(0)] -= 1;
-      if (node18->input_size() == 2) {
-        node_reference[node18->input(1)] -= 1;
-      }
-      node_reference[node19->input(0)] -= 1;
-      node_reference[node20->input(0)] -= 1;
-
-      blob_names.erase(node->output(0));
-      blob_names.erase(node2->output(0));
-      blob_names.erase(node3->output(0));
-      blob_names.erase(node4->output(0));
-      blob_names.erase(node5->output(0));
-      blob_names.erase(node6->output(0));
-      blob_names.erase(node7->output(0));
-      blob_names.erase(node8->output(0));
-      blob_names.erase(node9->output(0));
-      blob_names.erase(node10->output(0));
-      blob_names.erase(node11->output(0));
-      blob_names.erase(node12->output(0));
-      blob_names.erase(node13->output(0));
-      blob_names.erase(node14->output(0));
-      blob_names.erase(node15->output(0));
-      blob_names.erase(node16->output(0));
-      blob_names.erase(node17->output(0));
-      blob_names.erase(node18->output(0));
-      blob_names.erase(node19->output(0));
-
-      std::string qw = node->input(1);
-      std::string qb = node2->input(1);
-      std::string kw = node3->input(1);
-      std::string kb = node4->input(1);
-      std::string vw = node5->input(1);
-      std::string vb = node6->input(1);
-      std::string ow = node19->input(1);
-      std::string ob = node20->input(1);
-
-      node20->set_op_type("MultiHeadAttention");
-      node20->clear_input();
-      node20->add_input(node->input(0));
-      node20->add_input(node3->input(0));
-      node20->add_input(node5->input(0));
-      // q
-      node20->add_input(qw);
-      node20->add_input(qb);
-      // k
-      node20->add_input(kw);
-      node20->add_input(kb);
-      // v
-      node20->add_input(vw);
-      node20->add_input(vb);
-      // out linear
-      node20->add_input(ow);
-      node20->add_input(ob);
-
-      onnx::AttributeProto* attr_embed_dim = node20->add_attribute();
-      attr_embed_dim->set_name("embed_dim");
-      attr_embed_dim->set_i(embed_dim);
-
-      onnx::AttributeProto* attr_num_heads = node20->add_attribute();
-      attr_num_heads->set_name("num_heads");
-      attr_num_heads->set_i(num_heads);
-
-      reduced_node_count += 19;
-      i += 19;
+                             std::map<std::string, int>&               node_reference,
+                             std::set<std::string>&                    blob_names,
+                             int&                                      reduced_node_count)
+{
+    int node_count = mutable_graph->node_size();
+    for (int i = 0; i < node_count; i++)
+    {
+        onnx::NodeProto* node = mutable_graph->mutable_node(i);
+
+        // MultiHeadAttention <= MatMul(q) - Add
+        //                      - MatMul(k) - Add
+        //                      - MatMul(v) - Add
+        //                      - Mul
+        //                      - Reshape - Transpose
+        //                      - Reshape - Reshape - Transpose - Transpose
+        //                      - Gemm - Softmax - Gemm - Transpose - Reshape -
+        //                      MatMul - Add
+        if (node->op_type() == "MatMul")
+        {
+            if (i + 19 >= node_count) continue;
+
+            if (node_reference[node->output(0)] != 1) continue;
+
+            onnx::NodeProto* node2  = mutable_graph->mutable_node(i + 1);
+            onnx::NodeProto* node3  = mutable_graph->mutable_node(i + 2);
+            onnx::NodeProto* node4  = mutable_graph->mutable_node(i + 3);
+            onnx::NodeProto* node5  = mutable_graph->mutable_node(i + 4);
+            onnx::NodeProto* node6  = mutable_graph->mutable_node(i + 5);
+            onnx::NodeProto* node7  = mutable_graph->mutable_node(i + 6);
+            onnx::NodeProto* node8  = mutable_graph->mutable_node(i + 7);
+            onnx::NodeProto* node9  = mutable_graph->mutable_node(i + 8);
+            onnx::NodeProto* node10 = mutable_graph->mutable_node(i + 9);
+            onnx::NodeProto* node11 = mutable_graph->mutable_node(i + 10);
+            onnx::NodeProto* node12 = mutable_graph->mutable_node(i + 11);
+            onnx::NodeProto* node13 = mutable_graph->mutable_node(i + 12);
+            onnx::NodeProto* node14 = mutable_graph->mutable_node(i + 13);
+            onnx::NodeProto* node15 = mutable_graph->mutable_node(i + 14);
+            onnx::NodeProto* node16 = mutable_graph->mutable_node(i + 15);
+            onnx::NodeProto* node17 = mutable_graph->mutable_node(i + 16);
+            onnx::NodeProto* node18 = mutable_graph->mutable_node(i + 17);
+            onnx::NodeProto* node19 = mutable_graph->mutable_node(i + 18);
+            onnx::NodeProto* node20 = mutable_graph->mutable_node(i + 19);
+
+            if (node2->op_type() != "Add" || node3->op_type() != "MatMul" || node4->op_type() != "Add" ||
+                node5->op_type() != "MatMul" || node6->op_type() != "Add" || node7->op_type() != "Mul" ||
+                node8->op_type() != "Reshape" || node9->op_type() != "Transpose" ||
+                node10->op_type() != "Reshape" || node11->op_type() != "Reshape" ||
+                node12->op_type() != "Transpose" || node13->op_type() != "Transpose" ||
+                node14->op_type() != "MatMul" || node15->op_type() != "Softmax" ||
+                node16->op_type() != "MatMul" || node17->op_type() != "Transpose" ||
+                node18->op_type() != "Reshape" || node19->op_type() != "MatMul" ||
+                node20->op_type() != "Add")
+                continue;
+
+            if (node_reference[node2->output(0)] != 1 || node_reference[node3->output(0)] != 1 ||
+                node_reference[node4->output(0)] != 1 || node_reference[node5->output(0)] != 1 ||
+                node_reference[node6->output(0)] != 1 || node_reference[node7->output(0)] != 1 ||
+                node_reference[node8->output(0)] != 1 || node_reference[node9->output(0)] != 1 ||
+                node_reference[node10->output(0)] != 1 || node_reference[node11->output(0)] != 1 ||
+                node_reference[node12->output(0)] != 1 || node_reference[node13->output(0)] != 1 ||
+                node_reference[node14->output(0)] != 1 || node_reference[node15->output(0)] != 1 ||
+                node_reference[node16->output(0)] != 1 || node_reference[node17->output(0)] != 1 ||
+                node_reference[node18->output(0)] != 1 || node_reference[node19->output(0)] != 1)
+                continue;
+
+            if (node2->input(0) != node->output(0) || node4->input(0) != node3->output(0) ||
+                node6->input(0) != node5->output(0) || node7->input(0) != node2->output(0) ||
+                node8->input(0) != node7->output(0) || node9->input(0) != node8->output(0) ||
+                node10->input(0) != node4->output(0) || node11->input(0) != node6->output(0) ||
+                node12->input(0) != node11->output(0) || node13->input(0) != node10->output(0) ||
+                node14->input(0) != node9->output(0) || node14->input(1) != node13->output(0) ||
+                node15->input(0) != node14->output(0) || node16->input(0) != node15->output(0) ||
+                node16->input(1) != node12->output(0) || node17->input(0) != node16->output(0) ||
+                node18->input(0) != node17->output(0) || node19->input(0) != node18->output(0) ||
+                node20->input(0) != node19->output(0))
+                continue;
+
+            std::vector<float> q_B = get_node_attr_from_input_af(weights[node2->input(1)]);
+            std::vector<float> k_B = get_node_attr_from_input_af(weights[node4->input(1)]);
+            std::vector<float> v_B = get_node_attr_from_input_af(weights[node6->input(1)]);
+            std::vector<float> o_B = get_node_attr_from_input_af(weights[node20->input(1)]);
+
+            if (q_B.size() != k_B.size() || q_B.size() != v_B.size() || q_B.size() != o_B.size())
+                continue;
+
+            int              embed_dim = q_B.size();
+
+            // 1 0 2
+            std::vector<int> perm9  = get_node_attr_ai(*node9, "perm");
+            std::vector<int> perm12 = get_node_attr_ai(*node12, "perm");
+            if (perm9.size() != 3 || perm12.size() != 3) continue;
+
+            if (perm9[0] != 1 || perm9[1] != 0 || perm9[2] != 2 || perm12[0] != 1 || perm12[1] != 0 ||
+                perm12[2] != 2)
+                continue;
+
+            // 1 2 0
+            std::vector<int> perm13 = get_node_attr_ai(*node13, "perm");
+            if (perm13.size() != 3) continue;
+
+            if (perm13[0] != 1 || perm13[1] != 2 || perm13[2] != 0) continue;
+
+            // 1 0 2
+            std::vector<int> perm17 = get_node_attr_ai(*node17, "perm");
+            if (perm17.size() != 3) continue;
+
+            if (perm17[0] != 1 || perm17[1] != 0 || perm17[2] != 2) continue;
+
+            int softmax_axis = get_node_attr_i(*node15, "axis");
+            if (softmax_axis != 2) continue;
+
+            // 1/-1, seqlen * num_heads, embed_dim / num_heads
+            std::vector<int> shape8;
+            std::vector<int> shape10;
+            std::vector<int> shape11;
+            if (node8->input_size() == 1)
+            {
+                shape8 = get_node_attr_ai(*node8, "shape");
+            }
+            else
+            {
+                // skip weight reshape
+                if (weights.find(node8->input(1)) == weights.end()) continue;
+
+                shape8 = get_node_attr_from_input_ai(weights[node8->input(1)]);
+            }
+            if (node10->input_size() == 1)
+            {
+                shape10 = get_node_attr_ai(*node10, "shape");
+            }
+            else
+            {
+                // skip weight reshape
+                if (weights.find(node10->input(1)) == weights.end()) continue;
+
+                shape10 = get_node_attr_from_input_ai(weights[node10->input(1)]);
+            }
+            if (node11->input_size() == 1)
+            {
+                shape11 = get_node_attr_ai(*node11, "shape");
+            }
+            else
+            {
+                // skip weight reshape
+                if (weights.find(node11->input(1)) == weights.end()) continue;
+
+                shape11 = get_node_attr_from_input_ai(weights[node11->input(1)]);
+            }
+
+            if (shape8.size() != 3 || shape10.size() != 3 || shape11.size() != 3) continue;
+
+            if (shape8[1] != shape10[1] || shape8[1] != shape11[1] || shape8[2] != shape10[2] ||
+                shape8[2] != shape11[2])
+                continue;
+
+            int              num_heads = embed_dim / shape8[2];
+
+            // 1, seqlen, embed_dim
+            std::vector<int> shape18;
+            if (node18->input_size() == 1)
+            {
+                shape18 = get_node_attr_ai(*node18, "shape");
+            }
+            else
+            {
+                // skip weight reshape
+                if (weights.find(node18->input(1)) == weights.end()) continue;
+
+                shape18 = get_node_attr_from_input_ai(weights[node18->input(1)]);
+            }
+
+            if (shape18.size() != 3) continue;
+
+            if (shape18[2] != embed_dim || shape18[1] * num_heads != shape8[1]) continue;
+
+            // reduce
+            node->set_op_type("noop_reducedncnn");
+            node2->set_op_type("noop_reducedncnn");
+            node3->set_op_type("noop_reducedncnn");
+            node4->set_op_type("noop_reducedncnn");
+            node5->set_op_type("noop_reducedncnn");
+            node6->set_op_type("noop_reducedncnn");
+            node7->set_op_type("noop_reducedncnn");
+            node8->set_op_type("noop_reducedncnn");
+            node9->set_op_type("noop_reducedncnn");
+            node10->set_op_type("noop_reducedncnn");
+            node11->set_op_type("noop_reducedncnn");
+            node12->set_op_type("noop_reducedncnn");
+            node13->set_op_type("noop_reducedncnn");
+            node14->set_op_type("noop_reducedncnn");
+            node15->set_op_type("noop_reducedncnn");
+            node16->set_op_type("noop_reducedncnn");
+            node17->set_op_type("noop_reducedncnn");
+            node18->set_op_type("noop_reducedncnn");
+            node19->set_op_type("noop_reducedncnn");
+
+            node_reference[node2->input(0)] -= 1;
+            node_reference[node4->input(0)] -= 1;
+            node_reference[node6->input(0)] -= 1;
+            node_reference[node7->input(0)] -= 1;
+            node_reference[node7->input(1)] -= 1;
+            node_reference[node8->input(0)] -= 1;
+            if (node8->input_size() == 2)
+            {
+                node_reference[node8->input(1)] -= 1;
+            }
+            node_reference[node9->input(0)] -= 1;
+            node_reference[node10->input(0)] -= 1;
+            if (node10->input_size() == 2)
+            {
+                node_reference[node10->input(1)] -= 1;
+            }
+            node_reference[node11->input(0)] -= 1;
+            if (node11->input_size() == 2)
+            {
+                node_reference[node11->input(1)] -= 1;
+            }
+            node_reference[node12->input(0)] -= 1;
+            node_reference[node13->input(0)] -= 1;
+            node_reference[node14->input(0)] -= 1;
+            node_reference[node14->input(1)] -= 1;
+            node_reference[node15->input(0)] -= 1;
+            node_reference[node16->input(0)] -= 1;
+            node_reference[node16->input(1)] -= 1;
+            node_reference[node17->input(0)] -= 1;
+            node_reference[node18->input(0)] -= 1;
+            if (node18->input_size() == 2)
+            {
+                node_reference[node18->input(1)] -= 1;
+            }
+            node_reference[node19->input(0)] -= 1;
+            node_reference[node20->input(0)] -= 1;
+
+            blob_names.erase(node->output(0));
+            blob_names.erase(node2->output(0));
+            blob_names.erase(node3->output(0));
+            blob_names.erase(node4->output(0));
+            blob_names.erase(node5->output(0));
+            blob_names.erase(node6->output(0));
+            blob_names.erase(node7->output(0));
+            blob_names.erase(node8->output(0));
+            blob_names.erase(node9->output(0));
+            blob_names.erase(node10->output(0));
+            blob_names.erase(node11->output(0));
+            blob_names.erase(node12->output(0));
+            blob_names.erase(node13->output(0));
+            blob_names.erase(node14->output(0));
+            blob_names.erase(node15->output(0));
+            blob_names.erase(node16->output(0));
+            blob_names.erase(node17->output(0));
+            blob_names.erase(node18->output(0));
+            blob_names.erase(node19->output(0));
+
+            std::string qw = node->input(1);
+            std::string qb = node2->input(1);
+            std::string kw = node3->input(1);
+            std::string kb = node4->input(1);
+            std::string vw = node5->input(1);
+            std::string vb = node6->input(1);
+            std::string ow = node19->input(1);
+            std::string ob = node20->input(1);
+
+            node20->set_op_type("MultiHeadAttention");
+            node20->clear_input();
+            node20->add_input(node->input(0));
+            node20->add_input(node3->input(0));
+            node20->add_input(node5->input(0));
+            // q
+            node20->add_input(qw);
+            node20->add_input(qb);
+            // k
+            node20->add_input(kw);
+            node20->add_input(kb);
+            // v
+            node20->add_input(vw);
+            node20->add_input(vb);
+            // out linear
+            node20->add_input(ow);
+            node20->add_input(ob);
+
+            onnx::AttributeProto* attr_embed_dim = node20->add_attribute();
+            attr_embed_dim->set_name("embed_dim");
+            attr_embed_dim->set_i(embed_dim);
+
+            onnx::AttributeProto* attr_num_heads = node20->add_attribute();
+            attr_num_heads->set_name("num_heads");
+            attr_num_heads->set_i(num_heads);
+
+            reduced_node_count += 19;
+            i += 19;
+        }
     }
-  }
-
-  for (int i = 0; i < node_count; i++) {
-    onnx::NodeProto* node = mutable_graph->mutable_node(i);
-
-    // MultiHeadAttention <= MatMul(qkv) - Add - Split
-    //                      - Mul
-    //                      - Reshape - Transpose
-    //                      - Reshape - Reshape - Transpose - Transpose
-    //                      - Gemm - Softmax - Gemm - Transpose - Reshape -
-    //                      MatMul - Add
-    if (node->op_type() == "MatMul") {
-      if (i + 16 >= node_count) continue;
-
-      if (node_reference[node->output(0)] != 1) continue;
-
-      onnx::NodeProto* node2 = mutable_graph->mutable_node(i + 1);
-      onnx::NodeProto* node3 = mutable_graph->mutable_node(i + 2);
-      onnx::NodeProto* node4 = mutable_graph->mutable_node(i + 3);
-      onnx::NodeProto* node5 = mutable_graph->mutable_node(i + 4);
-      onnx::NodeProto* node6 = mutable_graph->mutable_node(i + 5);
-      onnx::NodeProto* node7 = mutable_graph->mutable_node(i + 6);
-      onnx::NodeProto* node8 = mutable_graph->mutable_node(i + 7);
-      onnx::NodeProto* node9 = mutable_graph->mutable_node(i + 8);
-      onnx::NodeProto* node10 = mutable_graph->mutable_node(i + 9);
-      onnx::NodeProto* node11 = mutable_graph->mutable_node(i + 10);
-      onnx::NodeProto* node12 = mutable_graph->mutable_node(i + 11);
-      onnx::NodeProto* node13 = mutable_graph->mutable_node(i + 12);
-      onnx::NodeProto* node14 = mutable_graph->mutable_node(i + 13);
-      onnx::NodeProto* node15 = mutable_graph->mutable_node(i + 14);
-      onnx::NodeProto* node16 = mutable_graph->mutable_node(i + 15);
-      onnx::NodeProto* node17 = mutable_graph->mutable_node(i + 16);
-
-      if (node2->op_type() != "Add" || node3->op_type() != "Split" || node4->op_type() != "Mul" ||
-          node5->op_type() != "Reshape" || node6->op_type() != "Transpose" ||
-          node7->op_type() != "Reshape" || node8->op_type() != "Reshape" ||
-          node9->op_type() != "Transpose" || node10->op_type() != "Transpose" ||
-          node11->op_type() != "MatMul" || node12->op_type() != "Softmax" ||
-          node13->op_type() != "MatMul" || node14->op_type() != "Transpose" ||
-          node15->op_type() != "Reshape" || node16->op_type() != "MatMul" ||
-          node17->op_type() != "Add")
-        continue;
-
-      if (node_reference[node2->output(0)] != 1 || node_reference[node3->output(0)] != 1 ||
-          node_reference[node3->output(1)] != 1 || node_reference[node3->output(2)] != 1 ||
-          node_reference[node4->output(0)] != 1 || node_reference[node5->output(0)] != 1 ||
-          node_reference[node6->output(0)] != 1 || node_reference[node7->output(0)] != 1 ||
-          node_reference[node8->output(0)] != 1 || node_reference[node9->output(0)] != 1 ||
-          node_reference[node10->output(0)] != 1 || node_reference[node11->output(0)] != 1 ||
-          node_reference[node12->output(0)] != 1 || node_reference[node13->output(0)] != 1 ||
-          node_reference[node14->output(0)] != 1 || node_reference[node15->output(0)] != 1 ||
-          node_reference[node16->output(0)] != 1)
-        continue;
-
-      if (node2->input(0) != node->output(0) || node3->input(0) != node2->output(0) ||
-          node4->input(0) != node3->output(0) || node5->input(0) != node4->output(0) ||
-          node6->input(0) != node5->output(0) || node7->input(0) != node3->output(1) ||
-          node8->input(0) != node3->output(2) || node9->input(0) != node8->output(0) ||
-          node10->input(0) != node7->output(0) || node11->input(0) != node6->output(0) ||
-          node11->input(1) != node10->output(0) || node12->input(0) != node11->output(0) ||
-          node13->input(0) != node12->output(0) || node13->input(1) != node9->output(0) ||
-          node14->input(0) != node13->output(0) || node15->input(0) != node14->output(0) ||
-          node16->input(0) != node15->output(0) || node17->input(0) != node16->output(0))
-        continue;
-
-      std::vector<float> qkv_B = get_node_attr_from_input_af(weights[node2->input(1)]);
-      std::vector<float> o_B = get_node_attr_from_input_af(weights[node17->input(1)]);
-
-      if (qkv_B.size() != o_B.size() * 3) continue;
-
-      int embed_dim = o_B.size();
-
-      // 1 0 2
-      std::vector<int> perm6 = get_node_attr_ai(*node6, "perm");
-      std::vector<int> perm9 = get_node_attr_ai(*node9, "perm");
-      if (perm6.size() != 3 || perm9.size() != 3) continue;
-
-      if (perm6[0] != 1 || perm6[1] != 0 || perm6[2] != 2 || perm9[0] != 1 || perm9[1] != 0 ||
-          perm9[2] != 2)
-        continue;
-
-      // 1 2 0
-      std::vector<int> perm10 = get_node_attr_ai(*node10, "perm");
-      if (perm10.size() != 3) continue;
-
-      if (perm10[0] != 1 || perm10[1] != 2 || perm10[2] != 0) continue;
-
-      // 1 0 2
-      std::vector<int> perm14 = get_node_attr_ai(*node14, "perm");
-      if (perm14.size() != 3) continue;
-
-      if (perm14[0] != 1 || perm14[1] != 0 || perm14[2] != 2) continue;
-
-      int softmax_axis = get_node_attr_i(*node12, "axis");
-      if (softmax_axis != 2) continue;
-
-      // 1/-1, seqlen * num_heads, embed_dim / num_heads
-      std::vector<int> shape5;
-      std::vector<int> shape7;
-      std::vector<int> shape8;
-      if (node5->input_size() == 1) {
-        shape5 = get_node_attr_ai(*node5, "shape");
-      } else {
-        // skip weight reshape
-        if (weights.find(node5->input(1)) == weights.end()) continue;
-
-        shape5 = get_node_attr_from_input_ai(weights[node5->input(1)]);
-      }
-      if (node7->input_size() == 1) {
-        shape7 = get_node_attr_ai(*node7, "shape");
-      } else {
-        // skip weight reshape
-        if (weights.find(node7->input(1)) == weights.end()) continue;
-
-        shape7 = get_node_attr_from_input_ai(weights[node7->input(1)]);
-      }
-      if (node8->input_size() == 1) {
-        shape8 = get_node_attr_ai(*node8, "shape");
-      } else {
-        // skip weight reshape
-        if (weights.find(node8->input(1)) == weights.end()) continue;
-
-        shape8 = get_node_attr_from_input_ai(weights[node8->input(1)]);
-      }
-
-      if (shape5.size() != 3 || shape7.size() != 3 || shape8.size() != 3) continue;
-
-      if (shape5[1] != shape7[1] || shape5[1] != shape8[1] || shape5[2] != shape7[2] ||
-          shape5[2] != shape8[2])
-        continue;
-
-      int num_heads = embed_dim / shape5[2];
-
-      // 1, seqlen, embed_dim
-      std::vector<int> shape15;
-      if (node15->input_size() == 1) {
-        shape15 = get_node_attr_ai(*node15, "shape");
-      } else {
-        // skip weight reshape
-        if (weights.find(node15->input(1)) == weights.end()) continue;
-
-        shape15 = get_node_attr_from_input_ai(weights[node15->input(1)]);
-      }
-
-      if (shape15.size() != 3) continue;
-
-      if (shape15[2] != embed_dim || shape15[1] * num_heads != shape8[1]) continue;
-
-      // reduce
-      node->set_op_type("noop_reducedncnn");
-      node2->set_op_type("noop_reducedncnn");
-      node3->set_op_type("noop_reducedncnn");
-      node4->set_op_type("noop_reducedncnn");
-      node5->set_op_type("noop_reducedncnn");
-      node6->set_op_type("noop_reducedncnn");
-      node7->set_op_type("noop_reducedncnn");
-      node8->set_op_type("noop_reducedncnn");
-      node9->set_op_type("noop_reducedncnn");
-      node10->set_op_type("noop_reducedncnn");
-      node11->set_op_type("noop_reducedncnn");
-      node12->set_op_type("noop_reducedncnn");
-      node13->set_op_type("noop_reducedncnn");
-      node14->set_op_type("noop_reducedncnn");
-      node15->set_op_type("noop_reducedncnn");
-      node16->set_op_type("noop_reducedncnn");
-
-      node_reference[node2->input(0)] -= 1;
-      node_reference[node3->input(0)] -= 1;
-      node_reference[node4->input(0)] -= 1;
-      node_reference[node4->input(1)] -= 1;
-      node_reference[node5->input(0)] -= 1;
-      if (node5->input_size() == 2) {
-        node_reference[node5->input(1)] -= 1;
-      }
-      node_reference[node6->input(0)] -= 1;
-      node_reference[node7->input(0)] -= 1;
-      if (node7->input_size() == 2) {
-        node_reference[node7->input(1)] -= 1;
-      }
-      node_reference[node8->input(0)] -= 1;
-      if (node8->input_size() == 2) {
-        node_reference[node8->input(1)] -= 1;
-      }
-      node_reference[node9->input(0)] -= 1;
-      node_reference[node10->input(0)] -= 1;
-      node_reference[node11->input(0)] -= 1;
-      node_reference[node11->input(1)] -= 1;
-      node_reference[node12->input(0)] -= 1;
-      node_reference[node13->input(0)] -= 1;
-      node_reference[node13->input(1)] -= 1;
-      node_reference[node14->input(0)] -= 1;
-      node_reference[node15->input(0)] -= 1;
-      if (node15->input_size() == 2) {
-        node_reference[node15->input(1)] -= 1;
-      }
-      node_reference[node16->input(0)] -= 1;
-      node_reference[node17->input(0)] -= 1;
-
-      blob_names.erase(node->output(0));
-      blob_names.erase(node2->output(0));
-      blob_names.erase(node3->output(0));
-      blob_names.erase(node3->output(1));
-      blob_names.erase(node3->output(2));
-      blob_names.erase(node4->output(0));
-      blob_names.erase(node5->output(0));
-      blob_names.erase(node6->output(0));
-      blob_names.erase(node7->output(0));
-      blob_names.erase(node8->output(0));
-      blob_names.erase(node9->output(0));
-      blob_names.erase(node10->output(0));
-      blob_names.erase(node11->output(0));
-      blob_names.erase(node12->output(0));
-      blob_names.erase(node13->output(0));
-      blob_names.erase(node14->output(0));
-      blob_names.erase(node15->output(0));
-      blob_names.erase(node16->output(0));
-
-      std::string qkvw = node->input(1);
-      std::string qkvb = node2->input(1);
-      std::string ow = node16->input(1);
-      std::string ob = node17->input(1);
-
-      node17->set_op_type("MultiHeadAttention");
-      node17->clear_input();
-      node17->add_input(node->input(0));
-      // qkv
-      node17->add_input(qkvw);
-      node17->add_input(qkvb);
-      // out linear
-      node17->add_input(ow);
-      node17->add_input(ob);
-
-      onnx::AttributeProto* attr_embed_dim = node17->add_attribute();
-      attr_embed_dim->set_name("embed_dim");
-      attr_embed_dim->set_i(embed_dim);
-
-      onnx::AttributeProto* attr_num_heads = node17->add_attribute();
-      attr_num_heads->set_name("num_heads");
-      attr_num_heads->set_i(num_heads);
-
-      reduced_node_count += 16;
-      i += 16;
+
+    for (int i = 0; i < node_count; i++)
+    {
+        onnx::NodeProto* node = mutable_graph->mutable_node(i);
+
+        // MultiHeadAttention <= MatMul(qkv) - Add - Split
+        //                      - Mul
+        //                      - Reshape - Transpose
+        //                      - Reshape - Reshape - Transpose - Transpose
+        //                      - Gemm - Softmax - Gemm - Transpose - Reshape -
+        //                      MatMul - Add
+        if (node->op_type() == "MatMul")
+        {
+            if (i + 16 >= node_count) continue;
+
+            if (node_reference[node->output(0)] != 1) continue;
+
+            onnx::NodeProto* node2  = mutable_graph->mutable_node(i + 1);
+            onnx::NodeProto* node3  = mutable_graph->mutable_node(i + 2);
+            onnx::NodeProto* node4  = mutable_graph->mutable_node(i + 3);
+            onnx::NodeProto* node5  = mutable_graph->mutable_node(i + 4);
+            onnx::NodeProto* node6  = mutable_graph->mutable_node(i + 5);
+            onnx::NodeProto* node7  = mutable_graph->mutable_node(i + 6);
+            onnx::NodeProto* node8  = mutable_graph->mutable_node(i + 7);
+            onnx::NodeProto* node9  = mutable_graph->mutable_node(i + 8);
+            onnx::NodeProto* node10 = mutable_graph->mutable_node(i + 9);
+            onnx::NodeProto* node11 = mutable_graph->mutable_node(i + 10);
+            onnx::NodeProto* node12 = mutable_graph->mutable_node(i + 11);
+            onnx::NodeProto* node13 = mutable_graph->mutable_node(i + 12);
+            onnx::NodeProto* node14 = mutable_graph->mutable_node(i + 13);
+            onnx::NodeProto* node15 = mutable_graph->mutable_node(i + 14);
+            onnx::NodeProto* node16 = mutable_graph->mutable_node(i + 15);
+            onnx::NodeProto* node17 = mutable_graph->mutable_node(i + 16);
+
+            if (node2->op_type() != "Add" || node3->op_type() != "Split" || node4->op_type() != "Mul" ||
+                node5->op_type() != "Reshape" || node6->op_type() != "Transpose" ||
+                node7->op_type() != "Reshape" || node8->op_type() != "Reshape" ||
+                node9->op_type() != "Transpose" || node10->op_type() != "Transpose" ||
+                node11->op_type() != "MatMul" || node12->op_type() != "Softmax" ||
+                node13->op_type() != "MatMul" || node14->op_type() != "Transpose" ||
+                node15->op_type() != "Reshape" || node16->op_type() != "MatMul" ||
+                node17->op_type() != "Add")
+                continue;
+
+            if (node_reference[node2->output(0)] != 1 || node_reference[node3->output(0)] != 1 ||
+                node_reference[node3->output(1)] != 1 || node_reference[node3->output(2)] != 1 ||
+                node_reference[node4->output(0)] != 1 || node_reference[node5->output(0)] != 1 ||
+                node_reference[node6->output(0)] != 1 || node_reference[node7->output(0)] != 1 ||
+                node_reference[node8->output(0)] != 1 || node_reference[node9->output(0)] != 1 ||
+                node_reference[node10->output(0)] != 1 || node_reference[node11->output(0)] != 1 ||
+                node_reference[node12->output(0)] != 1 || node_reference[node13->output(0)] != 1 ||
+                node_reference[node14->output(0)] != 1 || node_reference[node15->output(0)] != 1 ||
+                node_reference[node16->output(0)] != 1)
+                continue;
+
+            if (node2->input(0) != node->output(0) || node3->input(0) != node2->output(0) ||
+                node4->input(0) != node3->output(0) || node5->input(0) != node4->output(0) ||
+                node6->input(0) != node5->output(0) || node7->input(0) != node3->output(1) ||
+                node8->input(0) != node3->output(2) || node9->input(0) != node8->output(0) ||
+                node10->input(0) != node7->output(0) || node11->input(0) != node6->output(0) ||
+                node11->input(1) != node10->output(0) || node12->input(0) != node11->output(0) ||
+                node13->input(0) != node12->output(0) || node13->input(1) != node9->output(0) ||
+                node14->input(0) != node13->output(0) || node15->input(0) != node14->output(0) ||
+                node16->input(0) != node15->output(0) || node17->input(0) != node16->output(0))
+                continue;
+
+            std::vector<float> qkv_B = get_node_attr_from_input_af(weights[node2->input(1)]);
+            std::vector<float> o_B   = get_node_attr_from_input_af(weights[node17->input(1)]);
+
+            if (qkv_B.size() != o_B.size() * 3) continue;
+
+            int              embed_dim = o_B.size();
+
+            // 1 0 2
+            std::vector<int> perm6 = get_node_attr_ai(*node6, "perm");
+            std::vector<int> perm9 = get_node_attr_ai(*node9, "perm");
+            if (perm6.size() != 3 || perm9.size() != 3) continue;
+
+            if (perm6[0] != 1 || perm6[1] != 0 || perm6[2] != 2 || perm9[0] != 1 || perm9[1] != 0 ||
+                perm9[2] != 2)
+                continue;
+
+            // 1 2 0
+            std::vector<int> perm10 = get_node_attr_ai(*node10, "perm");
+            if (perm10.size() != 3) continue;
+
+            if (perm10[0] != 1 || perm10[1] != 2 || perm10[2] != 0) continue;
+
+            // 1 0 2
+            std::vector<int> perm14 = get_node_attr_ai(*node14, "perm");
+            if (perm14.size() != 3) continue;
+
+            if (perm14[0] != 1 || perm14[1] != 0 || perm14[2] != 2) continue;
+
+            int softmax_axis = get_node_attr_i(*node12, "axis");
+            if (softmax_axis != 2) continue;
+
+            // 1/-1, seqlen * num_heads, embed_dim / num_heads
+            std::vector<int> shape5;
+            std::vector<int> shape7;
+            std::vector<int> shape8;
+            if (node5->input_size() == 1)
+            {
+                shape5 = get_node_attr_ai(*node5, "shape");
+            }
+            else
+            {
+                // skip weight reshape
+                if (weights.find(node5->input(1)) == weights.end()) continue;
+
+                shape5 = get_node_attr_from_input_ai(weights[node5->input(1)]);
+            }
+            if (node7->input_size() == 1)
+            {
+                shape7 = get_node_attr_ai(*node7, "shape");
+            }
+            else
+            {
+                // skip weight reshape
+                if (weights.find(node7->input(1)) == weights.end()) continue;
+
+                shape7 = get_node_attr_from_input_ai(weights[node7->input(1)]);
+            }
+            if (node8->input_size() == 1)
+            {
+                shape8 = get_node_attr_ai(*node8, "shape");
+            }
+            else
+            {
+                // skip weight reshape
+                if (weights.find(node8->input(1)) == weights.end()) continue;
+
+                shape8 = get_node_attr_from_input_ai(weights[node8->input(1)]);
+            }
+
+            if (shape5.size() != 3 || shape7.size() != 3 || shape8.size() != 3) continue;
+
+            if (shape5[1] != shape7[1] || shape5[1] != shape8[1] || shape5[2] != shape7[2] ||
+                shape5[2] != shape8[2])
+                continue;
+
+            int              num_heads = embed_dim / shape5[2];
+
+            // 1, seqlen, embed_dim
+            std::vector<int> shape15;
+            if (node15->input_size() == 1)
+            {
+                shape15 = get_node_attr_ai(*node15, "shape");
+            }
+            else
+            {
+                // skip weight reshape
+                if (weights.find(node15->input(1)) == weights.end()) continue;
+
+                shape15 = get_node_attr_from_input_ai(weights[node15->input(1)]);
+            }
+
+            if (shape15.size() != 3) continue;
+
+            if (shape15[2] != embed_dim || shape15[1] * num_heads != shape8[1]) continue;
+
+            // reduce
+            node->set_op_type("noop_reducedncnn");
+            node2->set_op_type("noop_reducedncnn");
+            node3->set_op_type("noop_reducedncnn");
+            node4->set_op_type("noop_reducedncnn");
+            node5->set_op_type("noop_reducedncnn");
+            node6->set_op_type("noop_reducedncnn");
+            node7->set_op_type("noop_reducedncnn");
+            node8->set_op_type("noop_reducedncnn");
+            node9->set_op_type("noop_reducedncnn");
+            node10->set_op_type("noop_reducedncnn");
+            node11->set_op_type("noop_reducedncnn");
+            node12->set_op_type("noop_reducedncnn");
+            node13->set_op_type("noop_reducedncnn");
+            node14->set_op_type("noop_reducedncnn");
+            node15->set_op_type("noop_reducedncnn");
+            node16->set_op_type("noop_reducedncnn");
+
+            node_reference[node2->input(0)] -= 1;
+            node_reference[node3->input(0)] -= 1;
+            node_reference[node4->input(0)] -= 1;
+            node_reference[node4->input(1)] -= 1;
+            node_reference[node5->input(0)] -= 1;
+            if (node5->input_size() == 2)
+            {
+                node_reference[node5->input(1)] -= 1;
+            }
+            node_reference[node6->input(0)] -= 1;
+            node_reference[node7->input(0)] -= 1;
+            if (node7->input_size() == 2)
+            {
+                node_reference[node7->input(1)] -= 1;
+            }
+            node_reference[node8->input(0)] -= 1;
+            if (node8->input_size() == 2)
+            {
+                node_reference[node8->input(1)] -= 1;
+            }
+            node_reference[node9->input(0)] -= 1;
+            node_reference[node10->input(0)] -= 1;
+            node_reference[node11->input(0)] -= 1;
+            node_reference[node11->input(1)] -= 1;
+            node_reference[node12->input(0)] -= 1;
+            node_reference[node13->input(0)] -= 1;
+            node_reference[node13->input(1)] -= 1;
+            node_reference[node14->input(0)] -= 1;
+            node_reference[node15->input(0)] -= 1;
+            if (node15->input_size() == 2)
+            {
+                node_reference[node15->input(1)] -= 1;
+            }
+            node_reference[node16->input(0)] -= 1;
+            node_reference[node17->input(0)] -= 1;
+
+            blob_names.erase(node->output(0));
+            blob_names.erase(node2->output(0));
+            blob_names.erase(node3->output(0));
+            blob_names.erase(node3->output(1));
+            blob_names.erase(node3->output(2));
+            blob_names.erase(node4->output(0));
+            blob_names.erase(node5->output(0));
+            blob_names.erase(node6->output(0));
+            blob_names.erase(node7->output(0));
+            blob_names.erase(node8->output(0));
+            blob_names.erase(node9->output(0));
+            blob_names.erase(node10->output(0));
+            blob_names.erase(node11->output(0));
+            blob_names.erase(node12->output(0));
+            blob_names.erase(node13->output(0));
+            blob_names.erase(node14->output(0));
+            blob_names.erase(node15->output(0));
+            blob_names.erase(node16->output(0));
+
+            std::string qkvw = node->input(1);
+            std::string qkvb = node2->input(1);
+            std::string ow   = node16->input(1);
+            std::string ob   = node17->input(1);
+
+            node17->set_op_type("MultiHeadAttention");
+            node17->clear_input();
+            node17->add_input(node->input(0));
+            // qkv
+            node17->add_input(qkvw);
+            node17->add_input(qkvb);
+            // out linear
+            node17->add_input(ow);
+            node17->add_input(ob);
+
+            onnx::AttributeProto* attr_embed_dim = node17->add_attribute();
+            attr_embed_dim->set_name("embed_dim");
+            attr_embed_dim->set_i(embed_dim);
+
+            onnx::AttributeProto* attr_num_heads = node17->add_attribute();
+            attr_num_heads->set_name("num_heads");
+            attr_num_heads->set_i(num_heads);
+
+            reduced_node_count += 16;
+            i += 16;
+        }
     }
-  }
 }
diff --git a/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/fuse_pass.h b/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/fuse_pass.h
index 31dc6f5b93..ec4575b51a 100644
--- a/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/fuse_pass.h
+++ b/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/fuse_pass.h
@@ -4,30 +4,35 @@
 #include "shape_inference.h"
 #include "utils.h"
 
-void fuse_identity(onnx::GraphProto* mutable_graph,
+void fuse_identity(onnx::GraphProto*                         mutable_graph,
                    std::map<std::string, onnx::TensorProto>& weights,
-                   std::map<std::string, int>& node_reference, std::set<std::string>& blob_names,
-                   int& reduced_node_count);
+                   std::map<std::string, int>&               node_reference,
+                   std::set<std::string>&                    blob_names,
+                   int&                                      reduced_node_count);
 
-void fuse_rewrite_gather(onnx::GraphProto* mutable_graph,
+void fuse_rewrite_gather(onnx::GraphProto*                         mutable_graph,
                          std::map<std::string, onnx::TensorProto>& weights,
-                         std::map<std::string, int>& node_reference,
-                         std::set<std::string>& blob_names, int& reduced_node_count);
+                         std::map<std::string, int>&               node_reference,
+                         std::set<std::string>&                    blob_names,
+                         int&                                      reduced_node_count);
 
-void fuse_weight_reshape(onnx::GraphProto* mutable_graph,
+void fuse_weight_reshape(onnx::GraphProto*                         mutable_graph,
                          std::map<std::string, onnx::TensorProto>& weights,
-                         std::map<std::string, int>& node_reference,
-                         std::set<std::string>& blob_names, int& reduced_node_count);
+                         std::map<std::string, int>&               node_reference,
+                         std::set<std::string>&                    blob_names,
+                         int&                                      reduced_node_count);
 
-void fuse_shufflechannel(onnx::GraphProto* mutable_graph,
+void fuse_shufflechannel(onnx::GraphProto*                         mutable_graph,
                          std::map<std::string, onnx::TensorProto>& weights,
-                         std::map<std::string, int>& node_reference,
-                         std::set<std::string>& blob_names, int& reduced_node_count);
+                         std::map<std::string, int>&               node_reference,
+                         std::set<std::string>&                    blob_names,
+                         int&                                      reduced_node_count);
 
-void fuse_shufflechannel_split(onnx::GraphProto* mutable_graph,
+void fuse_shufflechannel_split(onnx::GraphProto*                         mutable_graph,
                                std::map<std::string, onnx::TensorProto>& weights,
-                               std::map<std::string, int>& node_reference,
-                               std::set<std::string>& blob_names, int& reduced_node_count);
+                               std::map<std::string, int>&               node_reference,
+                               std::set<std::string>&                    blob_names,
+                               int&                                      reduced_node_count);
 
 /**
  * @brief fuse subgraph
@@ -46,85 +51,104 @@ void fuse_shufflechannel_split(onnx::GraphProto* mutable_graph,
  * @param blob_names
  * @param reduced_node_count
  */
-void fuse_conv_reshape(onnx::GraphProto* mutable_graph,
+void fuse_conv_reshape(onnx::GraphProto*                         mutable_graph,
                        std::map<std::string, onnx::TensorProto>& weights,
-                       std::map<std::string, int>& node_reference,
-                       std::set<std::string>& blob_names, int& reduced_node_count);
+                       std::map<std::string, int>&               node_reference,
+                       std::set<std::string>&                    blob_names,
+                       int&                                      reduced_node_count);
 
-void fuse_binaryop_with_scalar(onnx::GraphProto* mutable_graph,
+void fuse_binaryop_with_scalar(onnx::GraphProto*                         mutable_graph,
                                std::map<std::string, onnx::TensorProto>& weights,
-                               std::map<std::string, int>& node_reference,
-                               std::set<std::string>& blob_names, int& reduced_node_count);
+                               std::map<std::string, int>&               node_reference,
+                               std::set<std::string>&                    blob_names,
+                               int&                                      reduced_node_count);
 
-void fuse_hardswish(onnx::GraphProto* mutable_graph,
+void fuse_hardswish(onnx::GraphProto*                         mutable_graph,
                     std::map<std::string, onnx::TensorProto>& weights,
-                    std::map<std::string, int>& node_reference, std::set<std::string>& blob_names,
-                    int& reduced_node_count);
+                    std::map<std::string, int>&               node_reference,
+                    std::set<std::string>&                    blob_names,
+                    int&                                      reduced_node_count);
 
-void fuse_hardsigmoid(onnx::GraphProto* mutable_graph,
+void fuse_hardsigmoid(onnx::GraphProto*                         mutable_graph,
                       std::map<std::string, onnx::TensorProto>& weights,
-                      std::map<std::string, int>& node_reference, std::set<std::string>& blob_names,
-                      int& reduced_node_count);
+                      std::map<std::string, int>&               node_reference,
+                      std::set<std::string>&                    blob_names,
+                      int&                                      reduced_node_count);
 
-void fuse_batchnorm1d_squeeze_unsqueeze(onnx::GraphProto* mutable_graph,
+void fuse_batchnorm1d_squeeze_unsqueeze(onnx::GraphProto*                         mutable_graph,
                                         std::map<std::string, onnx::TensorProto>& weights,
-                                        std::map<std::string, int>& node_reference,
-                                        std::set<std::string>& blob_names, int& reduced_node_count);
+                                        std::map<std::string, int>&               node_reference,
+                                        std::set<std::string>&                    blob_names,
+                                        int&                                      reduced_node_count);
 
-void fuse_unsqueeze_prelu(onnx::GraphProto* mutable_graph,
+void fuse_unsqueeze_prelu(onnx::GraphProto*                         mutable_graph,
                           std::map<std::string, onnx::TensorProto>& weights,
-                          std::map<std::string, int>& node_reference,
-                          std::set<std::string>& blob_names, int& reduced_node_count);
+                          std::map<std::string, int>&               node_reference,
+                          std::set<std::string>&                    blob_names,
+                          int&                                      reduced_node_count);
 
-void fuse_normalize(onnx::GraphProto* mutable_graph,
+void fuse_normalize(onnx::GraphProto*                         mutable_graph,
                     std::map<std::string, onnx::TensorProto>& weights,
-                    std::map<std::string, int>& node_reference, std::set<std::string>& blob_names,
-                    int& reduced_node_count);
+                    std::map<std::string, int>&               node_reference,
+                    std::set<std::string>&                    blob_names,
+                    int&                                      reduced_node_count);
 
-void fuse_groupnorm(onnx::GraphProto* mutable_graph,
+void fuse_groupnorm(onnx::GraphProto*                         mutable_graph,
                     std::map<std::string, onnx::TensorProto>& weights,
-                    std::map<std::string, int>& node_reference, std::set<std::string>& blob_names,
-                    int& reduced_node_count);
+                    std::map<std::string, int>&               node_reference,
+                    std::set<std::string>&                    blob_names,
+                    int&                                      reduced_node_count);
 
-void fuse_layernorm(onnx::GraphProto* mutable_graph,
+void fuse_layernorm(onnx::GraphProto*                         mutable_graph,
                     std::map<std::string, onnx::TensorProto>& weights,
-                    std::map<std::string, int>& node_reference, std::set<std::string>& blob_names,
-                    int& reduced_node_count);
+                    std::map<std::string, int>&               node_reference,
+                    std::set<std::string>&                    blob_names,
+                    int&                                      reduced_node_count);
 
-void fuse_flatten(onnx::GraphProto* mutable_graph,
+void fuse_flatten(onnx::GraphProto*                         mutable_graph,
                   std::map<std::string, onnx::TensorProto>& weights,
-                  std::map<std::string, int>& node_reference, std::set<std::string>& blob_names,
-                  int& reduced_node_count);
+                  std::map<std::string, int>&               node_reference,
+                  std::set<std::string>&                    blob_names,
+                  int&                                      reduced_node_count);
 
-void fuse_pixelshuffle(onnx::GraphProto* mutable_graph,
+void fuse_pixelshuffle(onnx::GraphProto*                         mutable_graph,
                        std::map<std::string, onnx::TensorProto>& weights,
-                       std::map<std::string, int>& node_reference,
-                       std::set<std::string>& blob_names, int& reduced_node_count);
+                       std::map<std::string, int>&               node_reference,
+                       std::set<std::string>&                    blob_names,
+                       int&                                      reduced_node_count);
 
-void fuse_reorg(onnx::GraphProto* mutable_graph, std::map<std::string, onnx::TensorProto>& weights,
-                std::map<std::string, int>& node_reference, std::set<std::string>& blob_names,
-                int& reduced_node_count);
+void fuse_reorg(onnx::GraphProto*                         mutable_graph,
+                std::map<std::string, onnx::TensorProto>& weights,
+                std::map<std::string, int>&               node_reference,
+                std::set<std::string>&                    blob_names,
+                int&                                      reduced_node_count);
 
-void fuse_expand_broadcast(onnx::GraphProto* mutable_graph,
+void fuse_expand_broadcast(onnx::GraphProto*                         mutable_graph,
                            std::map<std::string, onnx::TensorProto>& weights,
-                           std::map<std::string, int>& node_reference,
-                           std::set<std::string>& blob_names, int& reduced_node_count);
+                           std::map<std::string, int>&               node_reference,
+                           std::set<std::string>&                    blob_names,
+                           int&                                      reduced_node_count);
 
-void fuse_lstm_gru_rnn(onnx::GraphProto* mutable_graph,
+void fuse_lstm_gru_rnn(onnx::GraphProto*                         mutable_graph,
                        std::map<std::string, onnx::TensorProto>& weights,
-                       std::map<std::string, int>& node_reference,
-                       std::set<std::string>& blob_names, int& reduced_node_count);
+                       std::map<std::string, int>&               node_reference,
+                       std::set<std::string>&                    blob_names,
+                       int&                                      reduced_node_count);
 
-void fuse_multiheadattention(onnx::GraphProto* mutable_graph,
+void fuse_multiheadattention(onnx::GraphProto*                         mutable_graph,
                              std::map<std::string, onnx::TensorProto>& weights,
-                             std::map<std::string, int>& node_reference,
-                             std::set<std::string>& blob_names, int& reduced_node_count);
+                             std::map<std::string, int>&               node_reference,
+                             std::set<std::string>&                    blob_names,
+                             int&                                      reduced_node_count);
 
-void fuse_weight_transpose(onnx::GraphProto* mutable_graph,
+void fuse_weight_transpose(onnx::GraphProto*                         mutable_graph,
                            std::map<std::string, onnx::TensorProto>& weights,
-                           std::map<std::string, int>& node_reference,
-                           std::set<std::string>& blob_names, int& reduced_node_count);
-
-void fuse_swish(onnx::GraphProto* mutable_graph, std::map<std::string, onnx::TensorProto>& weights,
-                std::map<std::string, int>& node_reference, std::set<std::string>& blob_names,
-                int& reduced_node_count);
+                           std::map<std::string, int>&               node_reference,
+                           std::set<std::string>&                    blob_names,
+                           int&                                      reduced_node_count);
+
+void fuse_swish(onnx::GraphProto*                         mutable_graph,
+                std::map<std::string, onnx::TensorProto>& weights,
+                std::map<std::string, int>&               node_reference,
+                std::set<std::string>&                    blob_names,
+                int&                                      reduced_node_count);
diff --git a/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/onnx2ncnn.cpp b/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/onnx2ncnn.cpp
index ca8cd628ad..bc38599b63 100644
--- a/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/onnx2ncnn.cpp
+++ b/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/onnx2ncnn.cpp
@@ -26,2719 +26,3551 @@
 #include "shape_inference.h"
 #include "utils.h"
 
-int main(int argc, char** argv) {
-  if (!(argc == 2 || argc == 4)) {
-    fprintf(stderr, "Usage: %s [onnxpb] [ncnnparam] [ncnnbin]\n", argv[0]);
-    return -1;
-  }
-
-  const char* onnxpb = argv[1];
-  const char* ncnn_prototxt = argc == 4 ? argv[2] : "ncnn.param";
-  const char* ncnn_modelbin = argc == 4 ? argv[3] : "ncnn.bin";
-
-  onnx::ModelProto model;
-
-  // load
-  bool s1 = read_proto_from_binary(onnxpb, &model);
-  if (!s1) {
-    fprintf(stderr, "read_proto_from_binary failed\n");
-    return -1;
-  }
-  FILE* pp = fopen(ncnn_prototxt, "wb");
-  FILE* bp = fopen(ncnn_modelbin, "wb");
-  // magic
-  fprintf(pp, "7767517\n");
-  onnx::GraphProto* mutable_graph = model.mutable_graph();
-  int node_count = mutable_graph->node_size();
-
-  // node reference
-  std::map<std::string, int> node_reference;
-
-  // weight node and weight reshape node
-  std::map<std::string, onnx::TensorProto> weights;
-  for (int j = 0; j < mutable_graph->initializer_size(); j++) {
-    const onnx::TensorProto& initializer = mutable_graph->initializer(j);
-
-    //         fprintf(stderr, "weight = %s %d\n", initializer.name().c_str(),
-    //         initializer.data_type());
-
-    weights[initializer.name()] = initializer;
-  }
-  // topological sort
-  {
-    // name -> producer node index
-    std::set<std::string> producers;
-    for (int j = 0; j < mutable_graph->input_size(); j++) {
-      const std::string& input_name = mutable_graph->input(j).name();
-      producers.insert(input_name);
+int main(int argc, char** argv)
+{
+    if (!(argc == 2 || argc == 4))
+    {
+        fprintf(stderr, "Usage: %s [onnxpb] [ncnnparam] [ncnnbin]\n", argv[0]);
+        return -1;
     }
 
-    for (int i = 0; i < node_count;) {
-      onnx::NodeProto* node = mutable_graph->mutable_node(i);
+    const char*      onnxpb        = argv[1];
+    const char*      ncnn_prototxt = argc == 4 ? argv[2] : "ncnn.param";
+    const char*      ncnn_modelbin = argc == 4 ? argv[3] : "ncnn.bin";
 
-      bool swapnode = false;
-      std::string missing_input_name;
-      for (int j = 0; j < (int)node->input_size(); j++) {
-        const std::string& input_name = node->input(j);
-        if (input_name.empty()) continue;
+    onnx::ModelProto model;
 
-        if (producers.find(input_name) == producers.end() &&
-            weights.find(input_name) == weights.end()) {
-          swapnode = true;
-          missing_input_name = input_name;
-          break;
-        }
-      }
+    // load
+    bool             s1 = read_proto_from_binary(onnxpb, &model);
+    if (!s1)
+    {
+        fprintf(stderr, "read_proto_from_binary failed\n");
+        return -1;
+    }
+    FILE* pp = fopen(ncnn_prototxt, "wb");
+    FILE* bp = fopen(ncnn_modelbin, "wb");
+    // magic
+    fprintf(pp, "7767517\n");
+    onnx::GraphProto*                        mutable_graph = model.mutable_graph();
+    int                                      node_count    = mutable_graph->node_size();
+
+    // node reference
+    std::map<std::string, int>               node_reference;
+
+    // weight node and weight reshape node
+    std::map<std::string, onnx::TensorProto> weights;
+    for (int j = 0; j < mutable_graph->initializer_size(); j++)
+    {
+        const onnx::TensorProto& initializer = mutable_graph->initializer(j);
 
-      if (!swapnode) {
-        for (int j = 0; j < (int)node->output_size(); j++) {
-          const std::string& output_name = node->output(j);
-          if (output_name.empty()) continue;
+        //         fprintf(stderr, "weight = %s %d\n", initializer.name().c_str(),
+        //         initializer.data_type());
 
-          producers.insert(output_name);
+        weights[initializer.name()] = initializer;
+    }
+    // topological sort
+    {
+        // name -> producer node index
+        std::set<std::string> producers;
+        for (int j = 0; j < mutable_graph->input_size(); j++)
+        {
+            const std::string& input_name = mutable_graph->input(j).name();
+            producers.insert(input_name);
         }
 
-        i++;
-        continue;
-      }
-
-      // find node that produce missing_input_name
-      int q = i + 1;
-      for (; q < node_count; q++) {
-        onnx::NodeProto* nodeq = mutable_graph->mutable_node(q);
-        bool found = false;
-        for (int j = 0; j < (int)nodeq->output_size(); j++) {
-          const std::string& output_name = nodeq->output(j);
-          if (output_name == missing_input_name) {
-            found = true;
-            break;
-          }
-        }
+        for (int i = 0; i < node_count;)
+        {
+            onnx::NodeProto* node = mutable_graph->mutable_node(i);
+
+            bool             swapnode = false;
+            std::string      missing_input_name;
+            for (int j = 0; j < (int)node->input_size(); j++)
+            {
+                const std::string& input_name = node->input(j);
+                if (input_name.empty()) continue;
+
+                if (producers.find(input_name) == producers.end() &&
+                    weights.find(input_name) == weights.end())
+                {
+                    swapnode           = true;
+                    missing_input_name = input_name;
+                    break;
+                }
+            }
 
-        if (found) break;
-      }
+            if (!swapnode)
+            {
+                for (int j = 0; j < (int)node->output_size(); j++)
+                {
+                    const std::string& output_name = node->output(j);
+                    if (output_name.empty()) continue;
 
-      if (q == node_count) {
-        fprintf(stderr, "cannot find node produces %s but node %d requires it\n",
-                missing_input_name.c_str(), i);
-        return -1;
-      }
-
-      // fprintf(stderr, "swap %d %d\n", i, q);
-      // swap this node with q
-      onnx::NodeProto* nodeq = mutable_graph->mutable_node(q);
-      onnx::NodeProto tmp = *node;
-      *node = *nodeq;
-      *nodeq = tmp;
-    }
-  }
-  // global definition line
-  // [layer count] [blob count]
-  std::set<std::string> blob_names;
-  for (int i = 0; i < node_count; i++) {
-    const onnx::NodeProto& node = mutable_graph->node(i);
-
-    const std::string& op = node.op_type();
-
-    std::string name = node.name();
-    if (name.empty()) {
-      name = node.output(0);
-    }
+                    producers.insert(output_name);
+                }
 
-    if (op == "Constant") {
-      onnx::TensorProto tensor = get_node_attr_tensor(node, "value");
-      weights[node.output(0)] = tensor;
-    }
+                i++;
+                continue;
+            }
 
-    for (int j = 0; j < (int)node.input_size(); j++) {
-      const std::string& input_name = node.input(j);
+            // find node that produce missing_input_name
+            int q = i + 1;
+            for (; q < node_count; q++)
+            {
+                onnx::NodeProto* nodeq = mutable_graph->mutable_node(q);
+                bool             found = false;
+                for (int j = 0; j < (int)nodeq->output_size(); j++)
+                {
+                    const std::string& output_name = nodeq->output(j);
+                    if (output_name == missing_input_name)
+                    {
+                        found = true;
+                        break;
+                    }
+                }
+
+                if (found) break;
+            }
 
-      blob_names.insert(input_name);
+            if (q == node_count)
+            {
+                fprintf(stderr, "cannot find node produces %s but node %d requires it\n", missing_input_name.c_str(), i);
+                return -1;
+            }
 
-      if (node_reference.find(input_name) == node_reference.end()) {
-        node_reference[input_name] = 1;
-      } else {
-        node_reference[input_name] = node_reference[input_name] + 1;
-      }
+            // fprintf(stderr, "swap %d %d\n", i, q);
+            // swap this node with q
+            onnx::NodeProto* nodeq = mutable_graph->mutable_node(q);
+            onnx::NodeProto  tmp   = *node;
+            *node                  = *nodeq;
+            *nodeq                 = tmp;
+        }
     }
+    // global definition line
+    // [layer count] [blob count]
+    std::set<std::string> blob_names;
+    for (int i = 0; i < node_count; i++)
+    {
+        const onnx::NodeProto& node = mutable_graph->node(i);
 
-    if (op == "Dropout") {
-      const std::string& output_name = node.output(0);
-      blob_names.insert(output_name);
-      node_reference[output_name] = 0;
-      continue;
-    }
+        const std::string&     op = node.op_type();
 
-    for (int j = 0; j < (int)node.output_size(); j++) {
-      const std::string& output_name = node.output(j);
+        std::string            name = node.name();
+        if (name.empty())
+        {
+            name = node.output(0);
+        }
 
-      blob_names.insert(output_name);
+        if (op == "Constant")
+        {
+            onnx::TensorProto tensor = get_node_attr_tensor(node, "value");
+            weights[node.output(0)]  = tensor;
+        }
 
-      node_reference[output_name] = 0;
-    }
-  }
-  // include Input node
-  int input_node_count = 0;
-  for (int j = 0; j < mutable_graph->input_size(); j++) {
-    const std::string& input_name = mutable_graph->input(j).name();
-
-    // check weight
-    if (weights.find(input_name) != weights.end()) continue;
-
-    blob_names.insert(input_name);
-
-    input_node_count++;
-  }
-
-  //     for (auto a: node_reference)
-  //     {
-  //         fprintf(stderr, "a = %s %d\n", a.first.c_str(), a.second);
-  //     }
-
-  // op chain fusion
-  int reduced_node_count = 0;
-  {
-    fuse_identity(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
-    fuse_conv_reshape(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
-    fuse_weight_reshape(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
-    fuse_weight_transpose(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
-    fuse_shufflechannel(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
-    fuse_shufflechannel_split(mutable_graph, weights, node_reference, blob_names,
-                              reduced_node_count);
-    fuse_hardsigmoid(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
-    fuse_hardswish(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
-    fuse_swish(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
-    fuse_batchnorm1d_squeeze_unsqueeze(mutable_graph, weights, node_reference, blob_names,
-                                       reduced_node_count);
-    fuse_unsqueeze_prelu(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
-    fuse_normalize(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
-    fuse_groupnorm(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
-    fuse_layernorm(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
-    fuse_flatten(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
-    fuse_pixelshuffle(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
-    fuse_reorg(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
-    fuse_expand_broadcast(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
-    fuse_lstm_gru_rnn(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
-    fuse_multiheadattention(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
-    fuse_binaryop_with_scalar(mutable_graph, weights, node_reference, blob_names,
-                              reduced_node_count);
-    fuse_rewrite_gather(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
-  }
-  // reduce common const weight node_reference
-  for (int i = 0; i < node_count; i++) {
-    const onnx::NodeProto& node = mutable_graph->node(i);
-
-    const std::string& op = node.op_type();
-
-    if (op == "BatchNormalization") {
-      node_reference[node.input(1)] -= 1;
-      node_reference[node.input(2)] -= 1;
-      node_reference[node.input(3)] -= 1;
-      node_reference[node.input(4)] -= 1;
-    } else if (op == "BiasGelu") {
-      node_reference[node.input(1)] -= 1;
-    } else if (op == "Clip") {
-      if (node.input_size() == 3) {
-        node_reference[node.input(1)] -= 1;
-        node_reference[node.input(2)] -= 1;
-      }
-    } else if (op == "Conv") {
-      node_reference[node.input(1)] -= 1;
-      if (node.input_size() == 3) {
-        node_reference[node.input(2)] -= 1;
-      }
-    } else if (op == "ConvTranspose") {
-      node_reference[node.input(1)] -= 1;
-      if (node.input_size() == 3) {
-        node_reference[node.input(2)] -= 1;
-      }
-    } else if (op == "EmbedLayerNormalization") {
-      node_reference[node.input(1)] -= 1;
-      node_reference[node.input(2)] -= 1;
-      node_reference[node.input(3)] -= 1;
-      node_reference[node.input(4)] -= 1;
-      node_reference[node.input(5)] -= 1;
-      node_reference[node.input(6)] -= 1;
-    } else if (op == "Gemm") {
-      float alpha = get_node_attr_f(node, "alpha", 1.f);
-      float beta = get_node_attr_f(node, "beta", 1.f);
-      int transA = get_node_attr_i(node, "transA", 0);
-      int transB = get_node_attr_i(node, "transB", 0);
-
-      if (alpha == 1.f && beta == 1.f && transA == 0 && transB == 1) {
-        // InnerProduct-like A * B + C, C is optional.
-        node_reference[node.input(1)] -= 1;
-        if (node.input_size() == 3) {
-          node_reference[node.input(2)] -= 1;
-        }
-      }
-    } else if (op == "GroupNorm") {
-      int affine = get_node_attr_i(node, "affine", 1);
-      if (affine) {
-        node_reference[node.input(1)] -= 1;
-        node_reference[node.input(2)] -= 1;
-      }
-    } else if (op == "GRU") {
-      for (int j = 1; j < node.input_size(); j++) {
-        node_reference[node.input(j)] -= 1;
-      }
-    } else if (op == "InstanceNormalization") {
-      node_reference[node.input(1)] -= 1;
-      node_reference[node.input(2)] -= 1;
-    } else if (op == "LayerNorm") {
-      int affine = get_node_attr_i(node, "affine", 1);
-      if (affine) {
-        node_reference[node.input(1)] -= 1;
-        node_reference[node.input(2)] -= 1;
-      }
-    } else if (op == "LSTM") {
-      for (int j = 1; j < node.input_size(); j++) {
-        node_reference[node.input(j)] -= 1;
-      }
-    } else if (op == "MatMul") {
-      if (weights.find(node.input(1)) != weights.end() && weights[node.input(1)].dims_size() == 2) {
-        // InnerProduct
-        node_reference[node.input(1)] -= 1;
-      }
-    } else if (op == "MultiHeadAttention") {
-      if (node.input_size() == 5) {
-        node_reference[node.input(1)] -= 1;
-        node_reference[node.input(2)] -= 1;
-        node_reference[node.input(3)] -= 1;
-        node_reference[node.input(4)] -= 1;
-      } else {
-        node_reference[node.input(3)] -= 1;
-        node_reference[node.input(4)] -= 1;
-        node_reference[node.input(5)] -= 1;
-        node_reference[node.input(6)] -= 1;
-        node_reference[node.input(7)] -= 1;
-        node_reference[node.input(8)] -= 1;
-        node_reference[node.input(9)] -= 1;
-        node_reference[node.input(10)] -= 1;
-      }
-    } else if (op == "NonMaxSuppression") {
-      if (node.input_size() >= 3) {
-        node_reference[node.input(2)] -= 1;
-      }
-      if (node.input_size() >= 4) {
-        node_reference[node.input(3)] -= 1;
-      }
-      if (node.input_size() >= 5) {
-        node_reference[node.input(4)] -= 1;
-      }
-    } else if (op == "Pad") {
-      if (node.input_size() >= 2) {
-        node_reference[node.input(1)] -= 1;
-      }
-    } else if (op == "PRelu") {
-      node_reference[node.input(1)] -= 1;
-    } else if (op == "Reshape") {
-      if (node.input_size() == 2) {
-        if (weights[node.input(1)].data_type() != 0) {
-          node_reference[node.input(1)] -= 1;
-        }
-      }
-    } else if (op == "Resize") {
-      if (node.input_size() == 2) {
-        // opset 10
-        node_reference[node.input(1)] -= 1;
-      } else {
-        // opset 11+
-        node_reference[node.input(1)] -= 1;
-        node_reference[node.input(2)] -= 1;
-        if (node.input_size() >= 4) {
-          node_reference[node.input(3)] -= 1;
-        }
-      }
-    } else if (op == "RNN") {
-      for (int j = 1; j < node.input_size(); j++) {
-        node_reference[node.input(j)] -= 1;
-      }
-    } else if (op == "SkipLayerNormalization") {
-      node_reference[node.input(2)] -= 1;
-      node_reference[node.input(3)] -= 1;
-      node_reference[node.input(4)] -= 1;
-    } else if (op == "Slice") {
-      if (node.input_size() >= 2) {
-        node_reference[node.input(1)] -= 1;
-        node_reference[node.input(2)] -= 1;
-        if (node.input_size() >= 4) node_reference[node.input(3)] -= 1;
-        if (node.input_size() >= 5) node_reference[node.input(4)] -= 1;
-      }
-    } else if (op == "Upsample") {
-      if (node.input_size() >= 2) {
-        node_reference[node.input(1)] -= 1;
-      }
-    } else if (op == "AdaptiveAvgPool2d" || op == "adaptive_avg_pool2d" ||
-               op == "adaptive_max_pool2d") {
-      if (node.input_size() >= 2) {
-        node_reference[node.input(1)] -= 1;
-      }
-    }
-  }
+        for (int j = 0; j < (int)node.input_size(); j++)
+        {
+            const std::string& input_name = node.input(j);
 
-  //         for (auto a: node_reference)
-  //         {
-  //             fprintf(stderr, "b = %s %d\n", a.first.c_str(), a.second);
-  //         }
+            blob_names.insert(input_name);
 
-  // count all weight node with zero reference
-  int zero_reference_weight_node_count = 0;
-  for (std::map<std::string, onnx::TensorProto>::iterator it = weights.begin(); it != weights.end();
-       it++) {
-    const std::string& input_name = it->first;
+            if (node_reference.find(input_name) == node_reference.end())
+            {
+                node_reference[input_name] = 1;
+            }
+            else
+            {
+                node_reference[input_name] = node_reference[input_name] + 1;
+            }
+        }
 
-    int refcount = node_reference[input_name];
-    if (refcount == 0) zero_reference_weight_node_count++;
-  }
+        if (op == "Dropout")
+        {
+            const std::string& output_name = node.output(0);
+            blob_names.insert(output_name);
+            node_reference[output_name] = 0;
+            continue;
+        }
 
-  // we always treat constant node as weight or binaryop_weights
-  // do not count it twice for layer_count
-  int constant_node_count_moved_to_weight = 0;
-  for (int i = 0; i < node_count; i++) {
-    const onnx::NodeProto& node = mutable_graph->node(i);
+        for (int j = 0; j < (int)node.output_size(); j++)
+        {
+            const std::string& output_name = node.output(j);
 
-    const std::string& op = node.op_type();
+            blob_names.insert(output_name);
 
-    if (op == "Constant") {
-      constant_node_count_moved_to_weight++;
-    }
-  }
-
-  // some op may have anonymous input
-  // LSTM sequence_lens
-  blob_names.erase("");
-  node_reference.erase("");
-
-  // remove node_reference entry with reference equals to one
-  int split_layer_count = 0;
-  int splitncnn_blob_count = 0;
-  // split node reference
-  std::map<std::string, int> split_node_reference;
-  for (std::map<std::string, int>::iterator it = node_reference.begin(); it != node_reference.end();
-       it++) {
-    if (it->second > 1) {
-      split_layer_count++;
-      splitncnn_blob_count += it->second;
-
-      split_node_reference[it->first] = it->second;
+            node_reference[output_name] = 0;
+        }
     }
-  }
-
-  fprintf(pp, "%zu %zu\n",
-          node_count - constant_node_count_moved_to_weight + weights.size() -
-              zero_reference_weight_node_count - reduced_node_count + input_node_count +
-              split_layer_count,
-          blob_names.size() - zero_reference_weight_node_count + splitncnn_blob_count);
-
-  int internal_split = 0;
-
-  // place Input at the beginning
-  for (int j = 0; j < mutable_graph->input_size(); j++) {
-    const std::string& input_name = mutable_graph->input(j).name();
+    // include Input node
+    int input_node_count = 0;
+    for (int j = 0; j < mutable_graph->input_size(); j++)
+    {
+        const std::string& input_name = mutable_graph->input(j).name();
 
-    // check weight
-    if (weights.find(input_name) != weights.end()) continue;
+        // check weight
+        if (weights.find(input_name) != weights.end()) continue;
 
-    fprintf(pp, "%-16s %-24s 0 1 %s\n", "Input", input_name.c_str(), input_name.c_str());
+        blob_names.insert(input_name);
 
-    int refcount = node_reference[input_name];
-    if (refcount <= 1) {
-      continue;
+        input_node_count++;
     }
 
-    char splitname[256];
-    sprintf(splitname, "splitncnn_input%d", j);
-    fprintf(pp, "%-16s %-24s %d %d", "Split", splitname, 1, refcount);
-    fprintf(pp, " %s", input_name.c_str());
+    //     for (auto a: node_reference)
+    //     {
+    //         fprintf(stderr, "a = %s %d\n", a.first.c_str(), a.second);
+    //     }
 
-    for (int k = 0; k < refcount; k++) {
-      fprintf(pp, " %s_splitncnn_%d", input_name.c_str(), k);
+    // op chain fusion
+    int reduced_node_count = 0;
+    {
+        fuse_identity(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
+        fuse_conv_reshape(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
+        fuse_weight_reshape(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
+        fuse_weight_transpose(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
+        fuse_shufflechannel(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
+        fuse_shufflechannel_split(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
+        fuse_hardsigmoid(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
+        fuse_hardswish(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
+        fuse_swish(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
+        fuse_batchnorm1d_squeeze_unsqueeze(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
+        fuse_unsqueeze_prelu(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
+        fuse_normalize(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
+        fuse_groupnorm(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
+        fuse_layernorm(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
+        fuse_flatten(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
+        fuse_pixelshuffle(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
+        fuse_reorg(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
+        fuse_expand_broadcast(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
+        fuse_lstm_gru_rnn(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
+        fuse_multiheadattention(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
+        fuse_binaryop_with_scalar(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
+        fuse_rewrite_gather(mutable_graph, weights, node_reference, blob_names, reduced_node_count);
     }
-    fprintf(pp, "\n");
-  }
+    // reduce common const weight node_reference
+    for (int i = 0; i < node_count; i++)
+    {
+        const onnx::NodeProto& node = mutable_graph->node(i);
 
-  // place MemoryData next
-  for (std::map<std::string, onnx::TensorProto>::iterator weight_it = weights.begin();
-       weight_it != weights.end(); weight_it++) {
-    const std::string& input_name = weight_it->first;
+        const std::string&     op = node.op_type();
 
-    int refcount = node_reference[input_name];
-    if (refcount == 0) {
-      continue;
+        if (op == "BatchNormalization")
+        {
+            node_reference[node.input(1)] -= 1;
+            node_reference[node.input(2)] -= 1;
+            node_reference[node.input(3)] -= 1;
+            node_reference[node.input(4)] -= 1;
+        }
+        else if (op == "BiasGelu")
+        {
+            node_reference[node.input(1)] -= 1;
+        }
+        else if (op == "Clip")
+        {
+            if (node.input_size() == 3)
+            {
+                node_reference[node.input(1)] -= 1;
+                node_reference[node.input(2)] -= 1;
+            }
+        }
+        else if (op == "Conv")
+        {
+            node_reference[node.input(1)] -= 1;
+            if (node.input_size() == 3)
+            {
+                node_reference[node.input(2)] -= 1;
+            }
+        }
+        else if (op == "ConvTranspose")
+        {
+            node_reference[node.input(1)] -= 1;
+            if (node.input_size() == 3)
+            {
+                node_reference[node.input(2)] -= 1;
+            }
+        }
+        else if (op == "EmbedLayerNormalization")
+        {
+            node_reference[node.input(1)] -= 1;
+            node_reference[node.input(2)] -= 1;
+            node_reference[node.input(3)] -= 1;
+            node_reference[node.input(4)] -= 1;
+            node_reference[node.input(5)] -= 1;
+            node_reference[node.input(6)] -= 1;
+        }
+        else if (op == "Gemm")
+        {
+            float alpha  = get_node_attr_f(node, "alpha", 1.f);
+            float beta   = get_node_attr_f(node, "beta", 1.f);
+            int   transA = get_node_attr_i(node, "transA", 0);
+            int   transB = get_node_attr_i(node, "transB", 0);
+
+            if (alpha == 1.f && beta == 1.f && transA == 0 && transB == 1)
+            {
+                // InnerProduct-like A * B + C, C is optional.
+                node_reference[node.input(1)] -= 1;
+                if (node.input_size() == 3)
+                {
+                    node_reference[node.input(2)] -= 1;
+                }
+            }
+        }
+        else if (op == "GroupNorm")
+        {
+            int affine = get_node_attr_i(node, "affine", 1);
+            if (affine)
+            {
+                node_reference[node.input(1)] -= 1;
+                node_reference[node.input(2)] -= 1;
+            }
+        }
+        else if (op == "GRU")
+        {
+            for (int j = 1; j < node.input_size(); j++)
+            {
+                node_reference[node.input(j)] -= 1;
+            }
+        }
+        else if (op == "InstanceNormalization")
+        {
+            node_reference[node.input(1)] -= 1;
+            node_reference[node.input(2)] -= 1;
+        }
+        else if (op == "LayerNorm")
+        {
+            int affine = get_node_attr_i(node, "affine", 1);
+            if (affine)
+            {
+                node_reference[node.input(1)] -= 1;
+                node_reference[node.input(2)] -= 1;
+            }
+        }
+        else if (op == "LSTM")
+        {
+            for (int j = 1; j < node.input_size(); j++)
+            {
+                node_reference[node.input(j)] -= 1;
+            }
+        }
+        else if (op == "MatMul")
+        {
+            if (weights.find(node.input(1)) != weights.end() && weights[node.input(1)].dims_size() == 2)
+            {
+                // InnerProduct
+                node_reference[node.input(1)] -= 1;
+            }
+        }
+        else if (op == "MultiHeadAttention")
+        {
+            if (node.input_size() == 5)
+            {
+                node_reference[node.input(1)] -= 1;
+                node_reference[node.input(2)] -= 1;
+                node_reference[node.input(3)] -= 1;
+                node_reference[node.input(4)] -= 1;
+            }
+            else
+            {
+                node_reference[node.input(3)] -= 1;
+                node_reference[node.input(4)] -= 1;
+                node_reference[node.input(5)] -= 1;
+                node_reference[node.input(6)] -= 1;
+                node_reference[node.input(7)] -= 1;
+                node_reference[node.input(8)] -= 1;
+                node_reference[node.input(9)] -= 1;
+                node_reference[node.input(10)] -= 1;
+            }
+        }
+        else if (op == "NonMaxSuppression")
+        {
+            if (node.input_size() >= 3)
+            {
+                node_reference[node.input(2)] -= 1;
+            }
+            if (node.input_size() >= 4)
+            {
+                node_reference[node.input(3)] -= 1;
+            }
+            if (node.input_size() >= 5)
+            {
+                node_reference[node.input(4)] -= 1;
+            }
+        }
+        else if (op == "Pad")
+        {
+            if (node.input_size() >= 2)
+            {
+                node_reference[node.input(1)] -= 1;
+            }
+        }
+        else if (op == "PRelu")
+        {
+            node_reference[node.input(1)] -= 1;
+        }
+        else if (op == "Reshape")
+        {
+            if (node.input_size() == 2)
+            {
+                if (weights[node.input(1)].data_type() != 0)
+                {
+                    node_reference[node.input(1)] -= 1;
+                }
+            }
+        }
+        else if (op == "Resize")
+        {
+            if (node.input_size() == 2)
+            {
+                // opset 10
+                node_reference[node.input(1)] -= 1;
+            }
+            else
+            {
+                // opset 11+
+                node_reference[node.input(1)] -= 1;
+                node_reference[node.input(2)] -= 1;
+                if (node.input_size() >= 4)
+                {
+                    node_reference[node.input(3)] -= 1;
+                }
+            }
+        }
+        else if (op == "RNN")
+        {
+            for (int j = 1; j < node.input_size(); j++)
+            {
+                node_reference[node.input(j)] -= 1;
+            }
+        }
+        else if (op == "SkipLayerNormalization")
+        {
+            node_reference[node.input(2)] -= 1;
+            node_reference[node.input(3)] -= 1;
+            node_reference[node.input(4)] -= 1;
+        }
+        else if (op == "Slice")
+        {
+            if (node.input_size() >= 2)
+            {
+                node_reference[node.input(1)] -= 1;
+                node_reference[node.input(2)] -= 1;
+                if (node.input_size() >= 4) node_reference[node.input(3)] -= 1;
+                if (node.input_size() >= 5) node_reference[node.input(4)] -= 1;
+            }
+        }
+        else if (op == "Upsample")
+        {
+            if (node.input_size() >= 2)
+            {
+                node_reference[node.input(1)] -= 1;
+            }
+        }
+        else if (op == "AdaptiveAvgPool2d" || op == "adaptive_avg_pool2d" ||
+                 op == "adaptive_max_pool2d")
+        {
+            if (node.input_size() >= 2)
+            {
+                node_reference[node.input(1)] -= 1;
+            }
+        }
     }
 
-    fprintf(pp, "%-16s %-24s 0 1 %s", "MemoryData", input_name.c_str(), input_name.c_str());
-
-    const onnx::TensorProto& M = weights[input_name];
-
-    if (M.dims_size() == 0) {
-      fprintf(pp, " 0=%d", get_tensor_proto_data_size(M));
-    } else if (M.dims_size() == 1) {
-      fprintf(pp, " 0=%d", (int)M.dims(0));
-    } else if (M.dims_size() == 2) {
-      fprintf(pp, " 0=%d", (int)M.dims(1));
-      if (M.dims(0) != 1) {
-        fprintf(pp, " 1=%d", (int)M.dims(0));
-      }
-    } else if (M.dims_size() == 3) {
-      fprintf(pp, " 0=%d", (int)M.dims(2));
-      fprintf(pp, " 1=%d", (int)M.dims(1));
-      if (M.dims(0) != 1) {
-        fprintf(pp, " 2=%d", (int)M.dims(0));
-      }
-    } else if (M.dims_size() == 4) {
-      fprintf(pp, " 0=%d", (int)M.dims(3));
-      fprintf(pp, " 1=%d", (int)M.dims(2));
-      fprintf(pp, " 2=%d", (int)M.dims(1));
-    }
+    //         for (auto a: node_reference)
+    //         {
+    //             fprintf(stderr, "b = %s %d\n", a.first.c_str(), a.second);
+    //         }
 
-    fprintf(pp, "\n");
-    if (M.data_type() == 1) {
-      fwrite_tensor_proto_data(M, bp);
-    } else if (M.data_type() == 7 || M.data_type() == 6 || M.data_type() == 9 ||
-               M.data_type() == 11) {
-      fwrite_tensor_proto_data_to_float(M, bp);
-    } else {
-      fwrite_tensor_proto_data(M, bp);
-    }
+    // count all weight node with zero reference
+    int zero_reference_weight_node_count = 0;
+    for (std::map<std::string, onnx::TensorProto>::iterator it = weights.begin(); it != weights.end();
+         it++)
+    {
+        const std::string& input_name = it->first;
 
-    if (refcount <= 1) {
-      continue;
+        int                refcount = node_reference[input_name];
+        if (refcount == 0) zero_reference_weight_node_count++;
     }
 
-    char splitname[256];
-    sprintf(splitname, "splitncnn_%d", internal_split);
-    fprintf(pp, "%-16s %-24s %d %d", "Split", splitname, 1, refcount);
+    // we always treat constant node as weight or binaryop_weights
+    // do not count it twice for layer_count
+    int constant_node_count_moved_to_weight = 0;
+    for (int i = 0; i < node_count; i++)
+    {
+        const onnx::NodeProto& node = mutable_graph->node(i);
 
-    fprintf(pp, " %s", input_name.c_str());
+        const std::string&     op = node.op_type();
 
-    for (int k = 0; k < refcount; k++) {
-      fprintf(pp, " %s_splitncnn_%d", input_name.c_str(), k);
+        if (op == "Constant")
+        {
+            constant_node_count_moved_to_weight++;
+        }
     }
-    fprintf(pp, "\n");
 
-    internal_split++;
-  }
+    // some op may have anonymous input
+    // LSTM sequence_lens
+    blob_names.erase("");
+    node_reference.erase("");
+
+    // remove node_reference entry with reference equals to one
+    int                        split_layer_count    = 0;
+    int                        splitncnn_blob_count = 0;
+    // split node reference
+    std::map<std::string, int> split_node_reference;
+    for (std::map<std::string, int>::iterator it = node_reference.begin(); it != node_reference.end();
+         it++)
+    {
+        if (it->second > 1)
+        {
+            split_layer_count++;
+            splitncnn_blob_count += it->second;
 
-  for (int i = 0; i < node_count; i++) {
-    const onnx::NodeProto& node = mutable_graph->node(i);
-    const std::string& op = node.op_type();
+            split_node_reference[it->first] = it->second;
+        }
+    }
 
-    //         fprintf(stderr, "op = %s\n", op.c_str());
+    fprintf(pp, "%zu %zu\n", node_count - constant_node_count_moved_to_weight + weights.size() - zero_reference_weight_node_count - reduced_node_count + input_node_count + split_layer_count, blob_names.size() - zero_reference_weight_node_count + splitncnn_blob_count);
 
-    if (op == "noop_reducedncnn") {
-      continue;
-    }
+    int internal_split = 0;
 
-    std::string name = node.name();
-    if (name.empty()) {
-      name = node.output(0);
-    }
+    // place Input at the beginning
+    for (int j = 0; j < mutable_graph->input_size(); j++)
+    {
+        const std::string& input_name = mutable_graph->input(j).name();
 
-    int input_size = node.input_size();
-    int output_size = node.output_size();
+        // check weight
+        if (weights.find(input_name) != weights.end()) continue;
 
-    for (int j = 0; j < (int)node.input_size(); j++) {
-      const std::string& input_name = node.input(j);
+        fprintf(pp, "%-16s %-24s 0 1 %s\n", "Input", input_name.c_str(), input_name.c_str());
 
-      // check weight
-      if (weights.find(input_name) != weights.end() && node_reference[input_name] == 0) {
-        input_size--;
-      }
+        int refcount = node_reference[input_name];
+        if (refcount <= 1)
+        {
+            continue;
+        }
 
-      if (input_name.empty()) {
-        input_size--;
-      }
+        char splitname[256];
+        sprintf(splitname, "splitncnn_input%d", j);
+        fprintf(pp, "%-16s %-24s %d %d", "Split", splitname, 1, refcount);
+        fprintf(pp, " %s", input_name.c_str());
 
-      //             fprintf(stderr, "  input = %s\n", input_name.c_str());
+        for (int k = 0; k < refcount; k++)
+        {
+            fprintf(pp, " %s_splitncnn_%d", input_name.c_str(), k);
+        }
+        fprintf(pp, "\n");
     }
-    /*
-    for (int j=0; j<(int)node.output_size(); j++)
+
+    // place MemoryData next
+    for (std::map<std::string, onnx::TensorProto>::iterator weight_it = weights.begin();
+         weight_it != weights.end();
+         weight_it++)
     {
-        const std::string& output_name = node.output(j);
-        fprintf(stderr, "  output = %s\n", output_name.c_str());
-    }
-    */
-
-    if (op == "Abs") {
-      fprintf(pp, "%-16s", "UnaryOp");
-    } else if (op == "Acos") {
-      fprintf(pp, "%-16s", "UnaryOp");
-    } else if (op == "Add") {
-      fprintf(pp, "%-16s", "BinaryOp");
-    } else if (op == "ArgMax") {
-      fprintf(pp, "%-16s", "TopK");
-    } else if (op == "Asin") {
-      fprintf(pp, "%-16s", "UnaryOp");
-    } else if (op == "Atan") {
-      fprintf(pp, "%-16s", "UnaryOp");
-    } else if (op == "AveragePool" || op == "MaxPool") {
-      std::vector<int> kernel_shape = get_node_attr_ai(node, "kernel_shape");
-      if (kernel_shape.size() == 1) {
-        fprintf(pp, "%-16s", "Pooling1D");
-      } else {
-        fprintf(pp, "%-16s", "Pooling");
-      }
-    } else if (op == "BatchNormalization") {
-      fprintf(pp, "%-16s", "BatchNorm");
-    } else if (op == "BiasGelu") {
-      fprintf(pp, "%-16s", "BiasGelu");
-    } else if (op == "Cast") {
-      fprintf(pp, "%-16s", "Noop");
-    } else if (op == "Ceil") {
-      fprintf(pp, "%-16s", "UnaryOp");
-    } else if (op == "Clip") {
-      fprintf(pp, "%-16s", "Clip");
-    } else if (op == "Concat") {
-      fprintf(pp, "%-16s", "Concat");
-    } else if (op == "Constant") {
-      continue;
-    } else if (op == "ConstantOfShape") {
-      fprintf(pp, "%-16s", "ConstantOfShape");
-    } else if (op == "Conv") {
-      std::vector<int> kernel_shape = get_node_attr_ai(node, "kernel_shape");
-      if (kernel_shape.size() == 1) {
-        fprintf(pp, "%-16s", "Convolution1D");
-      } else {
-        int group = get_node_attr_i(node, "group", 1);
-        if (group > 1) {
-          fprintf(pp, "%-16s", "ConvolutionDepthWise");
-        } else {
-          fprintf(pp, "%-16s", "Convolution");
-        }
-      }
-    } else if (op == "ConvTranspose") {
-      int group = get_node_attr_i(node, "group", 1);
-      if (group > 1) {
-        fprintf(pp, "%-16s", "DeconvolutionDepthWise");
-      } else {
-        fprintf(pp, "%-16s", "Deconvolution");
-      }
-    } else if (op == "Cos") {
-      fprintf(pp, "%-16s", "UnaryOp");
-    } else if (op == "Crop") {
-      fprintf(pp, "%-16s", "Crop");
-    } else if (op == "DepthToSpace") {
-      fprintf(pp, "%-16s", "PixelShuffle");
-    } else if (op == "DetectionOutput") {
-      fprintf(pp, "%-16s", "DetectionOutput");
-    } else if (op == "Div") {
-      fprintf(pp, "%-16s", "BinaryOp");
-    } else if (op == "Dropout") {
-      fprintf(pp, "%-16s", "Dropout");
-      output_size = 1;
-    } else if (op == "Elu") {
-      fprintf(pp, "%-16s", "ELU");
-    } else if (op == "EmbedLayerNormalization") {
-      fprintf(pp, "%-16s", "EmbedLayerNormalization");
-    } else if (op == "Equal") {
-      fprintf(pp, "%-16s", "Compare");
-    } else if (op == "Exp") {
-      fprintf(pp, "%-16s", "UnaryOp");
-    } else if (op == "Expand") {
-      fprintf(pp, "%-16s", "Expand");
-    } else if (op == "Flatten") {
-      fprintf(pp, "%-16s", "Flatten");
-    } else if (op == "Floor") {
-      fprintf(pp, "%-16s", "UnaryOp");
-    } else if (op == "Gather") {
-      fprintf(pp, "%-16s", "Gather");
-    } else if (op == "Gelu") {
-      fprintf(pp, "%-16s", "GELU");
-    } else if (op == "Gemm") {
-      float alpha = get_node_attr_f(node, "alpha", 1.f);
-      float beta = get_node_attr_f(node, "beta", 1.f);
-      int transA = get_node_attr_i(node, "transA", 0);
-      int transB = get_node_attr_i(node, "transB", 0);
-
-      if (alpha == 1.f && beta == 1.f && transA == 0 && transB == 1) {
-        // InnerProduct-like A * B + C
-        fprintf(pp, "%-16s", "InnerProduct");
-      } else {
-        fprintf(pp, "%-16s", "Gemm");
-      }
-    } else if (op == "GlobalAveragePool") {
-      fprintf(pp, "%-16s", "Pooling");
-    } else if (op == "GlobalMaxPool") {
-      fprintf(pp, "%-16s", "Pooling");
-    } else if (op == "AdaptiveAvgPool2d" || op == "adaptive_avg_pool2d" ||
-               op == "adaptive_max_pool2d") {
-      fprintf(pp, "%-16s", "Pooling");
-    } else if (op == "GroupNorm") {
-      fprintf(pp, "%-16s", "GroupNorm");
-    } else if (op == "GRU") {
-      fprintf(pp, "%-16s", "GRU");
-    } else if (op == "HardSigmoid") {
-      fprintf(pp, "%-16s", "HardSigmoid");
-    } else if (op == "HardSwish") {
-      fprintf(pp, "%-16s", "HardSwish");
-    } else if (op == "ImageScaler") {
-      fprintf(pp, "%-16s", "Scale");
-    } else if (op == "InstanceNormalization") {
-      fprintf(pp, "%-16s", "InstanceNorm");
-    } else if (op == "LayerNorm") {
-      fprintf(pp, "%-16s", "LayerNorm");
-    } else if (op == "LeakyRelu") {
-      fprintf(pp, "%-16s", "ReLU");
-    } else if (op == "Threshold") {
-      fprintf(pp, "%-16s", "Threshold");
-    } else if (op == "Log") {
-      fprintf(pp, "%-16s", "UnaryOp");
-    } else if (op == "LRN") {
-      fprintf(pp, "%-16s", "LRN");
-    } else if (op == "LSTM") {
-      fprintf(pp, "%-16s", "LSTM");
-    } else if (op == "MatMul") {
-      if (weights.find(node.input(1)) != weights.end() && weights[node.input(1)].dims_size() == 2) {
-        fprintf(pp, "%-16s", "InnerProduct");
-      } else {
-        fprintf(pp, "%-16s", "Gemm");
-      }
-    } else if (op == "Max") {
-      fprintf(pp, "%-16s", "BinaryOp");
-    } else if (op == "Min") {
-      fprintf(pp, "%-16s", "BinaryOp");
-    } else if (op == "Mul") {
-      fprintf(pp, "%-16s", "BinaryOp");
-    } else if (op == "MultiHeadAttention") {
-      fprintf(pp, "%-16s", "MultiHeadAttention");
-    } else if (op == "Neg") {
-      fprintf(pp, "%-16s", "UnaryOp");
-    } else if (op == "NonMaxSuppression") {
-      fprintf(pp, "%-16s", "NonMaxSuppression");
-    } else if (op == "Normalize") {
-      fprintf(pp, "%-16s", "Normalize");
-    } else if (op == "Pad") {
-      fprintf(pp, "%-16s", "Padding");
-    } else if (op == "PixelShuffle") {
-      fprintf(pp, "%-16s", "PixelShuffle");
-    } else if (op == "Pow") {
-      fprintf(pp, "%-16s", "BinaryOp");
-    } else if (op == "PriorBox") {
-      fprintf(pp, "%-16s", "PriorBox");
-    } else if (op == "PRelu") {
-      fprintf(pp, "%-16s", "PReLU");
-    } else if (op == "Range") {
-      fprintf(pp, "%-16s", "Range");
-    } else if (op == "Reciprocal") {
-      fprintf(pp, "%-16s", "UnaryOp");
-    } else if (op == "ReduceMax" || op == "ReduceMin" || op == "ReduceMean" || op == "ReduceProd" ||
-               op == "ReduceSum" || op == "ReduceSumSquare" || op == "ReduceL1" ||
-               op == "ReduceL2" || op == "ReduceLogSum" || op == "ReduceLogSumExp") {
-      fprintf(pp, "%-16s", "Reduction");
-    } else if (op == "Relu") {
-      fprintf(pp, "%-16s", "ReLU");
-    } else if (op == "Reorg") {
-      fprintf(pp, "%-16s", "Reorg");
-    } else if (op == "Reshape") {
-      fprintf(pp, "%-16s", "Reshape");
-    } else if (op == "RNN") {
-      fprintf(pp, "%-16s", "RNN");
-    } else if (op == "RDiv") {
-      fprintf(pp, "%-16s", "BinaryOp");
-    } else if (op == "RSub") {
-      fprintf(pp, "%-16s", "BinaryOp");
-    } else if (op == "RoiAlign") {
-      fprintf(pp, "%-16s", "ROIAlign");
-    } else if (op == "ScatterND") {
-      fprintf(pp, "%-16s", "ScatterND");
-    } else if (op == "Shape") {
-      fprintf(pp, "%-16s", "Shape");
-    } else if (op == "ShuffleChannel") {
-      fprintf(pp, "%-16s", "ShuffleChannel");
-    } else if (op == "Sigmoid") {
-      fprintf(pp, "%-16s", "Sigmoid");
-    } else if (op == "Sin") {
-      fprintf(pp, "%-16s", "UnaryOp");
-    } else if (op == "SkipLayerNormalization") {
-      fprintf(pp, "%-16s", "SkipLayerNormalization");
-    } else if (op == "Slice") {
-      std::vector<int> ends;
-      std::vector<int> steps;
-      bool use_crop = true;
-
-      if (node.input_size() == 1) {
-        ends = get_node_attr_ai(node, "ends");
-        steps = get_node_attr_ai(node, "steps");  // TODO
-      } else {
-        ends = get_node_attr_from_input_ai(weights[node.input(2)]);
-        if (node.input_size() >= 5) steps = get_node_attr_from_input_ai(weights[node.input(4)]);
-      }
-
-      // assert step == 1
-      for (int i = 0; i < (int)steps.size(); i++) {
-        if (steps[i] != 1 && steps[i] < ends[i]) {
-          use_crop = false;
-          break;
-        }
-      }
-
-      if (use_crop) {
-        fprintf(pp, "%-16s", "Crop");
-      } else {
-        fprintf(pp, "%-16s", "TensorSlice");
-      }
-    } else if (op == "Softmax") {
-      fprintf(pp, "%-16s", "Softmax");
-    } else if (op == "Softplus") {
-      fprintf(pp, "%-16s", "Softplus");
-    } else if (op == "Split") {
-      fprintf(pp, "%-16s", "Slice");
-    } else if (op == "Sqrt") {
-      fprintf(pp, "%-16s", "UnaryOp");
-    } else if (op == "Squeeze") {
-      std::vector<int> axes = get_node_attr_ai(node, "axes");
-      // fprintf(stderr, "axes[0]: %d\n",axes[0]);
-      if (axes[0] == 0) {
-        fprintf(pp, "%-16s", "Noop");
-      } else {
-        fprintf(pp, "%-16s", "Squeeze");
-      }
-    } else if (op == "Sub") {
-      fprintf(pp, "%-16s", "BinaryOp");
-    } else if (op == "Sum") {
-      fprintf(pp, "%-16s", "Eltwise");
-    } else if (op == "Swish") {
-      fprintf(pp, "%-16s", "Swish");
-    } else if (op == "Tan") {
-      fprintf(pp, "%-16s", "UnaryOp");
-    } else if (op == "Tanh") {
-      fprintf(pp, "%-16s", "UnaryOp");
-    } else if (op == "Tile") {
-      fprintf(pp, "%-16s", "TileOnnx");
-    } else if (op == "TopK") {
-      fprintf(pp, "%-16s", "TopK");
-    } else if (op == "Transpose") {
-      fprintf(pp, "%-16s", "Permute");
-    } else if (op == "Upsample" || op == "Resize") {
-      fprintf(pp, "%-16s", "Interp");
-    } else if (op == "Unsqueeze") {
-      std::vector<int> axes = get_node_attr_ai(node, "axes");
-      // fprintf(stderr, "axes[0]: %d\n",axes[0]);
-      if (axes[0] == 0) {
-        fprintf(pp, "%-16s", "Noop");
-      } else {
-        fprintf(pp, "%-16s", "ExpandDims");
-      }
-    } else if (op == "Where") {
-      fprintf(pp, "%-16s", "Where");
-    } else if (op == "Yolov3DetectionOutput") {
-      fprintf(pp, "%-16s", "Yolov3DetectionOutput");
-    } else {
-      // TODO
-      fprintf(stderr, "%s not supported yet!\n", op.c_str());
-      fprintf(pp, "%-16s", op.c_str());
-    }
+        const std::string& input_name = weight_it->first;
 
-    fprintf(pp, " %-24s %d %d", name.c_str(), input_size, output_size);
+        int                refcount = node_reference[input_name];
+        if (refcount == 0)
+        {
+            continue;
+        }
 
-    for (int j = 0; j < (int)node.input_size(); j++) {
-      std::string input_name = node.input(j);
+        fprintf(pp, "%-16s %-24s 0 1 %s", "MemoryData", input_name.c_str(), input_name.c_str());
 
-      // check weight
-      if (weights.find(input_name) != weights.end() && node_reference[input_name] == 0) {
-        continue;
-      }
+        const onnx::TensorProto& M = weights[input_name];
 
-      if (input_name.empty()) {
-        continue;
-      }
+        if (M.dims_size() == 0)
+        {
+            fprintf(pp, " 0=%d", get_tensor_proto_data_size(M));
+        }
+        else if (M.dims_size() == 1)
+        {
+            fprintf(pp, " 0=%d", (int)M.dims(0));
+        }
+        else if (M.dims_size() == 2)
+        {
+            fprintf(pp, " 0=%d", (int)M.dims(1));
+            if (M.dims(0) != 1)
+            {
+                fprintf(pp, " 1=%d", (int)M.dims(0));
+            }
+        }
+        else if (M.dims_size() == 3)
+        {
+            fprintf(pp, " 0=%d", (int)M.dims(2));
+            fprintf(pp, " 1=%d", (int)M.dims(1));
+            if (M.dims(0) != 1)
+            {
+                fprintf(pp, " 2=%d", (int)M.dims(0));
+            }
+        }
+        else if (M.dims_size() == 4)
+        {
+            fprintf(pp, " 0=%d", (int)M.dims(3));
+            fprintf(pp, " 1=%d", (int)M.dims(2));
+            fprintf(pp, " 2=%d", (int)M.dims(1));
+        }
 
-      if (split_node_reference.find(input_name) != split_node_reference.end()) {
-        int refidx = split_node_reference[input_name] - 1;
-        split_node_reference[input_name] = refidx;
+        fprintf(pp, "\n");
+        if (M.data_type() == 1)
+        {
+            fwrite_tensor_proto_data(M, bp);
+        }
+        else if (M.data_type() == 7 || M.data_type() == 6 || M.data_type() == 9 ||
+                 M.data_type() == 11)
+        {
+            fwrite_tensor_proto_data_to_float(M, bp);
+        }
+        else
+        {
+            fwrite_tensor_proto_data(M, bp);
+        }
 
-        char splitsuffix[256];
-        sprintf(splitsuffix, "_splitncnn_%d", refidx);
-        input_name = input_name + splitsuffix;
-      }
+        if (refcount <= 1)
+        {
+            continue;
+        }
 
-      fprintf(pp, " %s", input_name.c_str());
-    }
+        char splitname[256];
+        sprintf(splitname, "splitncnn_%d", internal_split);
+        fprintf(pp, "%-16s %-24s %d %d", "Split", splitname, 1, refcount);
 
-    for (int j = 0; j < output_size; j++) {
-      const std::string& output_name = node.output(j);
+        fprintf(pp, " %s", input_name.c_str());
 
-      fprintf(pp, " %s", output_name.c_str());
-    }
+        for (int k = 0; k < refcount; k++)
+        {
+            fprintf(pp, " %s_splitncnn_%d", input_name.c_str(), k);
+        }
+        fprintf(pp, "\n");
 
-    if (op == "Abs") {
-      int op_type = 0;
-      fprintf(pp, " 0=%d", op_type);
-    } else if (op == "Acos") {
-      int op_type = 13;
-      fprintf(pp, " 0=%d", op_type);
-    } else if (op == "Add") {
-      int op_type = 0;
-      fprintf(pp, " 0=%d", op_type);
-
-      int with_scalar = get_node_attr_i(node, "with_scalar", 0);
-      float b = get_node_attr_f(node, "b", 0.f);
-      if (with_scalar) {
-        fprintf(pp, " 1=%d", with_scalar);
-        fprintf(pp, " 2=%e", b);
-      }
-    } else if (op == "ArgMax") {
-      int axis = get_node_attr_i(node, "axis");
-      int keepdims = get_node_attr_i(node, "keepdims");
-      fprintf(pp, " 0=%d", axis - 1);
-      fprintf(pp, " 3=%d", keepdims);
-    } else if (op == "Asin") {
-      int op_type = 12;
-      fprintf(pp, " 0=%d", op_type);
-    } else if (op == "Atan") {
-      int op_type = 14;
-      fprintf(pp, " 0=%d", op_type);
-    } else if (op == "AveragePool" || op == "MaxPool") {
-      std::string auto_pad = get_node_attr_s(node, "auto_pad");
-      int ceil_mode = get_node_attr_i(node, "ceil_mode", 0);
-      std::vector<int> kernel_shape = get_node_attr_ai(node, "kernel_shape");
-      std::vector<int> strides = get_node_attr_ai(node, "strides");
-      std::vector<int> pads = get_node_attr_ai(node, "pads");
-
-      int pool = op == "AveragePool" ? 1 : 0;
-      int pad_mode = 1;
-
-      if (auto_pad == "SAME_UPPER") {
-        pad_mode = 2;
-      } else if (auto_pad == "SAME_LOWER") {
-        pad_mode = 3;
-      }
-
-      if (ceil_mode == 1) {
-        pad_mode = 0;
-      }
-
-      fprintf(pp, " 0=%d", pool);
-
-      if (kernel_shape.size() == 1) {
-        fprintf(pp, " 1=%d", kernel_shape[0]);
-      } else if (kernel_shape.size() == 2) {
-        fprintf(pp, " 1=%d", kernel_shape[1]);
-        fprintf(pp, " 11=%d", kernel_shape[0]);
-      }
-
-      if (strides.size() == 1) {
-        fprintf(pp, " 2=%d", strides[0]);
-      } else if (strides.size() == 2) {
-        fprintf(pp, " 2=%d", strides[1]);
-        fprintf(pp, " 12=%d", strides[0]);
-      }
-
-      if (pads.size() == 1) {
-        fprintf(pp, " 3=%d", pads[0]);
-      } else if (pads.size() == 2) {
-        fprintf(pp, " 3=%d", pads[1]);
-        fprintf(pp, " 13=%d", pads[0]);
-      } else if (pads.size() == 4) {
-        fprintf(pp, " 3=%d", pads[1]);
-        fprintf(pp, " 13=%d", pads[0]);
-        fprintf(pp, " 14=%d", pads[3]);
-        fprintf(pp, " 15=%d", pads[2]);
-      }
-
-      fprintf(pp, " 5=%d", pad_mode);
-
-      if (op == "AveragePool") {
-        int avgpool_count_include_pad = get_node_attr_i(node, "count_include_pad", 0);
-        fprintf(pp, " 6=%d", avgpool_count_include_pad);
-      }
-    } else if (op == "BatchNormalization") {
-      float epsilon = get_node_attr_f(node, "epsilon", 1e-5f);
-
-      const onnx::TensorProto& scale = weights[node.input(1)];
-      const onnx::TensorProto& B = weights[node.input(2)];
-      const onnx::TensorProto& mean = weights[node.input(3)];
-      const onnx::TensorProto& var = weights[node.input(4)];
-
-      int channels = get_tensor_proto_data_size(scale);
-
-      fprintf(pp, " 0=%d", channels);
-
-      fwrite_tensor_proto_data(scale, bp);
-      fwrite_tensor_proto_data(mean, bp);
-      // apply epsilon to var
-      {
-        const float* v =
-            var.has_raw_data() ? (const float*)var.raw_data().data() : var.float_data().data();
-
-        for (int j = 0; j < channels; j++) {
-          float ve = v[j] + epsilon;
-          fwrite(&ve, sizeof(float), 1, bp);
-        }
-      }
-      fwrite_tensor_proto_data(B, bp);
-    } else if (op == "BiasGelu") {
-      const onnx::TensorProto& B = weights[node.input(1)];
-
-      fprintf(pp, " 0=%d", get_tensor_proto_data_size(B));
-
-      int quantize_tag = 0;
-      fwrite(&quantize_tag, sizeof(int), 1, bp);
-
-      fwrite_tensor_proto_data(B, bp);
-    } else if (op == "Ceil") {
-      int op_type = 3;
-      fprintf(pp, " 0=%d", op_type);
-    } else if (op == "Clip") {
-      float min;
-      float max;
-      if (node.input_size() == 1) {
-        min = get_node_attr_f(node, "min", -FLT_MAX);
-        max = get_node_attr_f(node, "max", FLT_MAX);
-      } else {
-        min = weights.find(node.input(1)) != weights.end()
-                  ? get_node_attr_from_input<float>(weights[node.input(1)])
-                  : -FLT_MAX;
-        max = weights.find(node.input(2)) != weights.end()
-                  ? get_node_attr_from_input<float>(weights[node.input(2)])
-                  : FLT_MAX;
-      }
-
-      fprintf(pp, " 0=%e", min);
-      fprintf(pp, " 1=%e", max);
-    } else if (op == "Concat") {
-      int axis = get_node_attr_i(node, "axis", 1);
-      fprintf(pp, " 0=%d", axis - 1);
-    } else if (op == "Constant") {
-      // never reach here
-    } else if (op == "ConstantOfShape") {
-      float value = 0.f;
-      value = get_node_attr_f(node, "value", 0.f);
-      fprintf(pp, " 0=%f", value);
-
-    } else if (op == "Conv") {
-      const onnx::TensorProto& W = weights[node.input(1)];
-
-      int num_filter = W.dims(0);
-      int has_bias = node.input_size() == 3 ? 1 : 0;
-
-      std::string auto_pad = get_node_attr_s(node, "auto_pad");
-      std::vector<int> kernel_shape = get_node_attr_ai(node, "kernel_shape");
-      std::vector<int> dilations = get_node_attr_ai(node, "dilations");
-      std::vector<int> strides = get_node_attr_ai(node, "strides");
-      std::vector<int> pads = get_node_attr_ai(node, "pads");
-      int group = get_node_attr_i(node, "group", 1);
-
-      fprintf(pp, " 0=%d", num_filter);
-
-      if (kernel_shape.size() == 1) {
-        fprintf(pp, " 1=%d", kernel_shape[0]);
-      } else if (kernel_shape.size() == 2) {
-        fprintf(pp, " 1=%d", kernel_shape[1]);
-        fprintf(pp, " 11=%d", kernel_shape[0]);
-      }
-
-      if (dilations.size() == 1) {
-        fprintf(pp, " 2=%d", dilations[0]);
-      } else if (dilations.size() == 2) {
-        fprintf(pp, " 2=%d", dilations[1]);
-        fprintf(pp, " 12=%d", dilations[0]);
-      }
-
-      if (strides.size() == 1) {
-        fprintf(pp, " 3=%d", strides[0]);
-      } else if (strides.size() == 2) {
-        fprintf(pp, " 3=%d", strides[1]);
-        fprintf(pp, " 13=%d", strides[0]);
-      }
-
-      if (auto_pad == "SAME_UPPER") {
-        fprintf(pp, " 4=-233");
-      } else if (auto_pad == "SAME_LOWER") {
-        fprintf(pp, " 4=-234");
-      } else {
-        if (pads.size() == 1) {
-          fprintf(pp, " 4=%d", pads[0]);
-        } else if (pads.size() == 2) {
-          fprintf(pp, " 4=%d", pads[1]);
-          fprintf(pp, " 14=%d", pads[0]);
-        } else if (pads.size() == 4) {
-          fprintf(pp, " 4=%d", pads[1]);
-          fprintf(pp, " 14=%d", pads[0]);
-          fprintf(pp, " 15=%d", pads[3]);
-          fprintf(pp, " 16=%d", pads[2]);
-        }
-      }
-
-      fprintf(pp, " 5=%d", has_bias);
-
-      fprintf(pp, " 6=%d", get_tensor_proto_data_size(W));
-
-      if (group > 1) {
-        fprintf(pp, " 7=%d", group);
-      }
-
-      int quantize_tag = 0;
-      fwrite(&quantize_tag, sizeof(int), 1, bp);
-
-      fwrite_tensor_proto_data(W, bp);
-
-      if (has_bias) {
-        const onnx::TensorProto& B = weights[node.input(2)];
-        fwrite_tensor_proto_data(B, bp);
-      }
-    } else if (op == "ConvTranspose") {
-      const onnx::TensorProto& W = weights[node.input(1)];
-
-      int has_bias = node.input_size() == 3 ? 1 : 0;
-
-      std::string auto_pad = get_node_attr_s(node, "auto_pad");
-      std::vector<int> kernel_shape = get_node_attr_ai(node, "kernel_shape");
-      std::vector<int> dilations = get_node_attr_ai(node, "dilations");
-      std::vector<int> strides = get_node_attr_ai(node, "strides");
-      std::vector<int> output_padding = get_node_attr_ai(node, "output_padding");
-      std::vector<int> output_shape = get_node_attr_ai(node, "output_shape");
-      std::vector<int> pads = get_node_attr_ai(node, "pads");
-      int group = get_node_attr_i(node, "group", 1);
-      int num_filter = W.dims(1) * group;
-
-      fprintf(pp, " 0=%d", num_filter);
-
-      if (kernel_shape.size() == 1) {
-        fprintf(pp, " 1=%d", kernel_shape[0]);
-      } else if (kernel_shape.size() == 2) {
-        fprintf(pp, " 1=%d", kernel_shape[1]);
-        fprintf(pp, " 11=%d", kernel_shape[0]);
-      }
-
-      if (dilations.size() == 1) {
-        fprintf(pp, " 2=%d", dilations[0]);
-      } else if (dilations.size() == 2) {
-        fprintf(pp, " 2=%d", dilations[1]);
-        fprintf(pp, " 12=%d", dilations[0]);
-      }
-
-      if (strides.size() == 1) {
-        fprintf(pp, " 3=%d", strides[0]);
-      } else if (strides.size() == 2) {
-        fprintf(pp, " 3=%d", strides[1]);
-        fprintf(pp, " 13=%d", strides[0]);
-      }
-
-      if (auto_pad == "SAME_UPPER") {
-        fprintf(pp, " 4=-233");
-      } else if (auto_pad == "SAME_LOWER") {
-        fprintf(pp, " 4=-234");
-      } else {
-        if (pads.size() == 1) {
-          fprintf(pp, " 4=%d", pads[0]);
-        } else if (pads.size() == 2) {
-          fprintf(pp, " 4=%d", pads[1]);
-          fprintf(pp, " 14=%d", pads[0]);
-        } else if (pads.size() == 4) {
-          fprintf(pp, " 4=%d", pads[1]);
-          fprintf(pp, " 14=%d", pads[0]);
-          fprintf(pp, " 15=%d", pads[3]);
-          fprintf(pp, " 16=%d", pads[2]);
-        }
-      }
-
-      if (output_padding.size() == 1) {
-        fprintf(pp, " 18=%d", output_padding[0]);
-      } else if (output_padding.size() == 2) {
-        fprintf(pp, " 18=%d", output_padding[1]);
-        fprintf(pp, " 19=%d", output_padding[0]);
-      }
-
-      if (output_shape.size() == 1) {
-        fprintf(pp, " 20=%d", output_shape[0]);
-      } else if (output_shape.size() == 2) {
-        fprintf(pp, " 20=%d", output_shape[1]);
-        fprintf(pp, " 21=%d", output_shape[0]);
-      }
-
-      fprintf(pp, " 5=%d", has_bias);
-
-      fprintf(pp, " 6=%d", get_tensor_proto_data_size(W));
-
-      if (group > 1) {
-        fprintf(pp, " 7=%d", group);
-      }
-
-      int quantize_tag = 0;
-      fwrite(&quantize_tag, sizeof(int), 1, bp);
-
-      int maxk = 0;
-      if (kernel_shape.size() == 2) {
-        maxk = kernel_shape[1] * kernel_shape[0];
-      } else {
-        maxk = kernel_shape[0] * kernel_shape[0];
-      }
-      int weight_data_size = get_tensor_proto_data_size(W);
-      const float* weight_data = 0;
-      if (W.has_raw_data()) {
-        weight_data = (const float*)W.raw_data().data();
-      } else if (W.data_type() == 1) {
-        weight_data = W.float_data().data();
-      }
-      for (int g = 0; g < group; g++) {
-        // reorder weight from inch-outch to outch-inch
-        int num_filter_g = num_filter / group;
-        int num_input = weight_data_size / maxk / num_filter_g / group;
-        const float* weight_data_ptr = weight_data + g * maxk * num_filter_g * num_input;
-        for (int k = 0; k < num_filter_g; k++) {
-          for (int j = 0; j < num_input; j++) {
-            fwrite(weight_data_ptr + (j * num_filter_g + k) * maxk, sizeof(float), maxk, bp);
-          }
-        }
-      }
-
-      if (has_bias) {
-        const onnx::TensorProto& B = weights[node.input(2)];
-        fwrite_tensor_proto_data(B, bp);
-      }
-    } else if (op == "Cos") {
-      int op_type = 10;
-      fprintf(pp, " 0=%d", op_type);
-    } else if (op == "Crop") {
-      auto starts = get_node_attr_ai(node, "starts");
-      fprintf(pp, " -23309=%zu", starts.size());
-      for (size_t j = 0; j < starts.size(); ++j) {
-        fprintf(pp, ",%i", starts[j]);
-      }
-      auto ends = get_node_attr_ai(node, "ends");
-      fprintf(pp, " -23310=%zu", ends.size());
-      for (size_t j = 0; j < ends.size(); ++j) {
-        fprintf(pp, ",%i", ends[j]);
-      }
-      auto axis = get_node_attr_ai(node, "axis");
-      fprintf(pp, " -23311=%zu", axis.size());
-      for (size_t j = 0; j < axis.size(); ++j) {
-        fprintf(pp, ",%i", axis[j]);
-      }
-    } else if (op == "DepthToSpace") {
-      // pixelshuffle
-      int scale_factor = get_node_attr_i(node, "blocksize", 1);
-      std::string mode = get_node_attr_s(node, "mode");
-      fprintf(pp, " 0=%d", scale_factor);
-      if (mode == "CRD") {
-        fprintf(pp, " 1=0");
-      } else if (mode == "DCR") {
-        fprintf(pp, " 1=1");
-      }
-    } else if (op == "DetectionOutput") {
-      float score_threshold = get_node_attr_f(node, "score_threshold");
-      float nms_threshold = get_node_attr_f(node, "nms_threshold");
-      int nms_top_k = get_node_attr_i(node, "nms_top_k");
-      int keep_top_k = get_node_attr_i(node, "keep_top_k");
-      int num_class = get_node_attr_i(node, "num_class");
-      std::vector<float> vars = get_node_attr_af(node, "vars");
-      fprintf(pp, " 0=%d", num_class);
-      fprintf(pp, " 1=%f", nms_threshold);
-      fprintf(pp, " 2=%d", nms_top_k);
-      fprintf(pp, " 3=%d", keep_top_k);
-      fprintf(pp, " 4=%f", score_threshold);
-      fprintf(pp, " 5=%f", vars[0]);
-      fprintf(pp, " 6=%f", vars[1]);
-      fprintf(pp, " 7=%f", vars[2]);
-      fprintf(pp, " 8=%f", vars[3]);
-    } else if (op == "Div") {
-      int op_type = 3;
-      fprintf(pp, " 0=%d", op_type);
-
-      int with_scalar = get_node_attr_i(node, "with_scalar", 0);
-      float b = get_node_attr_f(node, "b", 0.f);
-      if (with_scalar) {
-        fprintf(pp, " 1=%d", with_scalar);
-        fprintf(pp, " 2=%e", b);
-      }
-    } else if (op == "Dropout") {
-      // no-op
-    } else if (op == "Elu") {
-      float alpha = get_node_attr_f(node, "alpha", 1.f);
-      fprintf(pp, " 0=%e", alpha);
-    } else if (op == "EmbedLayerNormalization") {
-      const onnx::TensorProto& words = weights[node.input(2)];
-      const onnx::TensorProto& positions = weights[node.input(3)];
-      const onnx::TensorProto& W = weights[node.input(5)];
-      const onnx::TensorProto& B = weights[node.input(6)];
-
-      fprintf(pp, " 0=%d", get_tensor_proto_data_size(B));
-      fprintf(pp, " 1=%d", get_tensor_proto_data_size(words));
-      fprintf(pp, " 2=%d", get_tensor_proto_data_size(positions));
-
-      int quantize_tag = 0;
-      fwrite(&quantize_tag, sizeof(int), 1, bp);
-
-      fwrite_tensor_proto_data(words, bp);
-
-      fwrite(&quantize_tag, sizeof(int), 1, bp);
-
-      fwrite_tensor_proto_data(positions, bp);
-
-      fwrite(&quantize_tag, sizeof(int), 1, bp);
-
-      fwrite_tensor_proto_data(W, bp);
-
-      fwrite(&quantize_tag, sizeof(int), 1, bp);
-
-      fwrite_tensor_proto_data(B, bp);
-    } else if (op == "Equal") {
-      int op_type = 0;
-      fprintf(pp, " 0=%d", op_type);
-    } else if (op == "Exp") {
-      int op_type = 7;
-      fprintf(pp, " 0=%d", op_type);
-    } else if (op == "Flatten") {
-      int axis = get_node_attr_i(node, "axis", 1);
-      if (axis != 1) {
-        fprintf(stderr, "Unsupported Flatten axis %d!\n", axis);
-      }
-    } else if (op == "Floor") {
-      int op_type = 2;
-      fprintf(pp, " 0=%d", op_type);
-    } else if (op == "Gather") {
-      if (weights[node.input(1)].dims_size() > 1) {
-        fprintf(stderr, "Unsupported indice dims > 1");
-      }
-      int axis = get_node_attr_i(node, "axis", 1) - 1;
-      if (axis < 0) {
-        fprintf(stderr, "Unsupported Gather axis: %d\n", axis + 1);
-      }
-      fprintf(pp, " 0=%d", axis);
-    } else if (op == "Gelu") {
-      fprintf(pp, " 0=1");
-    } else if (op == "Gemm") {
-      float alpha = get_node_attr_f(node, "alpha", 1.f);
-      float beta = get_node_attr_f(node, "beta", 1.f);
-      int transA = get_node_attr_i(node, "transA", 0);
-      int transB = get_node_attr_i(node, "transB", 0);
-
-      if (alpha == 1.f && beta == 1.f && transA == 0 && transB == 1) {
-        // InnerProduct-like A * B + C
-        const onnx::TensorProto& B = weights[node.input(1)];
-        // B has transposed.
-        int num_output = B.dims(0);
-        fprintf(pp, " 0=%d", num_output);
-        if (node.input_size() == 3) {
-          fprintf(pp, " 1=1");
-        } else {
-          fprintf(pp, " 1=0");
-        }
-        fprintf(pp, " 2=%d", get_tensor_proto_data_size(B));
-
-        int quantize_tag = 0;
-        fwrite(&quantize_tag, sizeof(int), 1, bp);
-        fwrite_tensor_proto_data(B, bp);
-        if (node.input_size() == 3) {
-          const onnx::TensorProto& C = weights[node.input(2)];
-          fwrite_tensor_proto_data(C, bp);
-        }
-      } else {
-        // gemm
-        fprintf(pp, " 0=%e", alpha);
-        fprintf(pp, " 1=%e", beta);
-        fprintf(pp, " 2=%d", transA);
-        fprintf(pp, " 3=%d", transB);
-      }
-    } else if (op == "GlobalAveragePool") {
-      int pool = 1;
-      int global_pool = 1;
-
-      fprintf(pp, " 0=%d", pool);
-      fprintf(pp, " 4=%d", global_pool);
-    } else if (op == "GlobalMaxPool") {
-      int pool = 0;
-      int global_pool = 1;
-
-      fprintf(pp, " 0=%d", pool);
-      fprintf(pp, " 4=%d", global_pool);
-    } else if (op == "AdaptiveAvgPool2d" || op == "adaptive_avg_pool2d" ||
-               op == "adaptive_max_pool2d") {
-      int pool = 0;
-      if (op == "AdaptiveAvgPool2d" || op == "adaptive_avg_pool2d") {
-        pool = 1;
-      }
-      int adaptive_pooling = 1;
-      const onnx::TensorProto& out_shape_tp = weights[node.input(1)];
-      std::vector<int> out_shape = get_node_attr_from_input_ai(out_shape_tp);
-
-      fprintf(pp, " 0=%d", pool);
-      fprintf(pp, " 7=%d", adaptive_pooling);
-      if (out_shape.size() == 1) {
-        fprintf(pp, " 8=%d", out_shape[0]);
-      } else if (out_shape.size() == 2) {
-        // out_w
-        fprintf(pp, " 8=%d", out_shape[1]);
-        // out_h
-        fprintf(pp, " 18=%d", out_shape[0]);
-      }
-    } else if (op == "GroupNorm") {
-      int groups = get_node_attr_i(node, "groups", 1);
-      int channels = get_node_attr_i(node, "channels", 1);
-      float eps = get_node_attr_f(node, "epsilon", 1e-5f);
-      int affine = get_node_attr_i(node, "affine", 1);
-
-      if (affine) {
-        // discard affine-less S=1 B=0
-        std::vector<float> affine_S = get_node_attr_from_input_af(weights[node.input(1)]);
-        std::vector<float> affine_B = get_node_attr_from_input_af(weights[node.input(2)]);
-        if (affine_S.size() == 1 && affine_S[0] == 1.f && affine_B.size() == 1 &&
-            affine_B[0] == 0.f) {
-          affine = 0;
-        } else {
-          affine = 0;
-          {
-            for (int j = 0; j < channels; j++) {
-              if (affine_S[j] != 1.f || affine_B[j] != 0.f) {
-                affine = 1;
-                break;
-              }
-            }
-          }
-        }
-      }
-
-      fprintf(pp, " 0=%d", groups);
-      fprintf(pp, " 1=%d", channels);
-      fprintf(pp, " 2=%e", eps);
-      fprintf(pp, " 3=%d", affine);
-      if (affine) {
-        const onnx::TensorProto& scale = weights[node.input(1)];
-        const onnx::TensorProto& B = weights[node.input(2)];
-
-        fwrite_tensor_proto_data(scale, bp);
-        fwrite_tensor_proto_data(B, bp);
-      }
-    } else if (op == "GRU") {
-      const onnx::TensorProto& W = weights[node.input(1)];
-      const onnx::TensorProto& R = weights[node.input(2)];
-      const onnx::TensorProto& B = weights[node.input(3)];
-
-      int hidden_size = get_node_attr_i(node, "hidden_size", 0);
-      std::string direction = get_node_attr_s(node, "direction");
-
-      int direction_type = 0;
-      if (direction == "forward") {
-        direction_type = 0;
-      } else if (direction == "reverse") {
-        direction_type = 1;
-      } else if (direction == "bidirectional") {
-        direction_type = 2;
-      }
-
-      int weight_data_size = get_tensor_proto_data_size(W);
-
-      fprintf(pp, " 0=%d", hidden_size);
-      fprintf(pp, " 1=%d", weight_data_size);
-      fprintf(pp, " 2=%d", direction_type);
-
-      int num_directions = direction_type == 2 ? 2 : 1;
-
-      int quantize_tag = 0;
-
-      // reorder num_directions-URN-hidden-size to
-      // num_directions-RUN-hidden-size
-      {
-        fwrite(&quantize_tag, sizeof(int), 1, bp);
-
-        int weight_data_size_g = get_tensor_proto_data_size(W) / 3 / num_directions;
-        const float* wptr =
-            W.has_raw_data() ? (const float*)W.raw_data().data() : W.float_data().data();
-
-        const float* uptr = wptr;
-        const float* rptr = wptr + weight_data_size_g;
-        const float* nptr = wptr + weight_data_size_g * 2;
-        fwrite(rptr, sizeof(float), weight_data_size_g, bp);
-        fwrite(uptr, sizeof(float), weight_data_size_g, bp);
-        fwrite(nptr, sizeof(float), weight_data_size_g, bp);
-
-        if (direction_type == 2) {
-          uptr += weight_data_size_g * 3;
-          rptr += weight_data_size_g * 3;
-          nptr += weight_data_size_g * 3;
-          fwrite(rptr, sizeof(float), weight_data_size_g, bp);
-          fwrite(uptr, sizeof(float), weight_data_size_g, bp);
-          fwrite(nptr, sizeof(float), weight_data_size_g, bp);
-        }
-      }
-
-      // reduce U and R bias except N
-      // reorder num_directions-URN-hidden to num_directions-RUN-hidden
-      {
-        fwrite(&quantize_tag, sizeof(int), 1, bp);
-
-        int bias_data_size_g = get_tensor_proto_data_size(B) / 2 / 3 / num_directions;
-        const float* bptr =
-            B.has_raw_data() ? (const float*)B.raw_data().data() : B.float_data().data();
-        const float* wuptr = bptr;
-        const float* wrptr = bptr + bias_data_size_g;
-        const float* wnptr = bptr + bias_data_size_g * 2;
-        const float* buptr = bptr + bias_data_size_g * 3;
-        const float* brptr = bptr + bias_data_size_g * 4;
-        const float* bnptr = bptr + bias_data_size_g * 5;
-
-        for (int j = 0; j < bias_data_size_g; j++) {
-          float vb = wrptr[j] + brptr[j];
-          fwrite(&vb, sizeof(float), 1, bp);
-        }
-        for (int j = 0; j < bias_data_size_g; j++) {
-          float vb = wuptr[j] + buptr[j];
-          fwrite(&vb, sizeof(float), 1, bp);
-        }
-        fwrite(wnptr, sizeof(float), bias_data_size_g, bp);
-        fwrite(bnptr, sizeof(float), bias_data_size_g, bp);
-
-        if (direction_type == 2) {
-          wuptr += bias_data_size_g * 6;
-          wrptr += bias_data_size_g * 6;
-          wnptr += bias_data_size_g * 6;
-          buptr += bias_data_size_g * 6;
-          brptr += bias_data_size_g * 6;
-          bnptr += bias_data_size_g * 6;
-
-          for (int j = 0; j < bias_data_size_g; j++) {
-            float vb = wrptr[j] + brptr[j];
-            fwrite(&vb, sizeof(float), 1, bp);
-          }
-          for (int j = 0; j < bias_data_size_g; j++) {
-            float vb = wuptr[j] + buptr[j];
-            fwrite(&vb, sizeof(float), 1, bp);
-          }
-          fwrite(wnptr, sizeof(float), bias_data_size_g, bp);
-          fwrite(bnptr, sizeof(float), bias_data_size_g, bp);
-        }
-      }
-
-      // reorder num_directions-URN-hidden-hidden to
-      // num_directions-RUN-hidden-hidden
-      {
-        fwrite(&quantize_tag, sizeof(int), 1, bp);
-
-        int weight_data_size_g = get_tensor_proto_data_size(R) / 3 / num_directions;
-        const float* Rptr =
-            R.has_raw_data() ? (const float*)R.raw_data().data() : R.float_data().data();
-
-        const float* uptr = Rptr;
-        const float* rptr = Rptr + weight_data_size_g;
-        const float* nptr = Rptr + weight_data_size_g * 2;
-        fwrite(rptr, sizeof(float), weight_data_size_g, bp);
-        fwrite(uptr, sizeof(float), weight_data_size_g, bp);
-        fwrite(nptr, sizeof(float), weight_data_size_g, bp);
-
-        if (direction_type == 2) {
-          uptr += weight_data_size_g * 3;
-          rptr += weight_data_size_g * 3;
-          nptr += weight_data_size_g * 3;
-          fwrite(rptr, sizeof(float), weight_data_size_g, bp);
-          fwrite(uptr, sizeof(float), weight_data_size_g, bp);
-          fwrite(nptr, sizeof(float), weight_data_size_g, bp);
-        }
-      }
-    } else if (op == "HardSigmoid") {
-      float alpha = get_node_attr_f(node, "alpha", 0.2f);
-      float beta = get_node_attr_f(node, "beta", 0.5f);
-
-      fprintf(pp, " 0=%e", alpha);
-      fprintf(pp, " 1=%e", beta);
-    } else if (op == "HardSwish") {
-      float alpha = get_node_attr_f(node, "alpha", 0.2f);
-      float beta = get_node_attr_f(node, "beta", 0.5f);
-
-      fprintf(pp, " 0=%e", alpha);
-      fprintf(pp, " 1=%e", beta);
-    } else if (op == "ImageScaler") {
-      std::vector<float> bias = get_node_attr_af(node, "bias");
-      float scale = get_node_attr_f(node, "scale", 1.f);
-
-      int channels = (int)bias.size();
-
-      fprintf(pp, " 0=%d", channels);
-      fprintf(pp, " 1=1");
-
-      for (int j = 0; j < channels; j++) {
-        fwrite(&scale, sizeof(float), 1, bp);
-      }
-      fwrite(&bias[0], sizeof(float), channels, bp);
-    } else if (op == "InstanceNormalization") {
-      float eps = get_node_attr_f(node, "epsilon", 1e-5f);
-
-      // discard affine-less S=1 B=0
-      std::vector<float> affine_S = get_node_attr_from_input_af(weights[node.input(1)]);
-      std::vector<float> affine_B = get_node_attr_from_input_af(weights[node.input(2)]);
-      int channels = (int)affine_S.size();
-      int affine = 0;
-      {
-        for (int j = 0; j < channels; j++) {
-          if (affine_S[j] != 1.f || affine_B[j] != 0.f) {
-            affine = 1;
-            break;
-          }
-        }
-      }
-
-      fprintf(pp, " 0=%d", channels);
-      fprintf(pp, " 1=%e", eps);
-      fprintf(pp, " 2=%d", affine);
-      if (affine) {
-        const onnx::TensorProto& scale = weights[node.input(1)];
-        const onnx::TensorProto& B = weights[node.input(2)];
-
-        fwrite_tensor_proto_data(scale, bp);
-        fwrite_tensor_proto_data(B, bp);
-      }
-    } else if (op == "LayerNorm") {
-      float eps = get_node_attr_f(node, "epsilon", 1e-5f);
-      int affine = get_node_attr_i(node, "affine", 1);
-
-      if (affine) {
-        // discard affine-less S=1 B=0
-        std::vector<float> affine_S = get_node_attr_from_input_af(weights[node.input(1)]);
-        std::vector<float> affine_B = get_node_attr_from_input_af(weights[node.input(2)]);
-        int affine_size = (int)affine_S.size();
-        affine = 0;
-        {
-          for (int j = 0; j < affine_size; j++) {
-            if (affine_S[j] != 1.f || affine_B[j] != 0.f) {
-              affine = 1;
-              break;
-            }
-          }
-        }
-
-        if (affine) {
-          fprintf(pp, " 0=%d", affine_size);
-        }
-      }
-
-      fprintf(pp, " 1=%e", eps);
-      fprintf(pp, " 2=%d", affine);
-
-      if (affine) {
-        const onnx::TensorProto& scale = weights[node.input(1)];
-        const onnx::TensorProto& B = weights[node.input(2)];
-
-        fwrite_tensor_proto_data(scale, bp);
-        fwrite_tensor_proto_data(B, bp);
-      }
-    } else if (op == "LeakyRelu") {
-      float alpha = get_node_attr_f(node, "alpha", 0.01f);
-      fprintf(pp, " 0=%e", alpha);
-    } else if (op == "Threshold") {
-      float threshold = get_node_attr_f(node, "threshold", 0.f);
-      fprintf(pp, " 0=%e", threshold);
-    } else if (op == "Log") {
-      int op_type = 8;
-      fprintf(pp, " 0=%d", op_type);
-    } else if (op == "LRN") {
-      float alpha = get_node_attr_f(node, "alpha", 1.f);
-      float beta = get_node_attr_f(node, "beta", 0.5f);
-      float bias = get_node_attr_f(node, "bias", 1.f);
-      int size = get_node_attr_i(node, "size", 1);
-
-      int norm_region = 0;
-
-      fprintf(pp, " 0=%d", norm_region);
-      fprintf(pp, " 1=%d", size);
-      fprintf(pp, " 2=%e", alpha);
-      fprintf(pp, " 3=%e", beta);
-      fprintf(pp, " 4=%e", bias);
-    } else if (op == "LSTM") {
-      const onnx::TensorProto& W = weights[node.input(1)];
-      const onnx::TensorProto& R = weights[node.input(2)];
-      const onnx::TensorProto& B = weights[node.input(3)];
-
-      int hidden_size = get_node_attr_i(node, "hidden_size", 0);
-      std::string direction = get_node_attr_s(node, "direction");
-
-      int direction_type = 0;
-      if (direction == "forward") {
-        direction_type = 0;
-      } else if (direction == "reverse") {
-        direction_type = 1;
-      } else if (direction == "bidirectional") {
-        direction_type = 2;
-      }
-
-      int weight_data_size = get_tensor_proto_data_size(W);
-
-      fprintf(pp, " 0=%d", hidden_size);
-      fprintf(pp, " 1=%d", weight_data_size);
-      fprintf(pp, " 2=%d", direction_type);
-
-      int num_directions = direction_type == 2 ? 2 : 1;
-
-      int quantize_tag = 0;
-
-      // reorder num_directions-IOFG-hidden-size to
-      // num_directions-IFOG-hidden-size
-      {
-        fwrite(&quantize_tag, sizeof(int), 1, bp);
-
-        int weight_data_size_g = get_tensor_proto_data_size(W) / 4 / num_directions;
-        const float* wptr =
-            W.has_raw_data() ? (const float*)W.raw_data().data() : W.float_data().data();
-
-        const float* iptr = wptr;
-        const float* optr = wptr + weight_data_size_g;
-        const float* fptr = wptr + weight_data_size_g * 2;
-        const float* gptr = wptr + weight_data_size_g * 3;
-        fwrite(iptr, sizeof(float), weight_data_size_g, bp);
-        fwrite(fptr, sizeof(float), weight_data_size_g, bp);
-        fwrite(optr, sizeof(float), weight_data_size_g, bp);
-        fwrite(gptr, sizeof(float), weight_data_size_g, bp);
-
-        if (direction_type == 2) {
-          iptr += weight_data_size_g * 4;
-          optr += weight_data_size_g * 4;
-          fptr += weight_data_size_g * 4;
-          gptr += weight_data_size_g * 4;
-          fwrite(iptr, sizeof(float), weight_data_size_g, bp);
-          fwrite(fptr, sizeof(float), weight_data_size_g, bp);
-          fwrite(optr, sizeof(float), weight_data_size_g, bp);
-          fwrite(gptr, sizeof(float), weight_data_size_g, bp);
-        }
-      }
-
-      // reduce xc and hc bias
-      // reorder num_directions-IOFG-hidden to num_directions-IFOG-hidden
-      {
-        fwrite(&quantize_tag, sizeof(int), 1, bp);
-
-        int bias_data_size_g = get_tensor_proto_data_size(B) / 2 / 4 / num_directions;
-        const float* xcbptr =
-            B.has_raw_data() ? (const float*)B.raw_data().data() : B.float_data().data();
-        const float* xiptr = xcbptr;
-        const float* xoptr = xcbptr + bias_data_size_g;
-        const float* xfptr = xcbptr + bias_data_size_g * 2;
-        const float* xgptr = xcbptr + bias_data_size_g * 3;
-        const float* hiptr = xcbptr + bias_data_size_g * 4;
-        const float* hoptr = xcbptr + bias_data_size_g * 5;
-        const float* hfptr = xcbptr + bias_data_size_g * 6;
-        const float* hgptr = xcbptr + bias_data_size_g * 7;
-
-        for (int j = 0; j < bias_data_size_g; j++) {
-          float vb = xiptr[j] + hiptr[j];
-          fwrite(&vb, sizeof(float), 1, bp);
-        }
-        for (int j = 0; j < bias_data_size_g; j++) {
-          float vb = xfptr[j] + hfptr[j];
-          fwrite(&vb, sizeof(float), 1, bp);
-        }
-        for (int j = 0; j < bias_data_size_g; j++) {
-          float vb = xoptr[j] + hoptr[j];
-          fwrite(&vb, sizeof(float), 1, bp);
-        }
-        for (int j = 0; j < bias_data_size_g; j++) {
-          float vb = xgptr[j] + hgptr[j];
-          fwrite(&vb, sizeof(float), 1, bp);
-        }
-
-        if (direction_type == 2) {
-          xiptr += bias_data_size_g * 8;
-          xoptr += bias_data_size_g * 8;
-          xfptr += bias_data_size_g * 8;
-          xgptr += bias_data_size_g * 8;
-          hiptr += bias_data_size_g * 8;
-          hoptr += bias_data_size_g * 8;
-          hfptr += bias_data_size_g * 8;
-          hgptr += bias_data_size_g * 8;
-
-          for (int j = 0; j < bias_data_size_g; j++) {
-            float vb = xiptr[j] + hiptr[j];
-            fwrite(&vb, sizeof(float), 1, bp);
-          }
-          for (int j = 0; j < bias_data_size_g; j++) {
-            float vb = xfptr[j] + hfptr[j];
-            fwrite(&vb, sizeof(float), 1, bp);
-          }
-          for (int j = 0; j < bias_data_size_g; j++) {
-            float vb = xoptr[j] + hoptr[j];
-            fwrite(&vb, sizeof(float), 1, bp);
-          }
-          for (int j = 0; j < bias_data_size_g; j++) {
-            float vb = xgptr[j] + hgptr[j];
-            fwrite(&vb, sizeof(float), 1, bp);
-          }
-        }
-      }
-
-      // reorder num_directions-IOFG-hidden-hidden to
-      // num_directions-IFOG-hidden-hidden
-      {
-        fwrite(&quantize_tag, sizeof(int), 1, bp);
-
-        int weight_data_size_g = get_tensor_proto_data_size(R) / 4 / num_directions;
-        const float* rptr =
-            R.has_raw_data() ? (const float*)R.raw_data().data() : R.float_data().data();
-
-        const float* iptr = rptr;
-        const float* optr = rptr + weight_data_size_g;
-        const float* fptr = rptr + weight_data_size_g * 2;
-        const float* gptr = rptr + weight_data_size_g * 3;
-        fwrite(iptr, sizeof(float), weight_data_size_g, bp);
-        fwrite(fptr, sizeof(float), weight_data_size_g, bp);
-        fwrite(optr, sizeof(float), weight_data_size_g, bp);
-        fwrite(gptr, sizeof(float), weight_data_size_g, bp);
-
-        if (direction_type == 2) {
-          iptr += weight_data_size_g * 4;
-          optr += weight_data_size_g * 4;
-          fptr += weight_data_size_g * 4;
-          gptr += weight_data_size_g * 4;
-          fwrite(iptr, sizeof(float), weight_data_size_g, bp);
-          fwrite(fptr, sizeof(float), weight_data_size_g, bp);
-          fwrite(optr, sizeof(float), weight_data_size_g, bp);
-          fwrite(gptr, sizeof(float), weight_data_size_g, bp);
-        }
-      }
-    } else if (op == "MatMul") {
-      if (weights.find(node.input(1)) != weights.end() && weights[node.input(1)].dims_size() == 2) {
-        // InnerProduct
-        const onnx::TensorProto& B = weights[node.input(1)];
-
-        int weight_data_size = get_tensor_proto_data_size(B);
-
-        int num_output = B.dims(B.dims_size() - 1);
-        int num_input = weight_data_size / num_output;
-
-        fprintf(pp, " 0=%d", num_output);
-        fprintf(pp, " 1=0");
-        fprintf(pp, " 2=%d", weight_data_size);
-
-        int quantize_tag = 0;
-        fwrite(&quantize_tag, sizeof(int), 1, bp);
-
-        // reorder num_input-num_output to num_output-num_input
-        {
-          const float* bptr =
-              B.has_raw_data() ? (const float*)B.raw_data().data() : B.float_data().data();
-
-          for (int j = 0; j < num_output; j++) {
-            for (int k = 0; k < num_input; k++) {
-              float vb = bptr[k * num_output + j];
-              fwrite(&vb, sizeof(float), 1, bp);
-            }
-          }
-        }
-
-        // fwrite_tensor_proto_data(B, bp)
-      } else {
-        // default matrix multiplication
-      }
-    } else if (op == "Max") {
-      int op_type = 4;
-      fprintf(pp, " 0=%d", op_type);
-
-      int with_scalar = get_node_attr_i(node, "with_scalar", 0);
-      float b = get_node_attr_f(node, "b", 0.f);
-      if (with_scalar) {
-        fprintf(pp, " 1=%d", with_scalar);
-        fprintf(pp, " 2=%e", b);
-      }
-    } else if (op == "Min") {
-      int op_type = 5;
-      fprintf(pp, " 0=%d", op_type);
-
-      int with_scalar = get_node_attr_i(node, "with_scalar", 0);
-      float b = get_node_attr_f(node, "b", 0.f);
-      if (with_scalar) {
-        fprintf(pp, " 1=%d", with_scalar);
-        fprintf(pp, " 2=%e", b);
-      }
-    } else if (op == "Mul") {
-      int op_type = 2;
-      fprintf(pp, " 0=%d", op_type);
-
-      int with_scalar = get_node_attr_i(node, "with_scalar", 0);
-      float b = get_node_attr_f(node, "b", 0.f);
-      if (with_scalar) {
-        fprintf(pp, " 1=%d", with_scalar);
-        fprintf(pp, " 2=%e", b);
-      }
-    } else if (op == "MultiHeadAttention") {
-      int embed_dim = get_node_attr_i(node, "embed_dim", 0);
-      int num_heads = get_node_attr_i(node, "num_heads", 0);
+        internal_split++;
+    }
 
-      fprintf(pp, " 0=%d", embed_dim);
-      fprintf(pp, " 1=%d", num_heads);
+    for (int i = 0; i < node_count; i++)
+    {
+        const onnx::NodeProto& node = mutable_graph->node(i);
+        const std::string&     op   = node.op_type();
 
-      if (node.input_size() == 5) {
-        const onnx::TensorProto& qkvw = weights[node.input(1)];
-        const onnx::TensorProto& qkvb = weights[node.input(2)];
-        const onnx::TensorProto& ow = weights[node.input(3)];
-        const onnx::TensorProto& ob = weights[node.input(4)];
+        //         fprintf(stderr, "op = %s\n", op.c_str());
 
-        int weight_data_size = get_tensor_proto_data_size(ow);
+        if (op == "noop_reducedncnn")
+        {
+            continue;
+        }
 
-        fprintf(pp, " 2=%d", weight_data_size);
+        std::string name = node.name();
+        if (name.empty())
+        {
+            name = node.output(0);
+        }
 
-        int quantize_tag = 0;
+        int input_size  = node.input_size();
+        int output_size = node.output_size();
 
-        fwrite(&quantize_tag, sizeof(int), 1, bp);
-        // transpose qw
+        for (int j = 0; j < (int)node.input_size(); j++)
         {
-          const float* wptr =
-              qkvw.has_raw_data() ? (const float*)qkvw.raw_data().data() : qkvw.float_data().data();
-          const float* bptr =
-              qkvb.has_raw_data() ? (const float*)qkvb.raw_data().data() : qkvb.float_data().data();
+            const std::string& input_name = node.input(j);
 
-          for (int j = 0; j < embed_dim; j++) {
-            for (int k = 0; k < embed_dim; k++) {
-              float vb = wptr[j * embed_dim * 3 + k];
-              fwrite(&vb, sizeof(float), 1, bp);
+            // check weight
+            if (weights.find(input_name) != weights.end() && node_reference[input_name] == 0)
+            {
+                input_size--;
             }
-          }
 
-          fwrite(bptr, sizeof(float), embed_dim, bp);
-        }
+            if (input_name.empty())
+            {
+                input_size--;
+            }
 
-        fwrite(&quantize_tag, sizeof(int), 1, bp);
-        // transpose kw
+            //             fprintf(stderr, "  input = %s\n", input_name.c_str());
+        }
+        /*
+        for (int j=0; j<(int)node.output_size(); j++)
         {
-          const float* wptr =
-              qkvw.has_raw_data() ? (const float*)qkvw.raw_data().data() : qkvw.float_data().data();
-          const float* bptr =
-              qkvb.has_raw_data() ? (const float*)qkvb.raw_data().data() : qkvb.float_data().data();
-          bptr += embed_dim;
+            const std::string& output_name = node.output(j);
+            fprintf(stderr, "  output = %s\n", output_name.c_str());
+        }
+        */
 
-          for (int j = 0; j < embed_dim; j++) {
-            for (int k = 0; k < embed_dim; k++) {
-              float vb = wptr[j * embed_dim * 3 + k + embed_dim];
-              fwrite(&vb, sizeof(float), 1, bp);
+        if (op == "Abs")
+        {
+            fprintf(pp, "%-16s", "UnaryOp");
+        }
+        else if (op == "Acos")
+        {
+            fprintf(pp, "%-16s", "UnaryOp");
+        }
+        else if (op == "Add")
+        {
+            fprintf(pp, "%-16s", "BinaryOp");
+        }
+        else if (op == "ArgMax")
+        {
+            fprintf(pp, "%-16s", "TopK");
+        }
+        else if (op == "Asin")
+        {
+            fprintf(pp, "%-16s", "UnaryOp");
+        }
+        else if (op == "Atan")
+        {
+            fprintf(pp, "%-16s", "UnaryOp");
+        }
+        else if (op == "AveragePool" || op == "MaxPool")
+        {
+            std::vector<int> kernel_shape = get_node_attr_ai(node, "kernel_shape");
+            if (kernel_shape.size() == 1)
+            {
+                fprintf(pp, "%-16s", "Pooling1D");
+            }
+            else
+            {
+                fprintf(pp, "%-16s", "Pooling");
             }
-          }
-
-          fwrite(bptr, sizeof(float), embed_dim, bp);
         }
-
-        fwrite(&quantize_tag, sizeof(int), 1, bp);
-        // transpose vw
+        else if (op == "BatchNormalization")
         {
-          const float* wptr =
-              qkvw.has_raw_data() ? (const float*)qkvw.raw_data().data() : qkvw.float_data().data();
-          const float* bptr =
-              qkvb.has_raw_data() ? (const float*)qkvb.raw_data().data() : qkvb.float_data().data();
-          bptr += embed_dim * 2;
-
-          for (int j = 0; j < embed_dim; j++) {
-            for (int k = 0; k < embed_dim; k++) {
-              float vb = wptr[j * embed_dim * 3 + k + embed_dim * 2];
-              fwrite(&vb, sizeof(float), 1, bp);
+            fprintf(pp, "%-16s", "BatchNorm");
+        }
+        else if (op == "BiasGelu")
+        {
+            fprintf(pp, "%-16s", "BiasGelu");
+        }
+        else if (op == "Cast")
+        {
+            fprintf(pp, "%-16s", "Noop");
+        }
+        else if (op == "Ceil")
+        {
+            fprintf(pp, "%-16s", "UnaryOp");
+        }
+        else if (op == "Clip")
+        {
+            fprintf(pp, "%-16s", "Clip");
+        }
+        else if (op == "Concat")
+        {
+            fprintf(pp, "%-16s", "Concat");
+        }
+        else if (op == "Constant")
+        {
+            continue;
+        }
+        else if (op == "ConstantOfShape")
+        {
+            fprintf(pp, "%-16s", "ConstantOfShape");
+        }
+        else if (op == "Conv")
+        {
+            std::vector<int> kernel_shape = get_node_attr_ai(node, "kernel_shape");
+            if (kernel_shape.size() == 1)
+            {
+                fprintf(pp, "%-16s", "Convolution1D");
+            }
+            else
+            {
+                int group = get_node_attr_i(node, "group", 1);
+                if (group > 1)
+                {
+                    fprintf(pp, "%-16s", "ConvolutionDepthWise");
+                }
+                else
+                {
+                    fprintf(pp, "%-16s", "Convolution");
+                }
             }
-          }
-
-          fwrite(bptr, sizeof(float), embed_dim, bp);
         }
-
-        fwrite(&quantize_tag, sizeof(int), 1, bp);
-        // transpose ow
+        else if (op == "ConvTranspose")
         {
-          const float* wptr =
-              ow.has_raw_data() ? (const float*)ow.raw_data().data() : ow.float_data().data();
-
-          for (int j = 0; j < embed_dim; j++) {
-            for (int k = 0; k < embed_dim; k++) {
-              float vb = wptr[j * embed_dim + k];
-              fwrite(&vb, sizeof(float), 1, bp);
+            int group = get_node_attr_i(node, "group", 1);
+            if (group > 1)
+            {
+                fprintf(pp, "%-16s", "DeconvolutionDepthWise");
+            }
+            else
+            {
+                fprintf(pp, "%-16s", "Deconvolution");
             }
-          }
         }
-        fwrite_tensor_proto_data(ob, bp);
-      } else {
-        const onnx::TensorProto& qw = weights[node.input(3)];
-        const onnx::TensorProto& qb = weights[node.input(4)];
-        const onnx::TensorProto& kw = weights[node.input(5)];
-        const onnx::TensorProto& kb = weights[node.input(6)];
-        const onnx::TensorProto& vw = weights[node.input(7)];
-        const onnx::TensorProto& vb = weights[node.input(8)];
-        const onnx::TensorProto& ow = weights[node.input(9)];
-        const onnx::TensorProto& ob = weights[node.input(10)];
-
-        int weight_data_size = get_tensor_proto_data_size(qw);
-
-        fprintf(pp, " 2=%d", weight_data_size);
-
-        int quantize_tag = 0;
-
-        fwrite(&quantize_tag, sizeof(int), 1, bp);
-        // transpose qw
+        else if (op == "Cos")
         {
-          const float* wptr =
-              qw.has_raw_data() ? (const float*)qw.raw_data().data() : qw.float_data().data();
-
-          for (int j = 0; j < embed_dim; j++) {
-            for (int k = 0; k < embed_dim; k++) {
-              float vb = wptr[j * embed_dim + k];
-              fwrite(&vb, sizeof(float), 1, bp);
+            fprintf(pp, "%-16s", "UnaryOp");
+        }
+        else if (op == "Crop")
+        {
+            fprintf(pp, "%-16s", "Crop");
+        }
+        else if (op == "DepthToSpace")
+        {
+            fprintf(pp, "%-16s", "PixelShuffle");
+        }
+        else if (op == "DetectionOutput")
+        {
+            fprintf(pp, "%-16s", "DetectionOutput");
+        }
+        else if (op == "Div")
+        {
+            fprintf(pp, "%-16s", "BinaryOp");
+        }
+        else if (op == "Dropout")
+        {
+            fprintf(pp, "%-16s", "Dropout");
+            output_size = 1;
+        }
+        else if (op == "Elu")
+        {
+            fprintf(pp, "%-16s", "ELU");
+        }
+        else if (op == "EmbedLayerNormalization")
+        {
+            fprintf(pp, "%-16s", "EmbedLayerNormalization");
+        }
+        else if (op == "Equal")
+        {
+            fprintf(pp, "%-16s", "Compare");
+        }
+        else if (op == "Exp")
+        {
+            fprintf(pp, "%-16s", "UnaryOp");
+        }
+        else if (op == "Expand")
+        {
+            fprintf(pp, "%-16s", "Expand");
+        }
+        else if (op == "Flatten")
+        {
+            fprintf(pp, "%-16s", "Flatten");
+        }
+        else if (op == "Floor")
+        {
+            fprintf(pp, "%-16s", "UnaryOp");
+        }
+        else if (op == "Gather")
+        {
+            fprintf(pp, "%-16s", "Gather");
+        }
+        else if (op == "Gelu")
+        {
+            fprintf(pp, "%-16s", "GELU");
+        }
+        else if (op == "Gemm")
+        {
+            float alpha  = get_node_attr_f(node, "alpha", 1.f);
+            float beta   = get_node_attr_f(node, "beta", 1.f);
+            int   transA = get_node_attr_i(node, "transA", 0);
+            int   transB = get_node_attr_i(node, "transB", 0);
+
+            if (alpha == 1.f && beta == 1.f && transA == 0 && transB == 1)
+            {
+                // InnerProduct-like A * B + C
+                fprintf(pp, "%-16s", "InnerProduct");
+            }
+            else
+            {
+                fprintf(pp, "%-16s", "Gemm");
+            }
+        }
+        else if (op == "GlobalAveragePool")
+        {
+            fprintf(pp, "%-16s", "Pooling");
+        }
+        else if (op == "GlobalMaxPool")
+        {
+            fprintf(pp, "%-16s", "Pooling");
+        }
+        else if (op == "AdaptiveAvgPool2d" || op == "adaptive_avg_pool2d" ||
+                 op == "adaptive_max_pool2d")
+        {
+            fprintf(pp, "%-16s", "Pooling");
+        }
+        else if (op == "GroupNorm")
+        {
+            fprintf(pp, "%-16s", "GroupNorm");
+        }
+        else if (op == "GRU")
+        {
+            fprintf(pp, "%-16s", "GRU");
+        }
+        else if (op == "HardSigmoid")
+        {
+            fprintf(pp, "%-16s", "HardSigmoid");
+        }
+        else if (op == "HardSwish")
+        {
+            fprintf(pp, "%-16s", "HardSwish");
+        }
+        else if (op == "ImageScaler")
+        {
+            fprintf(pp, "%-16s", "Scale");
+        }
+        else if (op == "InstanceNormalization")
+        {
+            fprintf(pp, "%-16s", "InstanceNorm");
+        }
+        else if (op == "LayerNorm")
+        {
+            fprintf(pp, "%-16s", "LayerNorm");
+        }
+        else if (op == "LeakyRelu")
+        {
+            fprintf(pp, "%-16s", "ReLU");
+        }
+        else if (op == "Threshold")
+        {
+            fprintf(pp, "%-16s", "Threshold");
+        }
+        else if (op == "Log")
+        {
+            fprintf(pp, "%-16s", "UnaryOp");
+        }
+        else if (op == "LRN")
+        {
+            fprintf(pp, "%-16s", "LRN");
+        }
+        else if (op == "LSTM")
+        {
+            fprintf(pp, "%-16s", "LSTM");
+        }
+        else if (op == "MatMul")
+        {
+            if (weights.find(node.input(1)) != weights.end() && weights[node.input(1)].dims_size() == 2)
+            {
+                fprintf(pp, "%-16s", "InnerProduct");
+            }
+            else
+            {
+                fprintf(pp, "%-16s", "Gemm");
+            }
+        }
+        else if (op == "Max")
+        {
+            fprintf(pp, "%-16s", "BinaryOp");
+        }
+        else if (op == "Min")
+        {
+            fprintf(pp, "%-16s", "BinaryOp");
+        }
+        else if (op == "Mul")
+        {
+            fprintf(pp, "%-16s", "BinaryOp");
+        }
+        else if (op == "MultiHeadAttention")
+        {
+            fprintf(pp, "%-16s", "MultiHeadAttention");
+        }
+        else if (op == "Neg")
+        {
+            fprintf(pp, "%-16s", "UnaryOp");
+        }
+        else if (op == "NonMaxSuppression")
+        {
+            fprintf(pp, "%-16s", "NonMaxSuppression");
+        }
+        else if (op == "Normalize")
+        {
+            fprintf(pp, "%-16s", "Normalize");
+        }
+        else if (op == "Pad")
+        {
+            fprintf(pp, "%-16s", "Padding");
+        }
+        else if (op == "PixelShuffle")
+        {
+            fprintf(pp, "%-16s", "PixelShuffle");
+        }
+        else if (op == "Pow")
+        {
+            fprintf(pp, "%-16s", "BinaryOp");
+        }
+        else if (op == "PriorBox")
+        {
+            fprintf(pp, "%-16s", "PriorBox");
+        }
+        else if (op == "PRelu")
+        {
+            fprintf(pp, "%-16s", "PReLU");
+        }
+        else if (op == "Range")
+        {
+            fprintf(pp, "%-16s", "Range");
+        }
+        else if (op == "Reciprocal")
+        {
+            fprintf(pp, "%-16s", "UnaryOp");
+        }
+        else if (op == "ReduceMax" || op == "ReduceMin" || op == "ReduceMean" || op == "ReduceProd" ||
+                 op == "ReduceSum" || op == "ReduceSumSquare" || op == "ReduceL1" ||
+                 op == "ReduceL2" || op == "ReduceLogSum" || op == "ReduceLogSumExp")
+        {
+            fprintf(pp, "%-16s", "Reduction");
+        }
+        else if (op == "Relu")
+        {
+            fprintf(pp, "%-16s", "ReLU");
+        }
+        else if (op == "Reorg")
+        {
+            fprintf(pp, "%-16s", "Reorg");
+        }
+        else if (op == "Reshape")
+        {
+            fprintf(pp, "%-16s", "Reshape");
+        }
+        else if (op == "RNN")
+        {
+            fprintf(pp, "%-16s", "RNN");
+        }
+        else if (op == "RDiv")
+        {
+            fprintf(pp, "%-16s", "BinaryOp");
+        }
+        else if (op == "RSub")
+        {
+            fprintf(pp, "%-16s", "BinaryOp");
+        }
+        else if (op == "RoiAlign")
+        {
+            fprintf(pp, "%-16s", "ROIAlign");
+        }
+        else if (op == "ScatterND")
+        {
+            fprintf(pp, "%-16s", "ScatterND");
+        }
+        else if (op == "Shape")
+        {
+            fprintf(pp, "%-16s", "Shape");
+        }
+        else if (op == "ShuffleChannel")
+        {
+            fprintf(pp, "%-16s", "ShuffleChannel");
+        }
+        else if (op == "Sigmoid")
+        {
+            fprintf(pp, "%-16s", "Sigmoid");
+        }
+        else if (op == "Sin")
+        {
+            fprintf(pp, "%-16s", "UnaryOp");
+        }
+        else if (op == "SkipLayerNormalization")
+        {
+            fprintf(pp, "%-16s", "SkipLayerNormalization");
+        }
+        else if (op == "Slice")
+        {
+            std::vector<int> ends;
+            std::vector<int> steps;
+            bool             use_crop = true;
+
+            if (node.input_size() == 1)
+            {
+                ends  = get_node_attr_ai(node, "ends");
+                steps = get_node_attr_ai(node, "steps");  // TODO
+            }
+            else
+            {
+                ends = get_node_attr_from_input_ai(weights[node.input(2)]);
+                if (node.input_size() >= 5) steps = get_node_attr_from_input_ai(weights[node.input(4)]);
+            }
+
+            // assert step == 1
+            for (int i = 0; i < (int)steps.size(); i++)
+            {
+                if (steps[i] != 1 && steps[i] < ends[i])
+                {
+                    use_crop = false;
+                    break;
+                }
+            }
+
+            if (use_crop)
+            {
+                fprintf(pp, "%-16s", "Crop");
+            }
+            else
+            {
+                fprintf(pp, "%-16s", "TensorSlice");
+            }
+        }
+        else if (op == "Softmax")
+        {
+            fprintf(pp, "%-16s", "Softmax");
+        }
+        else if (op == "Softplus")
+        {
+            fprintf(pp, "%-16s", "Softplus");
+        }
+        else if (op == "Split")
+        {
+            fprintf(pp, "%-16s", "Slice");
+        }
+        else if (op == "Sqrt")
+        {
+            fprintf(pp, "%-16s", "UnaryOp");
+        }
+        else if (op == "Squeeze")
+        {
+            std::vector<int> axes = get_node_attr_ai(node, "axes");
+            // fprintf(stderr, "axes[0]: %d\n",axes[0]);
+            if (axes[0] == 0)
+            {
+                fprintf(pp, "%-16s", "Noop");
+            }
+            else
+            {
+                fprintf(pp, "%-16s", "Squeeze");
+            }
+        }
+        else if (op == "Sub")
+        {
+            fprintf(pp, "%-16s", "BinaryOp");
+        }
+        else if (op == "Sum")
+        {
+            fprintf(pp, "%-16s", "Eltwise");
+        }
+        else if (op == "Swish")
+        {
+            fprintf(pp, "%-16s", "Swish");
+        }
+        else if (op == "Tan")
+        {
+            fprintf(pp, "%-16s", "UnaryOp");
+        }
+        else if (op == "Tanh")
+        {
+            fprintf(pp, "%-16s", "UnaryOp");
+        }
+        else if (op == "Tile")
+        {
+            fprintf(pp, "%-16s", "TileOnnx");
+        }
+        else if (op == "TopK")
+        {
+            fprintf(pp, "%-16s", "TopK");
+        }
+        else if (op == "Transpose")
+        {
+            fprintf(pp, "%-16s", "Permute");
+        }
+        else if (op == "Upsample" || op == "Resize")
+        {
+            fprintf(pp, "%-16s", "Interp");
+        }
+        else if (op == "Unsqueeze")
+        {
+            std::vector<int> axes = get_node_attr_ai(node, "axes");
+            // fprintf(stderr, "axes[0]: %d\n",axes[0]);
+            if (axes[0] == 0)
+            {
+                fprintf(pp, "%-16s", "Noop");
+            }
+            else
+            {
+                fprintf(pp, "%-16s", "ExpandDims");
+            }
+        }
+        else if (op == "Where")
+        {
+            fprintf(pp, "%-16s", "Where");
+        }
+        else if (op == "Yolov3DetectionOutput")
+        {
+            fprintf(pp, "%-16s", "Yolov3DetectionOutput");
+        }
+        else
+        {
+            // TODO
+            fprintf(stderr, "%s not supported yet!\n", op.c_str());
+            fprintf(pp, "%-16s", op.c_str());
+        }
+
+        fprintf(pp, " %-24s %d %d", name.c_str(), input_size, output_size);
+
+        for (int j = 0; j < (int)node.input_size(); j++)
+        {
+            std::string input_name = node.input(j);
+
+            // check weight
+            if (weights.find(input_name) != weights.end() && node_reference[input_name] == 0)
+            {
+                continue;
+            }
+
+            if (input_name.empty())
+            {
+                continue;
+            }
+
+            if (split_node_reference.find(input_name) != split_node_reference.end())
+            {
+                int refidx                       = split_node_reference[input_name] - 1;
+                split_node_reference[input_name] = refidx;
+
+                char splitsuffix[256];
+                sprintf(splitsuffix, "_splitncnn_%d", refidx);
+                input_name = input_name + splitsuffix;
+            }
+
+            fprintf(pp, " %s", input_name.c_str());
+        }
+
+        for (int j = 0; j < output_size; j++)
+        {
+            const std::string& output_name = node.output(j);
+
+            fprintf(pp, " %s", output_name.c_str());
+        }
+
+        if (op == "Abs")
+        {
+            int op_type = 0;
+            fprintf(pp, " 0=%d", op_type);
+        }
+        else if (op == "Acos")
+        {
+            int op_type = 13;
+            fprintf(pp, " 0=%d", op_type);
+        }
+        else if (op == "Add")
+        {
+            int op_type = 0;
+            fprintf(pp, " 0=%d", op_type);
+
+            int   with_scalar = get_node_attr_i(node, "with_scalar", 0);
+            float b           = get_node_attr_f(node, "b", 0.f);
+            if (with_scalar)
+            {
+                fprintf(pp, " 1=%d", with_scalar);
+                fprintf(pp, " 2=%e", b);
+            }
+        }
+        else if (op == "ArgMax")
+        {
+            int axis     = get_node_attr_i(node, "axis");
+            int keepdims = get_node_attr_i(node, "keepdims");
+            fprintf(pp, " 0=%d", axis - 1);
+            fprintf(pp, " 3=%d", keepdims);
+        }
+        else if (op == "Asin")
+        {
+            int op_type = 12;
+            fprintf(pp, " 0=%d", op_type);
+        }
+        else if (op == "Atan")
+        {
+            int op_type = 14;
+            fprintf(pp, " 0=%d", op_type);
+        }
+        else if (op == "AveragePool" || op == "MaxPool")
+        {
+            std::string      auto_pad     = get_node_attr_s(node, "auto_pad");
+            int              ceil_mode    = get_node_attr_i(node, "ceil_mode", 0);
+            std::vector<int> kernel_shape = get_node_attr_ai(node, "kernel_shape");
+            std::vector<int> strides      = get_node_attr_ai(node, "strides");
+            std::vector<int> pads         = get_node_attr_ai(node, "pads");
+
+            int              pool     = op == "AveragePool" ? 1 : 0;
+            int              pad_mode = 1;
+
+            if (auto_pad == "SAME_UPPER")
+            {
+                pad_mode = 2;
+            }
+            else if (auto_pad == "SAME_LOWER")
+            {
+                pad_mode = 3;
+            }
+
+            if (ceil_mode == 1)
+            {
+                pad_mode = 0;
+            }
+
+            fprintf(pp, " 0=%d", pool);
+
+            if (kernel_shape.size() == 1)
+            {
+                fprintf(pp, " 1=%d", kernel_shape[0]);
+            }
+            else if (kernel_shape.size() == 2)
+            {
+                fprintf(pp, " 1=%d", kernel_shape[1]);
+                fprintf(pp, " 11=%d", kernel_shape[0]);
+            }
+
+            if (strides.size() == 1)
+            {
+                fprintf(pp, " 2=%d", strides[0]);
+            }
+            else if (strides.size() == 2)
+            {
+                fprintf(pp, " 2=%d", strides[1]);
+                fprintf(pp, " 12=%d", strides[0]);
+            }
+
+            if (pads.size() == 1)
+            {
+                fprintf(pp, " 3=%d", pads[0]);
+            }
+            else if (pads.size() == 2)
+            {
+                fprintf(pp, " 3=%d", pads[1]);
+                fprintf(pp, " 13=%d", pads[0]);
+            }
+            else if (pads.size() == 4)
+            {
+                fprintf(pp, " 3=%d", pads[1]);
+                fprintf(pp, " 13=%d", pads[0]);
+                fprintf(pp, " 14=%d", pads[3]);
+                fprintf(pp, " 15=%d", pads[2]);
+            }
+
+            fprintf(pp, " 5=%d", pad_mode);
+
+            if (op == "AveragePool")
+            {
+                int avgpool_count_include_pad = get_node_attr_i(node, "count_include_pad", 0);
+                fprintf(pp, " 6=%d", avgpool_count_include_pad);
+            }
+        }
+        else if (op == "BatchNormalization")
+        {
+            float                    epsilon = get_node_attr_f(node, "epsilon", 1e-5f);
+
+            const onnx::TensorProto& scale = weights[node.input(1)];
+            const onnx::TensorProto& B     = weights[node.input(2)];
+            const onnx::TensorProto& mean  = weights[node.input(3)];
+            const onnx::TensorProto& var   = weights[node.input(4)];
+
+            int                      channels = get_tensor_proto_data_size(scale);
+
+            fprintf(pp, " 0=%d", channels);
+
+            fwrite_tensor_proto_data(scale, bp);
+            fwrite_tensor_proto_data(mean, bp);
+            // apply epsilon to var
+            {
+                const float* v =
+                    var.has_raw_data() ? (const float*)var.raw_data().data() : var.float_data().data();
+
+                for (int j = 0; j < channels; j++)
+                {
+                    float ve = v[j] + epsilon;
+                    fwrite(&ve, sizeof(float), 1, bp);
+                }
+            }
+            fwrite_tensor_proto_data(B, bp);
+        }
+        else if (op == "BiasGelu")
+        {
+            const onnx::TensorProto& B = weights[node.input(1)];
+
+            fprintf(pp, " 0=%d", get_tensor_proto_data_size(B));
+
+            int quantize_tag = 0;
+            fwrite(&quantize_tag, sizeof(int), 1, bp);
+
+            fwrite_tensor_proto_data(B, bp);
+        }
+        else if (op == "Ceil")
+        {
+            int op_type = 3;
+            fprintf(pp, " 0=%d", op_type);
+        }
+        else if (op == "Clip")
+        {
+            float min;
+            float max;
+            if (node.input_size() == 1)
+            {
+                min = get_node_attr_f(node, "min", -FLT_MAX);
+                max = get_node_attr_f(node, "max", FLT_MAX);
+            }
+            else
+            {
+                min = weights.find(node.input(1)) != weights.end() ? get_node_attr_from_input<float>(weights[node.input(1)]) : -FLT_MAX;
+                max = weights.find(node.input(2)) != weights.end() ? get_node_attr_from_input<float>(weights[node.input(2)]) : FLT_MAX;
+            }
+
+            fprintf(pp, " 0=%e", min);
+            fprintf(pp, " 1=%e", max);
+        }
+        else if (op == "Concat")
+        {
+            int axis = get_node_attr_i(node, "axis", 1);
+            fprintf(pp, " 0=%d", axis - 1);
+        }
+        else if (op == "Constant")
+        {
+            // never reach here
+        }
+        else if (op == "ConstantOfShape")
+        {
+            float value = 0.f;
+            value       = get_node_attr_f(node, "value", 0.f);
+            fprintf(pp, " 0=%f", value);
+        }
+        else if (op == "Conv")
+        {
+            const onnx::TensorProto& W = weights[node.input(1)];
+
+            int                      num_filter = W.dims(0);
+            int                      has_bias   = node.input_size() == 3 ? 1 : 0;
+
+            std::string              auto_pad     = get_node_attr_s(node, "auto_pad");
+            std::vector<int>         kernel_shape = get_node_attr_ai(node, "kernel_shape");
+            std::vector<int>         dilations    = get_node_attr_ai(node, "dilations");
+            std::vector<int>         strides      = get_node_attr_ai(node, "strides");
+            std::vector<int>         pads         = get_node_attr_ai(node, "pads");
+            int                      group        = get_node_attr_i(node, "group", 1);
+
+            fprintf(pp, " 0=%d", num_filter);
+
+            if (kernel_shape.size() == 1)
+            {
+                fprintf(pp, " 1=%d", kernel_shape[0]);
+            }
+            else if (kernel_shape.size() == 2)
+            {
+                fprintf(pp, " 1=%d", kernel_shape[1]);
+                fprintf(pp, " 11=%d", kernel_shape[0]);
+            }
+
+            if (dilations.size() == 1)
+            {
+                fprintf(pp, " 2=%d", dilations[0]);
+            }
+            else if (dilations.size() == 2)
+            {
+                fprintf(pp, " 2=%d", dilations[1]);
+                fprintf(pp, " 12=%d", dilations[0]);
+            }
+
+            if (strides.size() == 1)
+            {
+                fprintf(pp, " 3=%d", strides[0]);
+            }
+            else if (strides.size() == 2)
+            {
+                fprintf(pp, " 3=%d", strides[1]);
+                fprintf(pp, " 13=%d", strides[0]);
+            }
+
+            if (auto_pad == "SAME_UPPER")
+            {
+                fprintf(pp, " 4=-233");
+            }
+            else if (auto_pad == "SAME_LOWER")
+            {
+                fprintf(pp, " 4=-234");
+            }
+            else
+            {
+                if (pads.size() == 1)
+                {
+                    fprintf(pp, " 4=%d", pads[0]);
+                }
+                else if (pads.size() == 2)
+                {
+                    fprintf(pp, " 4=%d", pads[1]);
+                    fprintf(pp, " 14=%d", pads[0]);
+                }
+                else if (pads.size() == 4)
+                {
+                    fprintf(pp, " 4=%d", pads[1]);
+                    fprintf(pp, " 14=%d", pads[0]);
+                    fprintf(pp, " 15=%d", pads[3]);
+                    fprintf(pp, " 16=%d", pads[2]);
+                }
+            }
+
+            fprintf(pp, " 5=%d", has_bias);
+
+            fprintf(pp, " 6=%d", get_tensor_proto_data_size(W));
+
+            if (group > 1)
+            {
+                fprintf(pp, " 7=%d", group);
+            }
+
+            int quantize_tag = 0;
+            fwrite(&quantize_tag, sizeof(int), 1, bp);
+
+            fwrite_tensor_proto_data(W, bp);
+
+            if (has_bias)
+            {
+                const onnx::TensorProto& B = weights[node.input(2)];
+                fwrite_tensor_proto_data(B, bp);
+            }
+        }
+        else if (op == "ConvTranspose")
+        {
+            const onnx::TensorProto& W = weights[node.input(1)];
+
+            int                      has_bias = node.input_size() == 3 ? 1 : 0;
+
+            std::string              auto_pad       = get_node_attr_s(node, "auto_pad");
+            std::vector<int>         kernel_shape   = get_node_attr_ai(node, "kernel_shape");
+            std::vector<int>         dilations      = get_node_attr_ai(node, "dilations");
+            std::vector<int>         strides        = get_node_attr_ai(node, "strides");
+            std::vector<int>         output_padding = get_node_attr_ai(node, "output_padding");
+            std::vector<int>         output_shape   = get_node_attr_ai(node, "output_shape");
+            std::vector<int>         pads           = get_node_attr_ai(node, "pads");
+            int                      group          = get_node_attr_i(node, "group", 1);
+            int                      num_filter     = W.dims(1) * group;
+
+            fprintf(pp, " 0=%d", num_filter);
+
+            if (kernel_shape.size() == 1)
+            {
+                fprintf(pp, " 1=%d", kernel_shape[0]);
+            }
+            else if (kernel_shape.size() == 2)
+            {
+                fprintf(pp, " 1=%d", kernel_shape[1]);
+                fprintf(pp, " 11=%d", kernel_shape[0]);
+            }
+
+            if (dilations.size() == 1)
+            {
+                fprintf(pp, " 2=%d", dilations[0]);
+            }
+            else if (dilations.size() == 2)
+            {
+                fprintf(pp, " 2=%d", dilations[1]);
+                fprintf(pp, " 12=%d", dilations[0]);
+            }
+
+            if (strides.size() == 1)
+            {
+                fprintf(pp, " 3=%d", strides[0]);
+            }
+            else if (strides.size() == 2)
+            {
+                fprintf(pp, " 3=%d", strides[1]);
+                fprintf(pp, " 13=%d", strides[0]);
+            }
+
+            if (auto_pad == "SAME_UPPER")
+            {
+                fprintf(pp, " 4=-233");
+            }
+            else if (auto_pad == "SAME_LOWER")
+            {
+                fprintf(pp, " 4=-234");
+            }
+            else
+            {
+                if (pads.size() == 1)
+                {
+                    fprintf(pp, " 4=%d", pads[0]);
+                }
+                else if (pads.size() == 2)
+                {
+                    fprintf(pp, " 4=%d", pads[1]);
+                    fprintf(pp, " 14=%d", pads[0]);
+                }
+                else if (pads.size() == 4)
+                {
+                    fprintf(pp, " 4=%d", pads[1]);
+                    fprintf(pp, " 14=%d", pads[0]);
+                    fprintf(pp, " 15=%d", pads[3]);
+                    fprintf(pp, " 16=%d", pads[2]);
+                }
+            }
+
+            if (output_padding.size() == 1)
+            {
+                fprintf(pp, " 18=%d", output_padding[0]);
+            }
+            else if (output_padding.size() == 2)
+            {
+                fprintf(pp, " 18=%d", output_padding[1]);
+                fprintf(pp, " 19=%d", output_padding[0]);
+            }
+
+            if (output_shape.size() == 1)
+            {
+                fprintf(pp, " 20=%d", output_shape[0]);
+            }
+            else if (output_shape.size() == 2)
+            {
+                fprintf(pp, " 20=%d", output_shape[1]);
+                fprintf(pp, " 21=%d", output_shape[0]);
+            }
+
+            fprintf(pp, " 5=%d", has_bias);
+
+            fprintf(pp, " 6=%d", get_tensor_proto_data_size(W));
+
+            if (group > 1)
+            {
+                fprintf(pp, " 7=%d", group);
+            }
+
+            int quantize_tag = 0;
+            fwrite(&quantize_tag, sizeof(int), 1, bp);
+
+            int maxk = 0;
+            if (kernel_shape.size() == 2)
+            {
+                maxk = kernel_shape[1] * kernel_shape[0];
+            }
+            else
+            {
+                maxk = kernel_shape[0] * kernel_shape[0];
+            }
+            int          weight_data_size = get_tensor_proto_data_size(W);
+            const float* weight_data      = 0;
+            if (W.has_raw_data())
+            {
+                weight_data = (const float*)W.raw_data().data();
+            }
+            else if (W.data_type() == 1)
+            {
+                weight_data = W.float_data().data();
+            }
+            for (int g = 0; g < group; g++)
+            {
+                // reorder weight from inch-outch to outch-inch
+                int          num_filter_g    = num_filter / group;
+                int          num_input       = weight_data_size / maxk / num_filter_g / group;
+                const float* weight_data_ptr = weight_data + g * maxk * num_filter_g * num_input;
+                for (int k = 0; k < num_filter_g; k++)
+                {
+                    for (int j = 0; j < num_input; j++)
+                    {
+                        fwrite(weight_data_ptr + (j * num_filter_g + k) * maxk, sizeof(float), maxk, bp);
+                    }
+                }
+            }
+
+            if (has_bias)
+            {
+                const onnx::TensorProto& B = weights[node.input(2)];
+                fwrite_tensor_proto_data(B, bp);
+            }
+        }
+        else if (op == "Cos")
+        {
+            int op_type = 10;
+            fprintf(pp, " 0=%d", op_type);
+        }
+        else if (op == "Crop")
+        {
+            auto starts = get_node_attr_ai(node, "starts");
+            fprintf(pp, " -23309=%zu", starts.size());
+            for (size_t j = 0; j < starts.size(); ++j)
+            {
+                fprintf(pp, ",%i", starts[j]);
+            }
+            auto ends = get_node_attr_ai(node, "ends");
+            fprintf(pp, " -23310=%zu", ends.size());
+            for (size_t j = 0; j < ends.size(); ++j)
+            {
+                fprintf(pp, ",%i", ends[j]);
+            }
+            auto axis = get_node_attr_ai(node, "axis");
+            fprintf(pp, " -23311=%zu", axis.size());
+            for (size_t j = 0; j < axis.size(); ++j)
+            {
+                fprintf(pp, ",%i", axis[j]);
+            }
+        }
+        else if (op == "DepthToSpace")
+        {
+            // pixelshuffle
+            int         scale_factor = get_node_attr_i(node, "blocksize", 1);
+            std::string mode         = get_node_attr_s(node, "mode");
+            fprintf(pp, " 0=%d", scale_factor);
+            if (mode == "CRD")
+            {
+                fprintf(pp, " 1=0");
+            }
+            else if (mode == "DCR")
+            {
+                fprintf(pp, " 1=1");
+            }
+        }
+        else if (op == "DetectionOutput")
+        {
+            float              score_threshold = get_node_attr_f(node, "score_threshold");
+            float              nms_threshold   = get_node_attr_f(node, "nms_threshold");
+            int                nms_top_k       = get_node_attr_i(node, "nms_top_k");
+            int                keep_top_k      = get_node_attr_i(node, "keep_top_k");
+            int                num_class       = get_node_attr_i(node, "num_class");
+            std::vector<float> vars            = get_node_attr_af(node, "vars");
+            fprintf(pp, " 0=%d", num_class);
+            fprintf(pp, " 1=%f", nms_threshold);
+            fprintf(pp, " 2=%d", nms_top_k);
+            fprintf(pp, " 3=%d", keep_top_k);
+            fprintf(pp, " 4=%f", score_threshold);
+            fprintf(pp, " 5=%f", vars[0]);
+            fprintf(pp, " 6=%f", vars[1]);
+            fprintf(pp, " 7=%f", vars[2]);
+            fprintf(pp, " 8=%f", vars[3]);
+        }
+        else if (op == "Div")
+        {
+            int op_type = 3;
+            fprintf(pp, " 0=%d", op_type);
+
+            int   with_scalar = get_node_attr_i(node, "with_scalar", 0);
+            float b           = get_node_attr_f(node, "b", 0.f);
+            if (with_scalar)
+            {
+                fprintf(pp, " 1=%d", with_scalar);
+                fprintf(pp, " 2=%e", b);
+            }
+        }
+        else if (op == "Dropout")
+        {
+            // no-op
+        }
+        else if (op == "Elu")
+        {
+            float alpha = get_node_attr_f(node, "alpha", 1.f);
+            fprintf(pp, " 0=%e", alpha);
+        }
+        else if (op == "EmbedLayerNormalization")
+        {
+            const onnx::TensorProto& words     = weights[node.input(2)];
+            const onnx::TensorProto& positions = weights[node.input(3)];
+            const onnx::TensorProto& W         = weights[node.input(5)];
+            const onnx::TensorProto& B         = weights[node.input(6)];
+
+            fprintf(pp, " 0=%d", get_tensor_proto_data_size(B));
+            fprintf(pp, " 1=%d", get_tensor_proto_data_size(words));
+            fprintf(pp, " 2=%d", get_tensor_proto_data_size(positions));
+
+            int quantize_tag = 0;
+            fwrite(&quantize_tag, sizeof(int), 1, bp);
+
+            fwrite_tensor_proto_data(words, bp);
+
+            fwrite(&quantize_tag, sizeof(int), 1, bp);
+
+            fwrite_tensor_proto_data(positions, bp);
+
+            fwrite(&quantize_tag, sizeof(int), 1, bp);
+
+            fwrite_tensor_proto_data(W, bp);
+
+            fwrite(&quantize_tag, sizeof(int), 1, bp);
+
+            fwrite_tensor_proto_data(B, bp);
+        }
+        else if (op == "Equal")
+        {
+            int op_type = 0;
+            fprintf(pp, " 0=%d", op_type);
+        }
+        else if (op == "Exp")
+        {
+            int op_type = 7;
+            fprintf(pp, " 0=%d", op_type);
+        }
+        else if (op == "Flatten")
+        {
+            int axis = get_node_attr_i(node, "axis", 1);
+            if (axis != 1)
+            {
+                fprintf(stderr, "Unsupported Flatten axis %d!\n", axis);
+            }
+        }
+        else if (op == "Floor")
+        {
+            int op_type = 2;
+            fprintf(pp, " 0=%d", op_type);
+        }
+        else if (op == "Gather")
+        {
+            if (weights[node.input(1)].dims_size() > 1)
+            {
+                fprintf(stderr, "Unsupported indice dims > 1");
+            }
+            int axis = get_node_attr_i(node, "axis", 1) - 1;
+            if (axis < 0)
+            {
+                fprintf(stderr, "Unsupported Gather axis: %d\n", axis + 1);
+            }
+            fprintf(pp, " 0=%d", axis);
+        }
+        else if (op == "Gelu")
+        {
+            fprintf(pp, " 0=1");
+        }
+        else if (op == "Gemm")
+        {
+            float alpha  = get_node_attr_f(node, "alpha", 1.f);
+            float beta   = get_node_attr_f(node, "beta", 1.f);
+            int   transA = get_node_attr_i(node, "transA", 0);
+            int   transB = get_node_attr_i(node, "transB", 0);
+
+            if (alpha == 1.f && beta == 1.f && transA == 0 && transB == 1)
+            {
+                // InnerProduct-like A * B + C
+                const onnx::TensorProto& B          = weights[node.input(1)];
+                // B has transposed.
+                int                      num_output = B.dims(0);
+                fprintf(pp, " 0=%d", num_output);
+                if (node.input_size() == 3)
+                {
+                    fprintf(pp, " 1=1");
+                }
+                else
+                {
+                    fprintf(pp, " 1=0");
+                }
+                fprintf(pp, " 2=%d", get_tensor_proto_data_size(B));
+
+                int quantize_tag = 0;
+                fwrite(&quantize_tag, sizeof(int), 1, bp);
+                fwrite_tensor_proto_data(B, bp);
+                if (node.input_size() == 3)
+                {
+                    const onnx::TensorProto& C = weights[node.input(2)];
+                    fwrite_tensor_proto_data(C, bp);
+                }
+            }
+            else
+            {
+                // gemm
+                fprintf(pp, " 0=%e", alpha);
+                fprintf(pp, " 1=%e", beta);
+                fprintf(pp, " 2=%d", transA);
+                fprintf(pp, " 3=%d", transB);
+            }
+        }
+        else if (op == "GlobalAveragePool")
+        {
+            int pool        = 1;
+            int global_pool = 1;
+
+            fprintf(pp, " 0=%d", pool);
+            fprintf(pp, " 4=%d", global_pool);
+        }
+        else if (op == "GlobalMaxPool")
+        {
+            int pool        = 0;
+            int global_pool = 1;
+
+            fprintf(pp, " 0=%d", pool);
+            fprintf(pp, " 4=%d", global_pool);
+        }
+        else if (op == "AdaptiveAvgPool2d" || op == "adaptive_avg_pool2d" ||
+                 op == "adaptive_max_pool2d")
+        {
+            int pool = 0;
+            if (op == "AdaptiveAvgPool2d" || op == "adaptive_avg_pool2d")
+            {
+                pool = 1;
+            }
+            int                      adaptive_pooling = 1;
+            const onnx::TensorProto& out_shape_tp     = weights[node.input(1)];
+            std::vector<int>         out_shape        = get_node_attr_from_input_ai(out_shape_tp);
+
+            fprintf(pp, " 0=%d", pool);
+            fprintf(pp, " 7=%d", adaptive_pooling);
+            if (out_shape.size() == 1)
+            {
+                fprintf(pp, " 8=%d", out_shape[0]);
+            }
+            else if (out_shape.size() == 2)
+            {
+                // out_w
+                fprintf(pp, " 8=%d", out_shape[1]);
+                // out_h
+                fprintf(pp, " 18=%d", out_shape[0]);
+            }
+        }
+        else if (op == "GroupNorm")
+        {
+            int   groups   = get_node_attr_i(node, "groups", 1);
+            int   channels = get_node_attr_i(node, "channels", 1);
+            float eps      = get_node_attr_f(node, "epsilon", 1e-5f);
+            int   affine   = get_node_attr_i(node, "affine", 1);
+
+            if (affine)
+            {
+                // discard affine-less S=1 B=0
+                std::vector<float> affine_S = get_node_attr_from_input_af(weights[node.input(1)]);
+                std::vector<float> affine_B = get_node_attr_from_input_af(weights[node.input(2)]);
+                if (affine_S.size() == 1 && affine_S[0] == 1.f && affine_B.size() == 1 &&
+                    affine_B[0] == 0.f)
+                {
+                    affine = 0;
+                }
+                else
+                {
+                    affine = 0;
+                    {
+                        for (int j = 0; j < channels; j++)
+                        {
+                            if (affine_S[j] != 1.f || affine_B[j] != 0.f)
+                            {
+                                affine = 1;
+                                break;
+                            }
+                        }
+                    }
+                }
+            }
+
+            fprintf(pp, " 0=%d", groups);
+            fprintf(pp, " 1=%d", channels);
+            fprintf(pp, " 2=%e", eps);
+            fprintf(pp, " 3=%d", affine);
+            if (affine)
+            {
+                const onnx::TensorProto& scale = weights[node.input(1)];
+                const onnx::TensorProto& B     = weights[node.input(2)];
+
+                fwrite_tensor_proto_data(scale, bp);
+                fwrite_tensor_proto_data(B, bp);
+            }
+        }
+        else if (op == "GRU")
+        {
+            const onnx::TensorProto& W = weights[node.input(1)];
+            const onnx::TensorProto& R = weights[node.input(2)];
+            const onnx::TensorProto& B = weights[node.input(3)];
+
+            int                      hidden_size = get_node_attr_i(node, "hidden_size", 0);
+            std::string              direction   = get_node_attr_s(node, "direction");
+
+            int                      direction_type = 0;
+            if (direction == "forward")
+            {
+                direction_type = 0;
+            }
+            else if (direction == "reverse")
+            {
+                direction_type = 1;
+            }
+            else if (direction == "bidirectional")
+            {
+                direction_type = 2;
+            }
+
+            int weight_data_size = get_tensor_proto_data_size(W);
+
+            fprintf(pp, " 0=%d", hidden_size);
+            fprintf(pp, " 1=%d", weight_data_size);
+            fprintf(pp, " 2=%d", direction_type);
+
+            int num_directions = direction_type == 2 ? 2 : 1;
+
+            int quantize_tag = 0;
+
+            // reorder num_directions-URN-hidden-size to
+            // num_directions-RUN-hidden-size
+            {
+                fwrite(&quantize_tag, sizeof(int), 1, bp);
+
+                int          weight_data_size_g = get_tensor_proto_data_size(W) / 3 / num_directions;
+                const float* wptr =
+                    W.has_raw_data() ? (const float*)W.raw_data().data() : W.float_data().data();
+
+                const float* uptr = wptr;
+                const float* rptr = wptr + weight_data_size_g;
+                const float* nptr = wptr + weight_data_size_g * 2;
+                fwrite(rptr, sizeof(float), weight_data_size_g, bp);
+                fwrite(uptr, sizeof(float), weight_data_size_g, bp);
+                fwrite(nptr, sizeof(float), weight_data_size_g, bp);
+
+                if (direction_type == 2)
+                {
+                    uptr += weight_data_size_g * 3;
+                    rptr += weight_data_size_g * 3;
+                    nptr += weight_data_size_g * 3;
+                    fwrite(rptr, sizeof(float), weight_data_size_g, bp);
+                    fwrite(uptr, sizeof(float), weight_data_size_g, bp);
+                    fwrite(nptr, sizeof(float), weight_data_size_g, bp);
+                }
+            }
+
+            // reduce U and R bias except N
+            // reorder num_directions-URN-hidden to num_directions-RUN-hidden
+            {
+                fwrite(&quantize_tag, sizeof(int), 1, bp);
+
+                int          bias_data_size_g = get_tensor_proto_data_size(B) / 2 / 3 / num_directions;
+                const float* bptr =
+                    B.has_raw_data() ? (const float*)B.raw_data().data() : B.float_data().data();
+                const float* wuptr = bptr;
+                const float* wrptr = bptr + bias_data_size_g;
+                const float* wnptr = bptr + bias_data_size_g * 2;
+                const float* buptr = bptr + bias_data_size_g * 3;
+                const float* brptr = bptr + bias_data_size_g * 4;
+                const float* bnptr = bptr + bias_data_size_g * 5;
+
+                for (int j = 0; j < bias_data_size_g; j++)
+                {
+                    float vb = wrptr[j] + brptr[j];
+                    fwrite(&vb, sizeof(float), 1, bp);
+                }
+                for (int j = 0; j < bias_data_size_g; j++)
+                {
+                    float vb = wuptr[j] + buptr[j];
+                    fwrite(&vb, sizeof(float), 1, bp);
+                }
+                fwrite(wnptr, sizeof(float), bias_data_size_g, bp);
+                fwrite(bnptr, sizeof(float), bias_data_size_g, bp);
+
+                if (direction_type == 2)
+                {
+                    wuptr += bias_data_size_g * 6;
+                    wrptr += bias_data_size_g * 6;
+                    wnptr += bias_data_size_g * 6;
+                    buptr += bias_data_size_g * 6;
+                    brptr += bias_data_size_g * 6;
+                    bnptr += bias_data_size_g * 6;
+
+                    for (int j = 0; j < bias_data_size_g; j++)
+                    {
+                        float vb = wrptr[j] + brptr[j];
+                        fwrite(&vb, sizeof(float), 1, bp);
+                    }
+                    for (int j = 0; j < bias_data_size_g; j++)
+                    {
+                        float vb = wuptr[j] + buptr[j];
+                        fwrite(&vb, sizeof(float), 1, bp);
+                    }
+                    fwrite(wnptr, sizeof(float), bias_data_size_g, bp);
+                    fwrite(bnptr, sizeof(float), bias_data_size_g, bp);
+                }
+            }
+
+            // reorder num_directions-URN-hidden-hidden to
+            // num_directions-RUN-hidden-hidden
+            {
+                fwrite(&quantize_tag, sizeof(int), 1, bp);
+
+                int          weight_data_size_g = get_tensor_proto_data_size(R) / 3 / num_directions;
+                const float* Rptr =
+                    R.has_raw_data() ? (const float*)R.raw_data().data() : R.float_data().data();
+
+                const float* uptr = Rptr;
+                const float* rptr = Rptr + weight_data_size_g;
+                const float* nptr = Rptr + weight_data_size_g * 2;
+                fwrite(rptr, sizeof(float), weight_data_size_g, bp);
+                fwrite(uptr, sizeof(float), weight_data_size_g, bp);
+                fwrite(nptr, sizeof(float), weight_data_size_g, bp);
+
+                if (direction_type == 2)
+                {
+                    uptr += weight_data_size_g * 3;
+                    rptr += weight_data_size_g * 3;
+                    nptr += weight_data_size_g * 3;
+                    fwrite(rptr, sizeof(float), weight_data_size_g, bp);
+                    fwrite(uptr, sizeof(float), weight_data_size_g, bp);
+                    fwrite(nptr, sizeof(float), weight_data_size_g, bp);
+                }
+            }
+        }
+        else if (op == "HardSigmoid")
+        {
+            float alpha = get_node_attr_f(node, "alpha", 0.2f);
+            float beta  = get_node_attr_f(node, "beta", 0.5f);
+
+            fprintf(pp, " 0=%e", alpha);
+            fprintf(pp, " 1=%e", beta);
+        }
+        else if (op == "HardSwish")
+        {
+            float alpha = get_node_attr_f(node, "alpha", 0.2f);
+            float beta  = get_node_attr_f(node, "beta", 0.5f);
+
+            fprintf(pp, " 0=%e", alpha);
+            fprintf(pp, " 1=%e", beta);
+        }
+        else if (op == "ImageScaler")
+        {
+            std::vector<float> bias  = get_node_attr_af(node, "bias");
+            float              scale = get_node_attr_f(node, "scale", 1.f);
+
+            int                channels = (int)bias.size();
+
+            fprintf(pp, " 0=%d", channels);
+            fprintf(pp, " 1=1");
+
+            for (int j = 0; j < channels; j++)
+            {
+                fwrite(&scale, sizeof(float), 1, bp);
+            }
+            fwrite(&bias[0], sizeof(float), channels, bp);
+        }
+        else if (op == "InstanceNormalization")
+        {
+            float              eps = get_node_attr_f(node, "epsilon", 1e-5f);
+
+            // discard affine-less S=1 B=0
+            std::vector<float> affine_S = get_node_attr_from_input_af(weights[node.input(1)]);
+            std::vector<float> affine_B = get_node_attr_from_input_af(weights[node.input(2)]);
+            int                channels = (int)affine_S.size();
+            int                affine   = 0;
+            {
+                for (int j = 0; j < channels; j++)
+                {
+                    if (affine_S[j] != 1.f || affine_B[j] != 0.f)
+                    {
+                        affine = 1;
+                        break;
+                    }
+                }
+            }
+
+            fprintf(pp, " 0=%d", channels);
+            fprintf(pp, " 1=%e", eps);
+            fprintf(pp, " 2=%d", affine);
+            if (affine)
+            {
+                const onnx::TensorProto& scale = weights[node.input(1)];
+                const onnx::TensorProto& B     = weights[node.input(2)];
+
+                fwrite_tensor_proto_data(scale, bp);
+                fwrite_tensor_proto_data(B, bp);
+            }
+        }
+        else if (op == "LayerNorm")
+        {
+            float eps    = get_node_attr_f(node, "epsilon", 1e-5f);
+            int   affine = get_node_attr_i(node, "affine", 1);
+
+            if (affine)
+            {
+                // discard affine-less S=1 B=0
+                std::vector<float> affine_S    = get_node_attr_from_input_af(weights[node.input(1)]);
+                std::vector<float> affine_B    = get_node_attr_from_input_af(weights[node.input(2)]);
+                int                affine_size = (int)affine_S.size();
+                affine                         = 0;
+                {
+                    for (int j = 0; j < affine_size; j++)
+                    {
+                        if (affine_S[j] != 1.f || affine_B[j] != 0.f)
+                        {
+                            affine = 1;
+                            break;
+                        }
+                    }
+                }
+
+                if (affine)
+                {
+                    fprintf(pp, " 0=%d", affine_size);
+                }
+            }
+
+            fprintf(pp, " 1=%e", eps);
+            fprintf(pp, " 2=%d", affine);
+
+            if (affine)
+            {
+                const onnx::TensorProto& scale = weights[node.input(1)];
+                const onnx::TensorProto& B     = weights[node.input(2)];
+
+                fwrite_tensor_proto_data(scale, bp);
+                fwrite_tensor_proto_data(B, bp);
+            }
+        }
+        else if (op == "LeakyRelu")
+        {
+            float alpha = get_node_attr_f(node, "alpha", 0.01f);
+            fprintf(pp, " 0=%e", alpha);
+        }
+        else if (op == "Threshold")
+        {
+            float threshold = get_node_attr_f(node, "threshold", 0.f);
+            fprintf(pp, " 0=%e", threshold);
+        }
+        else if (op == "Log")
+        {
+            int op_type = 8;
+            fprintf(pp, " 0=%d", op_type);
+        }
+        else if (op == "LRN")
+        {
+            float alpha = get_node_attr_f(node, "alpha", 1.f);
+            float beta  = get_node_attr_f(node, "beta", 0.5f);
+            float bias  = get_node_attr_f(node, "bias", 1.f);
+            int   size  = get_node_attr_i(node, "size", 1);
+
+            int   norm_region = 0;
+
+            fprintf(pp, " 0=%d", norm_region);
+            fprintf(pp, " 1=%d", size);
+            fprintf(pp, " 2=%e", alpha);
+            fprintf(pp, " 3=%e", beta);
+            fprintf(pp, " 4=%e", bias);
+        }
+        else if (op == "LSTM")
+        {
+            const onnx::TensorProto& W = weights[node.input(1)];
+            const onnx::TensorProto& R = weights[node.input(2)];
+            const onnx::TensorProto& B = weights[node.input(3)];
+
+            int                      hidden_size = get_node_attr_i(node, "hidden_size", 0);
+            std::string              direction   = get_node_attr_s(node, "direction");
+
+            int                      direction_type = 0;
+            if (direction == "forward")
+            {
+                direction_type = 0;
+            }
+            else if (direction == "reverse")
+            {
+                direction_type = 1;
+            }
+            else if (direction == "bidirectional")
+            {
+                direction_type = 2;
+            }
+
+            int weight_data_size = get_tensor_proto_data_size(W);
+
+            fprintf(pp, " 0=%d", hidden_size);
+            fprintf(pp, " 1=%d", weight_data_size);
+            fprintf(pp, " 2=%d", direction_type);
+
+            int num_directions = direction_type == 2 ? 2 : 1;
+
+            int quantize_tag = 0;
+
+            // reorder num_directions-IOFG-hidden-size to
+            // num_directions-IFOG-hidden-size
+            {
+                fwrite(&quantize_tag, sizeof(int), 1, bp);
+
+                int          weight_data_size_g = get_tensor_proto_data_size(W) / 4 / num_directions;
+                const float* wptr =
+                    W.has_raw_data() ? (const float*)W.raw_data().data() : W.float_data().data();
+
+                const float* iptr = wptr;
+                const float* optr = wptr + weight_data_size_g;
+                const float* fptr = wptr + weight_data_size_g * 2;
+                const float* gptr = wptr + weight_data_size_g * 3;
+                fwrite(iptr, sizeof(float), weight_data_size_g, bp);
+                fwrite(fptr, sizeof(float), weight_data_size_g, bp);
+                fwrite(optr, sizeof(float), weight_data_size_g, bp);
+                fwrite(gptr, sizeof(float), weight_data_size_g, bp);
+
+                if (direction_type == 2)
+                {
+                    iptr += weight_data_size_g * 4;
+                    optr += weight_data_size_g * 4;
+                    fptr += weight_data_size_g * 4;
+                    gptr += weight_data_size_g * 4;
+                    fwrite(iptr, sizeof(float), weight_data_size_g, bp);
+                    fwrite(fptr, sizeof(float), weight_data_size_g, bp);
+                    fwrite(optr, sizeof(float), weight_data_size_g, bp);
+                    fwrite(gptr, sizeof(float), weight_data_size_g, bp);
+                }
+            }
+
+            // reduce xc and hc bias
+            // reorder num_directions-IOFG-hidden to num_directions-IFOG-hidden
+            {
+                fwrite(&quantize_tag, sizeof(int), 1, bp);
+
+                int          bias_data_size_g = get_tensor_proto_data_size(B) / 2 / 4 / num_directions;
+                const float* xcbptr =
+                    B.has_raw_data() ? (const float*)B.raw_data().data() : B.float_data().data();
+                const float* xiptr = xcbptr;
+                const float* xoptr = xcbptr + bias_data_size_g;
+                const float* xfptr = xcbptr + bias_data_size_g * 2;
+                const float* xgptr = xcbptr + bias_data_size_g * 3;
+                const float* hiptr = xcbptr + bias_data_size_g * 4;
+                const float* hoptr = xcbptr + bias_data_size_g * 5;
+                const float* hfptr = xcbptr + bias_data_size_g * 6;
+                const float* hgptr = xcbptr + bias_data_size_g * 7;
+
+                for (int j = 0; j < bias_data_size_g; j++)
+                {
+                    float vb = xiptr[j] + hiptr[j];
+                    fwrite(&vb, sizeof(float), 1, bp);
+                }
+                for (int j = 0; j < bias_data_size_g; j++)
+                {
+                    float vb = xfptr[j] + hfptr[j];
+                    fwrite(&vb, sizeof(float), 1, bp);
+                }
+                for (int j = 0; j < bias_data_size_g; j++)
+                {
+                    float vb = xoptr[j] + hoptr[j];
+                    fwrite(&vb, sizeof(float), 1, bp);
+                }
+                for (int j = 0; j < bias_data_size_g; j++)
+                {
+                    float vb = xgptr[j] + hgptr[j];
+                    fwrite(&vb, sizeof(float), 1, bp);
+                }
+
+                if (direction_type == 2)
+                {
+                    xiptr += bias_data_size_g * 8;
+                    xoptr += bias_data_size_g * 8;
+                    xfptr += bias_data_size_g * 8;
+                    xgptr += bias_data_size_g * 8;
+                    hiptr += bias_data_size_g * 8;
+                    hoptr += bias_data_size_g * 8;
+                    hfptr += bias_data_size_g * 8;
+                    hgptr += bias_data_size_g * 8;
+
+                    for (int j = 0; j < bias_data_size_g; j++)
+                    {
+                        float vb = xiptr[j] + hiptr[j];
+                        fwrite(&vb, sizeof(float), 1, bp);
+                    }
+                    for (int j = 0; j < bias_data_size_g; j++)
+                    {
+                        float vb = xfptr[j] + hfptr[j];
+                        fwrite(&vb, sizeof(float), 1, bp);
+                    }
+                    for (int j = 0; j < bias_data_size_g; j++)
+                    {
+                        float vb = xoptr[j] + hoptr[j];
+                        fwrite(&vb, sizeof(float), 1, bp);
+                    }
+                    for (int j = 0; j < bias_data_size_g; j++)
+                    {
+                        float vb = xgptr[j] + hgptr[j];
+                        fwrite(&vb, sizeof(float), 1, bp);
+                    }
+                }
+            }
+
+            // reorder num_directions-IOFG-hidden-hidden to
+            // num_directions-IFOG-hidden-hidden
+            {
+                fwrite(&quantize_tag, sizeof(int), 1, bp);
+
+                int          weight_data_size_g = get_tensor_proto_data_size(R) / 4 / num_directions;
+                const float* rptr =
+                    R.has_raw_data() ? (const float*)R.raw_data().data() : R.float_data().data();
+
+                const float* iptr = rptr;
+                const float* optr = rptr + weight_data_size_g;
+                const float* fptr = rptr + weight_data_size_g * 2;
+                const float* gptr = rptr + weight_data_size_g * 3;
+                fwrite(iptr, sizeof(float), weight_data_size_g, bp);
+                fwrite(fptr, sizeof(float), weight_data_size_g, bp);
+                fwrite(optr, sizeof(float), weight_data_size_g, bp);
+                fwrite(gptr, sizeof(float), weight_data_size_g, bp);
+
+                if (direction_type == 2)
+                {
+                    iptr += weight_data_size_g * 4;
+                    optr += weight_data_size_g * 4;
+                    fptr += weight_data_size_g * 4;
+                    gptr += weight_data_size_g * 4;
+                    fwrite(iptr, sizeof(float), weight_data_size_g, bp);
+                    fwrite(fptr, sizeof(float), weight_data_size_g, bp);
+                    fwrite(optr, sizeof(float), weight_data_size_g, bp);
+                    fwrite(gptr, sizeof(float), weight_data_size_g, bp);
+                }
+            }
+        }
+        else if (op == "MatMul")
+        {
+            if (weights.find(node.input(1)) != weights.end() && weights[node.input(1)].dims_size() == 2)
+            {
+                // InnerProduct
+                const onnx::TensorProto& B = weights[node.input(1)];
+
+                int                      weight_data_size = get_tensor_proto_data_size(B);
+
+                int                      num_output = B.dims(B.dims_size() - 1);
+                int                      num_input  = weight_data_size / num_output;
+
+                fprintf(pp, " 0=%d", num_output);
+                fprintf(pp, " 1=0");
+                fprintf(pp, " 2=%d", weight_data_size);
+
+                int quantize_tag = 0;
+                fwrite(&quantize_tag, sizeof(int), 1, bp);
+
+                // reorder num_input-num_output to num_output-num_input
+                {
+                    const float* bptr =
+                        B.has_raw_data() ? (const float*)B.raw_data().data() : B.float_data().data();
+
+                    for (int j = 0; j < num_output; j++)
+                    {
+                        for (int k = 0; k < num_input; k++)
+                        {
+                            float vb = bptr[k * num_output + j];
+                            fwrite(&vb, sizeof(float), 1, bp);
+                        }
+                    }
+                }
+
+                // fwrite_tensor_proto_data(B, bp)
+            }
+            else
+            {
+                // default matrix multiplication
+            }
+        }
+        else if (op == "Max")
+        {
+            int op_type = 4;
+            fprintf(pp, " 0=%d", op_type);
+
+            int   with_scalar = get_node_attr_i(node, "with_scalar", 0);
+            float b           = get_node_attr_f(node, "b", 0.f);
+            if (with_scalar)
+            {
+                fprintf(pp, " 1=%d", with_scalar);
+                fprintf(pp, " 2=%e", b);
+            }
+        }
+        else if (op == "Min")
+        {
+            int op_type = 5;
+            fprintf(pp, " 0=%d", op_type);
+
+            int   with_scalar = get_node_attr_i(node, "with_scalar", 0);
+            float b           = get_node_attr_f(node, "b", 0.f);
+            if (with_scalar)
+            {
+                fprintf(pp, " 1=%d", with_scalar);
+                fprintf(pp, " 2=%e", b);
+            }
+        }
+        else if (op == "Mul")
+        {
+            int op_type = 2;
+            fprintf(pp, " 0=%d", op_type);
+
+            int   with_scalar = get_node_attr_i(node, "with_scalar", 0);
+            float b           = get_node_attr_f(node, "b", 0.f);
+            if (with_scalar)
+            {
+                fprintf(pp, " 1=%d", with_scalar);
+                fprintf(pp, " 2=%e", b);
+            }
+        }
+        else if (op == "MultiHeadAttention")
+        {
+            int embed_dim = get_node_attr_i(node, "embed_dim", 0);
+            int num_heads = get_node_attr_i(node, "num_heads", 0);
+
+            fprintf(pp, " 0=%d", embed_dim);
+            fprintf(pp, " 1=%d", num_heads);
+
+            if (node.input_size() == 5)
+            {
+                const onnx::TensorProto& qkvw = weights[node.input(1)];
+                const onnx::TensorProto& qkvb = weights[node.input(2)];
+                const onnx::TensorProto& ow   = weights[node.input(3)];
+                const onnx::TensorProto& ob   = weights[node.input(4)];
+
+                int                      weight_data_size = get_tensor_proto_data_size(ow);
+
+                fprintf(pp, " 2=%d", weight_data_size);
+
+                int quantize_tag = 0;
+
+                fwrite(&quantize_tag, sizeof(int), 1, bp);
+                // transpose qw
+                {
+                    const float* wptr =
+                        qkvw.has_raw_data() ? (const float*)qkvw.raw_data().data() : qkvw.float_data().data();
+                    const float* bptr =
+                        qkvb.has_raw_data() ? (const float*)qkvb.raw_data().data() : qkvb.float_data().data();
+
+                    for (int j = 0; j < embed_dim; j++)
+                    {
+                        for (int k = 0; k < embed_dim; k++)
+                        {
+                            float vb = wptr[j * embed_dim * 3 + k];
+                            fwrite(&vb, sizeof(float), 1, bp);
+                        }
+                    }
+
+                    fwrite(bptr, sizeof(float), embed_dim, bp);
+                }
+
+                fwrite(&quantize_tag, sizeof(int), 1, bp);
+                // transpose kw
+                {
+                    const float* wptr =
+                        qkvw.has_raw_data() ? (const float*)qkvw.raw_data().data() : qkvw.float_data().data();
+                    const float* bptr =
+                        qkvb.has_raw_data() ? (const float*)qkvb.raw_data().data() : qkvb.float_data().data();
+                    bptr += embed_dim;
+
+                    for (int j = 0; j < embed_dim; j++)
+                    {
+                        for (int k = 0; k < embed_dim; k++)
+                        {
+                            float vb = wptr[j * embed_dim * 3 + k + embed_dim];
+                            fwrite(&vb, sizeof(float), 1, bp);
+                        }
+                    }
+
+                    fwrite(bptr, sizeof(float), embed_dim, bp);
+                }
+
+                fwrite(&quantize_tag, sizeof(int), 1, bp);
+                // transpose vw
+                {
+                    const float* wptr =
+                        qkvw.has_raw_data() ? (const float*)qkvw.raw_data().data() : qkvw.float_data().data();
+                    const float* bptr =
+                        qkvb.has_raw_data() ? (const float*)qkvb.raw_data().data() : qkvb.float_data().data();
+                    bptr += embed_dim * 2;
+
+                    for (int j = 0; j < embed_dim; j++)
+                    {
+                        for (int k = 0; k < embed_dim; k++)
+                        {
+                            float vb = wptr[j * embed_dim * 3 + k + embed_dim * 2];
+                            fwrite(&vb, sizeof(float), 1, bp);
+                        }
+                    }
+
+                    fwrite(bptr, sizeof(float), embed_dim, bp);
+                }
+
+                fwrite(&quantize_tag, sizeof(int), 1, bp);
+                // transpose ow
+                {
+                    const float* wptr =
+                        ow.has_raw_data() ? (const float*)ow.raw_data().data() : ow.float_data().data();
+
+                    for (int j = 0; j < embed_dim; j++)
+                    {
+                        for (int k = 0; k < embed_dim; k++)
+                        {
+                            float vb = wptr[j * embed_dim + k];
+                            fwrite(&vb, sizeof(float), 1, bp);
+                        }
+                    }
+                }
+                fwrite_tensor_proto_data(ob, bp);
+            }
+            else
+            {
+                const onnx::TensorProto& qw = weights[node.input(3)];
+                const onnx::TensorProto& qb = weights[node.input(4)];
+                const onnx::TensorProto& kw = weights[node.input(5)];
+                const onnx::TensorProto& kb = weights[node.input(6)];
+                const onnx::TensorProto& vw = weights[node.input(7)];
+                const onnx::TensorProto& vb = weights[node.input(8)];
+                const onnx::TensorProto& ow = weights[node.input(9)];
+                const onnx::TensorProto& ob = weights[node.input(10)];
+
+                int                      weight_data_size = get_tensor_proto_data_size(qw);
+
+                fprintf(pp, " 2=%d", weight_data_size);
+
+                int quantize_tag = 0;
+
+                fwrite(&quantize_tag, sizeof(int), 1, bp);
+                // transpose qw
+                {
+                    const float* wptr =
+                        qw.has_raw_data() ? (const float*)qw.raw_data().data() : qw.float_data().data();
+
+                    for (int j = 0; j < embed_dim; j++)
+                    {
+                        for (int k = 0; k < embed_dim; k++)
+                        {
+                            float vb = wptr[j * embed_dim + k];
+                            fwrite(&vb, sizeof(float), 1, bp);
+                        }
+                    }
+                }
+                fwrite_tensor_proto_data(qb, bp);
+
+                fwrite(&quantize_tag, sizeof(int), 1, bp);
+                // transpose kw
+                {
+                    const float* wptr =
+                        kw.has_raw_data() ? (const float*)kw.raw_data().data() : kw.float_data().data();
+
+                    for (int j = 0; j < embed_dim; j++)
+                    {
+                        for (int k = 0; k < embed_dim; k++)
+                        {
+                            float vb = wptr[j * embed_dim + k];
+                            fwrite(&vb, sizeof(float), 1, bp);
+                        }
+                    }
+                }
+                fwrite_tensor_proto_data(kb, bp);
+
+                fwrite(&quantize_tag, sizeof(int), 1, bp);
+                // transpose vw
+                {
+                    const float* wptr =
+                        vw.has_raw_data() ? (const float*)vw.raw_data().data() : vw.float_data().data();
+
+                    for (int j = 0; j < embed_dim; j++)
+                    {
+                        for (int k = 0; k < embed_dim; k++)
+                        {
+                            float vb = wptr[j * embed_dim + k];
+                            fwrite(&vb, sizeof(float), 1, bp);
+                        }
+                    }
+                }
+                fwrite_tensor_proto_data(vb, bp);
+
+                fwrite(&quantize_tag, sizeof(int), 1, bp);
+                // transpose ow
+                {
+                    const float* wptr =
+                        ow.has_raw_data() ? (const float*)ow.raw_data().data() : ow.float_data().data();
+
+                    for (int j = 0; j < embed_dim; j++)
+                    {
+                        for (int k = 0; k < embed_dim; k++)
+                        {
+                            float vb = wptr[j * embed_dim + k];
+                            fwrite(&vb, sizeof(float), 1, bp);
+                        }
+                    }
+                }
+                fwrite_tensor_proto_data(ob, bp);
+            }
+        }
+        else if (op == "Neg")
+        {
+            int op_type = 1;
+            fprintf(pp, " 0=%d", op_type);
+        }
+        else if (op == "NonMaxSuppression")
+        {
+            int   max_dets   = 0;
+            float iou_thre   = 0.f;
+            float score_thre = 0.f;
+            // fprintf(stderr, "%s\n", node.name().c_str());
+            // fprintf(stderr, "node.input_size(): %d\n", node.input_size());
+            if (node.input_size() >= 3)
+            {
+                // fprintf(stderr, "ok12!\n");
+                max_dets = (int)(get_node_attr_from_input<float>(weights[node.input(2)]) + 0.5);
+            }
+            if (node.input_size() >= 4)
+            {
+                // fprintf(stderr, "iou_thre: %f\n",
+                // get_node_attr_from_input<float>(weights[node.input(3)]));
+                iou_thre = get_node_attr_from_input<float>(weights[node.input(3)]);
+            }
+            if (node.input_size() >= 5)
+            {
+                // fprintf(stderr, "score_thre: %f\n",
+                // get_node_attr_from_input<float>(weights[node.input(4)]));
+                score_thre = get_node_attr_from_input<float>(weights[node.input(4)]);
+            }
+            fprintf(pp, " 0=%d", max_dets);
+            fprintf(pp, " 1=%f", iou_thre);
+            fprintf(pp, " 2=%f", score_thre);
+        }
+        else if (op == "Normalize")
+        {
+            float eps             = get_node_attr_f(node, "eps", 0.f);
+            int   scale_data_size = 1;
+
+            fprintf(pp, " 1=1");  // channel_shared
+            fprintf(pp, " 2=%e", eps);
+            fprintf(pp, " 3=%d", scale_data_size);
+            fprintf(pp, " 9=1");  // TODO hardcode pytorch style
+
+            const float scale_data[1] = {1.f};
+            fwrite(scale_data, sizeof(float), 1, bp);
+        }
+        else if (op == "Pad")
+        {
+            std::string      mode  = get_node_attr_s(node, "mode");
+            float            value = get_node_attr_f(node, "value", 0.f);
+
+            std::vector<int> pads;
+            if (node.input_size() == 1)
+            {
+                pads = get_node_attr_ai(node, "pads");
+            }
+            else
+            {
+                pads = get_node_attr_from_input_ai(weights[node.input(1)]);
+            }
+            int type = 0;
+            if (mode == "constant")
+            {
+                type = 0;
+            }
+            else if (mode == "edge")
+            {
+                type = 1;
+            }
+            else if (mode == "reflect")
+            {
+                type = 2;
+            }
+
+            int pad_size = (int)pads.size();
+            int top      = 0;
+            int bottom   = 0;
+            int left     = 0;
+            int right    = 0;
+            int front    = 0;
+            int behind   = 0;
+            if (pad_size == 8)
+            {
+                // NCHW
+                top    = pads[2];
+                bottom = pads[6];
+                left   = pads[3];
+                right  = pads[7];
+                front  = pads[1];
+                behind = pads[5];
+            }
+            else if (pad_size == 6)
+            {
+                // NHW
+                top    = pads[1];
+                bottom = pads[4];
+                left   = pads[2];
+                right  = pads[5];
+            }
+            else
+            {
+                // NW
+                left  = pads[1];
+                right = pads[3];
+            }
+
+            fprintf(pp, " 0=%d", top);
+            fprintf(pp, " 1=%d", bottom);
+            fprintf(pp, " 2=%d", left);
+            fprintf(pp, " 3=%d", right);
+            fprintf(pp, " 4=%d", type);
+            fprintf(pp, " 5=%e", value);
+            fprintf(pp, " 7=%d", front);
+            fprintf(pp, " 8=%d", behind);
+        }
+        else if (op == "Pow")
+        {
+            int op_type = 6;
+            fprintf(pp, " 0=%d", op_type);
+
+            int   with_scalar = get_node_attr_i(node, "with_scalar", 0);
+            float b           = get_node_attr_f(node, "b", 0.f);
+            if (with_scalar)
+            {
+                fprintf(pp, " 1=%d", with_scalar);
+                fprintf(pp, " 2=%e", b);
+            }
+        }
+        else if (op == "PriorBox")
+        {
+            std::vector<float> min_sizes     = get_node_attr_af(node, "min_sizes");
+            std::vector<float> max_sizes     = get_node_attr_af(node, "max_sizes");
+            std::vector<float> aspect_ratios = get_node_attr_af(node, "aspect_ratios");
+            fprintf(pp, " -23300=%zu", min_sizes.size());
+            for (size_t j = 0; j < min_sizes.size(); ++j)
+            {
+                fprintf(pp, ",%f", min_sizes[j]);
+            }
+            fprintf(pp, " -23301=%zu", max_sizes.size());
+            for (size_t j = 0; j < max_sizes.size(); ++j)
+            {
+                fprintf(pp, ",%f", max_sizes[j]);
+            }
+            fprintf(pp, " -23302=%zu", aspect_ratios.size());
+            for (size_t j = 0; j < aspect_ratios.size(); ++j)
+            {
+                fprintf(pp, ",%f", aspect_ratios[j]);
+            }
+            int   image_width      = get_node_attr_i(node, "image_width");
+            int   image_height     = get_node_attr_i(node, "image_height");
+            float step_width       = get_node_attr_f(node, "step_width");
+            float step_height      = get_node_attr_f(node, "step_height");
+            float offset           = get_node_attr_f(node, "offset");
+            int   step_mmdetection = get_node_attr_i(node, "step_mmdetection");
+            fprintf(pp, " 9=%d", image_width);
+            fprintf(pp, " 10=%d", image_height);
+            fprintf(pp, " 11=%f", step_width);
+            fprintf(pp, " 12=%f", step_height);
+            fprintf(pp, " 13=%f", offset);
+            fprintf(pp, " 14=%d", step_mmdetection);
+        }
+        else if (op == "PixelShuffle")
+        {
+            int scale_factor = get_node_attr_i(node, "scale_factor", 1);
+            fprintf(pp, " 0=%d", scale_factor);
+        }
+        else if (op == "PRelu")
+        {
+            const onnx::TensorProto& slope = weights[node.input(1)];
+
+            int                      num_slope = get_tensor_proto_data_size(slope);
+
+            fprintf(pp, " 0=%d", num_slope);
+
+            fwrite_tensor_proto_data(slope, bp);
+        }
+        else if (op == "Reciprocal")
+        {
+            int op_type = 15;
+            fprintf(pp, " 0=%d", op_type);
+        }
+        else if (op == "ReduceMax" || op == "ReduceMin" || op == "ReduceMean" || op == "ReduceProd" ||
+                 op == "ReduceSum" || op == "ReduceSumSquare" || op == "ReduceL1" ||
+                 op == "ReduceL2" || op == "ReduceLogSum" || op == "ReduceLogSumExp")
+        {
+            int op_type = -233;
+            if (op == "ReduceSum")
+                op_type = 0;
+            else if (op == "ReduceSumSquare")
+                op_type = 2;
+            else if (op == "ReduceMean")
+                op_type = 3;
+            else if (op == "ReduceMax")
+                op_type = 4;
+            else if (op == "ReduceMin")
+                op_type = 5;
+            else if (op == "ReduceProd")
+                op_type = 6;
+            else if (op == "ReduceL1")
+                op_type = 7;
+            else if (op == "ReduceL2")
+                op_type = 8;
+            else if (op == "ReduceLogSum")
+                op_type = 9;
+            else if (op == "ReduceLogSumExp")
+                op_type = 10;
+            fprintf(pp, " 0=%d", op_type);
+
+            std::vector<int> axes     = get_node_attr_ai(node, "axes");
+            int              keepdims = get_node_attr_i(node, "keepdims", 1);
+
+            if (axes.size() > 0)
+            {
+                // if axes set, reduce according to axes
+                fprintf(pp, " 1=%d", 0);
+                fprintf(pp, " -23303=%zu", axes.size());
+                for (size_t j = 0; j < axes.size(); j++)
+                {
+                    if (axes[j] == 0 || axes[j] > 4 || axes[j] < -3)
+                        fprintf(stderr, "Unsupported reduction axes !\n");
+                    fprintf(pp, ",%d", axes[j] > 0 ? axes[j] - 1 : axes[j]);
+                }
+            }
+            else
+            {
+                // if axes not set, reduce all axes by default
+                fprintf(pp, " 1=%d", 1);
+            }
+            fprintf(pp, " 4=%d", keepdims);
+            fprintf(pp, " 5=1");
+        }
+        else if (op == "Reorg")
+        {
+            int stride = get_node_attr_i(node, "stride", 1);
+            fprintf(pp, " 0=%d", stride);
+        }
+        else if (op == "Reshape")
+        {
+            std::vector<int> shape;
+
+            if (node.input_size() == 1)
+            {
+                shape = get_node_attr_ai(node, "shape");
+            }
+            else if (weights.find(node.input(1)) != weights.end())
+            {
+                shape = get_node_attr_from_input_ai(weights[node.input(1)]);
+            }
+            else
+            {
+                fprintf(stderr, "Unsupported reshape weight ! \n");
+            }
+
+            if (shape.size() == 1)
+            {
+                fprintf(pp, " 0=%d", shape[0]);  // should never reach here
+            }
+            else if (shape.size() == 2)
+            {
+                fprintf(pp, " 0=%d", shape[1]);
+            }
+            else if (shape.size() == 3)
+            {
+                fprintf(pp, " 0=%d", shape[2]);
+                fprintf(pp, " 1=%d", shape[1]);
+            }
+            else if (shape.size() == 4)
+            {
+                fprintf(pp, " 0=%d", shape[3]);
+                fprintf(pp, " 1=%d", shape[2]);
+                fprintf(pp, " 2=%d", shape[1]);
+            }
+            else if (shape.size() == 5)
+            {
+                fprintf(pp, " 0=%d", shape[4] * shape[3]);
+                fprintf(pp, " 1=%d", shape[2]);
+                fprintf(pp, " 2=%d", shape[1]);
+            }
+        }
+        else if (op == "Resize")
+        {
+            std::string        mode  = get_node_attr_s(node, "mode");
+            std::string        align = get_node_attr_s(node, "coordinate_transformation_mode");
+
+            std::vector<float> scales;
+            std::vector<int>   sizes;
+            if (node.input_size() == 2)
+            {
+                // opset 10
+                scales = get_node_attr_from_input_af(weights[node.input(1)]);
+            }
+            else
+            {
+                // opset 11+
+                scales = get_node_attr_from_input_af(weights[node.input(2)]);
+                if (node.input_size() >= 4)
+                {
+                    sizes = get_node_attr_from_input_ai(weights[node.input(3)]);
+                }
+            }
+
+            int resize_type = 1;
+            if (mode == "nearest")
+            {
+                resize_type = 1;
+            }
+            else if (mode == "linear")
+            {
+                resize_type = 2;
+            }
+            else if (mode == "cubic")
+            {
+                resize_type = 3;
+            }
+
+            if (scales.empty() && sizes.empty())
+            {
+                fprintf(stderr, "Unsupported Resize scales and sizes are all empty!\n");
+            }
+
+            float h_scale = 1.f;
+            float w_scale = 1.f;
+            if (scales.size() == 2)
+            {
+                w_scale = scales[1];
+            }
+            else if (scales.size() == 3)
+            {
+                h_scale = scales[1];
+                w_scale = scales[2];
+            }
+            else if (scales.size() == 4)
+            {
+                h_scale = scales[2];
+                w_scale = scales[3];
+
+                if (scales[1] != 1.f) fprintf(stderr, "Unsupported Resize scales !\n");
+            }
+
+            int output_height = 0;
+            int output_width  = 0;
+            if (sizes.size() == 2)
+            {
+                output_width = sizes[1];
+            }
+            else if (sizes.size() == 3)
+            {
+                output_height = sizes[1];
+                output_width  = sizes[2];
+            }
+            else if (sizes.size() == 4)
+            {
+                output_height = sizes[2];
+                output_width  = sizes[3];
+            }
+
+            int align_corner = 0;
+            if (align == "align_corners")
+            {
+                align_corner = 1;
+            }
+
+            fprintf(pp, " 0=%d", resize_type);
+            fprintf(pp, " 1=%e", h_scale);
+            fprintf(pp, " 2=%e", w_scale);
+            fprintf(pp, " 3=%d", output_height);
+            fprintf(pp, " 4=%d", output_width);
+            fprintf(pp, " 6=%d", align_corner);
+        }
+        else if (op == "RNN")
+        {
+            const onnx::TensorProto& W = weights[node.input(1)];
+            const onnx::TensorProto& R = weights[node.input(2)];
+            const onnx::TensorProto& B = weights[node.input(3)];
+
+            int                      hidden_size = get_node_attr_i(node, "hidden_size", 0);
+            std::string              direction   = get_node_attr_s(node, "direction");
+
+            int                      direction_type = 0;
+            if (direction == "forward")
+            {
+                direction_type = 0;
+            }
+            else if (direction == "reverse")
+            {
+                direction_type = 1;
+            }
+            else if (direction == "bidirectional")
+            {
+                direction_type = 2;
+            }
+
+            int weight_data_size = get_tensor_proto_data_size(W);
+
+            fprintf(pp, " 0=%d", hidden_size);
+            fprintf(pp, " 1=%d", weight_data_size);
+            fprintf(pp, " 2=%d", direction_type);
+
+            int num_directions = direction_type == 2 ? 2 : 1;
+
+            int quantize_tag = 0;
+
+            fwrite(&quantize_tag, sizeof(int), 1, bp);
+            fwrite_tensor_proto_data(W, bp);
+
+            // reduce xc and hc bias
+            {
+                fwrite(&quantize_tag, sizeof(int), 1, bp);
+
+                int          bias_data_size_g = get_tensor_proto_data_size(B) / 2 / num_directions;
+                const float* bptr =
+                    B.has_raw_data() ? (const float*)B.raw_data().data() : B.float_data().data();
+                const float* xiptr = bptr;
+                const float* hiptr = bptr + bias_data_size_g;
+
+                for (int j = 0; j < bias_data_size_g; j++)
+                {
+                    float vb = xiptr[j] + hiptr[j];
+                    fwrite(&vb, sizeof(float), 1, bp);
+                }
+
+                if (direction_type == 2)
+                {
+                    xiptr += bias_data_size_g * 2;
+                    hiptr += bias_data_size_g * 2;
+
+                    for (int j = 0; j < bias_data_size_g; j++)
+                    {
+                        float vb = xiptr[j] + hiptr[j];
+                        fwrite(&vb, sizeof(float), 1, bp);
+                    }
+                }
+            }
+
+            fwrite(&quantize_tag, sizeof(int), 1, bp);
+            fwrite_tensor_proto_data(R, bp);
+        }
+        else if (op == "RDiv")
+        {
+            int op_type = 8;
+            fprintf(pp, " 0=%d", op_type);
+
+            int   with_scalar = get_node_attr_i(node, "with_scalar", 0);
+            float b           = get_node_attr_f(node, "b", 0.f);
+            if (with_scalar)
+            {
+                fprintf(pp, " 1=%d", with_scalar);
+                fprintf(pp, " 2=%e", b);
+            }
+        }
+        else if (op == "RSub")
+        {
+            int op_type = 7;
+            fprintf(pp, " 0=%d", op_type);
+
+            int   with_scalar = get_node_attr_i(node, "with_scalar", 0);
+            float b           = get_node_attr_f(node, "b", 0.f);
+            if (with_scalar)
+            {
+                fprintf(pp, " 1=%d", with_scalar);
+                fprintf(pp, " 2=%e", b);
+            }
+        }
+        else if (op == "RoiAlign")
+        {
+            int   pooled_width   = get_node_attr_i(node, "output_width", 1);
+            int   pooled_height  = get_node_attr_i(node, "output_height", 1);
+            float spatial_scale  = get_node_attr_f(node, "spatial_scale", 1.f);
+            int   sampling_ratio = get_node_attr_i(node, "sampling_ratio", 0);
+            fprintf(pp, " 0=%d", pooled_width);
+            fprintf(pp, " 1=%d", pooled_height);
+            fprintf(pp, " 2=%f", spatial_scale);
+            fprintf(pp, " 3=%d", sampling_ratio);
+        }
+        else if (op == "ShuffleChannel")
+        {
+            int group   = get_node_attr_i(node, "group", 1);
+            int reverse = get_node_attr_i(node, "reverse", 0);
+            fprintf(pp, " 0=%d", group);
+            fprintf(pp, " 1=%d", reverse);
+        }
+        else if (op == "Sigmoid")
+        {
+            // no param
+        }
+        else if (op == "Sin")
+        {
+            int op_type = 9;
+            fprintf(pp, " 0=%d", op_type);
+        }
+        else if (op == "SkipLayerNormalization")
+        {
+            const onnx::TensorProto& W  = weights[node.input(2)];
+            const onnx::TensorProto& B  = weights[node.input(3)];
+            const onnx::TensorProto& B2 = weights[node.input(4)];
+
+            fprintf(pp, " 0=%d", get_tensor_proto_data_size(B));
+
+            int quantize_tag = 0;
+            fwrite(&quantize_tag, sizeof(int), 1, bp);
+
+            fwrite_tensor_proto_data(W, bp);
+
+            fwrite(&quantize_tag, sizeof(int), 1, bp);
+
+            fwrite_tensor_proto_data(B, bp);
+
+            fwrite(&quantize_tag, sizeof(int), 1, bp);
+
+            fwrite_tensor_proto_data(B2, bp);
+        }
+        else if (op == "Slice")
+        {
+            bool             use_crop = true;
+
+            std::vector<int> starts;
+            std::vector<int> ends;
+            std::vector<int> axes;
+            std::vector<int> steps;
+            if (node.input_size() == 1)
+            {
+                starts = get_node_attr_ai(node, "starts");
+                ends   = get_node_attr_ai(node, "ends");
+                axes   = get_node_attr_ai(node, "axes");
+                steps  = get_node_attr_ai(node, "steps");  // TODO
+            }
+            else
+            {
+                starts = get_node_attr_from_input_ai(weights[node.input(1)]);
+                ends   = get_node_attr_from_input_ai(weights[node.input(2)]);
+                if (node.input_size() >= 4) axes = get_node_attr_from_input_ai(weights[node.input(3)]);
+                if (node.input_size() >= 5) steps = get_node_attr_from_input_ai(weights[node.input(4)]);
+            }
+
+            // assert step == 1 or step >= ends
+            for (int i = 0; i < (int)steps.size(); i++)
+            {
+                if (steps[i] != 1 && steps[i] < ends[i])
+                {
+                    use_crop = false;
+                    fprintf(stderr, "Unsupported slice step ! Use custom TensorSlice\n");
+                }
+            }
+
+            if (use_crop)
+            {
+                // filter out N-dim axis
+                if (!axes.empty())
+                {
+                    for (int i = 0; i < (int)axes.size(); i++)
+                    {
+                        int axis = axes[i];
+                        if (axis == 0)
+                        {
+                            starts.erase(starts.begin() + i);
+                            ends.erase(ends.begin() + i);
+                            axes.erase(axes.begin() + i);
+                            break;
+                        }
+                    }
+                }
+
+                fprintf(pp, " -23309=%d", (int)starts.size());
+                for (int i = 0; i < (int)starts.size(); i++)
+                {
+                    fprintf(pp, ",%d", starts[i]);
+                }
+                fprintf(pp, " -23310=%d", (int)ends.size());
+                for (int i = 0; i < (int)ends.size(); i++)
+                {
+                    fprintf(pp, ",%d", ends[i]);
+                }
+                if (!axes.empty())
+                {
+                    fprintf(pp, " -23311=%d", (int)axes.size());
+                    for (int i = 0; i < (int)axes.size(); i++)
+                    {
+                        int axis = axes[i];
+                        if (axis == 0 || axis > 3 || axis < -3) fprintf(stderr, "Unsupported slice axes !\n");
+
+                        if (axis > 0) axis = axis - 1;  // -1 for skip N-dim
+
+                        fprintf(pp, ",%d", axis);
+                    }
+                }
+            }
+            else
+            {
+                fprintf(pp, " -23300=%d", (int)starts.size());
+                for (int i = 0; i < (int)starts.size(); i++)
+                {
+                    fprintf(pp, ",%d", starts[i]);
+                }
+                fprintf(pp, " -23301=%d", (int)ends.size());
+                for (int i = 0; i < (int)ends.size(); i++)
+                {
+                    fprintf(pp, ",%d", ends[i]);
+                }
+                if (!axes.empty())
+                {
+                    fprintf(pp, " -23302=%d", (int)axes.size());
+                    for (int i = 0; i < (int)axes.size(); i++)
+                    {
+                        int axis = axes[i];
+                        if (axis > 3 || axis < -3) fprintf(stderr, "Unsupported slice axes !\n");
+                        fprintf(pp, ",%d", axis);
+                    }
+                }
+                if (!steps.empty())
+                {
+                    fprintf(pp, " -23303=%d", (int)steps.size());
+                    for (int i = 0; i < (int)steps.size(); i++)
+                    {
+                        int step = steps[i];
+                        if (step == 0) fprintf(stderr, "Unsupported slice step ! Unsupported slice step\n");
+                        fprintf(pp, ",%d", step);
+                    }
+                }
+            }
+        }
+        else if (op == "Softmax")
+        {
+            int axis = get_node_attr_i(node, "axis", 1);
+            fprintf(pp, " 0=%d", axis - 1);
+            fprintf(pp, " 1=1");
+        }
+        else if (op == "Split")
+        {
+            int              axis  = get_node_attr_i(node, "axis", 0);
+            std::vector<int> split = get_node_attr_ai(node, "split");
+            if (axis < 1) fprintf(stderr, "Unsupported split axis !\n");
+
+            fprintf(pp, " -23300=%d", output_size);
+            if (split.empty())
+            {
+                for (int i = 0; i < output_size; i++)
+                {
+                    fprintf(pp, ",-233");
+                }
+            }
+            else
+            {
+                for (size_t i = 0; i < split.size() - 1; i++)
+                {
+                    fprintf(pp, ",%d", split[i]);
+                }
+                fprintf(pp, ",-233");
+            }
+            fprintf(pp, " 1=%d", axis - 1);
+        }
+        else if (op == "Sqrt")
+        {
+            int op_type = 5;
+            fprintf(pp, " 0=%d", op_type);
+        }
+        else if (op == "Squeeze")
+        {
+            std::vector<int> axes = get_node_attr_ai(node, "axes");
+
+            if (axes.empty())
+            {
+                fprintf(pp, " 0=1");
+                fprintf(pp, " 1=1");
+                fprintf(pp, " 2=1");
+            }
+            else
+            {
+                bool flag = true;
+                for (int i = 0; i < (int)axes.size(); i++)
+                {
+                    if (axes[i] == 0)
+                    {
+                        flag = false;
+                        break;
+                    }
+                }
+                if (flag == true)
+                {
+                    fprintf(pp, " -23303=%zu", axes.size());
+                    for (int i = 0; i < (int)axes.size(); i++)
+                    {
+                        if (axes[i] == 0 || axes[i] > 3 || axes[i] < -3)
+                            fprintf(stderr, "Unsupported squeeze axes !: %d, %s\n", axes[i], node.name().c_str());
+                        fprintf(pp, ",%d", axes[i] - 1);
+                    }
+                }
+            }
+        }
+        else if (op == "Sub")
+        {
+            int op_type = 1;
+            fprintf(pp, " 0=%d", op_type);
+
+            int   with_scalar = get_node_attr_i(node, "with_scalar", 0);
+            float b           = get_node_attr_f(node, "b", 0.f);
+            if (with_scalar)
+            {
+                fprintf(pp, " 1=%d", with_scalar);
+                fprintf(pp, " 2=%e", b);
+            }
+        }
+        else if (op == "Sum")
+        {
+            int op_type = 1;
+            fprintf(pp, " 0=%d", op_type);
+        }
+        else if (op == "Swish")
+        {
+            // no param
+        }
+        else if (op == "Tan")
+        {
+            int op_type = 11;
+            fprintf(pp, " 0=%d", op_type);
+        }
+        else if (op == "Tanh")
+        {
+            int op_type = 16;
+            fprintf(pp, " 0=%d", op_type);
+        }
+        else if (op == "TopK")
+        {
+            int axis    = get_node_attr_i(node, "axis", -1);
+            axis        = axis > 0 ? axis - 1 : axis;
+            int largest = get_node_attr_i(node, "largest", 1);
+            int sorted  = get_node_attr_i(node, "sorted", 1);
+            fprintf(pp, " 0=%d", axis);
+            fprintf(pp, " 1=%d", largest);
+            fprintf(pp, " 2=%d", sorted);
+        }
+        else if (op == "Transpose")
+        {
+            std::vector<int> perm = get_node_attr_ai(node, "perm");
+
+            if (perm.size() == 3)
+            {
+                if (perm[1] == 1 && perm[2] == 2)
+                    fprintf(pp, " 0=0");  // w h
+                else if (perm[1] == 2 && perm[2] == 1)
+                    fprintf(pp, " 0=1");  // h w
+                else if (perm[0] == 1 && perm[1] == 0 && perm[2] == 2)
+                    fprintf(pp, " 0=0");  // w h
+                else if (perm[0] == 2 && perm[1] == 0 && perm[2] == 1)
+                    fprintf(pp, " 0=1");  // h w
+            }
+            else if (perm.size() == 4)
+            {
+                if (perm[1] == 1 && perm[2] == 2 && perm[3] == 3)
+                    fprintf(pp, " 0=0");  // w h c
+                else if (perm[1] == 1 && perm[2] == 3 && perm[3] == 2)
+                    fprintf(pp, " 0=1");  // h w c
+                else if (perm[1] == 2 && perm[2] == 1 && perm[3] == 3)
+                    fprintf(pp, " 0=2");  // w c h
+                else if (perm[1] == 2 && perm[2] == 3 && perm[3] == 1)
+                    fprintf(pp, " 0=3");  // c w h
+                else if (perm[1] == 3 && perm[2] == 1 && perm[3] == 2)
+                    fprintf(pp, " 0=4");  // h c w
+                else if (perm[1] == 3 && perm[2] == 2 && perm[3] == 1)
+                    fprintf(pp, " 0=5");  // c h w
+            }
+            else if (perm.size() == 5)
+            {
+                if (perm[1] == 1 && perm[2] == 2 && perm[3] == 3 && perm[4] == 4)
+                    fprintf(pp, " 0=0");  // wx h c
+                else if (perm[1] == 1 && perm[2] == 3 && perm[3] == 4 && perm[4] == 2)
+                    fprintf(pp, " 0=1");  // h wx c
+                else if (perm[1] == 2 && perm[2] == 1 && perm[3] == 3 && perm[4] == 4)
+                    fprintf(pp, " 0=2");  // wx c h
+                else if (perm[1] == 2 && perm[2] == 3 && perm[3] == 4 && perm[4] == 1)
+                    fprintf(pp, " 0=3");  // c wx h
+                else if (perm[1] == 3 && perm[2] == 4 && perm[3] == 1 && perm[4] == 2)
+                    fprintf(pp, " 0=4");  // h c wx
+                else if (perm[1] == 3 && perm[2] == 4 && perm[3] == 2 && perm[4] == 1)
+                    fprintf(pp, " 0=5");  // c h wx
+                else
+                    fprintf(stderr, "Unsupported transpose type !\n");
+            }
+        }
+        else if (op == "Upsample")
+        {
+            std::string        mode  = get_node_attr_s(node, "mode");
+            std::string        align = get_node_attr_s(node, "coordinate_transformation_mode");
+
+            std::vector<float> scales;
+
+            if (node.input_size() == 1)
+            {
+                scales = get_node_attr_af(node, "scales");
+            }
+            else
+            {
+                scales = get_node_attr_from_input_af(weights[node.input(1)]);
+            }
+
+            int resize_type = 1;
+            if (mode == "nearest")
+            {
+                resize_type = 1;
+            }
+            else if (mode == "bilinear" || mode == "linear")
+            {
+                resize_type = 2;
+            }
+            else if (mode == "trilinear")
+            {
+                fprintf(stderr, "Unsupported Upsample mode !\n");
+            }
+
+            float h_scale = 1.f;
+            float w_scale = 1.f;
+            if (scales.size() == 2)
+            {
+                w_scale = scales[1];
+            }
+            else if (scales.size() == 3)
+            {
+                h_scale = scales[1];
+                w_scale = scales[2];
+            }
+            else if (scales.size() == 4)
+            {
+                h_scale = scales[2];
+                w_scale = scales[3];
+
+                if (scales[1] != 1.f) fprintf(stderr, "Unsupported Upsample scales !\n");
+            }
+            else
+            {
+                fprintf(stderr, "Unsupported Upsample scales !\n");
+            }
+
+            int align_corner = 0;
+            if (align == "align_corners")
+            {
+                align_corner = 1;
+            }
+
+            fprintf(pp, " 0=%d", resize_type);
+            fprintf(pp, " 1=%e", h_scale);
+            fprintf(pp, " 2=%e", w_scale);
+            fprintf(pp, " 6=%d", align_corner);
+        }
+        else if (op == "Unsqueeze")
+        {
+            std::vector<int> axes = get_node_attr_ai(node, "axes");
+            bool             flag = true;
+            for (int i = 0; i < (int)axes.size(); i++)
+            {
+                if (axes[i] == 0)
+                {
+                    flag = false;
+                    break;
+                }
+            }
+            if (flag)
+            {
+                fprintf(pp, " -23303=%zu", axes.size());
+                for (int i = 0; i < (int)axes.size(); i++)
+                {
+                    if (axes[i] == 0 || axes[i] > 4 || axes[i] < -4)
+                        fprintf(stderr, "Unsupported unsqueeze axes !: %d, %s\n", axes[i], node.name().c_str());
+                    fprintf(pp, ",%d", axes[i] - 1);
+                }
+            }
+        }
+        else if (op == "Yolov3DetectionOutput")
+        {
+            int   num_class            = get_node_attr_i(node, "num_class");
+            int   num_box              = get_node_attr_i(node, "num_box");
+            float confidence_threshold = get_node_attr_f(node, "confidence_threshold");
+            float nms_threshold        = get_node_attr_f(node, "nms_threshold");
+            fprintf(pp, " 0=%d", num_class);
+            fprintf(pp, " 1=%d", num_box);
+            fprintf(pp, " 2=%e", confidence_threshold);
+            fprintf(pp, " 3=%e", nms_threshold);
+            std::vector<float> biases = get_node_attr_af(node, "biases");
+            if (biases.size() > 0)
+            {
+                fprintf(pp, " -23304=%zu", biases.size());
+                for (int i = 0; i < (int)biases.size(); i++)
+                {
+                    fprintf(pp, ",%e", biases[i]);
+                }
+            }
+            std::vector<float> mask = get_node_attr_af(node, "mask");
+            if (mask.size() > 0)
+            {
+                fprintf(pp, " -23305=%zu", mask.size());
+                for (int i = 0; i < (int)mask.size(); i++)
+                {
+                    fprintf(pp, ",%e", mask[i]);
+                }
+            }
+            std::vector<float> anchors_scale = get_node_attr_af(node, "anchors_scale");
+            if (anchors_scale.size() > 0)
+            {
+                fprintf(pp, " -23306=%zu", anchors_scale.size());
+                for (int i = 0; i < (int)anchors_scale.size(); i++)
+                {
+                    fprintf(pp, ",%e", anchors_scale[i]);
+                }
+            }
+        }
+        else
+        {
+            // TODO op specific param
+        }
+
+        fprintf(pp, "\n");
+        for (int j = 0; j < output_size; j++)
+        {
+            const std::string& output_name = node.output(j);
+            if (node_reference.find(output_name) != node_reference.end())
+            {
+                int refcount = node_reference[output_name];
+                if (refcount > 1)
+                {
+                    char splitname[256];
+                    sprintf(splitname, "splitncnn_%d", internal_split);
+                    fprintf(pp, "%-16s %-24s %d %d", "Split", splitname, 1, refcount);
+
+                    fprintf(pp, " %s", output_name.c_str());
+
+                    for (int k = 0; k < refcount; k++)
+                    {
+                        fprintf(pp, " %s_splitncnn_%d", output_name.c_str(), k);
+                    }
+                    fprintf(pp, "\n");
+
+                    internal_split++;
+                }
             }
-          }
-        }
-        fwrite_tensor_proto_data(qb, bp);
-
-        fwrite(&quantize_tag, sizeof(int), 1, bp);
-        // transpose kw
-        {
-          const float* wptr =
-              kw.has_raw_data() ? (const float*)kw.raw_data().data() : kw.float_data().data();
-
-          for (int j = 0; j < embed_dim; j++) {
-            for (int k = 0; k < embed_dim; k++) {
-              float vb = wptr[j * embed_dim + k];
-              fwrite(&vb, sizeof(float), 1, bp);
-            }
-          }
-        }
-        fwrite_tensor_proto_data(kb, bp);
-
-        fwrite(&quantize_tag, sizeof(int), 1, bp);
-        // transpose vw
-        {
-          const float* wptr =
-              vw.has_raw_data() ? (const float*)vw.raw_data().data() : vw.float_data().data();
-
-          for (int j = 0; j < embed_dim; j++) {
-            for (int k = 0; k < embed_dim; k++) {
-              float vb = wptr[j * embed_dim + k];
-              fwrite(&vb, sizeof(float), 1, bp);
-            }
-          }
-        }
-        fwrite_tensor_proto_data(vb, bp);
-
-        fwrite(&quantize_tag, sizeof(int), 1, bp);
-        // transpose ow
-        {
-          const float* wptr =
-              ow.has_raw_data() ? (const float*)ow.raw_data().data() : ow.float_data().data();
-
-          for (int j = 0; j < embed_dim; j++) {
-            for (int k = 0; k < embed_dim; k++) {
-              float vb = wptr[j * embed_dim + k];
-              fwrite(&vb, sizeof(float), 1, bp);
-            }
-          }
-        }
-        fwrite_tensor_proto_data(ob, bp);
-      }
-    } else if (op == "Neg") {
-      int op_type = 1;
-      fprintf(pp, " 0=%d", op_type);
-    } else if (op == "NonMaxSuppression") {
-      int max_dets = 0;
-      float iou_thre = 0.f;
-      float score_thre = 0.f;
-      // fprintf(stderr, "%s\n", node.name().c_str());
-      // fprintf(stderr, "node.input_size(): %d\n", node.input_size());
-      if (node.input_size() >= 3) {
-        // fprintf(stderr, "ok12!\n");
-        max_dets = (int)(get_node_attr_from_input<float>(weights[node.input(2)]) + 0.5);
-      }
-      if (node.input_size() >= 4) {
-        // fprintf(stderr, "iou_thre: %f\n",
-        // get_node_attr_from_input<float>(weights[node.input(3)]));
-        iou_thre = get_node_attr_from_input<float>(weights[node.input(3)]);
-      }
-      if (node.input_size() >= 5) {
-        // fprintf(stderr, "score_thre: %f\n",
-        // get_node_attr_from_input<float>(weights[node.input(4)]));
-        score_thre = get_node_attr_from_input<float>(weights[node.input(4)]);
-      }
-      fprintf(pp, " 0=%d", max_dets);
-      fprintf(pp, " 1=%f", iou_thre);
-      fprintf(pp, " 2=%f", score_thre);
-    } else if (op == "Normalize") {
-      float eps = get_node_attr_f(node, "eps", 0.f);
-      int scale_data_size = 1;
-
-      fprintf(pp, " 1=1");  // channel_shared
-      fprintf(pp, " 2=%e", eps);
-      fprintf(pp, " 3=%d", scale_data_size);
-      fprintf(pp, " 9=1");  // TODO hardcode pytorch style
-
-      const float scale_data[1] = {1.f};
-      fwrite(scale_data, sizeof(float), 1, bp);
-    } else if (op == "Pad") {
-      std::string mode = get_node_attr_s(node, "mode");
-      float value = get_node_attr_f(node, "value", 0.f);
-
-      std::vector<int> pads;
-      if (node.input_size() == 1) {
-        pads = get_node_attr_ai(node, "pads");
-      } else {
-        pads = get_node_attr_from_input_ai(weights[node.input(1)]);
-      }
-      int type = 0;
-      if (mode == "constant") {
-        type = 0;
-      } else if (mode == "edge") {
-        type = 1;
-      } else if (mode == "reflect") {
-        type = 2;
-      }
-
-      int pad_size = (int)pads.size();
-      int top = 0;
-      int bottom = 0;
-      int left = 0;
-      int right = 0;
-      int front = 0;
-      int behind = 0;
-      if (pad_size == 8) {
-        // NCHW
-        top = pads[2];
-        bottom = pads[6];
-        left = pads[3];
-        right = pads[7];
-        front = pads[1];
-        behind = pads[5];
-      } else if (pad_size == 6) {
-        // NHW
-        top = pads[1];
-        bottom = pads[4];
-        left = pads[2];
-        right = pads[5];
-      } else {
-        // NW
-        left = pads[1];
-        right = pads[3];
-      }
-
-      fprintf(pp, " 0=%d", top);
-      fprintf(pp, " 1=%d", bottom);
-      fprintf(pp, " 2=%d", left);
-      fprintf(pp, " 3=%d", right);
-      fprintf(pp, " 4=%d", type);
-      fprintf(pp, " 5=%e", value);
-      fprintf(pp, " 7=%d", front);
-      fprintf(pp, " 8=%d", behind);
-    } else if (op == "Pow") {
-      int op_type = 6;
-      fprintf(pp, " 0=%d", op_type);
-
-      int with_scalar = get_node_attr_i(node, "with_scalar", 0);
-      float b = get_node_attr_f(node, "b", 0.f);
-      if (with_scalar) {
-        fprintf(pp, " 1=%d", with_scalar);
-        fprintf(pp, " 2=%e", b);
-      }
-    } else if (op == "PriorBox") {
-      std::vector<float> min_sizes = get_node_attr_af(node, "min_sizes");
-      std::vector<float> max_sizes = get_node_attr_af(node, "max_sizes");
-      std::vector<float> aspect_ratios = get_node_attr_af(node, "aspect_ratios");
-      fprintf(pp, " -23300=%zu", min_sizes.size());
-      for (size_t j = 0; j < min_sizes.size(); ++j) {
-        fprintf(pp, ",%f", min_sizes[j]);
-      }
-      fprintf(pp, " -23301=%zu", max_sizes.size());
-      for (size_t j = 0; j < max_sizes.size(); ++j) {
-        fprintf(pp, ",%f", max_sizes[j]);
-      }
-      fprintf(pp, " -23302=%zu", aspect_ratios.size());
-      for (size_t j = 0; j < aspect_ratios.size(); ++j) {
-        fprintf(pp, ",%f", aspect_ratios[j]);
-      }
-      int image_width = get_node_attr_i(node, "image_width");
-      int image_height = get_node_attr_i(node, "image_height");
-      float step_width = get_node_attr_f(node, "step_width");
-      float step_height = get_node_attr_f(node, "step_height");
-      float offset = get_node_attr_f(node, "offset");
-      int step_mmdetection = get_node_attr_i(node, "step_mmdetection");
-      fprintf(pp, " 9=%d", image_width);
-      fprintf(pp, " 10=%d", image_height);
-      fprintf(pp, " 11=%f", step_width);
-      fprintf(pp, " 12=%f", step_height);
-      fprintf(pp, " 13=%f", offset);
-      fprintf(pp, " 14=%d", step_mmdetection);
-    } else if (op == "PixelShuffle") {
-      int scale_factor = get_node_attr_i(node, "scale_factor", 1);
-      fprintf(pp, " 0=%d", scale_factor);
-    } else if (op == "PRelu") {
-      const onnx::TensorProto& slope = weights[node.input(1)];
-
-      int num_slope = get_tensor_proto_data_size(slope);
-
-      fprintf(pp, " 0=%d", num_slope);
-
-      fwrite_tensor_proto_data(slope, bp);
-    } else if (op == "Reciprocal") {
-      int op_type = 15;
-      fprintf(pp, " 0=%d", op_type);
-    } else if (op == "ReduceMax" || op == "ReduceMin" || op == "ReduceMean" || op == "ReduceProd" ||
-               op == "ReduceSum" || op == "ReduceSumSquare" || op == "ReduceL1" ||
-               op == "ReduceL2" || op == "ReduceLogSum" || op == "ReduceLogSumExp") {
-      int op_type = -233;
-      if (op == "ReduceSum")
-        op_type = 0;
-      else if (op == "ReduceSumSquare")
-        op_type = 2;
-      else if (op == "ReduceMean")
-        op_type = 3;
-      else if (op == "ReduceMax")
-        op_type = 4;
-      else if (op == "ReduceMin")
-        op_type = 5;
-      else if (op == "ReduceProd")
-        op_type = 6;
-      else if (op == "ReduceL1")
-        op_type = 7;
-      else if (op == "ReduceL2")
-        op_type = 8;
-      else if (op == "ReduceLogSum")
-        op_type = 9;
-      else if (op == "ReduceLogSumExp")
-        op_type = 10;
-      fprintf(pp, " 0=%d", op_type);
-
-      std::vector<int> axes = get_node_attr_ai(node, "axes");
-      int keepdims = get_node_attr_i(node, "keepdims", 1);
-
-      if (axes.size() > 0) {
-        // if axes set, reduce according to axes
-        fprintf(pp, " 1=%d", 0);
-        fprintf(pp, " -23303=%zu", axes.size());
-        for (size_t j = 0; j < axes.size(); j++) {
-          if (axes[j] == 0 || axes[j] > 4 || axes[j] < -3)
-            fprintf(stderr, "Unsupported reduction axes !\n");
-          fprintf(pp, ",%d", axes[j] > 0 ? axes[j] - 1 : axes[j]);
-        }
-      } else {
-        // if axes not set, reduce all axes by default
-        fprintf(pp, " 1=%d", 1);
-      }
-      fprintf(pp, " 4=%d", keepdims);
-      fprintf(pp, " 5=1");
-    } else if (op == "Reorg") {
-      int stride = get_node_attr_i(node, "stride", 1);
-      fprintf(pp, " 0=%d", stride);
-    } else if (op == "Reshape") {
-      std::vector<int> shape;
-
-      if (node.input_size() == 1) {
-        shape = get_node_attr_ai(node, "shape");
-      } else if (weights.find(node.input(1)) != weights.end()) {
-        shape = get_node_attr_from_input_ai(weights[node.input(1)]);
-      } else {
-        fprintf(stderr, "Unsupported reshape weight ! \n");
-      }
-
-      if (shape.size() == 1) {
-        fprintf(pp, " 0=%d", shape[0]);  // should never reach here
-      } else if (shape.size() == 2) {
-        fprintf(pp, " 0=%d", shape[1]);
-      } else if (shape.size() == 3) {
-        fprintf(pp, " 0=%d", shape[2]);
-        fprintf(pp, " 1=%d", shape[1]);
-      } else if (shape.size() == 4) {
-        fprintf(pp, " 0=%d", shape[3]);
-        fprintf(pp, " 1=%d", shape[2]);
-        fprintf(pp, " 2=%d", shape[1]);
-      } else if (shape.size() == 5) {
-        fprintf(pp, " 0=%d", shape[4] * shape[3]);
-        fprintf(pp, " 1=%d", shape[2]);
-        fprintf(pp, " 2=%d", shape[1]);
-      }
-    } else if (op == "Resize") {
-      std::string mode = get_node_attr_s(node, "mode");
-      std::string align = get_node_attr_s(node, "coordinate_transformation_mode");
-
-      std::vector<float> scales;
-      std::vector<int> sizes;
-      if (node.input_size() == 2) {
-        // opset 10
-        scales = get_node_attr_from_input_af(weights[node.input(1)]);
-      } else {
-        // opset 11+
-        scales = get_node_attr_from_input_af(weights[node.input(2)]);
-        if (node.input_size() >= 4) {
-          sizes = get_node_attr_from_input_ai(weights[node.input(3)]);
-        }
-      }
-
-      int resize_type = 1;
-      if (mode == "nearest") {
-        resize_type = 1;
-      } else if (mode == "linear") {
-        resize_type = 2;
-      } else if (mode == "cubic") {
-        resize_type = 3;
-      }
-
-      if (scales.empty() && sizes.empty()) {
-        fprintf(stderr, "Unsupported Resize scales and sizes are all empty!\n");
-      }
-
-      float h_scale = 1.f;
-      float w_scale = 1.f;
-      if (scales.size() == 2) {
-        w_scale = scales[1];
-      } else if (scales.size() == 3) {
-        h_scale = scales[1];
-        w_scale = scales[2];
-      } else if (scales.size() == 4) {
-        h_scale = scales[2];
-        w_scale = scales[3];
-
-        if (scales[1] != 1.f) fprintf(stderr, "Unsupported Resize scales !\n");
-      }
-
-      int output_height = 0;
-      int output_width = 0;
-      if (sizes.size() == 2) {
-        output_width = sizes[1];
-      } else if (sizes.size() == 3) {
-        output_height = sizes[1];
-        output_width = sizes[2];
-      } else if (sizes.size() == 4) {
-        output_height = sizes[2];
-        output_width = sizes[3];
-      }
-
-      int align_corner = 0;
-      if (align == "align_corners") {
-        align_corner = 1;
-      }
-
-      fprintf(pp, " 0=%d", resize_type);
-      fprintf(pp, " 1=%e", h_scale);
-      fprintf(pp, " 2=%e", w_scale);
-      fprintf(pp, " 3=%d", output_height);
-      fprintf(pp, " 4=%d", output_width);
-      fprintf(pp, " 6=%d", align_corner);
-    } else if (op == "RNN") {
-      const onnx::TensorProto& W = weights[node.input(1)];
-      const onnx::TensorProto& R = weights[node.input(2)];
-      const onnx::TensorProto& B = weights[node.input(3)];
-
-      int hidden_size = get_node_attr_i(node, "hidden_size", 0);
-      std::string direction = get_node_attr_s(node, "direction");
-
-      int direction_type = 0;
-      if (direction == "forward") {
-        direction_type = 0;
-      } else if (direction == "reverse") {
-        direction_type = 1;
-      } else if (direction == "bidirectional") {
-        direction_type = 2;
-      }
-
-      int weight_data_size = get_tensor_proto_data_size(W);
-
-      fprintf(pp, " 0=%d", hidden_size);
-      fprintf(pp, " 1=%d", weight_data_size);
-      fprintf(pp, " 2=%d", direction_type);
-
-      int num_directions = direction_type == 2 ? 2 : 1;
-
-      int quantize_tag = 0;
-
-      fwrite(&quantize_tag, sizeof(int), 1, bp);
-      fwrite_tensor_proto_data(W, bp);
-
-      // reduce xc and hc bias
-      {
-        fwrite(&quantize_tag, sizeof(int), 1, bp);
-
-        int bias_data_size_g = get_tensor_proto_data_size(B) / 2 / num_directions;
-        const float* bptr =
-            B.has_raw_data() ? (const float*)B.raw_data().data() : B.float_data().data();
-        const float* xiptr = bptr;
-        const float* hiptr = bptr + bias_data_size_g;
-
-        for (int j = 0; j < bias_data_size_g; j++) {
-          float vb = xiptr[j] + hiptr[j];
-          fwrite(&vb, sizeof(float), 1, bp);
-        }
-
-        if (direction_type == 2) {
-          xiptr += bias_data_size_g * 2;
-          hiptr += bias_data_size_g * 2;
-
-          for (int j = 0; j < bias_data_size_g; j++) {
-            float vb = xiptr[j] + hiptr[j];
-            fwrite(&vb, sizeof(float), 1, bp);
-          }
-        }
-      }
-
-      fwrite(&quantize_tag, sizeof(int), 1, bp);
-      fwrite_tensor_proto_data(R, bp);
-    } else if (op == "RDiv") {
-      int op_type = 8;
-      fprintf(pp, " 0=%d", op_type);
-
-      int with_scalar = get_node_attr_i(node, "with_scalar", 0);
-      float b = get_node_attr_f(node, "b", 0.f);
-      if (with_scalar) {
-        fprintf(pp, " 1=%d", with_scalar);
-        fprintf(pp, " 2=%e", b);
-      }
-    } else if (op == "RSub") {
-      int op_type = 7;
-      fprintf(pp, " 0=%d", op_type);
-
-      int with_scalar = get_node_attr_i(node, "with_scalar", 0);
-      float b = get_node_attr_f(node, "b", 0.f);
-      if (with_scalar) {
-        fprintf(pp, " 1=%d", with_scalar);
-        fprintf(pp, " 2=%e", b);
-      }
-    } else if (op == "RoiAlign") {
-      int pooled_width = get_node_attr_i(node, "output_width", 1);
-      int pooled_height = get_node_attr_i(node, "output_height", 1);
-      float spatial_scale = get_node_attr_f(node, "spatial_scale", 1.f);
-      int sampling_ratio = get_node_attr_i(node, "sampling_ratio", 0);
-      fprintf(pp, " 0=%d", pooled_width);
-      fprintf(pp, " 1=%d", pooled_height);
-      fprintf(pp, " 2=%f", spatial_scale);
-      fprintf(pp, " 3=%d", sampling_ratio);
-    } else if (op == "ShuffleChannel") {
-      int group = get_node_attr_i(node, "group", 1);
-      int reverse = get_node_attr_i(node, "reverse", 0);
-      fprintf(pp, " 0=%d", group);
-      fprintf(pp, " 1=%d", reverse);
-    } else if (op == "Sigmoid") {
-      // no param
-    } else if (op == "Sin") {
-      int op_type = 9;
-      fprintf(pp, " 0=%d", op_type);
-    } else if (op == "SkipLayerNormalization") {
-      const onnx::TensorProto& W = weights[node.input(2)];
-      const onnx::TensorProto& B = weights[node.input(3)];
-      const onnx::TensorProto& B2 = weights[node.input(4)];
-
-      fprintf(pp, " 0=%d", get_tensor_proto_data_size(B));
-
-      int quantize_tag = 0;
-      fwrite(&quantize_tag, sizeof(int), 1, bp);
-
-      fwrite_tensor_proto_data(W, bp);
-
-      fwrite(&quantize_tag, sizeof(int), 1, bp);
-
-      fwrite_tensor_proto_data(B, bp);
-
-      fwrite(&quantize_tag, sizeof(int), 1, bp);
-
-      fwrite_tensor_proto_data(B2, bp);
-    } else if (op == "Slice") {
-      bool use_crop = true;
-
-      std::vector<int> starts;
-      std::vector<int> ends;
-      std::vector<int> axes;
-      std::vector<int> steps;
-      if (node.input_size() == 1) {
-        starts = get_node_attr_ai(node, "starts");
-        ends = get_node_attr_ai(node, "ends");
-        axes = get_node_attr_ai(node, "axes");
-        steps = get_node_attr_ai(node, "steps");  // TODO
-      } else {
-        starts = get_node_attr_from_input_ai(weights[node.input(1)]);
-        ends = get_node_attr_from_input_ai(weights[node.input(2)]);
-        if (node.input_size() >= 4) axes = get_node_attr_from_input_ai(weights[node.input(3)]);
-        if (node.input_size() >= 5) steps = get_node_attr_from_input_ai(weights[node.input(4)]);
-      }
-
-      // assert step == 1 or step >= ends
-      for (int i = 0; i < (int)steps.size(); i++) {
-        if (steps[i] != 1 && steps[i] < ends[i]) {
-          use_crop = false;
-          fprintf(stderr, "Unsupported slice step ! Use custom TensorSlice\n");
-        }
-      }
-
-      if (use_crop) {
-        // filter out N-dim axis
-        if (!axes.empty()) {
-          for (int i = 0; i < (int)axes.size(); i++) {
-            int axis = axes[i];
-            if (axis == 0) {
-              starts.erase(starts.begin() + i);
-              ends.erase(ends.begin() + i);
-              axes.erase(axes.begin() + i);
-              break;
-            }
-          }
-        }
-
-        fprintf(pp, " -23309=%d", (int)starts.size());
-        for (int i = 0; i < (int)starts.size(); i++) {
-          fprintf(pp, ",%d", starts[i]);
-        }
-        fprintf(pp, " -23310=%d", (int)ends.size());
-        for (int i = 0; i < (int)ends.size(); i++) {
-          fprintf(pp, ",%d", ends[i]);
-        }
-        if (!axes.empty()) {
-          fprintf(pp, " -23311=%d", (int)axes.size());
-          for (int i = 0; i < (int)axes.size(); i++) {
-            int axis = axes[i];
-            if (axis == 0 || axis > 3 || axis < -3) fprintf(stderr, "Unsupported slice axes !\n");
-
-            if (axis > 0) axis = axis - 1;  // -1 for skip N-dim
-
-            fprintf(pp, ",%d", axis);
-          }
-        }
-      } else {
-        fprintf(pp, " -23300=%d", (int)starts.size());
-        for (int i = 0; i < (int)starts.size(); i++) {
-          fprintf(pp, ",%d", starts[i]);
-        }
-        fprintf(pp, " -23301=%d", (int)ends.size());
-        for (int i = 0; i < (int)ends.size(); i++) {
-          fprintf(pp, ",%d", ends[i]);
-        }
-        if (!axes.empty()) {
-          fprintf(pp, " -23302=%d", (int)axes.size());
-          for (int i = 0; i < (int)axes.size(); i++) {
-            int axis = axes[i];
-            if (axis > 3 || axis < -3) fprintf(stderr, "Unsupported slice axes !\n");
-            fprintf(pp, ",%d", axis);
-          }
-        }
-        if (!steps.empty()) {
-          fprintf(pp, " -23303=%d", (int)steps.size());
-          for (int i = 0; i < (int)steps.size(); i++) {
-            int step = steps[i];
-            if (step == 0) fprintf(stderr, "Unsupported slice step ! Unsupported slice step\n");
-            fprintf(pp, ",%d", step);
-          }
-        }
-      }
-    } else if (op == "Softmax") {
-      int axis = get_node_attr_i(node, "axis", 1);
-      fprintf(pp, " 0=%d", axis - 1);
-      fprintf(pp, " 1=1");
-    } else if (op == "Split") {
-      int axis = get_node_attr_i(node, "axis", 0);
-      std::vector<int> split = get_node_attr_ai(node, "split");
-      if (axis < 1) fprintf(stderr, "Unsupported split axis !\n");
-
-      fprintf(pp, " -23300=%d", output_size);
-      if (split.empty()) {
-        for (int i = 0; i < output_size; i++) {
-          fprintf(pp, ",-233");
-        }
-      } else {
-        for (size_t i = 0; i < split.size() - 1; i++) {
-          fprintf(pp, ",%d", split[i]);
-        }
-        fprintf(pp, ",-233");
-      }
-      fprintf(pp, " 1=%d", axis - 1);
-    } else if (op == "Sqrt") {
-      int op_type = 5;
-      fprintf(pp, " 0=%d", op_type);
-    } else if (op == "Squeeze") {
-      std::vector<int> axes = get_node_attr_ai(node, "axes");
-
-      if (axes.empty()) {
-        fprintf(pp, " 0=1");
-        fprintf(pp, " 1=1");
-        fprintf(pp, " 2=1");
-      } else {
-        bool flag = true;
-        for (int i = 0; i < (int)axes.size(); i++) {
-          if (axes[i] == 0) {
-            flag = false;
-            break;
-          }
-        }
-        if (flag == true) {
-          fprintf(pp, " -23303=%zu", axes.size());
-          for (int i = 0; i < (int)axes.size(); i++) {
-            if (axes[i] == 0 || axes[i] > 3 || axes[i] < -3)
-              fprintf(stderr, "Unsupported squeeze axes !: %d, %s\n", axes[i], node.name().c_str());
-            fprintf(pp, ",%d", axes[i] - 1);
-          }
-        }
-      }
-    } else if (op == "Sub") {
-      int op_type = 1;
-      fprintf(pp, " 0=%d", op_type);
-
-      int with_scalar = get_node_attr_i(node, "with_scalar", 0);
-      float b = get_node_attr_f(node, "b", 0.f);
-      if (with_scalar) {
-        fprintf(pp, " 1=%d", with_scalar);
-        fprintf(pp, " 2=%e", b);
-      }
-    } else if (op == "Sum") {
-      int op_type = 1;
-      fprintf(pp, " 0=%d", op_type);
-    } else if (op == "Swish") {
-      // no param
-    } else if (op == "Tan") {
-      int op_type = 11;
-      fprintf(pp, " 0=%d", op_type);
-    } else if (op == "Tanh") {
-      int op_type = 16;
-      fprintf(pp, " 0=%d", op_type);
-    } else if (op == "TopK") {
-      int axis = get_node_attr_i(node, "axis", -1);
-      axis = axis > 0 ? axis - 1 : axis;
-      int largest = get_node_attr_i(node, "largest", 1);
-      int sorted = get_node_attr_i(node, "sorted", 1);
-      fprintf(pp, " 0=%d", axis);
-      fprintf(pp, " 1=%d", largest);
-      fprintf(pp, " 2=%d", sorted);
-    } else if (op == "Transpose") {
-      std::vector<int> perm = get_node_attr_ai(node, "perm");
-
-      if (perm.size() == 3) {
-        if (perm[1] == 1 && perm[2] == 2)
-          fprintf(pp, " 0=0");  // w h
-        else if (perm[1] == 2 && perm[2] == 1)
-          fprintf(pp, " 0=1");  // h w
-        else if (perm[0] == 1 && perm[1] == 0 && perm[2] == 2)
-          fprintf(pp, " 0=0");  // w h
-        else if (perm[0] == 2 && perm[1] == 0 && perm[2] == 1)
-          fprintf(pp, " 0=1");  // h w
-      } else if (perm.size() == 4) {
-        if (perm[1] == 1 && perm[2] == 2 && perm[3] == 3)
-          fprintf(pp, " 0=0");  // w h c
-        else if (perm[1] == 1 && perm[2] == 3 && perm[3] == 2)
-          fprintf(pp, " 0=1");  // h w c
-        else if (perm[1] == 2 && perm[2] == 1 && perm[3] == 3)
-          fprintf(pp, " 0=2");  // w c h
-        else if (perm[1] == 2 && perm[2] == 3 && perm[3] == 1)
-          fprintf(pp, " 0=3");  // c w h
-        else if (perm[1] == 3 && perm[2] == 1 && perm[3] == 2)
-          fprintf(pp, " 0=4");  // h c w
-        else if (perm[1] == 3 && perm[2] == 2 && perm[3] == 1)
-          fprintf(pp, " 0=5");  // c h w
-      } else if (perm.size() == 5) {
-        if (perm[1] == 1 && perm[2] == 2 && perm[3] == 3 && perm[4] == 4)
-          fprintf(pp, " 0=0");  // wx h c
-        else if (perm[1] == 1 && perm[2] == 3 && perm[3] == 4 && perm[4] == 2)
-          fprintf(pp, " 0=1");  // h wx c
-        else if (perm[1] == 2 && perm[2] == 1 && perm[3] == 3 && perm[4] == 4)
-          fprintf(pp, " 0=2");  // wx c h
-        else if (perm[1] == 2 && perm[2] == 3 && perm[3] == 4 && perm[4] == 1)
-          fprintf(pp, " 0=3");  // c wx h
-        else if (perm[1] == 3 && perm[2] == 4 && perm[3] == 1 && perm[4] == 2)
-          fprintf(pp, " 0=4");  // h c wx
-        else if (perm[1] == 3 && perm[2] == 4 && perm[3] == 2 && perm[4] == 1)
-          fprintf(pp, " 0=5");  // c h wx
-        else
-          fprintf(stderr, "Unsupported transpose type !\n");
-      }
-    } else if (op == "Upsample") {
-      std::string mode = get_node_attr_s(node, "mode");
-      std::string align = get_node_attr_s(node, "coordinate_transformation_mode");
-
-      std::vector<float> scales;
-
-      if (node.input_size() == 1) {
-        scales = get_node_attr_af(node, "scales");
-      } else {
-        scales = get_node_attr_from_input_af(weights[node.input(1)]);
-      }
-
-      int resize_type = 1;
-      if (mode == "nearest") {
-        resize_type = 1;
-      } else if (mode == "bilinear" || mode == "linear") {
-        resize_type = 2;
-      } else if (mode == "trilinear") {
-        fprintf(stderr, "Unsupported Upsample mode !\n");
-      }
-
-      float h_scale = 1.f;
-      float w_scale = 1.f;
-      if (scales.size() == 2) {
-        w_scale = scales[1];
-      } else if (scales.size() == 3) {
-        h_scale = scales[1];
-        w_scale = scales[2];
-      } else if (scales.size() == 4) {
-        h_scale = scales[2];
-        w_scale = scales[3];
-
-        if (scales[1] != 1.f) fprintf(stderr, "Unsupported Upsample scales !\n");
-      } else {
-        fprintf(stderr, "Unsupported Upsample scales !\n");
-      }
-
-      int align_corner = 0;
-      if (align == "align_corners") {
-        align_corner = 1;
-      }
-
-      fprintf(pp, " 0=%d", resize_type);
-      fprintf(pp, " 1=%e", h_scale);
-      fprintf(pp, " 2=%e", w_scale);
-      fprintf(pp, " 6=%d", align_corner);
-    } else if (op == "Unsqueeze") {
-      std::vector<int> axes = get_node_attr_ai(node, "axes");
-      bool flag = true;
-      for (int i = 0; i < (int)axes.size(); i++) {
-        if (axes[i] == 0) {
-          flag = false;
-          break;
-        }
-      }
-      if (flag) {
-        fprintf(pp, " -23303=%zu", axes.size());
-        for (int i = 0; i < (int)axes.size(); i++) {
-          if (axes[i] == 0 || axes[i] > 4 || axes[i] < -4)
-            fprintf(stderr, "Unsupported unsqueeze axes !: %d, %s\n", axes[i], node.name().c_str());
-          fprintf(pp, ",%d", axes[i] - 1);
-        }
-      }
-    } else if (op == "Yolov3DetectionOutput") {
-      int num_class = get_node_attr_i(node, "num_class");
-      int num_box = get_node_attr_i(node, "num_box");
-      float confidence_threshold = get_node_attr_f(node, "confidence_threshold");
-      float nms_threshold = get_node_attr_f(node, "nms_threshold");
-      fprintf(pp, " 0=%d", num_class);
-      fprintf(pp, " 1=%d", num_box);
-      fprintf(pp, " 2=%e", confidence_threshold);
-      fprintf(pp, " 3=%e", nms_threshold);
-      std::vector<float> biases = get_node_attr_af(node, "biases");
-      if (biases.size() > 0) {
-        fprintf(pp, " -23304=%zu", biases.size());
-        for (int i = 0; i < (int)biases.size(); i++) {
-          fprintf(pp, ",%e", biases[i]);
-        }
-      }
-      std::vector<float> mask = get_node_attr_af(node, "mask");
-      if (mask.size() > 0) {
-        fprintf(pp, " -23305=%zu", mask.size());
-        for (int i = 0; i < (int)mask.size(); i++) {
-          fprintf(pp, ",%e", mask[i]);
-        }
-      }
-      std::vector<float> anchors_scale = get_node_attr_af(node, "anchors_scale");
-      if (anchors_scale.size() > 0) {
-        fprintf(pp, " -23306=%zu", anchors_scale.size());
-        for (int i = 0; i < (int)anchors_scale.size(); i++) {
-          fprintf(pp, ",%e", anchors_scale[i]);
-        }
-      }
-    } else {
-      // TODO op specific param
-    }
-
-    fprintf(pp, "\n");
-    for (int j = 0; j < output_size; j++) {
-      const std::string& output_name = node.output(j);
-      if (node_reference.find(output_name) != node_reference.end()) {
-        int refcount = node_reference[output_name];
-        if (refcount > 1) {
-          char splitname[256];
-          sprintf(splitname, "splitncnn_%d", internal_split);
-          fprintf(pp, "%-16s %-24s %d %d", "Split", splitname, 1, refcount);
-
-          fprintf(pp, " %s", output_name.c_str());
-
-          for (int k = 0; k < refcount; k++) {
-            fprintf(pp, " %s_splitncnn_%d", output_name.c_str(), k);
-          }
-          fprintf(pp, "\n");
-
-          internal_split++;
         }
-      }
     }
-  }
 
-  fclose(pp);
-  fclose(bp);
-  fprintf(stderr, "onnx2ncnn finish\n");
-  return 0;
+    fclose(pp);
+    fclose(bp);
+    fprintf(stderr, "onnx2ncnn finish\n");
+    return 0;
 }
diff --git a/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/shape_inference.cpp b/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/shape_inference.cpp
index dd1fe2c4f6..efecdcd199 100644
--- a/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/shape_inference.cpp
+++ b/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/shape_inference.cpp
@@ -13,158 +13,179 @@
  * @param context <tensor name, shape>
  * @return std::tuple<bool, std::vector<int>>
  */
-std::tuple<bool, std::vector<int>> query_shape(
-    onnx::GraphProto* mutable_graph, onnx::NodeProto* target,
-    const std::map<std::string, onnx::TensorProto>& weights,
-    std::map<std::string, std::vector<int>>& context) {
-  // emplace all input nodes
-  const int input_count = mutable_graph->input_size();
-  for (int i = 0; i < input_count; i++) {
-    auto inp = mutable_graph->input(i);
-    onnx::TypeProto inp_type = inp.type();
-    onnx::TensorShapeProto shape_proto = inp_type.tensor_type().shape();
-
-    auto dim_size = shape_proto.dim_size();
-    std::vector<int> shape(dim_size);
-    for (int index = 0; index < dim_size; ++index) {
-      shape[index] = shape_proto.dim(index).dim_value();
-    }
-
-    context.emplace(inp.name(), shape);
-  }
-
-  // BFS the tree, `target` as root, onnx::graph inputs and weights as leaf nodes
-  std::vector<onnx::NodeProto*> serial = {target};
-  {
-    std::set<std::string> mark_as_appended = {};
-    while (true) {
-      int start = 0, end = serial.size();
-      for (int i = start; i < end; ++i) {
-        auto node_ptr = serial[i];
-        auto len = node_ptr->input_size();
-
-        for (int j = 0; j < len; ++j) {
-          std::string name = node_ptr->input(j);
-          if (context.find(name) != context.end()) {
-            // if input founded, skip
-            continue;
-          }
-
-          if (weights.find(name) != weights.end()) {
-            // if founded in weights, extract shape to context
-            auto weight = weights.at(name);
-            std::vector<int> shape;
-            for (auto index = 0; index < weight.dims_size(); ++index) {
-              shape.emplace_back(weight.dims(index));
-            }
-            context.emplace(name, shape);
-            continue;
-          }
-
-          if (mark_as_appended.find(name) != mark_as_appended.end()) {
-            // if mark as appended, skip
-            continue;
-          }
-          // else append it to serialization list
-          auto depend_ptr = find_node_by_output_name(mutable_graph, name);
-          if (depend_ptr == nullptr) {
-            fprintf(stderr, "cannot find %s from graph !\n", name.c_str());
-            return std::make_tuple(false, std::vector<int>{});
-          }
-          mark_as_appended.insert(name);
-          serial.emplace_back(depend_ptr);
+std::tuple<bool, std::vector<int>> query_shape(onnx::GraphProto*                               mutable_graph,
+                                               onnx::NodeProto*                                target,
+                                               const std::map<std::string, onnx::TensorProto>& weights,
+                                               std::map<std::string, std::vector<int>>&        context)
+{
+    // emplace all input nodes
+    const int input_count = mutable_graph->input_size();
+    for (int i = 0; i < input_count; i++)
+    {
+        auto                   inp         = mutable_graph->input(i);
+        onnx::TypeProto        inp_type    = inp.type();
+        onnx::TensorShapeProto shape_proto = inp_type.tensor_type().shape();
+
+        auto                   dim_size = shape_proto.dim_size();
+        std::vector<int>       shape(dim_size);
+        for (int index = 0; index < dim_size; ++index)
+        {
+            shape[index] = shape_proto.dim(index).dim_value();
         }
-      }
 
-      if (serial.size() <= end) {
-        // if not new node added, quit
-        break;
-      }
-
-      // update start and end position, continue BFS the tree
-      start = end;
-      end = serial.size();
+        context.emplace(inp.name(), shape);
     }
-  }
-
-  // for each node in serialization list, calculate the output shape
-  {
-    std::reverse(serial.begin(), serial.end());
-    for (auto node : serial) {
-      if (node->op_type() == "Conv") {
-        auto inp = context[node->input(0)];
-        auto weight = context[node->input(1)];
-        assert(inp.size() == 4 and weight.size() == 4);
-
-        int group = get_node_attr_i(*node, "group", 1);
-        assert(group == 1);
-
-        // treat multiple spatial attr as single one
-#define EXTRACT_REPEATED_PARAM(NAME, ATTR, DEFAULT)        \
-  int ATTR = DEFAULT;                                      \
-  {                                                        \
-    std::vector<int> _vec = get_node_attr_ai(*node, NAME); \
-    if (not _vec.empty()) {                                \
-      ATTR = _vec[0];                                      \
-    }                                                      \
-  }
-
-        EXTRACT_REPEATED_PARAM("dilations", dilation, 1);
-        EXTRACT_REPEATED_PARAM("pads", pad, 0);
-        EXTRACT_REPEATED_PARAM("strides", stride, 1);
-
-#undef EXTRACT_REPEATED_PARAM
 
-        int on = inp[0];
-        int oc = weight[0];
-        int oh = (inp[2] + 2 * pad - weight[2]) / stride + 1;
-        int ow = (inp[3] + 2 * pad - weight[3]) / stride + 1;
-        context.emplace(node->output(0), std::vector<int>{on, oc, oh, ow});
-
-      } else if (node->op_type() == "Shape") {
-        auto inp = context[node->input(0)];
-        context.emplace(node->output(0), std::vector<int>{1, inp[1], inp[2], inp[3]});
-
-      } else if (node->op_type() == "Slice") {
-        assert(node->input_size() >= 4);
+    // BFS the tree, `target` as root, onnx::graph inputs and weights as leaf nodes
+    std::vector<onnx::NodeProto*> serial = {target};
+    {
+        std::set<std::string> mark_as_appended = {};
+        while (true)
+        {
+            int start = 0, end = serial.size();
+            for (int i = start; i < end; ++i)
+            {
+                auto node_ptr = serial[i];
+                auto len      = node_ptr->input_size();
+
+                for (int j = 0; j < len; ++j)
+                {
+                    std::string name = node_ptr->input(j);
+                    if (context.find(name) != context.end())
+                    {
+                        // if input founded, skip
+                        continue;
+                    }
+
+                    if (weights.find(name) != weights.end())
+                    {
+                        // if founded in weights, extract shape to context
+                        auto             weight = weights.at(name);
+                        std::vector<int> shape;
+                        for (auto index = 0; index < weight.dims_size(); ++index)
+                        {
+                            shape.emplace_back(weight.dims(index));
+                        }
+                        context.emplace(name, shape);
+                        continue;
+                    }
+
+                    if (mark_as_appended.find(name) != mark_as_appended.end())
+                    {
+                        // if mark as appended, skip
+                        continue;
+                    }
+                    // else append it to serialization list
+                    auto depend_ptr = find_node_by_output_name(mutable_graph, name);
+                    if (depend_ptr == nullptr)
+                    {
+                        fprintf(stderr, "cannot find %s from graph !\n", name.c_str());
+                        return std::make_tuple(false, std::vector<int>{});
+                    }
+                    mark_as_appended.insert(name);
+                    serial.emplace_back(depend_ptr);
+                }
+            }
 
-        auto inp = context[node->input(0)];
-        int start = get_node_attr_from_input<int>(weights.at(node->input(1)));
-        int end = get_node_attr_from_input<int>(weights.at(node->input(2)));
-        int axes = get_node_attr_from_input<int>(weights.at(node->input(3)));
+            if (serial.size() <= end)
+            {
+                // if not new node added, quit
+                break;
+            }
 
-        if (axes != 0) {
-          fprintf(stderr, "Not support axes=%d !\n", axes);
-          return std::make_tuple(false, std::vector<int>{});
+            // update start and end position, continue BFS the tree
+            start = end;
+            end   = serial.size();
         }
+    }
 
-        assert(inp.size() >= end - start);
-        context.emplace(node->output(0), std::vector<int>{inp.begin() + start, inp.begin() + end});
-
-      } else if (node->op_type() == "Concat") {
-        assert(node->input_size() >= 2);
-
-        auto axis = get_node_attr_i(*node, "axis", 0);
-        if (axis != 0) {
-          fprintf(stderr, "Not support axes=%d !\n", axis);
-          return std::make_tuple(false, std::vector<int>{});
-        }
+    // for each node in serialization list, calculate the output shape
+    {
+        std::reverse(serial.begin(), serial.end());
+        for (auto node : serial)
+        {
+            if (node->op_type() == "Conv")
+            {
+                auto inp    = context[node->input(0)];
+                auto weight = context[node->input(1)];
+                assert(inp.size() == 4 and weight.size() == 4);
+
+                int group = get_node_attr_i(*node, "group", 1);
+                assert(group == 1);
+
+                // treat multiple spatial attr as single one
+#define EXTRACT_REPEATED_PARAM(NAME, ATTR, DEFAULT)            \
+    int ATTR = DEFAULT;                                        \
+    {                                                          \
+        std::vector<int> _vec = get_node_attr_ai(*node, NAME); \
+        if (not _vec.empty())                                  \
+        {                                                      \
+            ATTR = _vec[0];                                    \
+        }                                                      \
+    }
 
-        std::vector<int> inp = context[node->input(0)];
-        std::vector<int> w_data = get_node_attr_from_input_ai(weights.at(node->input(1)));
+                EXTRACT_REPEATED_PARAM("dilations", dilation, 1);
+                EXTRACT_REPEATED_PARAM("pads", pad, 0);
+                EXTRACT_REPEATED_PARAM("strides", stride, 1);
 
-        // concat data on axis 0
-        inp.insert(inp.end(), w_data.begin(), w_data.end());
-        context.emplace(node->output(0), inp);
+#undef EXTRACT_REPEATED_PARAM
 
-      } else {
-        fprintf(stderr, "Unsupported type %s in query_shape !\n", node->op_type().c_str());
-        return std::make_tuple(false, std::vector<int>{});
-      }
+                int on = inp[0];
+                int oc = weight[0];
+                int oh = (inp[2] + 2 * pad - weight[2]) / stride + 1;
+                int ow = (inp[3] + 2 * pad - weight[3]) / stride + 1;
+                context.emplace(node->output(0), std::vector<int>{on, oc, oh, ow});
+            }
+            else if (node->op_type() == "Shape")
+            {
+                auto inp = context[node->input(0)];
+                context.emplace(node->output(0), std::vector<int>{1, inp[1], inp[2], inp[3]});
+            }
+            else if (node->op_type() == "Slice")
+            {
+                assert(node->input_size() >= 4);
+
+                auto inp   = context[node->input(0)];
+                int  start = get_node_attr_from_input<int>(weights.at(node->input(1)));
+                int  end   = get_node_attr_from_input<int>(weights.at(node->input(2)));
+                int  axes  = get_node_attr_from_input<int>(weights.at(node->input(3)));
+
+                if (axes != 0)
+                {
+                    fprintf(stderr, "Not support axes=%d !\n", axes);
+                    return std::make_tuple(false, std::vector<int>{});
+                }
+
+                assert(inp.size() >= end - start);
+                context.emplace(node->output(0), std::vector<int>{inp.begin() + start, inp.begin() + end});
+            }
+            else if (node->op_type() == "Concat")
+            {
+                assert(node->input_size() >= 2);
+
+                auto axis = get_node_attr_i(*node, "axis", 0);
+                if (axis != 0)
+                {
+                    fprintf(stderr, "Not support axes=%d !\n", axis);
+                    return std::make_tuple(false, std::vector<int>{});
+                }
+
+                std::vector<int> inp    = context[node->input(0)];
+                std::vector<int> w_data = get_node_attr_from_input_ai(weights.at(node->input(1)));
+
+                // concat data on axis 0
+                inp.insert(inp.end(), w_data.begin(), w_data.end());
+                context.emplace(node->output(0), inp);
+            }
+            else
+            {
+                fprintf(stderr, "Unsupported type %s in query_shape !\n", node->op_type().c_str());
+                return std::make_tuple(false, std::vector<int>{});
+            }
+        }
     }
-  }
 
-  assert(context.find(target->output(0)) != context.end());
-  auto target_shape = context[target->output(0)];
-  return std::make_tuple(true, target_shape);
+    assert(context.find(target->output(0)) != context.end());
+    auto target_shape = context[target->output(0)];
+    return std::make_tuple(true, target_shape);
 }
diff --git a/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/shape_inference.h b/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/shape_inference.h
index fa62ffe9de..55d966ae83 100644
--- a/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/shape_inference.h
+++ b/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/shape_inference.h
@@ -13,7 +13,7 @@
  * @param context <tensor name, shape>
  * @return std::tuple<bool, std::vector<int>>
  */
-std::tuple<bool, std::vector<int>> query_shape(
-    onnx::GraphProto* mutable_graph, onnx::NodeProto* target,
-    const std::map<std::string, onnx::TensorProto>& weights,
-    std::map<std::string, std::vector<int>>& context);
+std::tuple<bool, std::vector<int>> query_shape(onnx::GraphProto*                               mutable_graph,
+                                               onnx::NodeProto*                                target,
+                                               const std::map<std::string, onnx::TensorProto>& weights,
+                                               std::map<std::string, std::vector<int>>&        context);
diff --git a/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/utils.h b/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/utils.h
index 792db0ed34..ab991a52f9 100644
--- a/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/utils.h
+++ b/csrc/mmdeploy/backend_ops/ncnn/onnx2ncnn/utils.h
@@ -21,381 +21,496 @@
  * @param name
  * @return onnx::NodeProto*
  */
-static onnx::NodeProto* find_node_by_output_name(onnx::GraphProto* mutable_graph,
-                                                 const std::string& name) {
-  const int input_count = mutable_graph->node_size();
-  for (int i = 0; i < input_count; ++i) {
-    onnx::NodeProto* node = mutable_graph->mutable_node(i);
-
-    for (int j = 0; j < node->output_size(); ++j) {
-      auto output = node->output(j);
-      if (output == name) {
-        return node;
-      }
+static onnx::NodeProto* find_node_by_output_name(onnx::GraphProto*  mutable_graph,
+                                                 const std::string& name)
+{
+    const int input_count = mutable_graph->node_size();
+    for (int i = 0; i < input_count; ++i)
+    {
+        onnx::NodeProto* node = mutable_graph->mutable_node(i);
+
+        for (int j = 0; j < node->output_size(); ++j)
+        {
+            auto output = node->output(j);
+            if (output == name)
+            {
+                return node;
+            }
+        }
     }
-  }
 
-  return nullptr;
+    return nullptr;
 }
 
-static bool read_proto_from_binary(const char* filepath, onnx::ModelProto* message) {
-  std::ifstream fs(filepath, std::ifstream::in | std::ifstream::binary);
-  if (!fs.is_open()) {
-    fprintf(stderr, "open failed %s\n", filepath);
-    return false;
-  }
+static bool read_proto_from_binary(const char* filepath, onnx::ModelProto* message)
+{
+    std::ifstream fs(filepath, std::ifstream::in | std::ifstream::binary);
+    if (!fs.is_open())
+    {
+        fprintf(stderr, "open failed %s\n", filepath);
+        return false;
+    }
 
-  google::protobuf::io::IstreamInputStream input(&fs);
-  google::protobuf::io::CodedInputStream codedstr(&input);
+    google::protobuf::io::IstreamInputStream input(&fs);
+    google::protobuf::io::CodedInputStream   codedstr(&input);
 
 #if GOOGLE_PROTOBUF_VERSION >= 3011000
-  codedstr.SetTotalBytesLimit(INT_MAX);
+    codedstr.SetTotalBytesLimit(INT_MAX);
 #else
-  codedstr.SetTotalBytesLimit(INT_MAX, INT_MAX / 2);
+    codedstr.SetTotalBytesLimit(INT_MAX, INT_MAX / 2);
 #endif
 
-  bool success = message->ParseFromCodedStream(&codedstr);
+    bool success = message->ParseFromCodedStream(&codedstr);
 
-  fs.close();
+    fs.close();
 
-  return success;
+    return success;
 }
 
-static std::vector<int> get_node_attr_ai(const onnx::NodeProto& node, const char* key) {
-  std::vector<int> v;
+static std::vector<int> get_node_attr_ai(const onnx::NodeProto& node, const char* key)
+{
+    std::vector<int> v;
+
+    for (int i = 0; i < node.attribute_size(); i++)
+    {
+        const onnx::AttributeProto& attr = node.attribute(i);
+        if (attr.name() == key)
+        {
+            v.resize(attr.ints_size());
+            for (int j = 0; j < attr.ints_size(); j++)
+            {
+                v[j] = std::max(std::min(attr.ints(j), (::google::protobuf::int64)INT_MAX),
+                                (::google::protobuf::int64)INT_MIN);
+            }
+
+            break;
+        }
+    }
 
-  for (int i = 0; i < node.attribute_size(); i++) {
-    const onnx::AttributeProto& attr = node.attribute(i);
-    if (attr.name() == key) {
-      v.resize(attr.ints_size());
-      for (int j = 0; j < attr.ints_size(); j++) {
-        v[j] = std::max(std::min(attr.ints(j), (::google::protobuf::int64)INT_MAX),
-                        (::google::protobuf::int64)INT_MIN);
-      }
+    return v;
+}
 
-      break;
+static void set_node_attr_ai(onnx::NodeProto& node, const char* key, const std::vector<int>& value)
+{
+    onnx::AttributeProto* attr_group = node.add_attribute();
+    attr_group->set_name(key);
+    for (auto v : value)
+    {
+        attr_group->add_ints(v);
     }
-  }
 
-  return v;
+    return;
 }
 
-static void set_node_attr_ai(onnx::NodeProto& node, const char* key,
-                             const std::vector<int>& value) {
-  onnx::AttributeProto* attr_group = node.add_attribute();
-  attr_group->set_name(key);
-  for (auto v : value) {
-    attr_group->add_ints(v);
-  }
+static std::vector<float> get_node_attr_af(const onnx::NodeProto& node, const char* key)
+{
+    std::vector<float> v;
+
+    for (int i = 0; i < node.attribute_size(); i++)
+    {
+        const onnx::AttributeProto& attr = node.attribute(i);
+        if (attr.name() == key)
+        {
+            v.resize(attr.floats_size());
+            for (int j = 0; j < attr.floats_size(); j++)
+            {
+                v[j] = attr.floats(j);
+            }
+
+            break;
+        }
+    }
 
-  return;
+    return v;
 }
 
-static std::vector<float> get_node_attr_af(const onnx::NodeProto& node, const char* key) {
-  std::vector<float> v;
+static int get_node_attr_i(const onnx::NodeProto& node, const char* key, int def = 0)
+{
+    for (int i = 0; i < node.attribute_size(); i++)
+    {
+        const onnx::AttributeProto& attr = node.attribute(i);
+        if (attr.name() == key)
+        {
+            return std::max(std::min(attr.i(), (::google::protobuf::int64)INT_MAX),
+                            (::google::protobuf::int64)INT_MIN);
+        }
+    }
 
-  for (int i = 0; i < node.attribute_size(); i++) {
-    const onnx::AttributeProto& attr = node.attribute(i);
-    if (attr.name() == key) {
-      v.resize(attr.floats_size());
-      for (int j = 0; j < attr.floats_size(); j++) {
-        v[j] = attr.floats(j);
-      }
+    return def;
+}
 
-      break;
+static float get_node_attr_f(const onnx::NodeProto& node, const char* key, float def = 0.f)
+{
+    for (int i = 0; i < node.attribute_size(); i++)
+    {
+        const onnx::AttributeProto& attr = node.attribute(i);
+        if (attr.name() == key)
+        {
+            return attr.f();
+        }
     }
-  }
 
-  return v;
+    return def;
 }
 
-static int get_node_attr_i(const onnx::NodeProto& node, const char* key, int def = 0) {
-  for (int i = 0; i < node.attribute_size(); i++) {
-    const onnx::AttributeProto& attr = node.attribute(i);
-    if (attr.name() == key) {
-      return std::max(std::min(attr.i(), (::google::protobuf::int64)INT_MAX),
-                      (::google::protobuf::int64)INT_MIN);
+static std::string get_node_attr_s(const onnx::NodeProto& node, const char* key, const std::string& def = std::string())
+{
+    for (int i = 0; i < node.attribute_size(); i++)
+    {
+        const onnx::AttributeProto& attr = node.attribute(i);
+        if (attr.name() == key)
+        {
+            return attr.s();
+        }
     }
-  }
 
-  return def;
+    return def;
 }
 
-static float get_node_attr_f(const onnx::NodeProto& node, const char* key, float def = 0.f) {
-  for (int i = 0; i < node.attribute_size(); i++) {
-    const onnx::AttributeProto& attr = node.attribute(i);
-    if (attr.name() == key) {
-      return attr.f();
+static onnx::TensorProto get_node_attr_tensor(const onnx::NodeProto& node, const char* key)
+{
+    for (int i = 0; i < node.attribute_size(); i++)
+    {
+        const onnx::AttributeProto& attr = node.attribute(i);
+        if (attr.name() == key)
+        {
+            return attr.t();
+        }
     }
-  }
 
-  return def;
+    return onnx::TensorProto();
 }
 
-static std::string get_node_attr_s(const onnx::NodeProto& node, const char* key,
-                                   const std::string& def = std::string()) {
-  for (int i = 0; i < node.attribute_size(); i++) {
-    const onnx::AttributeProto& attr = node.attribute(i);
-    if (attr.name() == key) {
-      return attr.s();
+template<typename T>
+static T get_node_attr_from_input(const onnx::TensorProto& tp)
+{
+    T v = 0.f;
+
+    // float
+    if (tp.data_type() == 1)
+    {
+        const float* shape_data = 0;
+        if (tp.has_raw_data())
+        {
+            shape_data = (const float*)tp.raw_data().data();
+        }
+        else
+        {
+            shape_data = tp.float_data().data();
+        }
+        v = shape_data[0];
+    }
+    // double
+    else if (tp.data_type() == 11)
+    {
+        const double* shape_data = 0;
+        if (tp.has_raw_data())
+        {
+            shape_data = (const double*)tp.raw_data().data();
+        }
+        else
+        {
+            shape_data = tp.double_data().data();
+        }
+        v = shape_data[0];
+    }
+    // int64
+    else if (tp.data_type() == 7)
+    {
+        const int64_t* shape_data = 0;
+        if (tp.has_raw_data())
+        {
+            shape_data = (const int64_t*)tp.raw_data().data();
+        }
+        else
+        {
+            shape_data = tp.int64_data().data();
+        }
+        v = std::max(std::min(shape_data[0], (::google::protobuf::int64)INT_MAX),
+                     (::google::protobuf::int64)INT_MIN);
+    }
+    // int32
+    else if (tp.data_type() == 6)
+    {
+        const int32_t* shape_data = 0;
+        if (tp.has_raw_data())
+        {
+            shape_data = (const int32_t*)tp.raw_data().data();
+        }
+        else
+        {
+            shape_data = tp.int32_data().data();
+        }
+        v = shape_data[0];
+    }
+    else
+    {
+        // fprintf(stderr, "tp.name: %s\n", tp.name().c_str());
+        fprintf(stderr, "Unknown data type %d\n", tp.data_type());
+        fprintf(stderr, "get_node_attr_from_input\n");
+        abort();
     }
-  }
 
-  return def;
+    return v;
 }
 
-static onnx::TensorProto get_node_attr_tensor(const onnx::NodeProto& node, const char* key) {
-  for (int i = 0; i < node.attribute_size(); i++) {
-    const onnx::AttributeProto& attr = node.attribute(i);
-    if (attr.name() == key) {
-      return attr.t();
+static std::vector<int> get_node_attr_from_input_ai(const onnx::TensorProto& tp)
+{
+    int              size = 0;
+
+    std::vector<int> v;
+
+    // int64
+    if (tp.data_type() == 7)
+    {
+        const int64_t* shape_data = 0;
+        if (tp.has_raw_data())
+        {
+            shape_data = (const int64_t*)tp.raw_data().data();
+            size       = (int)(tp.raw_data().size() / 8);
+        }
+        else
+        {
+            shape_data = tp.int64_data().data();
+            size       = tp.int64_data_size();
+        }
+        for (int j = 0; j < size; j++)
+        {
+            int vi = std::max(std::min(shape_data[j], (::google::protobuf::int64)INT_MAX),
+                              (::google::protobuf::int64)INT_MIN);
+            v.push_back(vi);
+        }
+    }
+    // int32
+    else if (tp.data_type() == 6)
+    {
+        const int32_t* shape_data = 0;
+        if (tp.has_raw_data())
+        {
+            shape_data = (const int32_t*)tp.raw_data().data();
+            size       = (int)(tp.raw_data().size() / 4);
+        }
+        else
+        {
+            shape_data = tp.int32_data().data();
+            size       = tp.int32_data_size();
+        }
+        for (int j = 0; j < size; j++)
+        {
+            v.push_back(shape_data[j]);
+        }
+    }
+    else
+    {
+        fprintf(stderr, "Unknown data type %d\n", tp.data_type());
     }
-  }
 
-  return onnx::TensorProto();
+    return v;
 }
 
-template <typename T>
-static T get_node_attr_from_input(const onnx::TensorProto& tp) {
-  T v = 0.f;
-
-  // float
-  if (tp.data_type() == 1) {
-    const float* shape_data = 0;
-    if (tp.has_raw_data()) {
-      shape_data = (const float*)tp.raw_data().data();
-    } else {
-      shape_data = tp.float_data().data();
-    }
-    v = shape_data[0];
-  }
-  // double
-  else if (tp.data_type() == 11) {
-    const double* shape_data = 0;
-    if (tp.has_raw_data()) {
-      shape_data = (const double*)tp.raw_data().data();
-    } else {
-      shape_data = tp.double_data().data();
-    }
-    v = shape_data[0];
-  }
-  // int64
-  else if (tp.data_type() == 7) {
-    const int64_t* shape_data = 0;
-    if (tp.has_raw_data()) {
-      shape_data = (const int64_t*)tp.raw_data().data();
-    } else {
-      shape_data = tp.int64_data().data();
-    }
-    v = std::max(std::min(shape_data[0], (::google::protobuf::int64)INT_MAX),
-                 (::google::protobuf::int64)INT_MIN);
-  }
-  // int32
-  else if (tp.data_type() == 6) {
-    const int32_t* shape_data = 0;
-    if (tp.has_raw_data()) {
-      shape_data = (const int32_t*)tp.raw_data().data();
-    } else {
-      shape_data = tp.int32_data().data();
-    }
-    v = shape_data[0];
-  } else {
-    // fprintf(stderr, "tp.name: %s\n", tp.name().c_str());
-    fprintf(stderr, "Unknown data type %d\n", tp.data_type());
-    fprintf(stderr, "get_node_attr_from_input\n");
-    abort();
-  }
-
-  return v;
-}
+static std::vector<float> get_node_attr_from_input_af(const onnx::TensorProto& tp)
+{
+    int                size = 0;
+
+    std::vector<float> v;
+
+    // float
+    if (tp.data_type() == 1)
+    {
+        const float* shape_data = 0;
+        if (tp.has_raw_data())
+        {
+            shape_data = (const float*)tp.raw_data().data();
+            size       = (int)(tp.raw_data().size() / 4);
+        }
+        else
+        {
+            shape_data = tp.float_data().data();
+            size       = tp.float_data_size();
+        }
+        for (int j = 0; j < size; j++)
+        {
+            v.push_back(shape_data[j]);
+        }
+    }
+    // double
+    else if (tp.data_type() == 11)
+    {
+        const double* shape_data = 0;
+        if (tp.has_raw_data())
+        {
+            shape_data = (const double*)tp.raw_data().data();
+            size       = (int)(tp.raw_data().size() / 8);
+        }
+        else
+        {
+            shape_data = tp.double_data().data();
+            size       = tp.double_data_size();
+        }
+        for (int j = 0; j < size; j++)
+        {
+            v.push_back((float)shape_data[j]);
+        }
+    }
+    else
+    {
+        fprintf(stderr, "Unknown data type %d\n", tp.data_type());
+    }
 
-static std::vector<int> get_node_attr_from_input_ai(const onnx::TensorProto& tp) {
-  int size = 0;
-
-  std::vector<int> v;
-
-  // int64
-  if (tp.data_type() == 7) {
-    const int64_t* shape_data = 0;
-    if (tp.has_raw_data()) {
-      shape_data = (const int64_t*)tp.raw_data().data();
-      size = (int)(tp.raw_data().size() / 8);
-    } else {
-      shape_data = tp.int64_data().data();
-      size = tp.int64_data_size();
-    }
-    for (int j = 0; j < size; j++) {
-      int vi = std::max(std::min(shape_data[j], (::google::protobuf::int64)INT_MAX),
-                        (::google::protobuf::int64)INT_MIN);
-      v.push_back(vi);
-    }
-  }
-  // int32
-  else if (tp.data_type() == 6) {
-    const int32_t* shape_data = 0;
-    if (tp.has_raw_data()) {
-      shape_data = (const int32_t*)tp.raw_data().data();
-      size = (int)(tp.raw_data().size() / 4);
-    } else {
-      shape_data = tp.int32_data().data();
-      size = tp.int32_data_size();
-    }
-    for (int j = 0; j < size; j++) {
-      v.push_back(shape_data[j]);
-    }
-  } else {
-    fprintf(stderr, "Unknown data type %d\n", tp.data_type());
-  }
-
-  return v;
+    return v;
 }
 
-static std::vector<float> get_node_attr_from_input_af(const onnx::TensorProto& tp) {
-  int size = 0;
-
-  std::vector<float> v;
-
-  // float
-  if (tp.data_type() == 1) {
-    const float* shape_data = 0;
-    if (tp.has_raw_data()) {
-      shape_data = (const float*)tp.raw_data().data();
-      size = (int)(tp.raw_data().size() / 4);
-    } else {
-      shape_data = tp.float_data().data();
-      size = tp.float_data_size();
-    }
-    for (int j = 0; j < size; j++) {
-      v.push_back(shape_data[j]);
-    }
-  }
-  // double
-  else if (tp.data_type() == 11) {
-    const double* shape_data = 0;
-    if (tp.has_raw_data()) {
-      shape_data = (const double*)tp.raw_data().data();
-      size = (int)(tp.raw_data().size() / 8);
-    } else {
-      shape_data = tp.double_data().data();
-      size = tp.double_data_size();
-    }
-    for (int j = 0; j < size; j++) {
-      v.push_back((float)shape_data[j]);
-    }
-  } else {
-    fprintf(stderr, "Unknown data type %d\n", tp.data_type());
-  }
-
-  return v;
-}
+static int get_tensor_proto_data_size(const onnx::TensorProto& tp)
+{
+    if (tp.has_raw_data())
+    {
+        if (tp.data_type() == 1 || tp.data_type() == 6)
+        {
+            const std::string& raw_data = tp.raw_data();
+            int                size     = (int)raw_data.size() / 4;
+            return size;
+        }
+        else if (tp.data_type() == 7 || tp.data_type() == 11)
+        {
+            const std::string& raw_data = tp.raw_data();
+            int                size     = (int)raw_data.size() / 8;
+            return size;
+        }
+        else if (tp.data_type() == 9)
+        {
+            const std::string& raw_data = tp.raw_data();
+            return 0;
+        }
+    }
+    else if (tp.data_type() == 1)
+    {
+        return tp.float_data_size();
+    }
+    else if (tp.data_type() == 7)
+    {
+        return tp.int64_data_size();
+    }
+    else if (tp.data_type() == 6)
+    {
+        return tp.int32_data_size();
+    }
+    else if (tp.data_type() == 11)
+    {
+        return tp.double_data_size();
+    }
 
-static int get_tensor_proto_data_size(const onnx::TensorProto& tp) {
-  if (tp.has_raw_data()) {
-    if (tp.data_type() == 1 || tp.data_type() == 6) {
-      const std::string& raw_data = tp.raw_data();
-      int size = (int)raw_data.size() / 4;
-      return size;
-    } else if (tp.data_type() == 7 || tp.data_type() == 11) {
-      const std::string& raw_data = tp.raw_data();
-      int size = (int)raw_data.size() / 8;
-      return size;
-    } else if (tp.data_type() == 9) {
-      const std::string& raw_data = tp.raw_data();
-      return 0;
-    }
-  } else if (tp.data_type() == 1) {
-    return tp.float_data_size();
-  } else if (tp.data_type() == 7) {
-    return tp.int64_data_size();
-  } else if (tp.data_type() == 6) {
-    return tp.int32_data_size();
-  } else if (tp.data_type() == 11) {
-    return tp.double_data_size();
-  }
-
-  return 0;
+    return 0;
 }
 
-static void fwrite_tensor_proto_data(const onnx::TensorProto& tp, FILE* bp) {
-  int size = get_tensor_proto_data_size(tp);
+static void fwrite_tensor_proto_data(const onnx::TensorProto& tp, FILE* bp)
+{
+    int size = get_tensor_proto_data_size(tp);
 
-  if (tp.has_raw_data()) {
-    const std::string& raw_data = tp.raw_data();
-    fwrite(raw_data.data(), sizeof(float), size, bp);
-  } else if (tp.data_type() == 1) {
-    fwrite(tp.float_data().data(), sizeof(float), size, bp);
-  }
+    if (tp.has_raw_data())
+    {
+        const std::string& raw_data = tp.raw_data();
+        fwrite(raw_data.data(), sizeof(float), size, bp);
+    }
+    else if (tp.data_type() == 1)
+    {
+        fwrite(tp.float_data().data(), sizeof(float), size, bp);
+    }
 }
 
-static void fwrite_tensor_proto_data_to_float(const onnx::TensorProto& tp, FILE* bp) {
-  int size = get_tensor_proto_data_size(tp);
-  size_t written_size;
-  if (tp.has_raw_data()) {
-    const std::string& raw_data = tp.raw_data();
-    if (tp.data_type() == 6) {
-      int* intdataptr = (int*)raw_data.data();
-      float* floatdataptr = (float*)std::malloc(sizeof(float) * size);
-      for (int i = 0; i < size; i++) {
-        floatdataptr[i] = (float)intdataptr[i];
-      }
-      written_size = fwrite(floatdataptr, sizeof(float), size, bp);
-      std::free(floatdataptr);
-    } else if (tp.data_type() == 7) {
-      int64_t* intdataptr = (int64_t*)raw_data.data();
-      float* floatdataptr = (float*)std::malloc(sizeof(float) * size);
-      for (int i = 0; i < size; i++) {
-        floatdataptr[i] = (float)intdataptr[i];
-      }
-      written_size = fwrite(floatdataptr, sizeof(float), size, bp);
-      std::free(floatdataptr);
-    } else if (tp.data_type() == 9) {
-      bool* intdataptr = (bool*)raw_data.data();
-      float* floatdataptr = (float*)std::malloc(sizeof(float) * size);
-      for (int i = 0; i < size; i++) {
-        floatdataptr[i] = (float)intdataptr[i];
-      }
-      written_size = fwrite(floatdataptr, sizeof(float), size, bp);
-      std::free(floatdataptr);
-    } else if (tp.data_type() == 11) {
-      double* doubledataptr = (double*)raw_data.data();
-      float* floatdataptr = (float*)std::malloc(sizeof(float) * size);
-      for (int i = 0; i < size; i++) {
-        floatdataptr[i] = (float)doubledataptr[i];
-      }
-      written_size = fwrite(floatdataptr, sizeof(float), size, bp);
-      std::free(floatdataptr);
-    }
-  } else if (tp.data_type() == 6) {
-    int* intdataptr = (int*)tp.int32_data().data();
-    float* floatdataptr = (float*)std::malloc(sizeof(float) * size);
-    for (int i = 0; i < size; i++) {
-      floatdataptr[i] = (float)intdataptr[i];
-    }
-    written_size = fwrite(floatdataptr, sizeof(float), size, bp);
-    std::free(floatdataptr);
-  } else if (tp.data_type() == 7) {
-    int64_t* intdataptr = (int64_t*)tp.int64_data().data();
-    float* floatdataptr = (float*)std::malloc(sizeof(float) * size);
-    for (int i = 0; i < size; i++) {
-      floatdataptr[i] = (float)intdataptr[i];
-    }
-    written_size = fwrite(floatdataptr, sizeof(float), size, bp);
-    std::free(floatdataptr);
-  } else if (tp.data_type() == 9) {
-    int* intdataptr = (int*)tp.int64_data().data();
-    float* floatdataptr = (float*)std::malloc(sizeof(float) * size);
-    for (int i = 0; i < size; i++) {
-      floatdataptr[i] = (float)intdataptr[i];
-    }
-    written_size = fwrite(floatdataptr, sizeof(float), size, bp);
-    std::free(floatdataptr);
-  } else if (tp.data_type() == 11) {
-    double* doubledataptr = (double*)tp.double_data().data();
-    float* floatdataptr = (float*)std::malloc(sizeof(float) * size);
-    for (int i = 0; i < size; i++) {
-      floatdataptr[i] = (float)doubledataptr[i];
-    }
-    written_size = fwrite(floatdataptr, sizeof(float), size, bp);
-    std::free(floatdataptr);
-  }
+static void fwrite_tensor_proto_data_to_float(const onnx::TensorProto& tp, FILE* bp)
+{
+    int    size = get_tensor_proto_data_size(tp);
+    size_t written_size;
+    if (tp.has_raw_data())
+    {
+        const std::string& raw_data = tp.raw_data();
+        if (tp.data_type() == 6)
+        {
+            int*   intdataptr   = (int*)raw_data.data();
+            float* floatdataptr = (float*)std::malloc(sizeof(float) * size);
+            for (int i = 0; i < size; i++)
+            {
+                floatdataptr[i] = (float)intdataptr[i];
+            }
+            written_size = fwrite(floatdataptr, sizeof(float), size, bp);
+            std::free(floatdataptr);
+        }
+        else if (tp.data_type() == 7)
+        {
+            int64_t* intdataptr   = (int64_t*)raw_data.data();
+            float*   floatdataptr = (float*)std::malloc(sizeof(float) * size);
+            for (int i = 0; i < size; i++)
+            {
+                floatdataptr[i] = (float)intdataptr[i];
+            }
+            written_size = fwrite(floatdataptr, sizeof(float), size, bp);
+            std::free(floatdataptr);
+        }
+        else if (tp.data_type() == 9)
+        {
+            bool*  intdataptr   = (bool*)raw_data.data();
+            float* floatdataptr = (float*)std::malloc(sizeof(float) * size);
+            for (int i = 0; i < size; i++)
+            {
+                floatdataptr[i] = (float)intdataptr[i];
+            }
+            written_size = fwrite(floatdataptr, sizeof(float), size, bp);
+            std::free(floatdataptr);
+        }
+        else if (tp.data_type() == 11)
+        {
+            double* doubledataptr = (double*)raw_data.data();
+            float*  floatdataptr  = (float*)std::malloc(sizeof(float) * size);
+            for (int i = 0; i < size; i++)
+            {
+                floatdataptr[i] = (float)doubledataptr[i];
+            }
+            written_size = fwrite(floatdataptr, sizeof(float), size, bp);
+            std::free(floatdataptr);
+        }
+    }
+    else if (tp.data_type() == 6)
+    {
+        int*   intdataptr   = (int*)tp.int32_data().data();
+        float* floatdataptr = (float*)std::malloc(sizeof(float) * size);
+        for (int i = 0; i < size; i++)
+        {
+            floatdataptr[i] = (float)intdataptr[i];
+        }
+        written_size = fwrite(floatdataptr, sizeof(float), size, bp);
+        std::free(floatdataptr);
+    }
+    else if (tp.data_type() == 7)
+    {
+        int64_t* intdataptr   = (int64_t*)tp.int64_data().data();
+        float*   floatdataptr = (float*)std::malloc(sizeof(float) * size);
+        for (int i = 0; i < size; i++)
+        {
+            floatdataptr[i] = (float)intdataptr[i];
+        }
+        written_size = fwrite(floatdataptr, sizeof(float), size, bp);
+        std::free(floatdataptr);
+    }
+    else if (tp.data_type() == 9)
+    {
+        int*   intdataptr   = (int*)tp.int64_data().data();
+        float* floatdataptr = (float*)std::malloc(sizeof(float) * size);
+        for (int i = 0; i < size; i++)
+        {
+            floatdataptr[i] = (float)intdataptr[i];
+        }
+        written_size = fwrite(floatdataptr, sizeof(float), size, bp);
+        std::free(floatdataptr);
+    }
+    else if (tp.data_type() == 11)
+    {
+        double* doubledataptr = (double*)tp.double_data().data();
+        float*  floatdataptr  = (float*)std::malloc(sizeof(float) * size);
+        for (int i = 0; i < size; i++)
+        {
+            floatdataptr[i] = (float)doubledataptr[i];
+        }
+        written_size = fwrite(floatdataptr, sizeof(float), size, bp);
+        std::free(floatdataptr);
+    }
 }
diff --git a/csrc/mmdeploy/backend_ops/ncnn/ops/CMakeLists.txt b/csrc/mmdeploy/backend_ops/ncnn/ops/CMakeLists.txt
index abfff8e3f2..755561c379 100755
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/CMakeLists.txt
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/CMakeLists.txt
@@ -6,19 +6,17 @@ project(mmdeploy_ncnn_ops)
 file(GLOB_RECURSE NCNN_OPS_SRCS *.cpp)
 add_library(${PROJECT_NAME}_obj OBJECT "${NCNN_OPS_SRCS}")
 target_compile_definitions(${PROJECT_NAME}_obj PRIVATE -DMMDEPLOY_API_EXPORTS=1)
-set_target_properties(${PROJECT_NAME}_obj PROPERTIES POSITION_INDEPENDENT_CODE 1)
+set_target_properties(${PROJECT_NAME}_obj PROPERTIES POSITION_INDEPENDENT_CODE
+                                                     1)
 target_link_libraries(${PROJECT_NAME}_obj PRIVATE ncnn)
-set(_COMMON_INCLUDE_DIRS
-        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
-        $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/csrc>)
-target_include_directories(${PROJECT_NAME}_obj
-        PUBLIC ${_COMMON_INCLUDE_DIRS})
+set(_COMMON_INCLUDE_DIRS $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+                         $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/csrc>)
+target_include_directories(${PROJECT_NAME}_obj PUBLIC ${_COMMON_INCLUDE_DIRS})
 mmdeploy_export(${PROJECT_NAME}_obj)
 
 mmdeploy_add_library(${PROJECT_NAME} SHARED EXCLUDE "")
 target_link_libraries(${PROJECT_NAME} PRIVATE ${PROJECT_NAME}_obj)
-target_include_directories(${PROJECT_NAME}
-        PUBLIC ${_COMMON_INCLUDE_DIRS})
+target_include_directories(${PROJECT_NAME} PUBLIC ${_COMMON_INCLUDE_DIRS})
 
 add_library(mmdeploy::ncnn_ops ALIAS ${PROJECT_NAME})
 
diff --git a/csrc/mmdeploy/backend_ops/ncnn/ops/constantofshape/constantofshape.cpp b/csrc/mmdeploy/backend_ops/ncnn/ops/constantofshape/constantofshape.cpp
old mode 100755
new mode 100644
index b865db7b25..32ae99669b
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/constantofshape/constantofshape.cpp
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/constantofshape/constantofshape.cpp
@@ -3,51 +3,63 @@
 
 #include "../ncnn_ops_definer.h"
 
-namespace mmdeploy {
-using namespace ncnn;
-DEFINE_LAYER_CREATOR(ConstantOfShape)
-DEFINE_NCNN_OPS(ConstantOfShape, ConstantOfShape)
-ConstantOfShape::ConstantOfShape() {
-  one_blob_only = true;
-  support_inplace = false;
-}
+namespace mmdeploy
+{
+    using namespace ncnn;
+    DEFINE_LAYER_CREATOR(ConstantOfShape)
+    DEFINE_NCNN_OPS(ConstantOfShape, ConstantOfShape)
 
-int ConstantOfShape::load_param(const ParamDict& pd) {
-  val = pd.get(0, 0.f);
-  return 0;
-}
+    ConstantOfShape::ConstantOfShape()
+    {
+        one_blob_only   = true;
+        support_inplace = false;
+    }
 
-int ConstantOfShape::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const {
-  int dims = bottom_blob.w - 1;
-  const float* bottom_ptr = bottom_blob;
-  const float* shape_ptr = bottom_ptr + 1;
+    int ConstantOfShape::load_param(const ParamDict& pd)
+    {
+        val = pd.get(0, 0.f);
+        return 0;
+    }
 
-  if (dims == 1) {
-    int w = (int)(shape_ptr[0] + 0.5);
-    size_t elemsize = sizeof(val);
-    top_blob.create(w, elemsize, opt.blob_allocator);
-    if (top_blob.empty()) return -100;
-    top_blob.fill(val);
-    return 0;
-  } else if (dims == 2) {
-    int h = (int)(shape_ptr[0] + 0.5);
-    int w = (int)(shape_ptr[1] + 0.5);
-    size_t elemsize = sizeof(val);
-    top_blob.create(w, h, elemsize, opt.blob_allocator);
-    if (top_blob.empty()) return -100;
-    top_blob.fill(val);
-    return 0;
-  } else if (dims == 3) {
-    int channels = (int)(shape_ptr[0] + 0.5);
-    int h = (int)(shape_ptr[1] + 0.5);
-    int w = (int)(shape_ptr[2] + 0.5);
-    size_t elemsize = sizeof(val);
-    top_blob.create(w, h, channels, elemsize, opt.blob_allocator);
-    if (top_blob.empty()) return -100;
-    top_blob.fill(val);
-    return 0;
-  }
-  return -1;
-}
+    int ConstantOfShape::forward(const Mat&    bottom_blob,
+                                 Mat&          top_blob,
+                                 const Option& opt) const
+    {
+        int          dims       = bottom_blob.w - 1;
+        const float* bottom_ptr = bottom_blob;
+        const float* shape_ptr  = bottom_ptr + 1;
+
+        if (dims == 1)
+        {
+            int    w        = (int)(shape_ptr[0] + 0.5);
+            size_t elemsize = sizeof(val);
+            top_blob.create(w, elemsize, opt.blob_allocator);
+            if (top_blob.empty()) return -100;
+            top_blob.fill(val);
+            return 0;
+        }
+        else if (dims == 2)
+        {
+            int    h        = (int)(shape_ptr[0] + 0.5);
+            int    w        = (int)(shape_ptr[1] + 0.5);
+            size_t elemsize = sizeof(val);
+            top_blob.create(w, h, elemsize, opt.blob_allocator);
+            if (top_blob.empty()) return -100;
+            top_blob.fill(val);
+            return 0;
+        }
+        else if (dims == 3)
+        {
+            int    channels = (int)(shape_ptr[0] + 0.5);
+            int    h        = (int)(shape_ptr[1] + 0.5);
+            int    w        = (int)(shape_ptr[2] + 0.5);
+            size_t elemsize = sizeof(val);
+            top_blob.create(w, h, channels, elemsize, opt.blob_allocator);
+            if (top_blob.empty()) return -100;
+            top_blob.fill(val);
+            return 0;
+        }
+        return -1;
+    }
 
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/ncnn/ops/constantofshape/constantofshape.h b/csrc/mmdeploy/backend_ops/ncnn/ops/constantofshape/constantofshape.h
old mode 100755
new mode 100644
index b61fb62c09..85317ba559
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/constantofshape/constantofshape.h
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/constantofshape/constantofshape.h
@@ -4,20 +4,23 @@
 
 #include "layer.h"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-class ConstantOfShape : public ncnn::Layer {
- public:
-  ConstantOfShape();
+    class ConstantOfShape : public ncnn::Layer
+    {
+      public:
+        ConstantOfShape();
 
-  virtual int load_param(const ncnn::ParamDict& pd);
+        virtual int load_param(const ncnn::ParamDict& pd);
 
-  virtual int forward(const ncnn::Mat& bottom_blob, ncnn::Mat& top_blob,
-                      const ncnn::Option& opt) const;
+        virtual int forward(const ncnn::Mat&    bottom_blob,
+                            ncnn::Mat&          top_blob,
+                            const ncnn::Option& opt) const;
 
- public:
-  float val;
-};
+      public:
+        float val;
+    };
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/backend_ops/ncnn/ops/expand/expand.cpp b/csrc/mmdeploy/backend_ops/ncnn/ops/expand/expand.cpp
old mode 100755
new mode 100644
index be3d75a248..ca8120f228
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/expand/expand.cpp
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/expand/expand.cpp
@@ -4,330 +4,454 @@
 #include "expand.h"
 
 #include "../ncnn_ops_definer.h"
-namespace mmdeploy {
-using namespace ncnn;
-DEFINE_LAYER_CREATOR(Expand)
-DEFINE_NCNN_OPS(Expand, Expand)
-Expand::Expand() {
-  one_blob_only = false;
-  support_inplace = false;
-}
-
-int Expand::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs,
-                    const Option& opt) const {
-  const Mat& bottom_blob = bottom_blobs[0];
-  size_t elemsize = bottom_blob.elemsize;
-  const Mat& old_shape_blob = bottom_blobs[1];
-  const int shape_width = old_shape_blob.w - 1;
-  Mat shape_blob(shape_width, elemsize, opt.workspace_allocator);
-  memcpy(shape_blob.row(0), old_shape_blob.row(0) + 1, shape_width * elemsize);
-  Mat& top_blob = top_blobs[0];
-
-  if (bottom_blob.dims == 1 && shape_blob.w == 1) {
-    int shape_0 = (int)(shape_blob[0] + 0.5);
-    if (bottom_blob.w != shape_0 && bottom_blob.w != 1 && shape_0 != 1) {
-      fprintf(stderr, "The broadcast rule is wrong, (%d) vs (%d)\n", bottom_blob.w, shape_0);
-    } else if (bottom_blob.w == shape_0 || shape_0 == 1) {
-      top_blob.create(bottom_blob.w, elemsize, opt.blob_allocator);
-      if (top_blob.empty()) return -100;
-
-      for (int i = 0; i < bottom_blob.w; i++) {
-        top_blob[i] = bottom_blob[i];
-      }
-    } else if (bottom_blob.w == 1) {
-      top_blob.create(shape_0, elemsize, opt.blob_allocator);
-      if (top_blob.empty()) return -100;
-
-      for (int i = 0; i < shape_0; i++) {
-        top_blob[i] = bottom_blob[0];
-      }
-    } else {
-      fprintf(stderr, "error case\n");
-      return -100;
+namespace mmdeploy
+{
+    using namespace ncnn;
+    DEFINE_LAYER_CREATOR(Expand)
+    DEFINE_NCNN_OPS(Expand, Expand)
+    Expand::Expand()
+    {
+        one_blob_only   = false;
+        support_inplace = false;
     }
-    return 0;
-  } else if (bottom_blob.dims == 1 && shape_blob.w == 2) {
-    int shape_0 = (int)(shape_blob[0] + 0.5);
-    int shape_1 = (int)(shape_blob[1] + 0.5);
-    if (bottom_blob.w != shape_1 && bottom_blob.w != 1 && shape_1 != 1) {
-      fprintf(stderr, "The broadcast rule is wrong, (1, %d) vs (%d, %d)\n", bottom_blob.w, shape_0,
-              shape_1);
-    } else if (bottom_blob.w == shape_1 || shape_1 == 1) {
-      top_blob.create(bottom_blob.w, shape_0, elemsize, opt.blob_allocator);
-      if (top_blob.empty()) return -100;
 
-      for (int j = 0; j < shape_0; j++) {
-        for (int i = 0; i < bottom_blob.w; i++) {
-          top_blob.row(j)[i] = bottom_blob[i];
-        }
-      }
+    int Expand::forward(const std::vector<Mat>& bottom_blobs,
+                        std::vector<Mat>&       top_blobs,
+                        const Option&           opt) const
+    {
+        const Mat& bottom_blob    = bottom_blobs[0];
+        size_t     elemsize       = bottom_blob.elemsize;
+        const Mat& old_shape_blob = bottom_blobs[1];
+        const int  shape_width    = old_shape_blob.w - 1;
+        Mat        shape_blob(shape_width, elemsize, opt.workspace_allocator);
+        memcpy(shape_blob.row(0), old_shape_blob.row(0) + 1, shape_width * elemsize);
+        Mat& top_blob = top_blobs[0];
 
-    } else if (bottom_blob.w == 1) {
-      top_blob.create(shape_1, shape_0, elemsize, opt.blob_allocator);
-      if (top_blob.empty()) return -100;
+        if (bottom_blob.dims == 1 && shape_blob.w == 1)
+        {
+            int shape_0 = (int)(shape_blob[0] + 0.5);
+            if (bottom_blob.w != shape_0 && bottom_blob.w != 1 && shape_0 != 1)
+            {
+                fprintf(stderr, "The broadcast rule is wrong, (%d) vs (%d)\n", bottom_blob.w, shape_0);
+            }
+            else if (bottom_blob.w == shape_0 || shape_0 == 1)
+            {
+                top_blob.create(bottom_blob.w, elemsize, opt.blob_allocator);
+                if (top_blob.empty()) return -100;
 
-      for (int j = 0; j < shape_0; j++) {
-        for (int i = 0; i < shape_1; i++) {
-          top_blob.row(j)[i] = bottom_blob[0];
-        }
-      }
+                for (int i = 0; i < bottom_blob.w; i++)
+                {
+                    top_blob[i] = bottom_blob[i];
+                }
+            }
+            else if (bottom_blob.w == 1)
+            {
+                top_blob.create(shape_0, elemsize, opt.blob_allocator);
+                if (top_blob.empty()) return -100;
 
-    } else {
-      fprintf(stderr, "error case\n");
-      return -100;
-    }
-    return 0;
-  } else if (bottom_blob.dims == 1 && shape_blob.w == 3) {
-    int shape_0 = (int)(shape_blob[0] + 0.5);
-    int shape_1 = (int)(shape_blob[1] + 0.5);
-    int shape_2 = (int)(shape_blob[2] + 0.5);
-
-    if (bottom_blob.w != shape_2 && bottom_blob.w != 1 && shape_2 != 1) {
-      fprintf(stderr, "The broadcast rule is wrong, (1, 1, %d) vs (%d, %d, %d)\n", bottom_blob.w,
-              shape_0, shape_1, shape_2);
-    } else if (bottom_blob.w == shape_2 || shape_2 == 1) {
-      top_blob.create(bottom_blob.w, shape_1, shape_0, elemsize, opt.blob_allocator);
-      if (top_blob.empty()) return -100;
-      for (int k = 0; k < shape_0; k++) {
-        for (int j = 0; j < shape_1; j++) {
-          for (int i = 0; i < bottom_blob.w; i++) {
-            top_blob.channel(k).row(j)[i] = bottom_blob[i];
-          }
-        }
-      }
-    } else if (bottom_blob.w == 1) {
-      top_blob.create(shape_2, shape_1, shape_0, elemsize, opt.blob_allocator);
-      if (top_blob.empty()) return -100;
-      for (int k = 0; k < shape_0; k++) {
-        for (int j = 0; j < shape_1; j++) {
-          for (int i = 0; i < shape_2; i++) {
-            top_blob.channel(k).row(j)[i] = bottom_blob[0];
-          }
-        }
-      }
-    } else {
-      fprintf(stderr, "error case\n");
-      return -100;
-    }
-    return 0;
-  } else if (bottom_blob.dims == 2 && shape_blob.w == 2) {
-    int shape_0 = (int)(shape_blob[0] + 0.5);
-    int shape_1 = (int)(shape_blob[1] + 0.5);
-    if (bottom_blob.w != shape_1 && bottom_blob.w != 1 && shape_1 != 1) {
-      fprintf(stderr, "The broadcast rule is wrong, (%d, %d) vs (%d, %d)\n", bottom_blob.h,
-              bottom_blob.w, shape_0, shape_1);
-    } else if (bottom_blob.h != shape_0 && bottom_blob.h != 1 && shape_0 != 1) {
-      fprintf(stderr, "The broadcast rule is wrong, (%d, %d) vs (%d, %d)\n", bottom_blob.h,
-              bottom_blob.w, shape_0, shape_1);
-    } else if ((bottom_blob.w == shape_1 || shape_1 == 1) &&
-               (bottom_blob.h == shape_0 || shape_0 == 1)) {
-      top_blob.create(bottom_blob.w, bottom_blob.h, elemsize, opt.blob_allocator);
-      if (top_blob.empty()) return -100;
-      for (int j = 0; j < bottom_blob.h; j++) {
-        for (int i = 0; i < bottom_blob.w; i++) {
-          top_blob.row(j)[i] = bottom_blob.row(j)[i];
-        }
-      }
-    } else if ((bottom_blob.w == shape_1 || shape_1 == 1) && (bottom_blob.h == 1)) {
-      top_blob.create(bottom_blob.w, shape_0, elemsize, opt.blob_allocator);
-      if (top_blob.empty()) return -100;
-      for (int j = 0; j < shape_0; j++) {
-        for (int i = 0; i < bottom_blob.w; i++) {
-          top_blob.row(j)[i] = bottom_blob.row(0)[i];
-        }
-      }
-    } else if ((bottom_blob.w == 1) && (bottom_blob.h == shape_0 || shape_0 == 1)) {
-      top_blob.create(shape_1, bottom_blob.h, elemsize, opt.blob_allocator);
-      if (top_blob.empty()) return -100;
-      for (int j = 0; j < bottom_blob.h; j++) {
-        for (int i = 0; i < shape_1; i++) {
-          top_blob.row(j)[i] = bottom_blob.row(j)[0];
+                for (int i = 0; i < shape_0; i++)
+                {
+                    top_blob[i] = bottom_blob[0];
+                }
+            }
+            else
+            {
+                fprintf(stderr, "error case\n");
+                return -100;
+            }
+            return 0;
         }
-      }
-    } else if (bottom_blob.h == 1 && bottom_blob.w == 1) {
-      top_blob.create(shape_1, shape_0, elemsize, opt.blob_allocator);
-      if (top_blob.empty()) return -100;
-      for (int j = 0; j < shape_0; j++) {
-        for (int i = 0; i < shape_1; i++) {
-          top_blob.row(j)[i] = bottom_blob.row(0)[0];
-        }
-      }
-    } else {
-      fprintf(stderr, "error case\n");
-      return -100;
-    }
-    return 0;
-  } else if (bottom_blob.dims == 2 && shape_blob.w == 3) {
-    int shape_0 = (int)(shape_blob[0] + 0.5);
-    int shape_1 = (int)(shape_blob[1] + 0.5);
-    int shape_2 = (int)(shape_blob[2] + 0.5);
-    if (bottom_blob.w != shape_2 && bottom_blob.w != 1 && shape_2 != 1) {
-      fprintf(stderr, "The broadcast rule is wrong, (%d, %d) vs (%d, %d, %d)\n", bottom_blob.h,
-              bottom_blob.w, shape_0, shape_1, shape_2);
-    } else if (bottom_blob.h != shape_1 && bottom_blob.h != 1 && shape_1 != 1) {
-      fprintf(stderr, "The broadcast rule is wrong, (%d, %d) vs (%d, %d, %d)\n", bottom_blob.h,
-              bottom_blob.w, shape_0, shape_1, shape_2);
-    } else if ((bottom_blob.w == shape_2 || shape_2 == 1) &&
-               (bottom_blob.h == shape_1 || shape_1 == 1)) {
-      top_blob.create(bottom_blob.w, bottom_blob.h, shape_0, elemsize, opt.blob_allocator);
-      if (top_blob.empty()) return -100;
-      for (int k = 0; k < shape_0; k++) {
-        for (int j = 0; j < bottom_blob.h; j++) {
-          for (int i = 0; i < bottom_blob.w; i++) {
-            top_blob.channel(k).row(j)[i] = bottom_blob.row(j)[i];
-          }
-        }
-      }
-    } else if ((bottom_blob.w == shape_2 || shape_2 == 1) && (bottom_blob.h == 1)) {
-      top_blob.create(bottom_blob.w, shape_1, shape_0, elemsize, opt.blob_allocator);
-      if (top_blob.empty()) return -100;
-      for (int k = 0; k < shape_0; k++) {
-        for (int j = 0; j < shape_1; j++) {
-          for (int i = 0; i < bottom_blob.w; i++) {
-            top_blob.channel(k).row(j)[i] = bottom_blob.row(0)[i];
-          }
-        }
-      }
-
-    } else if ((bottom_blob.w == 1) && (bottom_blob.h == shape_1 || shape_1 == 1)) {
-      top_blob.create(shape_2, bottom_blob.h, shape_0, elemsize, opt.blob_allocator);
-      if (top_blob.empty()) return -100;
-      for (int k = 0; k < shape_0; k++) {
-        for (int j = 0; j < bottom_blob.h; j++) {
-          for (int i = 0; i < shape_2; i++) {
-            top_blob.channel(k).row(j)[i] = bottom_blob.row(j)[0];
-          }
-        }
-      }
+        else if (bottom_blob.dims == 1 && shape_blob.w == 2)
+        {
+            int shape_0 = (int)(shape_blob[0] + 0.5);
+            int shape_1 = (int)(shape_blob[1] + 0.5);
+            if (bottom_blob.w != shape_1 && bottom_blob.w != 1 && shape_1 != 1)
+            {
+                fprintf(stderr, "The broadcast rule is wrong, (1, %d) vs (%d, %d)\n", bottom_blob.w, shape_0, shape_1);
+            }
+            else if (bottom_blob.w == shape_1 || shape_1 == 1)
+            {
+                top_blob.create(bottom_blob.w, shape_0, elemsize, opt.blob_allocator);
+                if (top_blob.empty()) return -100;
 
-    } else if (bottom_blob.h == 1 && bottom_blob.w == 1) {
-      top_blob.create(shape_2, shape_1, shape_0, elemsize, opt.blob_allocator);
-      if (top_blob.empty()) return -100;
-      for (int k = 0; k < shape_0; k++) {
-        for (int j = 0; j < shape_1; j++) {
-          for (int i = 0; i < shape_2; i++) {
-            top_blob.channel(k).row(j)[i] = bottom_blob.row(0)[0];
-          }
-        }
-      }
-    } else {
-      fprintf(stderr, "error case\n");
-      return -100;
-    }
-    return 0;
-  } else if (bottom_blob.dims == 3 && shape_blob.w == 3) {
-    int shape_0 = (int)(shape_blob[0] + 0.5);
-    int shape_1 = (int)(shape_blob[1] + 0.5);
-    int shape_2 = (int)(shape_blob[2] + 0.5);
-    if (bottom_blob.w != shape_2 && bottom_blob.w != 1 && shape_2 != 1) {
-      fprintf(stderr, "The broadcast rule is wrong, (%d, %d, %d) vs (%d, %d, %d)\n", bottom_blob.c,
-              bottom_blob.h, bottom_blob.w, shape_0, shape_1, shape_2);
-    } else if (bottom_blob.h != shape_1 && bottom_blob.h != 1 && shape_1 != 1) {
-      fprintf(stderr, "The broadcast rule is wrong, (%d, %d, %d) vs (%d, %d, %d)\n", bottom_blob.c,
-              bottom_blob.h, bottom_blob.w, shape_0, shape_1, shape_2);
-    } else if (bottom_blob.c != shape_0 && bottom_blob.c != 1 && shape_0 != 1) {
-      fprintf(stderr, "The broadcast rule is wrong, (%d, %d, %d) vs (%d, %d, %d)\n", bottom_blob.c,
-              bottom_blob.h, bottom_blob.w, shape_0, shape_1, shape_2);
-    } else if ((bottom_blob.w == shape_2 || shape_2 == 1) &&
-               (bottom_blob.h == shape_1 || shape_1 == 1) &&
-               (bottom_blob.c == shape_0 || shape_0 == 1)) {
-      top_blob.create(bottom_blob.w, bottom_blob.h, bottom_blob.c, elemsize, opt.blob_allocator);
-      if (top_blob.empty()) return -100;
-      for (int k = 0; k < bottom_blob.c; k++) {
-        for (int j = 0; j < bottom_blob.h; j++) {
-          for (int i = 0; i < bottom_blob.w; i++) {
-            top_blob.channel(k).row(j)[i] = bottom_blob.channel(k).row(j)[i];
-          }
-        }
-      }
-    } else if ((bottom_blob.w == shape_2 || shape_2 == 1) &&
-               (bottom_blob.h == shape_1 || shape_1 == 1) && (bottom_blob.c == 1)) {
-      top_blob.create(bottom_blob.w, bottom_blob.h, shape_0, elemsize, opt.blob_allocator);
-      if (top_blob.empty()) return -100;
-      for (int k = 0; k < shape_0; k++) {
-        for (int j = 0; j < bottom_blob.h; j++) {
-          for (int i = 0; i < bottom_blob.w; i++) {
-            top_blob.channel(k).row(j)[i] = bottom_blob.channel(0).row(j)[i];
-          }
-        }
-      }
-
-    } else if ((bottom_blob.w == shape_2 || shape_2 == 1) && (bottom_blob.h == 1) &&
-               (bottom_blob.c == shape_0 || shape_0 == 1)) {
-      top_blob.create(bottom_blob.w, shape_1, bottom_blob.c, elemsize, opt.blob_allocator);
-      if (top_blob.empty()) return -100;
-      for (int k = 0; k < bottom_blob.c; k++) {
-        for (int j = 0; j < shape_1; j++) {
-          for (int i = 0; i < bottom_blob.w; i++) {
-            top_blob.channel(k).row(j)[i] = bottom_blob.channel(k).row(0)[i];
-          }
-        }
-      }
+                for (int j = 0; j < shape_0; j++)
+                {
+                    for (int i = 0; i < bottom_blob.w; i++)
+                    {
+                        top_blob.row(j)[i] = bottom_blob[i];
+                    }
+                }
+            }
+            else if (bottom_blob.w == 1)
+            {
+                top_blob.create(shape_1, shape_0, elemsize, opt.blob_allocator);
+                if (top_blob.empty()) return -100;
 
-    } else if ((bottom_blob.w == shape_2 || shape_2 == 1) && (bottom_blob.h == 1) &&
-               (bottom_blob.c == 1)) {
-      top_blob.create(bottom_blob.w, shape_1, shape_0, elemsize, opt.blob_allocator);
-      if (top_blob.empty()) return -100;
-      for (int k = 0; k < shape_0; k++) {
-        for (int j = 0; j < shape_1; j++) {
-          for (int i = 0; i < bottom_blob.w; i++) {
-            top_blob.channel(k).row(j)[i] = bottom_blob.channel(0).row(0)[i];
-          }
+                for (int j = 0; j < shape_0; j++)
+                {
+                    for (int i = 0; i < shape_1; i++)
+                    {
+                        top_blob.row(j)[i] = bottom_blob[0];
+                    }
+                }
+            }
+            else
+            {
+                fprintf(stderr, "error case\n");
+                return -100;
+            }
+            return 0;
         }
-      }
+        else if (bottom_blob.dims == 1 && shape_blob.w == 3)
+        {
+            int shape_0 = (int)(shape_blob[0] + 0.5);
+            int shape_1 = (int)(shape_blob[1] + 0.5);
+            int shape_2 = (int)(shape_blob[2] + 0.5);
 
-    } else if (bottom_blob.w == 1 && (bottom_blob.h == shape_1 || shape_1 == 1) &&
-               (bottom_blob.c == shape_0 || shape_0 == 1)) {
-      top_blob.create(shape_2, bottom_blob.h, bottom_blob.c, elemsize, opt.blob_allocator);
-      if (top_blob.empty()) return -100;
-      for (int k = 0; k < bottom_blob.c; k++) {
-        for (int j = 0; j < bottom_blob.h; j++) {
-          for (int i = 0; i < shape_2; i++) {
-            top_blob.channel(k).row(j)[i] = bottom_blob.channel(k).row(j)[0];
-          }
+            if (bottom_blob.w != shape_2 && bottom_blob.w != 1 && shape_2 != 1)
+            {
+                fprintf(stderr, "The broadcast rule is wrong, (1, 1, %d) vs (%d, %d, %d)\n", bottom_blob.w, shape_0, shape_1, shape_2);
+            }
+            else if (bottom_blob.w == shape_2 || shape_2 == 1)
+            {
+                top_blob.create(bottom_blob.w, shape_1, shape_0, elemsize, opt.blob_allocator);
+                if (top_blob.empty()) return -100;
+                for (int k = 0; k < shape_0; k++)
+                {
+                    for (int j = 0; j < shape_1; j++)
+                    {
+                        for (int i = 0; i < bottom_blob.w; i++)
+                        {
+                            top_blob.channel(k).row(j)[i] = bottom_blob[i];
+                        }
+                    }
+                }
+            }
+            else if (bottom_blob.w == 1)
+            {
+                top_blob.create(shape_2, shape_1, shape_0, elemsize, opt.blob_allocator);
+                if (top_blob.empty()) return -100;
+                for (int k = 0; k < shape_0; k++)
+                {
+                    for (int j = 0; j < shape_1; j++)
+                    {
+                        for (int i = 0; i < shape_2; i++)
+                        {
+                            top_blob.channel(k).row(j)[i] = bottom_blob[0];
+                        }
+                    }
+                }
+            }
+            else
+            {
+                fprintf(stderr, "error case\n");
+                return -100;
+            }
+            return 0;
         }
-      }
-    } else if (bottom_blob.w == 1 && (bottom_blob.h == shape_1 || shape_1 == 1) &&
-               (bottom_blob.c == 1)) {
-      top_blob.create(shape_2, bottom_blob.h, shape_0, elemsize, opt.blob_allocator);
-      if (top_blob.empty()) return -100;
-      for (int k = 0; k < shape_0; k++) {
-        for (int j = 0; j < bottom_blob.h; j++) {
-          for (int i = 0; i < shape_2; i++) {
-            top_blob.channel(k).row(j)[i] = bottom_blob.channel(0).row(j)[0];
-          }
+        else if (bottom_blob.dims == 2 && shape_blob.w == 2)
+        {
+            int shape_0 = (int)(shape_blob[0] + 0.5);
+            int shape_1 = (int)(shape_blob[1] + 0.5);
+            if (bottom_blob.w != shape_1 && bottom_blob.w != 1 && shape_1 != 1)
+            {
+                fprintf(stderr, "The broadcast rule is wrong, (%d, %d) vs (%d, %d)\n", bottom_blob.h, bottom_blob.w, shape_0, shape_1);
+            }
+            else if (bottom_blob.h != shape_0 && bottom_blob.h != 1 && shape_0 != 1)
+            {
+                fprintf(stderr, "The broadcast rule is wrong, (%d, %d) vs (%d, %d)\n", bottom_blob.h, bottom_blob.w, shape_0, shape_1);
+            }
+            else if ((bottom_blob.w == shape_1 || shape_1 == 1) &&
+                     (bottom_blob.h == shape_0 || shape_0 == 1))
+            {
+                top_blob.create(bottom_blob.w, bottom_blob.h, elemsize, opt.blob_allocator);
+                if (top_blob.empty()) return -100;
+                for (int j = 0; j < bottom_blob.h; j++)
+                {
+                    for (int i = 0; i < bottom_blob.w; i++)
+                    {
+                        top_blob.row(j)[i] = bottom_blob.row(j)[i];
+                    }
+                }
+            }
+            else if ((bottom_blob.w == shape_1 || shape_1 == 1) && (bottom_blob.h == 1))
+            {
+                top_blob.create(bottom_blob.w, shape_0, elemsize, opt.blob_allocator);
+                if (top_blob.empty()) return -100;
+                for (int j = 0; j < shape_0; j++)
+                {
+                    for (int i = 0; i < bottom_blob.w; i++)
+                    {
+                        top_blob.row(j)[i] = bottom_blob.row(0)[i];
+                    }
+                }
+            }
+            else if ((bottom_blob.w == 1) && (bottom_blob.h == shape_0 || shape_0 == 1))
+            {
+                top_blob.create(shape_1, bottom_blob.h, elemsize, opt.blob_allocator);
+                if (top_blob.empty()) return -100;
+                for (int j = 0; j < bottom_blob.h; j++)
+                {
+                    for (int i = 0; i < shape_1; i++)
+                    {
+                        top_blob.row(j)[i] = bottom_blob.row(j)[0];
+                    }
+                }
+            }
+            else if (bottom_blob.h == 1 && bottom_blob.w == 1)
+            {
+                top_blob.create(shape_1, shape_0, elemsize, opt.blob_allocator);
+                if (top_blob.empty()) return -100;
+                for (int j = 0; j < shape_0; j++)
+                {
+                    for (int i = 0; i < shape_1; i++)
+                    {
+                        top_blob.row(j)[i] = bottom_blob.row(0)[0];
+                    }
+                }
+            }
+            else
+            {
+                fprintf(stderr, "error case\n");
+                return -100;
+            }
+            return 0;
         }
-      }
-    } else if (bottom_blob.w == 1 && bottom_blob.h == 1 &&
-               (bottom_blob.c == shape_0 || shape_0 == 1)) {
-      top_blob.create(shape_2, shape_1, bottom_blob.c, elemsize, opt.blob_allocator);
-      if (top_blob.empty()) return -100;
-      for (int k = 0; k < bottom_blob.c; k++) {
-        for (int j = 0; j < shape_1; j++) {
-          for (int i = 0; i < shape_2; i++) {
-            top_blob.channel(k).row(j)[i] = bottom_blob.channel(k).row(0)[0];
-          }
+        else if (bottom_blob.dims == 2 && shape_blob.w == 3)
+        {
+            int shape_0 = (int)(shape_blob[0] + 0.5);
+            int shape_1 = (int)(shape_blob[1] + 0.5);
+            int shape_2 = (int)(shape_blob[2] + 0.5);
+            if (bottom_blob.w != shape_2 && bottom_blob.w != 1 && shape_2 != 1)
+            {
+                fprintf(stderr, "The broadcast rule is wrong, (%d, %d) vs (%d, %d, %d)\n", bottom_blob.h, bottom_blob.w, shape_0, shape_1, shape_2);
+            }
+            else if (bottom_blob.h != shape_1 && bottom_blob.h != 1 && shape_1 != 1)
+            {
+                fprintf(stderr, "The broadcast rule is wrong, (%d, %d) vs (%d, %d, %d)\n", bottom_blob.h, bottom_blob.w, shape_0, shape_1, shape_2);
+            }
+            else if ((bottom_blob.w == shape_2 || shape_2 == 1) &&
+                     (bottom_blob.h == shape_1 || shape_1 == 1))
+            {
+                top_blob.create(bottom_blob.w, bottom_blob.h, shape_0, elemsize, opt.blob_allocator);
+                if (top_blob.empty()) return -100;
+                for (int k = 0; k < shape_0; k++)
+                {
+                    for (int j = 0; j < bottom_blob.h; j++)
+                    {
+                        for (int i = 0; i < bottom_blob.w; i++)
+                        {
+                            top_blob.channel(k).row(j)[i] = bottom_blob.row(j)[i];
+                        }
+                    }
+                }
+            }
+            else if ((bottom_blob.w == shape_2 || shape_2 == 1) && (bottom_blob.h == 1))
+            {
+                top_blob.create(bottom_blob.w, shape_1, shape_0, elemsize, opt.blob_allocator);
+                if (top_blob.empty()) return -100;
+                for (int k = 0; k < shape_0; k++)
+                {
+                    for (int j = 0; j < shape_1; j++)
+                    {
+                        for (int i = 0; i < bottom_blob.w; i++)
+                        {
+                            top_blob.channel(k).row(j)[i] = bottom_blob.row(0)[i];
+                        }
+                    }
+                }
+            }
+            else if ((bottom_blob.w == 1) && (bottom_blob.h == shape_1 || shape_1 == 1))
+            {
+                top_blob.create(shape_2, bottom_blob.h, shape_0, elemsize, opt.blob_allocator);
+                if (top_blob.empty()) return -100;
+                for (int k = 0; k < shape_0; k++)
+                {
+                    for (int j = 0; j < bottom_blob.h; j++)
+                    {
+                        for (int i = 0; i < shape_2; i++)
+                        {
+                            top_blob.channel(k).row(j)[i] = bottom_blob.row(j)[0];
+                        }
+                    }
+                }
+            }
+            else if (bottom_blob.h == 1 && bottom_blob.w == 1)
+            {
+                top_blob.create(shape_2, shape_1, shape_0, elemsize, opt.blob_allocator);
+                if (top_blob.empty()) return -100;
+                for (int k = 0; k < shape_0; k++)
+                {
+                    for (int j = 0; j < shape_1; j++)
+                    {
+                        for (int i = 0; i < shape_2; i++)
+                        {
+                            top_blob.channel(k).row(j)[i] = bottom_blob.row(0)[0];
+                        }
+                    }
+                }
+            }
+            else
+            {
+                fprintf(stderr, "error case\n");
+                return -100;
+            }
+            return 0;
         }
-      }
-    } else if (bottom_blob.w == 1 && bottom_blob.h == 1 && bottom_blob.c == 1) {
-      top_blob.create(shape_2, shape_1, shape_0, elemsize, opt.blob_allocator);
-      if (top_blob.empty()) return -100;
-      for (int k = 0; k < shape_0; k++) {
-        for (int j = 0; j < shape_1; j++) {
-          for (int i = 0; i < shape_2; i++) {
-            top_blob.channel(k).row(j)[i] = bottom_blob.channel(0).row(0)[0];
-          }
+        else if (bottom_blob.dims == 3 && shape_blob.w == 3)
+        {
+            int shape_0 = (int)(shape_blob[0] + 0.5);
+            int shape_1 = (int)(shape_blob[1] + 0.5);
+            int shape_2 = (int)(shape_blob[2] + 0.5);
+            if (bottom_blob.w != shape_2 && bottom_blob.w != 1 && shape_2 != 1)
+            {
+                fprintf(stderr, "The broadcast rule is wrong, (%d, %d, %d) vs (%d, %d, %d)\n", bottom_blob.c, bottom_blob.h, bottom_blob.w, shape_0, shape_1, shape_2);
+            }
+            else if (bottom_blob.h != shape_1 && bottom_blob.h != 1 && shape_1 != 1)
+            {
+                fprintf(stderr, "The broadcast rule is wrong, (%d, %d, %d) vs (%d, %d, %d)\n", bottom_blob.c, bottom_blob.h, bottom_blob.w, shape_0, shape_1, shape_2);
+            }
+            else if (bottom_blob.c != shape_0 && bottom_blob.c != 1 && shape_0 != 1)
+            {
+                fprintf(stderr, "The broadcast rule is wrong, (%d, %d, %d) vs (%d, %d, %d)\n", bottom_blob.c, bottom_blob.h, bottom_blob.w, shape_0, shape_1, shape_2);
+            }
+            else if ((bottom_blob.w == shape_2 || shape_2 == 1) &&
+                     (bottom_blob.h == shape_1 || shape_1 == 1) &&
+                     (bottom_blob.c == shape_0 || shape_0 == 1))
+            {
+                top_blob.create(bottom_blob.w, bottom_blob.h, bottom_blob.c, elemsize, opt.blob_allocator);
+                if (top_blob.empty()) return -100;
+                for (int k = 0; k < bottom_blob.c; k++)
+                {
+                    for (int j = 0; j < bottom_blob.h; j++)
+                    {
+                        for (int i = 0; i < bottom_blob.w; i++)
+                        {
+                            top_blob.channel(k).row(j)[i] = bottom_blob.channel(k).row(j)[i];
+                        }
+                    }
+                }
+            }
+            else if ((bottom_blob.w == shape_2 || shape_2 == 1) &&
+                     (bottom_blob.h == shape_1 || shape_1 == 1) && (bottom_blob.c == 1))
+            {
+                top_blob.create(bottom_blob.w, bottom_blob.h, shape_0, elemsize, opt.blob_allocator);
+                if (top_blob.empty()) return -100;
+                for (int k = 0; k < shape_0; k++)
+                {
+                    for (int j = 0; j < bottom_blob.h; j++)
+                    {
+                        for (int i = 0; i < bottom_blob.w; i++)
+                        {
+                            top_blob.channel(k).row(j)[i] = bottom_blob.channel(0).row(j)[i];
+                        }
+                    }
+                }
+            }
+            else if ((bottom_blob.w == shape_2 || shape_2 == 1) && (bottom_blob.h == 1) &&
+                     (bottom_blob.c == shape_0 || shape_0 == 1))
+            {
+                top_blob.create(bottom_blob.w, shape_1, bottom_blob.c, elemsize, opt.blob_allocator);
+                if (top_blob.empty()) return -100;
+                for (int k = 0; k < bottom_blob.c; k++)
+                {
+                    for (int j = 0; j < shape_1; j++)
+                    {
+                        for (int i = 0; i < bottom_blob.w; i++)
+                        {
+                            top_blob.channel(k).row(j)[i] = bottom_blob.channel(k).row(0)[i];
+                        }
+                    }
+                }
+            }
+            else if ((bottom_blob.w == shape_2 || shape_2 == 1) && (bottom_blob.h == 1) &&
+                     (bottom_blob.c == 1))
+            {
+                top_blob.create(bottom_blob.w, shape_1, shape_0, elemsize, opt.blob_allocator);
+                if (top_blob.empty()) return -100;
+                for (int k = 0; k < shape_0; k++)
+                {
+                    for (int j = 0; j < shape_1; j++)
+                    {
+                        for (int i = 0; i < bottom_blob.w; i++)
+                        {
+                            top_blob.channel(k).row(j)[i] = bottom_blob.channel(0).row(0)[i];
+                        }
+                    }
+                }
+            }
+            else if (bottom_blob.w == 1 && (bottom_blob.h == shape_1 || shape_1 == 1) &&
+                     (bottom_blob.c == shape_0 || shape_0 == 1))
+            {
+                top_blob.create(shape_2, bottom_blob.h, bottom_blob.c, elemsize, opt.blob_allocator);
+                if (top_blob.empty()) return -100;
+                for (int k = 0; k < bottom_blob.c; k++)
+                {
+                    for (int j = 0; j < bottom_blob.h; j++)
+                    {
+                        for (int i = 0; i < shape_2; i++)
+                        {
+                            top_blob.channel(k).row(j)[i] = bottom_blob.channel(k).row(j)[0];
+                        }
+                    }
+                }
+            }
+            else if (bottom_blob.w == 1 && (bottom_blob.h == shape_1 || shape_1 == 1) &&
+                     (bottom_blob.c == 1))
+            {
+                top_blob.create(shape_2, bottom_blob.h, shape_0, elemsize, opt.blob_allocator);
+                if (top_blob.empty()) return -100;
+                for (int k = 0; k < shape_0; k++)
+                {
+                    for (int j = 0; j < bottom_blob.h; j++)
+                    {
+                        for (int i = 0; i < shape_2; i++)
+                        {
+                            top_blob.channel(k).row(j)[i] = bottom_blob.channel(0).row(j)[0];
+                        }
+                    }
+                }
+            }
+            else if (bottom_blob.w == 1 && bottom_blob.h == 1 &&
+                     (bottom_blob.c == shape_0 || shape_0 == 1))
+            {
+                top_blob.create(shape_2, shape_1, bottom_blob.c, elemsize, opt.blob_allocator);
+                if (top_blob.empty()) return -100;
+                for (int k = 0; k < bottom_blob.c; k++)
+                {
+                    for (int j = 0; j < shape_1; j++)
+                    {
+                        for (int i = 0; i < shape_2; i++)
+                        {
+                            top_blob.channel(k).row(j)[i] = bottom_blob.channel(k).row(0)[0];
+                        }
+                    }
+                }
+            }
+            else if (bottom_blob.w == 1 && bottom_blob.h == 1 && bottom_blob.c == 1)
+            {
+                top_blob.create(shape_2, shape_1, shape_0, elemsize, opt.blob_allocator);
+                if (top_blob.empty()) return -100;
+                for (int k = 0; k < shape_0; k++)
+                {
+                    for (int j = 0; j < shape_1; j++)
+                    {
+                        for (int i = 0; i < shape_2; i++)
+                        {
+                            top_blob.channel(k).row(j)[i] = bottom_blob.channel(0).row(0)[0];
+                        }
+                    }
+                }
+            }
+            else
+            {
+                fprintf(stderr, "error case\n");
+                return -100;
+            }
+            return 0;
         }
-      }
-    } else {
-      fprintf(stderr, "error case\n");
-      return -100;
+        fprintf(stderr, "Layer: Expand, bottom_blob.dims: %d, shape_blob.w: %d\n", bottom_blob.dims, shape_blob.w);
+        return -1;
     }
-    return 0;
-  }
-  fprintf(stderr, "Layer: Expand, bottom_blob.dims: %d, shape_blob.w: %d\n", bottom_blob.dims,
-          shape_blob.w);
-  return -1;
-}
 
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/ncnn/ops/expand/expand.h b/csrc/mmdeploy/backend_ops/ncnn/ops/expand/expand.h
old mode 100755
new mode 100644
index 3dca54fb0f..5b280100a4
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/expand/expand.h
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/expand/expand.h
@@ -4,15 +4,18 @@
 
 #include "layer.h"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-class Expand : public ncnn::Layer {
- public:
-  Expand();
+    class Expand : public ncnn::Layer
+    {
+      public:
+        Expand();
 
-  virtual int forward(const std::vector<ncnn::Mat>& bottom_blobs, std::vector<ncnn::Mat>& top_blobs,
-                      const ncnn::Option& opt) const;
-};
+        virtual int forward(const std::vector<ncnn::Mat>& bottom_blobs,
+                            std::vector<ncnn::Mat>&       top_blobs,
+                            const ncnn::Option&           opt) const;
+    };
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/backend_ops/ncnn/ops/gather/gather.cpp b/csrc/mmdeploy/backend_ops/ncnn/ops/gather/gather.cpp
index 4b6bd34630..15950bdbfa 100644
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/gather/gather.cpp
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/gather/gather.cpp
@@ -4,157 +4,183 @@
 #include "../ncnn_ops_definer.h"
 #include "assert.h"
 
-namespace mmdeploy {
-using namespace ncnn;
-DEFINE_LAYER_CREATOR(Gather)
-DEFINE_NCNN_OPS(Gather, Gather)
-Gather::Gather() {
-  one_blob_only = false;
-  support_inplace = false;
-}
-
-int Gather::load_param(const ParamDict &pd) {
-  axis = pd.get(0, 0);
-
-  return 0;
-}
-
-// Gather only support 1-dim of indices, because the data and indices all has
-// implicit batch in ncnn, this will lead to wrong shape to match onnx result.
-// When indices dim equals to 1, after eliminating implicit batch, the indices
-// dim still be 1. So there is only 1 implicit batch in data, this will make
-// the shape match onnx result.
-int Gather::forward(const std::vector<Mat> &bottom_blobs, std::vector<Mat> &top_blobs,
-                    const Option &opt) const {
-  const Mat &bottom_blob = bottom_blobs[0];
-  const Mat &indices = bottom_blobs[1];
-  int dims = bottom_blob.dims;
-  int indices_dims = indices.dims;
-  size_t elemsize = bottom_blob.elemsize;
-  int positive_axis = axis < 0 ? dims + axis : axis;
-  Mat &top_blob = top_blobs[0];
-  assert(indices.dims == 1);
-  const float *indices_ptr = indices;
-
-  if (dims == 1 && indices_dims == 1)  // positive_axis == 0
-  {
-    int w = indices.w;
-    top_blob.create(w, elemsize, opt.blob_allocator);
-    if (top_blob.empty()) {
-      return -100;
-    }
-    const float *ptr = bottom_blob;
-    float *outptr = top_blob;
-    for (int i = 0; i < w; i++) {
-      float indice = indices_ptr[i];
-      outptr[i] = ptr[(int)(indice + 0.5)];
+namespace mmdeploy
+{
+    using namespace ncnn;
+    DEFINE_LAYER_CREATOR(Gather)
+    DEFINE_NCNN_OPS(Gather, Gather)
+
+    Gather::Gather()
+    {
+        one_blob_only   = false;
+        support_inplace = false;
     }
 
-    return 0;
-  }
-
-  if (dims == 2 && positive_axis == 0 && indices_dims == 1) {
-    int w = bottom_blob.w;
-    int h = bottom_blob.h;
-    top_blob.create(w, indices.w, elemsize, opt.blob_allocator);
-    // w -> w
-    // h -> indices.w
-    // h * w -> indices.w * w
-    if (top_blob.empty()) {
-      return -100;
-    }
-    const float *ptr = bottom_blob;
-    float *outptr = top_blob;
-    for (int i = 0; i < indices.w; i++) {
-      const int selected = (int)(indices_ptr[i] + 0.5);
-      memcpy(top_blob.row(i), bottom_blob.row(selected), w * elemsize);
-    }
+    int Gather::load_param(const ParamDict& pd)
+    {
+        axis = pd.get(0, 0);
 
-    return 0;
-  }
-
-  if (dims == 2 && positive_axis == 1 && indices_dims == 1) {
-    int w = bottom_blob.w;
-    int h = bottom_blob.h;
-    top_blob.create(indices.w, h, elemsize, opt.blob_allocator);
-    // w -> h
-    // h -> indices.w
-    // h * w -> indices.w * h
-    if (top_blob.empty()) {
-      return -100;
-    }
-    const float *ptr = bottom_blob;
-    float *outptr = top_blob;
-    for (int j = 0; j < h; j++) {
-      for (int i = 0; i < indices.w; i++) {
-        int selected = (int)(indices_ptr[i] + 0.5);
-        outptr[j * indices.w + i] = ptr[j * w + selected];
-      }
+        return 0;
     }
-    return 0;
-  }
 
-  if (dims == 3 && positive_axis == 0 && indices_dims == 1) {
-    int w = bottom_blob.w;
-    int h = bottom_blob.h;
-    int channels = bottom_blob.c;
-    top_blob.create(w, h, indices.w, elemsize, opt.blob_allocator);
+    // Gather only support 1-dim of indices, because the data and indices all has
+    // implicit batch in ncnn, this will lead to wrong shape to match onnx result.
+    // When indices dim equals to 1, after eliminating implicit batch, the indices
+    // dim still be 1. So there is only 1 implicit batch in data, this will make
+    // the shape match onnx result.
+    int Gather::forward(const std::vector<Mat>& bottom_blobs,
+                        std::vector<Mat>&       top_blobs,
+                        const Option&           opt) const
+    {
+        const Mat& bottom_blob   = bottom_blobs[0];
+        const Mat& indices       = bottom_blobs[1];
+        int        dims          = bottom_blob.dims;
+        int        indices_dims  = indices.dims;
+        size_t     elemsize      = bottom_blob.elemsize;
+        int        positive_axis = axis < 0 ? dims + axis : axis;
+        Mat&       top_blob      = top_blobs[0];
+        assert(indices.dims == 1);
+        const float* indices_ptr = indices;
+
+        if (dims == 1 && indices_dims == 1)  // positive_axis == 0
+        {
+            int w = indices.w;
+            top_blob.create(w, elemsize, opt.blob_allocator);
+            if (top_blob.empty())
+            {
+                return -100;
+            }
+            const float* ptr    = bottom_blob;
+            float*       outptr = top_blob;
+            for (int i = 0; i < w; i++)
+            {
+                float indice = indices_ptr[i];
+                outptr[i]    = ptr[(int)(indice + 0.5)];
+            }
+
+            return 0;
+        }
 
-    if (top_blob.empty()) {
-      return -100;
-    }
-    for (int i = 0; i < indices.w; i++) {
-      int selected = (int)(indices_ptr[i] + 0.5);
-      const unsigned char *ptr = bottom_blob.channel(selected);
-      unsigned char *outptr = top_blob.channel(i);
+        if (dims == 2 && positive_axis == 0 && indices_dims == 1)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            top_blob.create(w, indices.w, elemsize, opt.blob_allocator);
+            // w -> w
+            // h -> indices.w
+            // h * w -> indices.w * w
+            if (top_blob.empty())
+            {
+                return -100;
+            }
+            const float* ptr    = bottom_blob;
+            float*       outptr = top_blob;
+            for (int i = 0; i < indices.w; i++)
+            {
+                const int selected = (int)(indices_ptr[i] + 0.5);
+                memcpy(top_blob.row(i), bottom_blob.row(selected), w * elemsize);
+            }
+
+            return 0;
+        }
 
-      memcpy(outptr, ptr, w * h * elemsize);
-    }
-    return 0;
-  }
-
-  if (dims == 3 && positive_axis == 1 && indices_dims == 1) {
-    int w = bottom_blob.w;
-    int h = bottom_blob.h;
-    int channels = bottom_blob.c;
-    top_blob.create(w, indices.w, channels, elemsize, opt.blob_allocator);
-#pragma omp parallel for num_threads(opt.num_threads)
-    // use parallel programming
-    for (int i = 0; i < channels; i++) {
-      float *outptr = top_blob.channel(i);
-      const float *ptr = bottom_blob.channel(i);
-      for (int j = 0; j < indices.w; j++) {
-        int selected = (int)(indices_ptr[j] + 0.5);
-        for (int k = 0; k < w; k++) {
-          outptr[j * w + k] = ptr[selected * w + k];
+        if (dims == 2 && positive_axis == 1 && indices_dims == 1)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            top_blob.create(indices.w, h, elemsize, opt.blob_allocator);
+            // w -> h
+            // h -> indices.w
+            // h * w -> indices.w * h
+            if (top_blob.empty())
+            {
+                return -100;
+            }
+            const float* ptr    = bottom_blob;
+            float*       outptr = top_blob;
+            for (int j = 0; j < h; j++)
+            {
+                for (int i = 0; i < indices.w; i++)
+                {
+                    int selected              = (int)(indices_ptr[i] + 0.5);
+                    outptr[j * indices.w + i] = ptr[j * w + selected];
+                }
+            }
+            return 0;
         }
-      }
-    }
 
-    return 0;
-  }
+        if (dims == 3 && positive_axis == 0 && indices_dims == 1)
+        {
+            int w        = bottom_blob.w;
+            int h        = bottom_blob.h;
+            int channels = bottom_blob.c;
+            top_blob.create(w, h, indices.w, elemsize, opt.blob_allocator);
+
+            if (top_blob.empty())
+            {
+                return -100;
+            }
+            for (int i = 0; i < indices.w; i++)
+            {
+                int                  selected = (int)(indices_ptr[i] + 0.5);
+                const unsigned char* ptr      = bottom_blob.channel(selected);
+                unsigned char*       outptr   = top_blob.channel(i);
+
+                memcpy(outptr, ptr, w * h * elemsize);
+            }
+            return 0;
+        }
 
-  if (dims == 3 && positive_axis == 2 && indices_dims == 1) {
-    int w = bottom_blob.w;
-    int h = bottom_blob.h;
-    int channels = bottom_blob.c;
-    top_blob.create(indices.w, h, channels, elemsize, opt.blob_allocator);
+        if (dims == 3 && positive_axis == 1 && indices_dims == 1)
+        {
+            int w        = bottom_blob.w;
+            int h        = bottom_blob.h;
+            int channels = bottom_blob.c;
+            top_blob.create(w, indices.w, channels, elemsize, opt.blob_allocator);
 #pragma omp parallel for num_threads(opt.num_threads)
-    // use parallel programming
-    for (int i = 0; i < channels; i++) {
-      float *outptr = top_blob.channel(i);
-      const float *ptr = bottom_blob.channel(i);
-      for (int j = 0; j < h; j++) {
-        for (int k = 0; k < indices.w; k++) {
-          int selected = (int)(indices_ptr[k] + 0.5);
-          outptr[j * indices.w + k] = ptr[j * w + selected];
+            // use parallel programming
+            for (int i = 0; i < channels; i++)
+            {
+                float*       outptr = top_blob.channel(i);
+                const float* ptr    = bottom_blob.channel(i);
+                for (int j = 0; j < indices.w; j++)
+                {
+                    int selected = (int)(indices_ptr[j] + 0.5);
+                    for (int k = 0; k < w; k++)
+                    {
+                        outptr[j * w + k] = ptr[selected * w + k];
+                    }
+                }
+            }
+
+            return 0;
         }
-      }
-    }
-    return 0;
-  }
 
-  return 0;
-}
+        if (dims == 3 && positive_axis == 2 && indices_dims == 1)
+        {
+            int w        = bottom_blob.w;
+            int h        = bottom_blob.h;
+            int channels = bottom_blob.c;
+            top_blob.create(indices.w, h, channels, elemsize, opt.blob_allocator);
+#pragma omp parallel for num_threads(opt.num_threads)
+            // use parallel programming
+            for (int i = 0; i < channels; i++)
+            {
+                float*       outptr = top_blob.channel(i);
+                const float* ptr    = bottom_blob.channel(i);
+                for (int j = 0; j < h; j++)
+                {
+                    for (int k = 0; k < indices.w; k++)
+                    {
+                        int selected              = (int)(indices_ptr[k] + 0.5);
+                        outptr[j * indices.w + k] = ptr[j * w + selected];
+                    }
+                }
+            }
+            return 0;
+        }
+
+        return 0;
+    }
 
 }  //  namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/ncnn/ops/gather/gather.h b/csrc/mmdeploy/backend_ops/ncnn/ops/gather/gather.h
old mode 100755
new mode 100644
index af6eb6365e..e7bfb717c8
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/gather/gather.h
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/gather/gather.h
@@ -4,20 +4,23 @@
 
 #include "layer.h"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-class Gather : public ncnn::Layer {
- public:
-  Gather();
+    class Gather : public ncnn::Layer
+    {
+      public:
+        Gather();
 
-  virtual int load_param(const ncnn::ParamDict& pd);
+        virtual int load_param(const ncnn::ParamDict& pd);
 
-  virtual int forward(const std::vector<ncnn::Mat>& bottom_blobs, std::vector<ncnn::Mat>& top_blobs,
-                      const ncnn::Option& opt) const;
+        virtual int forward(const std::vector<ncnn::Mat>& bottom_blobs,
+                            std::vector<ncnn::Mat>&       top_blobs,
+                            const ncnn::Option&           opt) const;
 
- public:
-  int axis;
-};
+      public:
+        int axis;
+    };
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/backend_ops/ncnn/ops/ncnn_ops_definer.h b/csrc/mmdeploy/backend_ops/ncnn/ops/ncnn_ops_definer.h
old mode 100755
new mode 100644
index 509c8c0ce0..bd5d9ca23e
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/ncnn_ops_definer.h
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/ncnn_ops_definer.h
@@ -7,22 +7,24 @@
 #include "layer.h"
 #include "ncnn_ops_register.h"
 
-namespace mmdeploy {
-
-class NCNNOpsDefiner {
- public:
-  NCNNOpsDefiner(const std::string& ops_name, const ncnn::layer_creator_func& creator_func = 0,
-                 const ncnn::layer_destroyer_func& destroyer_func = 0)
-      : _ops_name(ops_name) {
-    get_mmdeploy_layer_creator()[_ops_name.c_str()] = creator_func;
-  }
-
- private:
-  const std::string _ops_name;
-};
+namespace mmdeploy
+{
+
+    class NCNNOpsDefiner
+    {
+      public:
+        NCNNOpsDefiner(const std::string& ops_name, const ncnn::layer_creator_func& creator_func = 0, const ncnn::layer_destroyer_func& destroyer_func = 0)
+            : _ops_name(ops_name)
+        {
+            get_mmdeploy_layer_creator()[_ops_name.c_str()] = creator_func;
+        }
+
+      private:
+        const std::string _ops_name;
+    };
 
 #define DEFINE_NCNN_OPS(ops_name, OpsLayer) \
-  static mmdeploy::NCNNOpsDefiner NCNNOpsDefiner##ops_name{#ops_name, OpsLayer##_layer_creator};
+    static mmdeploy::NCNNOpsDefiner NCNNOpsDefiner##ops_name{#ops_name, OpsLayer##_layer_creator};
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/backend_ops/ncnn/ops/ncnn_ops_register.cpp b/csrc/mmdeploy/backend_ops/ncnn/ops/ncnn_ops_register.cpp
old mode 100755
new mode 100644
index 42bc050a1c..85d4f66d04
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/ncnn_ops_register.cpp
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/ncnn_ops_register.cpp
@@ -3,32 +3,38 @@
 
 #include <iostream>
 
-std::map<const char *, ncnn::layer_creator_func> &get_mmdeploy_layer_creator() {
-  static std::map<const char *, ncnn::layer_creator_func> _layer_creator_map;
-  return _layer_creator_map;
+std::map<const char*, ncnn::layer_creator_func>& get_mmdeploy_layer_creator()
+{
+    static std::map<const char*, ncnn::layer_creator_func> _layer_creator_map;
+    return _layer_creator_map;
 }
 
-std::map<const char *, ncnn::layer_destroyer_func> &get_mmdeploy_layer_destroyer() {
-  static std::map<const char *, ncnn::layer_destroyer_func> _layer_destroyer_map;
-  return _layer_destroyer_map;
+std::map<const char*, ncnn::layer_destroyer_func>& get_mmdeploy_layer_destroyer()
+{
+    static std::map<const char*, ncnn::layer_destroyer_func> _layer_destroyer_map;
+    return _layer_destroyer_map;
 }
 
-int register_mmdeploy_custom_layers(ncnn::Net &net) {
-  auto &layer_creator_map = get_mmdeploy_layer_creator();
-  auto &layer_destroyer_map = get_mmdeploy_layer_destroyer();
+int register_mmdeploy_custom_layers(ncnn::Net& net)
+{
+    auto& layer_creator_map   = get_mmdeploy_layer_creator();
+    auto& layer_destroyer_map = get_mmdeploy_layer_destroyer();
 
-  for (auto const &creator_pair : layer_creator_map) {
-    auto creator_name = creator_pair.first;
-    auto creator_func = creator_pair.second;
+    for (auto const& creator_pair : layer_creator_map)
+    {
+        auto                       creator_name = creator_pair.first;
+        auto                       creator_func = creator_pair.second;
 
-    ncnn::layer_destroyer_func destroyer_func = 0;
-    if (layer_destroyer_map.find(creator_name) != layer_destroyer_map.end()) {
-      destroyer_func = layer_destroyer_map[creator_name];
+        ncnn::layer_destroyer_func destroyer_func = 0;
+        if (layer_destroyer_map.find(creator_name) != layer_destroyer_map.end())
+        {
+            destroyer_func = layer_destroyer_map[creator_name];
+        }
+        int ret = net.register_custom_layer(creator_name, creator_func, destroyer_func);
+        if (0 != ret)
+        {
+            return ret;
+        }
     }
-    int ret = net.register_custom_layer(creator_name, creator_func, destroyer_func);
-    if (0 != ret) {
-      return ret;
-    }
-  }
-  return 0;
+    return 0;
 }
diff --git a/csrc/mmdeploy/backend_ops/ncnn/ops/ncnn_ops_register.h b/csrc/mmdeploy/backend_ops/ncnn/ops/ncnn_ops_register.h
old mode 100755
new mode 100644
index 0d9974f783..32c918156c
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/ncnn_ops_register.h
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/ncnn_ops_register.h
@@ -9,8 +9,9 @@
 #include "net.h"
 
 MMDEPLOY_API std::map<const char*, ncnn::layer_creator_func>& get_mmdeploy_layer_creator();
+
 MMDEPLOY_API std::map<const char*, ncnn::layer_destroyer_func>& get_mmdeploy_layer_destroyer();
 
-MMDEPLOY_API int register_mmdeploy_custom_layers(ncnn::Net& net);
+MMDEPLOY_API int                                                register_mmdeploy_custom_layers(ncnn::Net& net);
 
 #endif
diff --git a/csrc/mmdeploy/backend_ops/ncnn/ops/shape/shape.cpp b/csrc/mmdeploy/backend_ops/ncnn/ops/shape/shape.cpp
old mode 100755
new mode 100644
index f538eabbac..cce2935ba1
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/shape/shape.cpp
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/shape/shape.cpp
@@ -3,45 +3,59 @@
 
 #include "../ncnn_ops_definer.h"
 
-namespace mmdeploy {
-using namespace ncnn;
-DEFINE_LAYER_CREATOR(Shape)
-DEFINE_NCNN_OPS(Shape, Shape)
-Shape::Shape() {
-  one_blob_only = true;
-  support_inplace = false;
-}
+namespace mmdeploy
+{
+    using namespace ncnn;
+    DEFINE_LAYER_CREATOR(Shape)
+    DEFINE_NCNN_OPS(Shape, Shape)
 
-int Shape::forward(const Mat &bottom_blob, Mat &top_blob, const Option &opt) const {
-  int dims = bottom_blob.dims;
-  int w = bottom_blob.w;
-  size_t elemsize = sizeof(float);
-  top_blob.create(dims + 1, elemsize, opt.blob_allocator);
-  if (top_blob.empty()) {
-    return -100;
-  }
-  float *outptr = top_blob;
+    Shape::Shape()
+    {
+        one_blob_only   = true;
+        support_inplace = false;
+    }
 
-  if (dims == 1) {
-    outptr[0] = 1.0f;
-    outptr[1] = w;
-  } else if (dims == 2) {
-    int h = bottom_blob.h;
-    outptr[0] = 1.0f;
-    outptr[1] = h;
-    outptr[2] = w;
-  } else if (dims == 3) {
-    int h = bottom_blob.h;
-    int channels = bottom_blob.c;
-    outptr[0] = 1.0f;
-    outptr[1] = channels;
-    outptr[2] = h;
-    outptr[3] = w;
-  } else {
-    fprintf(stdout, "Unsupported dims=%d\n", dims);
-  }
+    int Shape::forward(const Mat&    bottom_blob,
+                       Mat&          top_blob,
+                       const Option& opt) const
+    {
+        int    dims     = bottom_blob.dims;
+        int    w        = bottom_blob.w;
+        size_t elemsize = sizeof(float);
+        top_blob.create(dims + 1, elemsize, opt.blob_allocator);
+        if (top_blob.empty())
+        {
+            return -100;
+        }
+        float* outptr = top_blob;
 
-  return 0;
-}
+        if (dims == 1)
+        {
+            outptr[0] = 1.0f;
+            outptr[1] = w;
+        }
+        else if (dims == 2)
+        {
+            int h     = bottom_blob.h;
+            outptr[0] = 1.0f;
+            outptr[1] = h;
+            outptr[2] = w;
+        }
+        else if (dims == 3)
+        {
+            int h        = bottom_blob.h;
+            int channels = bottom_blob.c;
+            outptr[0]    = 1.0f;
+            outptr[1]    = channels;
+            outptr[2]    = h;
+            outptr[3]    = w;
+        }
+        else
+        {
+            fprintf(stdout, "Unsupported dims=%d\n", dims);
+        }
+
+        return 0;
+    }
 
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/ncnn/ops/shape/shape.h b/csrc/mmdeploy/backend_ops/ncnn/ops/shape/shape.h
old mode 100755
new mode 100644
index 863dc77c1d..2c1e4573bf
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/shape/shape.h
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/shape/shape.h
@@ -4,15 +4,18 @@
 
 #include "layer.h"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-class Shape : public ncnn::Layer {
- public:
-  Shape();
+    class Shape : public ncnn::Layer
+    {
+      public:
+        Shape();
 
-  virtual int forward(const ncnn::Mat& bottom_blob, ncnn::Mat& top_blob,
-                      const ncnn::Option& opt) const;
-};
+        virtual int forward(const ncnn::Mat&    bottom_blob,
+                            ncnn::Mat&          top_blob,
+                            const ncnn::Option& opt) const;
+    };
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/backend_ops/ncnn/ops/tensorslice/tensorslice.cpp b/csrc/mmdeploy/backend_ops/ncnn/ops/tensorslice/tensorslice.cpp
index 9f2ced1992..8b1e35ae66 100644
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/tensorslice/tensorslice.cpp
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/tensorslice/tensorslice.cpp
@@ -5,202 +5,253 @@
 
 #include "../ncnn_ops_definer.h"
 
-namespace mmdeploy {
-using namespace ncnn;
-DEFINE_LAYER_CREATOR(TensorSlice)
-DEFINE_NCNN_OPS(TensorSlice, TensorSlice)
-TensorSlice::TensorSlice() {
-  one_blob_only = true;
-  support_inplace = false;
-}
+namespace mmdeploy
+{
+    using namespace ncnn;
+    DEFINE_LAYER_CREATOR(TensorSlice)
+    DEFINE_NCNN_OPS(TensorSlice, TensorSlice)
 
-int TensorSlice::load_param(const ParamDict& pd) {
-  starts = pd.get(0, Mat());
-  ends = pd.get(1, Mat());
-  axes = pd.get(2, Mat());
-  steps = pd.get(3, Mat());
-  if (axes.w == 0) {
-    axes.create(starts.w);
-    int* axes_ptr = axes;
-    for (int i = 0; i < starts.w; i++) {
-      axes_ptr[i] = i;
+    TensorSlice::TensorSlice()
+    {
+        one_blob_only   = true;
+        support_inplace = false;
     }
-  }
-  if (steps.w == 0) {
-    steps.create(axes.w);
-    steps.fill(1);
-  }
-  return 0;
-}
 
-static inline int get_shape_by_axes(const Mat& blob, int axes, int dims) {
-  switch (dims - axes) {
-    case 0:
-      return blob.w;
-    case 1:
-      return blob.h;
-    case 2:
-      return blob.c;
-    default:
-      fprintf(stderr, "wrong axes %d!\n", axes);
-      return -1;
-  }
-  return 0;
-}
-
-int TensorSlice::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const {
-  int dims = bottom_blob.dims;
-  size_t elemsize = bottom_blob.elemsize;
-  const int* start_ptr = starts;
-  const int* end_ptr = ends;
-  const int* axes_ptr = axes;
-  const int* step_ptr = steps;
-  if (starts.w > dims || ends.w > dims) {
-    fprintf(stderr, "start/end attributes shape error!\n");
-    return -100;
-  }
-  if (axes.w != 1) {
-    fprintf(stderr,
-            "axes.w must be 1 because any of multiaxes slice is regarded as "
-            "multi-staged onnx slice in pytorch2onnx.");
-  }
-  if (dims == 1) {
-    for (int i = 0; i < axes.w; i++) {
-      int positive_axis = axes_ptr[i] < 0 ? dims + axes_ptr[i] : axes_ptr[i];
-      int step = step_ptr[i];
-      std::vector<float> temp_val;
-      int start = start_ptr[i];
-      int end = end_ptr[i];
-      int cur = start;
-      if (step > 0) {
-        while (cur < end && cur < bottom_blob.w) {
-          temp_val.push_back(bottom_blob[cur]);
-          cur += step;
+    int TensorSlice::load_param(const ParamDict& pd)
+    {
+        starts = pd.get(0, Mat());
+        ends   = pd.get(1, Mat());
+        axes   = pd.get(2, Mat());
+        steps  = pd.get(3, Mat());
+        if (axes.w == 0)
+        {
+            axes.create(starts.w);
+            int* axes_ptr = axes;
+            for (int i = 0; i < starts.w; i++)
+            {
+                axes_ptr[i] = i;
+            }
         }
-      } else if (step < 0) {
-        while (cur > end && cur > 0) {
-          temp_val.push_back(bottom_blob[cur]);
-          cur += step;
+        if (steps.w == 0)
+        {
+            steps.create(axes.w);
+            steps.fill(1);
         }
-      } else {
-        fprintf(stderr, "step should not be 0!\n");
-        return -100;
-      }
-      top_blob.create(temp_val.size(), elemsize, opt.blob_allocator);
-      for (int i = 0; i < temp_val.size(); i++) {
-        top_blob[i] = temp_val[i];
-      }
-    }
-    return 0;
-  }
-  if (dims == 2) {
-    std::vector<std::vector<int> > active_indice;
-    std::vector<int> indices;
-    for (int i = 0; i < bottom_blob.h; i++) {
-      indices.push_back(i);
-    }
-    active_indice.push_back(indices);
-    indices.clear();
-    for (int i = 0; i < bottom_blob.w; i++) {
-      indices.push_back(i);
+        return 0;
     }
-    active_indice.push_back(indices);
-    for (int i = 0; i < axes.w; i++) {
-      int positive_axis = axes_ptr[i] < 0 ? dims + axes_ptr[i] : axes_ptr[i];
-      int step = step_ptr[i];
-      int start = start_ptr[i];
-      int end = end_ptr[i];
-      int dim_shape = get_shape_by_axes(bottom_blob, positive_axis, dims);
-      int dim_shape_test = get_shape_by_axes(bottom_blob, positive_axis, dims - 1);
-      if (dim_shape < 0) {
-        return -1;
-      }
-      end = end < dim_shape ? end : dim_shape;
-      int cur = start;
-      std::vector<int> temp_indice;
-      if (step > 0) {
-        while (cur < end && cur < dim_shape) {
-          temp_indice.push_back(cur);
-          cur += step;
-        }
-      } else if (step < 0) {
-        while (cur > end && cur > 0) {
-          temp_indice.push_back(cur);
-          cur += step;
-        }
-      } else {
-        fprintf(stderr, "step should not be 0!\n");
-        return -100;
-      }
-      active_indice[positive_axis - 1] = temp_indice;
-      active_indice[positive_axis - 1].resize(temp_indice.size());
-    }
-    top_blob.create((int)active_indice[1].size(), (int)active_indice[0].size(), elemsize,
-                    opt.blob_allocator);
-    for (int i = 0; i < active_indice[0].size(); i++) {
-      for (int j = 0; j < active_indice[1].size(); j++) {
-        top_blob.row(i)[j] = bottom_blob.row(active_indice[0][i])[active_indice[1][j]];
-      }
-    }
-    return 0;
-  }
 
-  if (dims == 3) {
-    std::vector<std::vector<int> > active_indice;
-    std::vector<int> indices;
-    for (int i = 0; i < bottom_blob.c; i++) {
-      indices.push_back(i);
-    }
-    active_indice.push_back(indices);
-    indices.clear();
-    for (int i = 0; i < bottom_blob.h; i++) {
-      indices.push_back(i);
-    }
-    active_indice.push_back(indices);
-    indices.clear();
-    for (int i = 0; i < bottom_blob.w; i++) {
-      indices.push_back(i);
+    static inline int get_shape_by_axes(const Mat& blob, int axes, int dims)
+    {
+        switch (dims - axes)
+        {
+            case 0:
+                return blob.w;
+            case 1:
+                return blob.h;
+            case 2:
+                return blob.c;
+            default:
+                fprintf(stderr, "wrong axes %d!\n", axes);
+                return -1;
+        }
+        return 0;
     }
-    active_indice.push_back(indices);
-    for (int i = 0; i < axes.w; i++) {
-      int positive_axis = axes_ptr[i] < 0 ? dims + axes_ptr[i] : axes_ptr[i];
-      int step = step_ptr[i];
 
-      int start = start_ptr[i];
-      int end = end_ptr[i];
-      int cur = start;
-      std::vector<int> temp_indice;
-      if (step > 0) {
-        while (cur < end && cur < bottom_blob.w) {
-          temp_indice.push_back(cur);
-          cur += step;
+    int TensorSlice::forward(const Mat&    bottom_blob,
+                             Mat&          top_blob,
+                             const Option& opt) const
+    {
+        int        dims      = bottom_blob.dims;
+        size_t     elemsize  = bottom_blob.elemsize;
+        const int* start_ptr = starts;
+        const int* end_ptr   = ends;
+        const int* axes_ptr  = axes;
+        const int* step_ptr  = steps;
+        if (starts.w > dims || ends.w > dims)
+        {
+            fprintf(stderr, "start/end attributes shape error!\n");
+            return -100;
         }
-      } else if (step < 0) {
-        while (cur > end && cur > 0) {
-          temp_indice.push_back(cur);
-          cur += step;
+        if (axes.w != 1)
+        {
+            fprintf(stderr,
+                    "axes.w must be 1 because any of multiaxes slice is regarded as "
+                    "multi-staged onnx slice in pytorch2onnx.");
         }
-      } else {
-        fprintf(stderr, "step should not be 0!\n");
-        return -100;
-      }
-      active_indice[positive_axis - 1] = temp_indice;
-      active_indice[positive_axis - 1].resize(temp_indice.size());
-    }
-    top_blob.create((int)active_indice[2].size(), (int)active_indice[1].size(),
-                    (int)active_indice[0].size(), elemsize, opt.blob_allocator);
-    for (int i = 0; i < active_indice[0].size(); i++) {
-      for (int j = 0; j < active_indice[1].size(); j++) {
-        for (int k = 0; k < active_indice[2].size(); k++) {
-          top_blob.channel(i).row(j)[k] = bottom_blob.channel(active_indice[0][i])
-                                              .row(active_indice[1][j])[active_indice[2][k]];
+        if (dims == 1)
+        {
+            for (int i = 0; i < axes.w; i++)
+            {
+                int                positive_axis = axes_ptr[i] < 0 ? dims + axes_ptr[i] : axes_ptr[i];
+                int                step          = step_ptr[i];
+                std::vector<float> temp_val;
+                int                start = start_ptr[i];
+                int                end   = end_ptr[i];
+                int                cur   = start;
+                if (step > 0)
+                {
+                    while (cur < end && cur < bottom_blob.w)
+                    {
+                        temp_val.push_back(bottom_blob[cur]);
+                        cur += step;
+                    }
+                }
+                else if (step < 0)
+                {
+                    while (cur > end && cur > 0)
+                    {
+                        temp_val.push_back(bottom_blob[cur]);
+                        cur += step;
+                    }
+                }
+                else
+                {
+                    fprintf(stderr, "step should not be 0!\n");
+                    return -100;
+                }
+                top_blob.create(temp_val.size(), elemsize, opt.blob_allocator);
+                for (int i = 0; i < temp_val.size(); i++)
+                {
+                    top_blob[i] = temp_val[i];
+                }
+            }
+            return 0;
+        }
+        if (dims == 2)
+        {
+            std::vector<std::vector<int>> active_indice;
+            std::vector<int>              indices;
+            for (int i = 0; i < bottom_blob.h; i++)
+            {
+                indices.push_back(i);
+            }
+            active_indice.push_back(indices);
+            indices.clear();
+            for (int i = 0; i < bottom_blob.w; i++)
+            {
+                indices.push_back(i);
+            }
+            active_indice.push_back(indices);
+            for (int i = 0; i < axes.w; i++)
+            {
+                int positive_axis  = axes_ptr[i] < 0 ? dims + axes_ptr[i] : axes_ptr[i];
+                int step           = step_ptr[i];
+                int start          = start_ptr[i];
+                int end            = end_ptr[i];
+                int dim_shape      = get_shape_by_axes(bottom_blob, positive_axis, dims);
+                int dim_shape_test = get_shape_by_axes(bottom_blob, positive_axis, dims - 1);
+                if (dim_shape < 0)
+                {
+                    return -1;
+                }
+                end                  = end < dim_shape ? end : dim_shape;
+                int              cur = start;
+                std::vector<int> temp_indice;
+                if (step > 0)
+                {
+                    while (cur < end && cur < dim_shape)
+                    {
+                        temp_indice.push_back(cur);
+                        cur += step;
+                    }
+                }
+                else if (step < 0)
+                {
+                    while (cur > end && cur > 0)
+                    {
+                        temp_indice.push_back(cur);
+                        cur += step;
+                    }
+                }
+                else
+                {
+                    fprintf(stderr, "step should not be 0!\n");
+                    return -100;
+                }
+                active_indice[positive_axis - 1] = temp_indice;
+                active_indice[positive_axis - 1].resize(temp_indice.size());
+            }
+            top_blob.create((int)active_indice[1].size(), (int)active_indice[0].size(), elemsize, opt.blob_allocator);
+            for (int i = 0; i < active_indice[0].size(); i++)
+            {
+                for (int j = 0; j < active_indice[1].size(); j++)
+                {
+                    top_blob.row(i)[j] = bottom_blob.row(active_indice[0][i])[active_indice[1][j]];
+                }
+            }
+            return 0;
         }
-      }
-    }
-    return 0;
-  }
 
-  return 0;
-}
+        if (dims == 3)
+        {
+            std::vector<std::vector<int>> active_indice;
+            std::vector<int>              indices;
+            for (int i = 0; i < bottom_blob.c; i++)
+            {
+                indices.push_back(i);
+            }
+            active_indice.push_back(indices);
+            indices.clear();
+            for (int i = 0; i < bottom_blob.h; i++)
+            {
+                indices.push_back(i);
+            }
+            active_indice.push_back(indices);
+            indices.clear();
+            for (int i = 0; i < bottom_blob.w; i++)
+            {
+                indices.push_back(i);
+            }
+            active_indice.push_back(indices);
+            for (int i = 0; i < axes.w; i++)
+            {
+                int              positive_axis = axes_ptr[i] < 0 ? dims + axes_ptr[i] : axes_ptr[i];
+                int              step          = step_ptr[i];
+
+                int              start = start_ptr[i];
+                int              end   = end_ptr[i];
+                int              cur   = start;
+                std::vector<int> temp_indice;
+                if (step > 0)
+                {
+                    while (cur < end && cur < bottom_blob.w)
+                    {
+                        temp_indice.push_back(cur);
+                        cur += step;
+                    }
+                }
+                else if (step < 0)
+                {
+                    while (cur > end && cur > 0)
+                    {
+                        temp_indice.push_back(cur);
+                        cur += step;
+                    }
+                }
+                else
+                {
+                    fprintf(stderr, "step should not be 0!\n");
+                    return -100;
+                }
+                active_indice[positive_axis - 1] = temp_indice;
+                active_indice[positive_axis - 1].resize(temp_indice.size());
+            }
+            top_blob.create((int)active_indice[2].size(), (int)active_indice[1].size(), (int)active_indice[0].size(), elemsize, opt.blob_allocator);
+            for (int i = 0; i < active_indice[0].size(); i++)
+            {
+                for (int j = 0; j < active_indice[1].size(); j++)
+                {
+                    for (int k = 0; k < active_indice[2].size(); k++)
+                    {
+                        top_blob.channel(i).row(j)[k] = bottom_blob.channel(active_indice[0][i])
+                                                            .row(active_indice[1][j])[active_indice[2][k]];
+                    }
+                }
+            }
+            return 0;
+        }
+
+        return 0;
+    }
 
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/ncnn/ops/tensorslice/tensorslice.h b/csrc/mmdeploy/backend_ops/ncnn/ops/tensorslice/tensorslice.h
old mode 100755
new mode 100644
index 9164d43335..fbffdcb843
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/tensorslice/tensorslice.h
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/tensorslice/tensorslice.h
@@ -4,23 +4,26 @@
 
 #include "layer.h"
 
-namespace mmdeploy {
-
-class TensorSlice : public ncnn::Layer {
- public:
-  TensorSlice();
-
-  virtual int load_param(const ncnn::ParamDict& pd);
-
-  virtual int forward(const ncnn::Mat& bottom_blob, ncnn::Mat& top_blob,
-                      const ncnn::Option& opt) const;
-
- public:
-  ncnn::Mat starts;
-  ncnn::Mat ends;
-  ncnn::Mat axes;
-  ncnn::Mat steps;
-};
+namespace mmdeploy
+{
+
+    class TensorSlice : public ncnn::Layer
+    {
+      public:
+        TensorSlice();
+
+        virtual int load_param(const ncnn::ParamDict& pd);
+
+        virtual int forward(const ncnn::Mat&    bottom_blob,
+                            ncnn::Mat&          top_blob,
+                            const ncnn::Option& opt) const;
+
+      public:
+        ncnn::Mat starts;
+        ncnn::Mat ends;
+        ncnn::Mat axes;
+        ncnn::Mat steps;
+    };
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/backend_ops/ncnn/ops/topk/topk.cpp b/csrc/mmdeploy/backend_ops/ncnn/ops/topk/topk.cpp
index f618831568..cfa55d1f8e 100644
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/topk/topk.cpp
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/topk/topk.cpp
@@ -6,872 +6,1122 @@
 #include <functional>
 
 #include "../ncnn_ops_definer.h"
-namespace mmdeploy {
-using namespace ncnn;
-DEFINE_LAYER_CREATOR(TopK)
-DEFINE_NCNN_OPS(TopK, TopK)
-
-TopK::TopK() {
-  one_blob_only = false;
-  support_inplace = false;
-}
-int TopK::load_param(const ParamDict& pd) {
-  axis = pd.get(0, -1);
-  largest = pd.get(1, 1);
-  sorted = pd.get(2, 1);
-  keep_dims = pd.get(3, 1);
-
-  return 0;
-}
-int TopK::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs,
-                  const Option& opt) const {
-  int dims = bottom_blobs[0].dims;
-  int positive_axis = axis < 0 ? dims + axis : axis;
-  int topk;
-  if (bottom_blobs.size() == 2) {
-    const Mat& topk_blob = bottom_blobs[1];
-    topk = (int)(topk_blob[0] + 0.5);
-  } else if (bottom_blobs.size() == 1) {
-    topk = 1;
-  } else {
-    fprintf(stderr, "topk input blobs should be 1 or 2, but not %ld\n", bottom_blobs.size());
-    return -103;
-  }
-
-  // To do: Cut the top_val_blob after unit test. And we should change them in
-  // param files.
-  // Adaptive outputs. For onnx TopK, we output 2 blobs, for ArgMax, we output
-  // 1 blob.
-  Mat& top_val_blob = top_blobs[0];
-  Mat& top_ind_blob = top_blobs.size() == 2 ? top_blobs[1] : top_val_blob;
-
-  if (topk > 1) {
-    // real topk
-    if (keep_dims == 0) {
-      fprintf(stderr, "real topk should not reduce dims!\n");
-      return -102;
+namespace mmdeploy
+{
+    using namespace ncnn;
+    DEFINE_LAYER_CREATOR(TopK)
+    DEFINE_NCNN_OPS(TopK, TopK)
+
+    TopK::TopK()
+    {
+        one_blob_only   = false;
+        support_inplace = false;
     }
-    if (dims == 1 && positive_axis == 0) {
-      if (topk > bottom_blobs[0].w) {
-        fprintf(stderr, "topk should not greater than total items!\n");
-        return -100;
-      }
-      top_val_blob.create(topk, 4u, opt.blob_allocator);
-      if (top_val_blob.empty()) return -100;
-
-      top_ind_blob.create(topk, 4u, opt.blob_allocator);
-      if (top_ind_blob.empty()) return -100;
-
-      const float* ptr = bottom_blobs[0];
-      std::vector<std::pair<float, int> > vec;
-      vec.resize(bottom_blobs[0].w);
-
-      if (largest == 1) {
-        for (int i = 0; i < bottom_blobs[0].w; i++) {
-          vec[i] = std::make_pair(ptr[i], -i);
-        }
-        std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
-                          std::greater<std::pair<float, int> >());
-      } else if (largest == 0) {
-        for (int i = 0; i < bottom_blobs[0].w; i++) {
-          vec[i] = std::make_pair(ptr[i], i);
-        }
-        std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
-                          std::less<std::pair<float, int> >());
-      } else {
-        fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
-        return -100;
-      }
-      float* valptr = top_val_blob;
-      float* indptr = top_ind_blob;
-      if (sorted == 1) {
-        for (int i = 0; i < topk; i++) {
-          valptr[i] = vec[i].first;
-          indptr[i] = abs(vec[i].second);
-        }
-      } else if (sorted == 0) {
-        int cur = 0;
-        float valtarget = vec[topk - 1].first;
-        int indtarget = (int)(abs(vec[topk - 1].second) + 0.5);
-
-        // pair comparison
-        if (largest == 1) {
-          for (int i = 0; i < bottom_blobs[0].w; i++) {
-            if (cur >= topk) break;
-            if (bottom_blobs[0][i] > valtarget) {
-              valptr[cur] = bottom_blobs[0][i];
-              indptr[cur] = i;
-              cur++;
-            } else if (bottom_blobs[0][i] == valtarget && i <= indtarget) {
-              valptr[cur] = bottom_blobs[0][i];
-              indptr[cur] = i;
-              cur++;
-            }
-          }
-        } else {
-          for (int i = 0; i < bottom_blobs[0].w; i++) {
-            if (cur >= topk) break;
-            if (bottom_blobs[0][i] < valtarget) {
-              valptr[cur] = bottom_blobs[0][i];
-              indptr[cur] = i;
-              cur++;
-            } else if (bottom_blobs[0][i] == valtarget && i <= indtarget) {
-              valptr[cur] = bottom_blobs[0][i];
-              indptr[cur] = i;
-              cur++;
-            }
-          }
-        }
-      }
+
+    int TopK::load_param(const ParamDict& pd)
+    {
+        axis      = pd.get(0, -1);
+        largest   = pd.get(1, 1);
+        sorted    = pd.get(2, 1);
+        keep_dims = pd.get(3, 1);
+
+        return 0;
     }
-    if (dims == 2 && positive_axis == 0) {
-      if (topk > bottom_blobs[0].h) {
-        fprintf(stderr, "topk should not greater than total items!\n");
-        return -100;
-      }
-      top_val_blob.create(bottom_blobs[0].w, topk, 4u, opt.blob_allocator);
-      if (top_val_blob.empty()) return -100;
-
-      top_ind_blob.create(bottom_blobs[0].w, topk, 4u, opt.blob_allocator);
-      if (top_ind_blob.empty()) return -100;
-
-      for (int col = 0; col < bottom_blobs[0].w; col++) {
-        std::vector<std::pair<float, int> > vec;
-        vec.resize(bottom_blobs[0].h);
-
-        if (largest == 1) {
-          for (int i = 0; i < bottom_blobs[0].h; i++) {
-            vec[i] = std::make_pair(bottom_blobs[0].row(i)[col], -i);
-          }
-          std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
-                            std::greater<std::pair<float, int> >());
-        } else if (largest == 0) {
-          for (int i = 0; i < bottom_blobs[0].h; i++) {
-            vec[i] = std::make_pair(bottom_blobs[0].row(i)[col], i);
-          }
-          std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
-                            std::less<std::pair<float, int> >());
-        } else {
-          fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
-          return -100;
+
+    int TopK::forward(const std::vector<Mat>& bottom_blobs,
+                      std::vector<Mat>&       top_blobs,
+                      const Option&           opt) const
+    {
+        int dims          = bottom_blobs[0].dims;
+        int positive_axis = axis < 0 ? dims + axis : axis;
+        int topk;
+        if (bottom_blobs.size() == 2)
+        {
+            const Mat& topk_blob = bottom_blobs[1];
+            topk                 = (int)(topk_blob[0] + 0.5);
         }
-        if (sorted == 1) {
-          for (int i = 0; i < topk; i++) {
-            top_val_blob.row(i)[col] = vec[i].first;
-            top_ind_blob.row(i)[col] = abs(vec[i].second);
-          }
-        } else if (sorted == 0) {
-          int cur = 0;
-          float valtarget = vec[topk - 1].first;
-          int indtarget = (int)(abs(vec[topk - 1].second) + 0.5);
-          if (largest == 1) {
-            for (int i = 0; i < bottom_blobs[0].h; i++) {
-              if (cur >= topk) break;
-              if (bottom_blobs[0].row(i)[col] > valtarget) {
-                top_val_blob.row(cur)[col] = bottom_blobs[0].row(i)[col];
-                top_ind_blob.row(cur)[col] = i;
-                cur++;
-              } else if (bottom_blobs[0].row(i)[col] == valtarget && i <= indtarget) {
-                top_val_blob.row(cur)[col] = bottom_blobs[0].row(i)[col];
-                top_ind_blob.row(cur)[col] = i;
-                cur++;
-              }
-            }
-          } else {
-            for (int i = 0; i < bottom_blobs[0].h; i++) {
-              if (cur >= topk) break;
-              if (bottom_blobs[0].row(i)[col] < valtarget) {
-                top_val_blob.row(cur)[col] = bottom_blobs[0].row(i)[col];
-                top_ind_blob.row(cur)[col] = i;
-                cur++;
-              } else if (bottom_blobs[0].row(i)[col] == valtarget && i <= indtarget) {
-                top_val_blob.row(cur)[col] = bottom_blobs[0].row(i)[col];
-                top_ind_blob.row(cur)[col] = i;
-                cur++;
-              }
-            }
-          }
-        } else {
-          fprintf(stderr, "sorted attribute should be 0 or 1, but not %d\n", sorted);
-          return -100;
+        else if (bottom_blobs.size() == 1)
+        {
+            topk = 1;
         }
-      }
-    }
-    if (dims == 2 && positive_axis == 1) {
-      if (topk > bottom_blobs[0].w) {
-        fprintf(stderr, "topk should not greater than total items!\n");
-        return -100;
-      }
-      top_val_blob.create(topk, bottom_blobs[0].h, 4u, opt.blob_allocator);
-      if (top_val_blob.empty()) return -100;
-
-      top_ind_blob.create(topk, bottom_blobs[0].h, 4u, opt.blob_allocator);
-      if (top_ind_blob.empty()) return -100;
-
-      for (int r = 0; r < bottom_blobs[0].h; r++) {
-        std::vector<std::pair<float, int> > vec;
-        vec.resize(bottom_blobs[0].w);
-
-        if (largest == 1) {
-          for (int i = 0; i < bottom_blobs[0].w; i++) {
-            vec[i] = std::make_pair(bottom_blobs[0].row(r)[i], -i);
-          }
-          std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
-                            std::greater<std::pair<float, int> >());
-        } else if (largest == 0) {
-          for (int i = 0; i < bottom_blobs[0].w; i++) {
-            vec[i] = std::make_pair(bottom_blobs[0].row(r)[i], i);
-          }
-          std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
-                            std::less<std::pair<float, int> >());
-        } else {
-          fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
-          return -100;
+        else
+        {
+            fprintf(stderr, "topk input blobs should be 1 or 2, but not %ld\n", bottom_blobs.size());
+            return -103;
         }
 
-        if (sorted == 1) {
-          for (int i = 0; i < topk; i++) {
-            top_val_blob.row(r)[i] = vec[i].first;
-            top_ind_blob.row(r)[i] = abs(vec[i].second);
-          }
-        } else if (sorted == 0) {
-          int cur = 0;
-          float valtarget = vec[topk - 1].first;
-          int indtarget = (int)(abs(vec[topk - 1].second) + 0.5);
-          if (largest == 1) {
-            for (int i = 0; i < bottom_blobs[0].w; i++) {
-              if (cur >= topk) break;
-              if (bottom_blobs[0].row(r)[i] > valtarget) {
-                top_val_blob.row(r)[cur] = bottom_blobs[0].row(r)[i];
-                top_ind_blob.row(r)[cur] = i;
-                cur++;
-              } else if (bottom_blobs[0].row(r)[i] == valtarget && i <= indtarget) {
-                top_val_blob.row(r)[cur] = bottom_blobs[0].row(r)[i];
-                top_ind_blob.row(r)[cur] = i;
-                cur++;
-              }
-            }
-          } else {
-            for (int i = 0; i < bottom_blobs[0].w; i++) {
-              if (cur >= topk) break;
-              if (bottom_blobs[0].row(r)[i] < valtarget) {
-                top_val_blob.row(r)[cur] = bottom_blobs[0].row(r)[i];
-                top_ind_blob.row(r)[cur] = i;
-                cur++;
-              } else if (bottom_blobs[0].row(r)[i] == valtarget && i <= indtarget) {
-                top_val_blob.row(r)[cur] = bottom_blobs[0].row(r)[i];
-                top_ind_blob.row(r)[cur] = i;
-                cur++;
-              }
-            }
-          }
+        // To do: Cut the top_val_blob after unit test. And we should change them in
+        // param files.
+        // Adaptive outputs. For onnx TopK, we output 2 blobs, for ArgMax, we output
+        // 1 blob.
+        Mat& top_val_blob = top_blobs[0];
+        Mat& top_ind_blob = top_blobs.size() == 2 ? top_blobs[1] : top_val_blob;
 
-        } else {
-          fprintf(stderr, "sorted attribute should be 0 or 1, but not %d\n", sorted);
-          return -100;
-        }
-      }
-    }
-    if (dims == 3 && positive_axis == 0) {
-      if (topk > bottom_blobs[0].c) {
-        fprintf(stderr, "topk should not greater than total items!\n");
-        return -100;
-      }
-      top_val_blob.create(bottom_blobs[0].w, bottom_blobs[0].h, topk, 4u, opt.blob_allocator);
-      if (top_val_blob.empty()) return -100;
-
-      top_ind_blob.create(bottom_blobs[0].w, bottom_blobs[0].h, topk, 4u, opt.blob_allocator);
-      if (top_ind_blob.empty()) return -100;
-
-      for (int r = 0; r < bottom_blobs[0].h; r++) {
-        for (int col = 0; col < bottom_blobs[0].w; col++) {
-          std::vector<std::pair<float, int> > vec;
-          vec.resize(bottom_blobs[0].c);
-
-          if (largest == 1) {
-            for (int i = 0; i < bottom_blobs[0].c; i++) {
-              vec[i] = std::make_pair(bottom_blobs[0].channel(i).row(r)[col], -i);
+        if (topk > 1)
+        {
+            // real topk
+            if (keep_dims == 0)
+            {
+                fprintf(stderr, "real topk should not reduce dims!\n");
+                return -102;
             }
-            std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
-                              std::greater<std::pair<float, int> >());
-          } else if (largest == 0) {
-            for (int i = 0; i < bottom_blobs[0].c; i++) {
-              vec[i] = std::make_pair(bottom_blobs[0].channel(i).row(r)[col], i);
-            }
-            std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
-                              std::less<std::pair<float, int> >());
-          } else {
-            fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
-            return -100;
-          }
-
-          if (sorted == 1) {
-            for (int i = 0; i < topk; i++) {
-              top_val_blob.channel(i).row(r)[col] = vec[i].first;
-              top_ind_blob.channel(i).row(r)[col] = abs(vec[i].second);
+            if (dims == 1 && positive_axis == 0)
+            {
+                if (topk > bottom_blobs[0].w)
+                {
+                    fprintf(stderr, "topk should not greater than total items!\n");
+                    return -100;
+                }
+                top_val_blob.create(topk, 4u, opt.blob_allocator);
+                if (top_val_blob.empty()) return -100;
+
+                top_ind_blob.create(topk, 4u, opt.blob_allocator);
+                if (top_ind_blob.empty()) return -100;
+
+                const float*                       ptr = bottom_blobs[0];
+                std::vector<std::pair<float, int>> vec;
+                vec.resize(bottom_blobs[0].w);
+
+                if (largest == 1)
+                {
+                    for (int i = 0; i < bottom_blobs[0].w; i++)
+                    {
+                        vec[i] = std::make_pair(ptr[i], -i);
+                    }
+                    std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(), std::greater<std::pair<float, int>>());
+                }
+                else if (largest == 0)
+                {
+                    for (int i = 0; i < bottom_blobs[0].w; i++)
+                    {
+                        vec[i] = std::make_pair(ptr[i], i);
+                    }
+                    std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(), std::less<std::pair<float, int>>());
+                }
+                else
+                {
+                    fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
+                    return -100;
+                }
+                float* valptr = top_val_blob;
+                float* indptr = top_ind_blob;
+                if (sorted == 1)
+                {
+                    for (int i = 0; i < topk; i++)
+                    {
+                        valptr[i] = vec[i].first;
+                        indptr[i] = abs(vec[i].second);
+                    }
+                }
+                else if (sorted == 0)
+                {
+                    int   cur       = 0;
+                    float valtarget = vec[topk - 1].first;
+                    int   indtarget = (int)(abs(vec[topk - 1].second) + 0.5);
+
+                    // pair comparison
+                    if (largest == 1)
+                    {
+                        for (int i = 0; i < bottom_blobs[0].w; i++)
+                        {
+                            if (cur >= topk) break;
+                            if (bottom_blobs[0][i] > valtarget)
+                            {
+                                valptr[cur] = bottom_blobs[0][i];
+                                indptr[cur] = i;
+                                cur++;
+                            }
+                            else if (bottom_blobs[0][i] == valtarget && i <= indtarget)
+                            {
+                                valptr[cur] = bottom_blobs[0][i];
+                                indptr[cur] = i;
+                                cur++;
+                            }
+                        }
+                    }
+                    else
+                    {
+                        for (int i = 0; i < bottom_blobs[0].w; i++)
+                        {
+                            if (cur >= topk) break;
+                            if (bottom_blobs[0][i] < valtarget)
+                            {
+                                valptr[cur] = bottom_blobs[0][i];
+                                indptr[cur] = i;
+                                cur++;
+                            }
+                            else if (bottom_blobs[0][i] == valtarget && i <= indtarget)
+                            {
+                                valptr[cur] = bottom_blobs[0][i];
+                                indptr[cur] = i;
+                                cur++;
+                            }
+                        }
+                    }
+                }
             }
-          } else if (sorted == 0) {
-            int cur = 0;
-            float valtarget = vec[topk - 1].first;
-            int indtarget = (int)(abs(vec[topk - 1].second) + 0.5);
-            if (largest == 1) {
-              for (int i = 0; i < bottom_blobs[0].c; i++) {
-                if (cur >= topk) break;
-                if (bottom_blobs[0].channel(i).row(r)[col] > valtarget) {
-                  top_val_blob.channel(cur).row(r)[col] = bottom_blobs[0].channel(i).row(r)[col];
-                  top_ind_blob.channel(cur).row(r)[col] = i;
-                  cur++;
-                } else if (bottom_blobs[0].channel(i).row(r)[col] == valtarget && i <= indtarget) {
-                  top_val_blob.channel(cur).row(r)[col] = bottom_blobs[0].channel(i).row(r)[col];
-                  top_ind_blob.channel(cur).row(r)[col] = i;
-                  cur++;
-                }
-              }
-            } else {
-              for (int i = 0; i < bottom_blobs[0].c; i++) {
-                if (cur >= topk) break;
-                if (bottom_blobs[0].channel(i).row(r)[col] < valtarget) {
-                  top_val_blob.channel(cur).row(r)[col] = bottom_blobs[0].channel(i).row(r)[col];
-                  top_ind_blob.channel(cur).row(r)[col] = i;
-                  cur++;
-                } else if (bottom_blobs[0].channel(i).row(r)[col] == valtarget && i <= indtarget) {
-                  top_val_blob.channel(cur).row(r)[col] = bottom_blobs[0].channel(i).row(r)[col];
-                  top_ind_blob.channel(cur).row(r)[col] = i;
-                  cur++;
-                }
-              }
+            if (dims == 2 && positive_axis == 0)
+            {
+                if (topk > bottom_blobs[0].h)
+                {
+                    fprintf(stderr, "topk should not greater than total items!\n");
+                    return -100;
+                }
+                top_val_blob.create(bottom_blobs[0].w, topk, 4u, opt.blob_allocator);
+                if (top_val_blob.empty()) return -100;
+
+                top_ind_blob.create(bottom_blobs[0].w, topk, 4u, opt.blob_allocator);
+                if (top_ind_blob.empty()) return -100;
+
+                for (int col = 0; col < bottom_blobs[0].w; col++)
+                {
+                    std::vector<std::pair<float, int>> vec;
+                    vec.resize(bottom_blobs[0].h);
+
+                    if (largest == 1)
+                    {
+                        for (int i = 0; i < bottom_blobs[0].h; i++)
+                        {
+                            vec[i] = std::make_pair(bottom_blobs[0].row(i)[col], -i);
+                        }
+                        std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(), std::greater<std::pair<float, int>>());
+                    }
+                    else if (largest == 0)
+                    {
+                        for (int i = 0; i < bottom_blobs[0].h; i++)
+                        {
+                            vec[i] = std::make_pair(bottom_blobs[0].row(i)[col], i);
+                        }
+                        std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(), std::less<std::pair<float, int>>());
+                    }
+                    else
+                    {
+                        fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
+                        return -100;
+                    }
+                    if (sorted == 1)
+                    {
+                        for (int i = 0; i < topk; i++)
+                        {
+                            top_val_blob.row(i)[col] = vec[i].first;
+                            top_ind_blob.row(i)[col] = abs(vec[i].second);
+                        }
+                    }
+                    else if (sorted == 0)
+                    {
+                        int   cur       = 0;
+                        float valtarget = vec[topk - 1].first;
+                        int   indtarget = (int)(abs(vec[topk - 1].second) + 0.5);
+                        if (largest == 1)
+                        {
+                            for (int i = 0; i < bottom_blobs[0].h; i++)
+                            {
+                                if (cur >= topk) break;
+                                if (bottom_blobs[0].row(i)[col] > valtarget)
+                                {
+                                    top_val_blob.row(cur)[col] = bottom_blobs[0].row(i)[col];
+                                    top_ind_blob.row(cur)[col] = i;
+                                    cur++;
+                                }
+                                else if (bottom_blobs[0].row(i)[col] == valtarget && i <= indtarget)
+                                {
+                                    top_val_blob.row(cur)[col] = bottom_blobs[0].row(i)[col];
+                                    top_ind_blob.row(cur)[col] = i;
+                                    cur++;
+                                }
+                            }
+                        }
+                        else
+                        {
+                            for (int i = 0; i < bottom_blobs[0].h; i++)
+                            {
+                                if (cur >= topk) break;
+                                if (bottom_blobs[0].row(i)[col] < valtarget)
+                                {
+                                    top_val_blob.row(cur)[col] = bottom_blobs[0].row(i)[col];
+                                    top_ind_blob.row(cur)[col] = i;
+                                    cur++;
+                                }
+                                else if (bottom_blobs[0].row(i)[col] == valtarget && i <= indtarget)
+                                {
+                                    top_val_blob.row(cur)[col] = bottom_blobs[0].row(i)[col];
+                                    top_ind_blob.row(cur)[col] = i;
+                                    cur++;
+                                }
+                            }
+                        }
+                    }
+                    else
+                    {
+                        fprintf(stderr, "sorted attribute should be 0 or 1, but not %d\n", sorted);
+                        return -100;
+                    }
+                }
             }
+            if (dims == 2 && positive_axis == 1)
+            {
+                if (topk > bottom_blobs[0].w)
+                {
+                    fprintf(stderr, "topk should not greater than total items!\n");
+                    return -100;
+                }
+                top_val_blob.create(topk, bottom_blobs[0].h, 4u, opt.blob_allocator);
+                if (top_val_blob.empty()) return -100;
 
-          } else {
-            fprintf(stderr, "sorted attribute should be 0 or 1, but not %d\n", sorted);
-            return -100;
-          }
-        }
-      }
-    }
-    if (dims == 3 && positive_axis == 1) {
-      if (topk > bottom_blobs[0].h) {
-        fprintf(stderr, "topk should not greater than total items!\n");
-        return -100;
-      }
-      top_val_blob.create(bottom_blobs[0].w, topk, bottom_blobs[0].c, 4u, opt.blob_allocator);
-      if (top_val_blob.empty()) return -100;
-
-      top_ind_blob.create(bottom_blobs[0].w, topk, bottom_blobs[0].c, 4u, opt.blob_allocator);
-      if (top_ind_blob.empty()) return -100;
-
-      for (int page = 0; page < bottom_blobs[0].c; page++) {
-        for (int col = 0; col < bottom_blobs[0].w; col++) {
-          std::vector<std::pair<float, int> > vec;
-          vec.resize(bottom_blobs[0].h);
-
-          if (largest == 1) {
-            for (int i = 0; i < bottom_blobs[0].h; i++) {
-              vec[i] = std::make_pair(bottom_blobs[0].channel(page).row(i)[col], -i);
+                top_ind_blob.create(topk, bottom_blobs[0].h, 4u, opt.blob_allocator);
+                if (top_ind_blob.empty()) return -100;
+
+                for (int r = 0; r < bottom_blobs[0].h; r++)
+                {
+                    std::vector<std::pair<float, int>> vec;
+                    vec.resize(bottom_blobs[0].w);
+
+                    if (largest == 1)
+                    {
+                        for (int i = 0; i < bottom_blobs[0].w; i++)
+                        {
+                            vec[i] = std::make_pair(bottom_blobs[0].row(r)[i], -i);
+                        }
+                        std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(), std::greater<std::pair<float, int>>());
+                    }
+                    else if (largest == 0)
+                    {
+                        for (int i = 0; i < bottom_blobs[0].w; i++)
+                        {
+                            vec[i] = std::make_pair(bottom_blobs[0].row(r)[i], i);
+                        }
+                        std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(), std::less<std::pair<float, int>>());
+                    }
+                    else
+                    {
+                        fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
+                        return -100;
+                    }
+
+                    if (sorted == 1)
+                    {
+                        for (int i = 0; i < topk; i++)
+                        {
+                            top_val_blob.row(r)[i] = vec[i].first;
+                            top_ind_blob.row(r)[i] = abs(vec[i].second);
+                        }
+                    }
+                    else if (sorted == 0)
+                    {
+                        int   cur       = 0;
+                        float valtarget = vec[topk - 1].first;
+                        int   indtarget = (int)(abs(vec[topk - 1].second) + 0.5);
+                        if (largest == 1)
+                        {
+                            for (int i = 0; i < bottom_blobs[0].w; i++)
+                            {
+                                if (cur >= topk) break;
+                                if (bottom_blobs[0].row(r)[i] > valtarget)
+                                {
+                                    top_val_blob.row(r)[cur] = bottom_blobs[0].row(r)[i];
+                                    top_ind_blob.row(r)[cur] = i;
+                                    cur++;
+                                }
+                                else if (bottom_blobs[0].row(r)[i] == valtarget && i <= indtarget)
+                                {
+                                    top_val_blob.row(r)[cur] = bottom_blobs[0].row(r)[i];
+                                    top_ind_blob.row(r)[cur] = i;
+                                    cur++;
+                                }
+                            }
+                        }
+                        else
+                        {
+                            for (int i = 0; i < bottom_blobs[0].w; i++)
+                            {
+                                if (cur >= topk) break;
+                                if (bottom_blobs[0].row(r)[i] < valtarget)
+                                {
+                                    top_val_blob.row(r)[cur] = bottom_blobs[0].row(r)[i];
+                                    top_ind_blob.row(r)[cur] = i;
+                                    cur++;
+                                }
+                                else if (bottom_blobs[0].row(r)[i] == valtarget && i <= indtarget)
+                                {
+                                    top_val_blob.row(r)[cur] = bottom_blobs[0].row(r)[i];
+                                    top_ind_blob.row(r)[cur] = i;
+                                    cur++;
+                                }
+                            }
+                        }
+                    }
+                    else
+                    {
+                        fprintf(stderr, "sorted attribute should be 0 or 1, but not %d\n", sorted);
+                        return -100;
+                    }
+                }
             }
-            std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
-                              std::greater<std::pair<float, int> >());
-          } else if (largest == 0) {
-            for (int i = 0; i < bottom_blobs[0].h; i++) {
-              vec[i] = std::make_pair(bottom_blobs[0].channel(page).row(i)[col], i);
+            if (dims == 3 && positive_axis == 0)
+            {
+                if (topk > bottom_blobs[0].c)
+                {
+                    fprintf(stderr, "topk should not greater than total items!\n");
+                    return -100;
+                }
+                top_val_blob.create(bottom_blobs[0].w, bottom_blobs[0].h, topk, 4u, opt.blob_allocator);
+                if (top_val_blob.empty()) return -100;
+
+                top_ind_blob.create(bottom_blobs[0].w, bottom_blobs[0].h, topk, 4u, opt.blob_allocator);
+                if (top_ind_blob.empty()) return -100;
+
+                for (int r = 0; r < bottom_blobs[0].h; r++)
+                {
+                    for (int col = 0; col < bottom_blobs[0].w; col++)
+                    {
+                        std::vector<std::pair<float, int>> vec;
+                        vec.resize(bottom_blobs[0].c);
+
+                        if (largest == 1)
+                        {
+                            for (int i = 0; i < bottom_blobs[0].c; i++)
+                            {
+                                vec[i] = std::make_pair(bottom_blobs[0].channel(i).row(r)[col], -i);
+                            }
+                            std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(), std::greater<std::pair<float, int>>());
+                        }
+                        else if (largest == 0)
+                        {
+                            for (int i = 0; i < bottom_blobs[0].c; i++)
+                            {
+                                vec[i] = std::make_pair(bottom_blobs[0].channel(i).row(r)[col], i);
+                            }
+                            std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(), std::less<std::pair<float, int>>());
+                        }
+                        else
+                        {
+                            fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
+                            return -100;
+                        }
+
+                        if (sorted == 1)
+                        {
+                            for (int i = 0; i < topk; i++)
+                            {
+                                top_val_blob.channel(i).row(r)[col] = vec[i].first;
+                                top_ind_blob.channel(i).row(r)[col] = abs(vec[i].second);
+                            }
+                        }
+                        else if (sorted == 0)
+                        {
+                            int   cur       = 0;
+                            float valtarget = vec[topk - 1].first;
+                            int   indtarget = (int)(abs(vec[topk - 1].second) + 0.5);
+                            if (largest == 1)
+                            {
+                                for (int i = 0; i < bottom_blobs[0].c; i++)
+                                {
+                                    if (cur >= topk) break;
+                                    if (bottom_blobs[0].channel(i).row(r)[col] > valtarget)
+                                    {
+                                        top_val_blob.channel(cur).row(r)[col] = bottom_blobs[0].channel(i).row(r)[col];
+                                        top_ind_blob.channel(cur).row(r)[col] = i;
+                                        cur++;
+                                    }
+                                    else if (bottom_blobs[0].channel(i).row(r)[col] == valtarget && i <= indtarget)
+                                    {
+                                        top_val_blob.channel(cur).row(r)[col] = bottom_blobs[0].channel(i).row(r)[col];
+                                        top_ind_blob.channel(cur).row(r)[col] = i;
+                                        cur++;
+                                    }
+                                }
+                            }
+                            else
+                            {
+                                for (int i = 0; i < bottom_blobs[0].c; i++)
+                                {
+                                    if (cur >= topk) break;
+                                    if (bottom_blobs[0].channel(i).row(r)[col] < valtarget)
+                                    {
+                                        top_val_blob.channel(cur).row(r)[col] = bottom_blobs[0].channel(i).row(r)[col];
+                                        top_ind_blob.channel(cur).row(r)[col] = i;
+                                        cur++;
+                                    }
+                                    else if (bottom_blobs[0].channel(i).row(r)[col] == valtarget && i <= indtarget)
+                                    {
+                                        top_val_blob.channel(cur).row(r)[col] = bottom_blobs[0].channel(i).row(r)[col];
+                                        top_ind_blob.channel(cur).row(r)[col] = i;
+                                        cur++;
+                                    }
+                                }
+                            }
+                        }
+                        else
+                        {
+                            fprintf(stderr, "sorted attribute should be 0 or 1, but not %d\n", sorted);
+                            return -100;
+                        }
+                    }
+                }
             }
-            std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
-                              std::less<std::pair<float, int> >());
-          } else {
-            fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
-            return -100;
-          }
-
-          if (sorted == 1) {
-            for (int i = 0; i < topk; i++) {
-              top_val_blob.channel(page).row(i)[col] = vec[i].first;
-              top_ind_blob.channel(page).row(i)[col] = abs(vec[i].second);
+            if (dims == 3 && positive_axis == 1)
+            {
+                if (topk > bottom_blobs[0].h)
+                {
+                    fprintf(stderr, "topk should not greater than total items!\n");
+                    return -100;
+                }
+                top_val_blob.create(bottom_blobs[0].w, topk, bottom_blobs[0].c, 4u, opt.blob_allocator);
+                if (top_val_blob.empty()) return -100;
+
+                top_ind_blob.create(bottom_blobs[0].w, topk, bottom_blobs[0].c, 4u, opt.blob_allocator);
+                if (top_ind_blob.empty()) return -100;
+
+                for (int page = 0; page < bottom_blobs[0].c; page++)
+                {
+                    for (int col = 0; col < bottom_blobs[0].w; col++)
+                    {
+                        std::vector<std::pair<float, int>> vec;
+                        vec.resize(bottom_blobs[0].h);
+
+                        if (largest == 1)
+                        {
+                            for (int i = 0; i < bottom_blobs[0].h; i++)
+                            {
+                                vec[i] = std::make_pair(bottom_blobs[0].channel(page).row(i)[col], -i);
+                            }
+                            std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(), std::greater<std::pair<float, int>>());
+                        }
+                        else if (largest == 0)
+                        {
+                            for (int i = 0; i < bottom_blobs[0].h; i++)
+                            {
+                                vec[i] = std::make_pair(bottom_blobs[0].channel(page).row(i)[col], i);
+                            }
+                            std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(), std::less<std::pair<float, int>>());
+                        }
+                        else
+                        {
+                            fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
+                            return -100;
+                        }
+
+                        if (sorted == 1)
+                        {
+                            for (int i = 0; i < topk; i++)
+                            {
+                                top_val_blob.channel(page).row(i)[col] = vec[i].first;
+                                top_ind_blob.channel(page).row(i)[col] = abs(vec[i].second);
+                            }
+                        }
+                        else if (sorted == 0)
+                        {
+                            int   cur       = 0;
+                            float valtarget = vec[topk - 1].first;
+                            int   indtarget = (int)(abs(vec[topk - 1].second) + 0.5);
+                            for (int i = 0; i < bottom_blobs[0].h; i++)
+                            {
+                                if (cur >= topk) break;
+                                if (largest == 1)
+                                {
+                                    if (bottom_blobs[0].channel(page).row(i)[col] > valtarget)
+                                    {
+                                        top_val_blob.channel(page).row(cur)[col] =
+                                            bottom_blobs[0].channel(page).row(i)[col];
+                                        top_ind_blob.channel(page).row(cur)[col] = i;
+                                        cur++;
+                                    }
+                                    else if (bottom_blobs[0].channel(page).row(i)[col] == valtarget &&
+                                             i <= indtarget)
+                                    {
+                                        top_val_blob.channel(page).row(cur)[col] =
+                                            bottom_blobs[0].channel(page).row(i)[col];
+                                        top_ind_blob.channel(page).row(cur)[col] = i;
+                                        cur++;
+                                    }
+                                }
+                                else
+                                {
+                                    if (bottom_blobs[0].channel(page).row(i)[col] < valtarget)
+                                    {
+                                        top_val_blob.channel(page).row(cur)[col] =
+                                            bottom_blobs[0].channel(page).row(i)[col];
+                                        top_ind_blob.channel(page).row(cur)[col] = i;
+                                        cur++;
+                                    }
+                                    else if (bottom_blobs[0].channel(page).row(i)[col] == valtarget &&
+                                             i <= indtarget)
+                                    {
+                                        top_val_blob.channel(page).row(cur)[col] =
+                                            bottom_blobs[0].channel(page).row(i)[col];
+                                        top_ind_blob.channel(page).row(cur)[col] = i;
+                                        cur++;
+                                    }
+                                }
+                            }
+                        }
+                        else
+                        {
+                            fprintf(stderr, "sorted attribute should be 0 or 1, but not %d\n", sorted);
+                            return -100;
+                        }
+                    }
+                }
             }
-          } else if (sorted == 0) {
-            int cur = 0;
-            float valtarget = vec[topk - 1].first;
-            int indtarget = (int)(abs(vec[topk - 1].second) + 0.5);
-            for (int i = 0; i < bottom_blobs[0].h; i++) {
-              if (cur >= topk) break;
-              if (largest == 1) {
-                if (bottom_blobs[0].channel(page).row(i)[col] > valtarget) {
-                  top_val_blob.channel(page).row(cur)[col] =
-                      bottom_blobs[0].channel(page).row(i)[col];
-                  top_ind_blob.channel(page).row(cur)[col] = i;
-                  cur++;
-                } else if (bottom_blobs[0].channel(page).row(i)[col] == valtarget &&
-                           i <= indtarget) {
-                  top_val_blob.channel(page).row(cur)[col] =
-                      bottom_blobs[0].channel(page).row(i)[col];
-                  top_ind_blob.channel(page).row(cur)[col] = i;
-                  cur++;
-                }
-              } else {
-                if (bottom_blobs[0].channel(page).row(i)[col] < valtarget) {
-                  top_val_blob.channel(page).row(cur)[col] =
-                      bottom_blobs[0].channel(page).row(i)[col];
-                  top_ind_blob.channel(page).row(cur)[col] = i;
-                  cur++;
-                } else if (bottom_blobs[0].channel(page).row(i)[col] == valtarget &&
-                           i <= indtarget) {
-                  top_val_blob.channel(page).row(cur)[col] =
-                      bottom_blobs[0].channel(page).row(i)[col];
-                  top_ind_blob.channel(page).row(cur)[col] = i;
-                  cur++;
-                }
-              }
+            if (dims == 3 && positive_axis == 2)
+            {
+                if (topk > bottom_blobs[0].w)
+                {
+                    fprintf(stderr, "topk should not greater than total items!\n");
+                    return -100;
+                }
+                top_val_blob.create(topk, bottom_blobs[0].h, bottom_blobs[0].c, 4u, opt.blob_allocator);
+                if (top_val_blob.empty()) return -100;
+
+                top_ind_blob.create(topk, bottom_blobs[0].h, bottom_blobs[0].c, 4u, opt.blob_allocator);
+                if (top_ind_blob.empty()) return -100;
+
+                for (int page = 0; page < bottom_blobs[0].c; page++)
+                {
+                    for (int r = 0; r < bottom_blobs[0].h; r++)
+                    {
+                        std::vector<std::pair<float, int>> vec;
+                        vec.resize(bottom_blobs[0].w);
+
+                        if (largest == 1)
+                        {
+                            for (int i = 0; i < bottom_blobs[0].w; i++)
+                            {
+                                vec[i] = std::make_pair(bottom_blobs[0].channel(page).row(r)[i], -i);
+                            }
+                            std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(), std::greater<std::pair<float, int>>());
+                        }
+                        else if (largest == 0)
+                        {
+                            for (int i = 0; i < bottom_blobs[0].w; i++)
+                            {
+                                vec[i] = std::make_pair(bottom_blobs[0].channel(page).row(r)[i], i);
+                            }
+                            std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(), std::less<std::pair<float, int>>());
+                        }
+                        else
+                        {
+                            fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
+                            return -100;
+                        }
+
+                        if (sorted == 1)
+                        {
+                            for (int i = 0; i < topk; i++)
+                            {
+                                top_val_blob.channel(page).row(r)[i] = vec[i].first;
+                                top_ind_blob.channel(page).row(r)[i] = abs(vec[i].second);
+                            }
+                        }
+                        else if (sorted == 0)
+                        {
+                            int   cur       = 0;
+                            float valtarget = vec[topk - 1].first;
+                            int   indtarget = (int)(abs(vec[topk - 1].second) + 0.5);
+                            if (largest == 1)
+                            {
+                                for (int i = 0; i < bottom_blobs[0].w; i++)
+                                {
+                                    if (cur >= topk) break;
+                                    if (bottom_blobs[0].channel(page).row(r)[i] > valtarget)
+                                    {
+                                        top_val_blob.channel(page).row(r)[cur] = bottom_blobs[0].channel(page).row(r)[i];
+                                        top_ind_blob.channel(page).row(r)[cur] = i;
+                                        cur++;
+                                    }
+                                    else if (bottom_blobs[0].channel(page).row(r)[i] == valtarget && i <= indtarget)
+                                    {
+                                        top_val_blob.channel(page).row(r)[cur] = bottom_blobs[0].channel(page).row(r)[i];
+                                        top_ind_blob.channel(page).row(r)[cur] = i;
+                                        cur++;
+                                    }
+                                }
+                            }
+                            else
+                            {
+                                for (int i = 0; i < bottom_blobs[0].w; i++)
+                                {
+                                    if (cur >= topk) break;
+                                    if (bottom_blobs[0].channel(page).row(r)[i] < valtarget)
+                                    {
+                                        top_val_blob.channel(page).row(r)[cur] = bottom_blobs[0].channel(page).row(r)[i];
+                                        top_ind_blob.channel(page).row(r)[cur] = i;
+                                        cur++;
+                                    }
+                                    else if (bottom_blobs[0].channel(page).row(r)[i] == valtarget && i <= indtarget)
+                                    {
+                                        top_val_blob.channel(page).row(r)[cur] = bottom_blobs[0].channel(page).row(r)[i];
+                                        top_ind_blob.channel(page).row(r)[cur] = i;
+                                        cur++;
+                                    }
+                                }
+                            }
+                        }
+                        else
+                        {
+                            fprintf(stderr, "sorted attribute should be 0 or 1, but not %d\n", sorted);
+                            return -100;
+                        }
+                    }
+                }
             }
-          } else {
-            fprintf(stderr, "sorted attribute should be 0 or 1, but not %d\n", sorted);
-            return -100;
-          }
         }
-      }
-    }
-    if (dims == 3 && positive_axis == 2) {
-      if (topk > bottom_blobs[0].w) {
-        fprintf(stderr, "topk should not greater than total items!\n");
-        return -100;
-      }
-      top_val_blob.create(topk, bottom_blobs[0].h, bottom_blobs[0].c, 4u, opt.blob_allocator);
-      if (top_val_blob.empty()) return -100;
-
-      top_ind_blob.create(topk, bottom_blobs[0].h, bottom_blobs[0].c, 4u, opt.blob_allocator);
-      if (top_ind_blob.empty()) return -100;
-
-      for (int page = 0; page < bottom_blobs[0].c; page++) {
-        for (int r = 0; r < bottom_blobs[0].h; r++) {
-          std::vector<std::pair<float, int> > vec;
-          vec.resize(bottom_blobs[0].w);
-
-          if (largest == 1) {
-            for (int i = 0; i < bottom_blobs[0].w; i++) {
-              vec[i] = std::make_pair(bottom_blobs[0].channel(page).row(r)[i], -i);
+        else
+        {
+            if (topk <= 0)
+            {
+                fprintf(stderr, "topk should not <= 0!\n");
+                return -102;
             }
-            std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
-                              std::greater<std::pair<float, int> >());
-          } else if (largest == 0) {
-            for (int i = 0; i < bottom_blobs[0].w; i++) {
-              vec[i] = std::make_pair(bottom_blobs[0].channel(page).row(r)[i], i);
+            if (dims == 1 && positive_axis == 0)
+            {
+                if (topk > bottom_blobs[0].w)
+                {
+                    fprintf(stderr, "topk should not greater than total items!\n");
+                    return -100;
+                }
+                top_val_blob.create(topk, 4u, opt.blob_allocator);
+                if (top_val_blob.empty()) return -100;
+
+                if (top_blobs.size() == 2)
+                {
+                    top_ind_blob.create(topk, 4u, opt.blob_allocator);
+                    if (top_ind_blob.empty()) return -100;
+                }
+
+                const float*       ptr = bottom_blobs[0];
+                std::vector<float> vec;
+                vec.resize(bottom_blobs[0].w);
+                float* valptr = top_val_blob;
+                float* indptr;
+                if (top_blobs.size() == 2) indptr = top_ind_blob;
+
+                for (int i = 0; i < bottom_blobs[0].w; i++)
+                {
+                    vec[i] = ptr[i];
+                }
+                if (largest == 1)
+                {
+                    auto index_iter = std::max_element(vec.begin(), vec.end());
+                    valptr[0]       = *index_iter;
+                    if (top_blobs.size() == 2)
+                        indptr[0] = std::distance(vec.begin(), index_iter);
+                    else
+                        valptr[0] = std::distance(vec.begin(), index_iter);  // replace with index
+                }
+                else if (largest == 0)
+                {
+                    auto index_iter = std::min_element(vec.begin(), vec.end());
+                    valptr[0]       = *index_iter;
+                    if (top_blobs.size() == 2)
+                        indptr[0] = std::distance(vec.begin(), index_iter);
+                    else
+                        valptr[0] = std::distance(vec.begin(), index_iter);  // replace with index
+                }
+                else
+                {
+                    fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
+                    return -100;
+                }
             }
-            std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
-                              std::less<std::pair<float, int> >());
-          } else {
-            fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
-            return -100;
-          }
-
-          if (sorted == 1) {
-            for (int i = 0; i < topk; i++) {
-              top_val_blob.channel(page).row(r)[i] = vec[i].first;
-              top_ind_blob.channel(page).row(r)[i] = abs(vec[i].second);
+            if (dims == 2 && positive_axis == 0)
+            {
+                if (keep_dims == 1)
+                {
+                    top_val_blob.create(bottom_blobs[0].w, topk, 4u, opt.blob_allocator);
+                    if (top_val_blob.empty()) return -100;
+                    if (top_blobs.size() == 2)
+                    {
+                        top_ind_blob.create(bottom_blobs[0].w, topk, 4u, opt.blob_allocator);
+                        if (top_ind_blob.empty()) return -100;
+                    }
+                }
+                else
+                {
+                    top_val_blob.create(bottom_blobs[0].w, 4u, opt.blob_allocator);
+                    if (top_val_blob.empty()) return -100;
+
+                    if (top_blobs.size() == 2)
+                    {
+                        top_ind_blob.create(bottom_blobs[0].w, 4u, opt.blob_allocator);
+                        if (top_ind_blob.empty()) return -100;
+                    }
+                }
+                const float*       ptr = bottom_blobs[0];
+                std::vector<float> vec;
+                vec.resize(bottom_blobs[0].h);
+                float* valptr = top_val_blob;
+                float* indptr;
+                if (top_blobs.size() == 2) indptr = top_ind_blob;
+                for (int col = 0; col < bottom_blobs[0].w; col++)
+                {
+                    for (int i = 0; i < bottom_blobs[0].h; i++)
+                    {
+                        vec[i] = ptr[i * bottom_blobs[0].w + col];
+                    }
+                    if (largest == 1)
+                    {
+                        auto index_iter = std::max_element(vec.begin(), vec.end());
+                        valptr[col]     = *index_iter;
+                        if (top_blobs.size() == 2)
+                            indptr[col] = std::distance(vec.begin(), index_iter);
+                        else
+                            valptr[col] = std::distance(vec.begin(), index_iter);
+                    }
+                    else if (largest == 0)
+                    {
+                        auto index_iter = std::min_element(vec.begin(), vec.end());
+                        valptr[col]     = *index_iter;
+                        if (top_blobs.size() == 2)
+                            indptr[col] = std::distance(vec.begin(), index_iter);
+                        else
+                            valptr[col] = std::distance(vec.begin(), index_iter);
+                    }
+                    else
+                    {
+                        fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
+                        return -100;
+                    }
+                }
             }
-          } else if (sorted == 0) {
-            int cur = 0;
-            float valtarget = vec[topk - 1].first;
-            int indtarget = (int)(abs(vec[topk - 1].second) + 0.5);
-            if (largest == 1) {
-              for (int i = 0; i < bottom_blobs[0].w; i++) {
-                if (cur >= topk) break;
-                if (bottom_blobs[0].channel(page).row(r)[i] > valtarget) {
-                  top_val_blob.channel(page).row(r)[cur] = bottom_blobs[0].channel(page).row(r)[i];
-                  top_ind_blob.channel(page).row(r)[cur] = i;
-                  cur++;
-                } else if (bottom_blobs[0].channel(page).row(r)[i] == valtarget && i <= indtarget) {
-                  top_val_blob.channel(page).row(r)[cur] = bottom_blobs[0].channel(page).row(r)[i];
-                  top_ind_blob.channel(page).row(r)[cur] = i;
-                  cur++;
-                }
-              }
-            } else {
-              for (int i = 0; i < bottom_blobs[0].w; i++) {
-                if (cur >= topk) break;
-                if (bottom_blobs[0].channel(page).row(r)[i] < valtarget) {
-                  top_val_blob.channel(page).row(r)[cur] = bottom_blobs[0].channel(page).row(r)[i];
-                  top_ind_blob.channel(page).row(r)[cur] = i;
-                  cur++;
-                } else if (bottom_blobs[0].channel(page).row(r)[i] == valtarget && i <= indtarget) {
-                  top_val_blob.channel(page).row(r)[cur] = bottom_blobs[0].channel(page).row(r)[i];
-                  top_ind_blob.channel(page).row(r)[cur] = i;
-                  cur++;
-                }
-              }
+            if (dims == 2 && positive_axis == 1)
+            {
+                if (keep_dims == 1)
+                {
+                    top_val_blob.create(topk, bottom_blobs[0].h, 4u, opt.blob_allocator);
+                    if (top_val_blob.empty()) return -100;
+                    if (top_blobs.size() == 2)
+                    {
+                        top_ind_blob.create(topk, bottom_blobs[0].h, 4u, opt.blob_allocator);
+                        if (top_ind_blob.empty()) return -100;
+                    }
+                }
+                else
+                {
+                    top_val_blob.create(bottom_blobs[0].h, 4u, opt.blob_allocator);
+                    if (top_val_blob.empty()) return -100;
+                    if (top_blobs.size() == 2)
+                    {
+                        top_ind_blob.create(bottom_blobs[0].h, 4u, opt.blob_allocator);
+                        if (top_ind_blob.empty()) return -100;
+                    }
+                }
+
+                const float*       ptr = bottom_blobs[0];
+                std::vector<float> vec;
+                vec.resize(bottom_blobs[0].w);
+                float* valptr = top_val_blob;
+                float* indptr;
+                if (top_blobs.size() == 2) indptr = top_ind_blob;
+
+                for (int r = 0; r < bottom_blobs[0].h; r++)
+                {
+                    for (int i = 0; i < bottom_blobs[0].w; i++)
+                    {
+                        vec[i] = ptr[r * bottom_blobs[0].w + i];
+                    }
+                    if (largest == 1)
+                    {
+                        auto index_iter = std::max_element(vec.begin(), vec.end());
+                        valptr[r]       = *index_iter;
+                        if (top_blobs.size() == 2)
+                            indptr[r] = std::distance(vec.begin(), index_iter);
+                        else
+                            valptr[r] = std::distance(vec.begin(), index_iter);
+                    }
+                    else if (largest == 0)
+                    {
+                        auto index_iter = std::min_element(vec.begin(), vec.end());
+                        valptr[r]       = *index_iter;
+                        if (top_blobs.size() == 2)
+                            indptr[r] = std::distance(vec.begin(), index_iter);
+                        else
+                            valptr[r] = std::distance(vec.begin(), index_iter);
+                    }
+                    else
+                    {
+                        fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
+                        return -100;
+                    }
+                }
             }
+            if (dims == 3 && positive_axis == 0)
+            {
+                if (keep_dims == 1)
+                {
+                    top_val_blob.create(bottom_blobs[0].w, bottom_blobs[0].h, topk, 4u, opt.blob_allocator);
+                    if (top_val_blob.empty()) return -100;
+                    if (top_blobs.size() == 2)
+                    {
+                        top_ind_blob.create(bottom_blobs[0].w, bottom_blobs[0].h, topk, 4u, opt.blob_allocator);
+                        if (top_ind_blob.empty()) return -100;
+                    }
+                }
+                else
+                {
+                    top_val_blob.create(bottom_blobs[0].w, bottom_blobs[0].h, 4u, opt.blob_allocator);
+                    if (top_val_blob.empty()) return -100;
+                    if (top_blobs.size() == 2)
+                    {
+                        top_ind_blob.create(bottom_blobs[0].w, bottom_blobs[0].h, 4u, opt.blob_allocator);
+                        if (top_ind_blob.empty()) return -100;
+                    }
+                }
+                const float*       ptr = bottom_blobs[0];
+                std::vector<float> vec;
+                vec.resize(bottom_blobs[0].c);
+                float* valptr = top_val_blob;
+                float* indptr;
+                if (top_blobs.size() == 2) indptr = top_ind_blob;
 
-          } else {
-            fprintf(stderr, "sorted attribute should be 0 or 1, but not %d\n", sorted);
-            return -100;
-          }
-        }
-      }
-    }
-  } else {
-    if (topk <= 0) {
-      fprintf(stderr, "topk should not <= 0!\n");
-      return -102;
-    }
-    if (dims == 1 && positive_axis == 0) {
-      if (topk > bottom_blobs[0].w) {
-        fprintf(stderr, "topk should not greater than total items!\n");
-        return -100;
-      }
-      top_val_blob.create(topk, 4u, opt.blob_allocator);
-      if (top_val_blob.empty()) return -100;
-
-      if (top_blobs.size() == 2) {
-        top_ind_blob.create(topk, 4u, opt.blob_allocator);
-        if (top_ind_blob.empty()) return -100;
-      }
-
-      const float* ptr = bottom_blobs[0];
-      std::vector<float> vec;
-      vec.resize(bottom_blobs[0].w);
-      float* valptr = top_val_blob;
-      float* indptr;
-      if (top_blobs.size() == 2) indptr = top_ind_blob;
-
-      for (int i = 0; i < bottom_blobs[0].w; i++) {
-        vec[i] = ptr[i];
-      }
-      if (largest == 1) {
-        auto index_iter = std::max_element(vec.begin(), vec.end());
-        valptr[0] = *index_iter;
-        if (top_blobs.size() == 2)
-          indptr[0] = std::distance(vec.begin(), index_iter);
-        else
-          valptr[0] = std::distance(vec.begin(), index_iter);  // replace with index
-      } else if (largest == 0) {
-        auto index_iter = std::min_element(vec.begin(), vec.end());
-        valptr[0] = *index_iter;
-        if (top_blobs.size() == 2)
-          indptr[0] = std::distance(vec.begin(), index_iter);
-        else
-          valptr[0] = std::distance(vec.begin(), index_iter);  // replace with index
-      } else {
-        fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
-        return -100;
-      }
-    }
-    if (dims == 2 && positive_axis == 0) {
-      if (keep_dims == 1) {
-        top_val_blob.create(bottom_blobs[0].w, topk, 4u, opt.blob_allocator);
-        if (top_val_blob.empty()) return -100;
-        if (top_blobs.size() == 2) {
-          top_ind_blob.create(bottom_blobs[0].w, topk, 4u, opt.blob_allocator);
-          if (top_ind_blob.empty()) return -100;
-        }
+                for (int r = 0; r < bottom_blobs[0].h; r++)
+                {
+                    for (int col = 0; col < bottom_blobs[0].w; col++)
+                    {
+                        for (int i = 0; i < bottom_blobs[0].c; i++)
+                        {
+                            ptr    = bottom_blobs[0].channel(i);
+                            vec[i] = ptr[r * bottom_blobs[0].w + col];
+                        }
+                        if (largest == 1)
+                        {
+                            auto index_iter                  = std::max_element(vec.begin(), vec.end());
+                            valptr[r * top_val_blob.w + col] = *index_iter;
+                            if (top_blobs.size() == 2)
+                                indptr[r * top_ind_blob.w + col] = std::distance(vec.begin(), index_iter);
+                            else
+                                valptr[r * top_ind_blob.w + col] = std::distance(vec.begin(), index_iter);
+                        }
+                        else if (largest == 0)
+                        {
+                            auto index_iter                  = std::min_element(vec.begin(), vec.end());
+                            valptr[r * top_val_blob.w + col] = *index_iter;
 
-      } else {
-        top_val_blob.create(bottom_blobs[0].w, 4u, opt.blob_allocator);
-        if (top_val_blob.empty()) return -100;
+                            if (top_blobs.size() == 2)
+                                indptr[r * top_ind_blob.w + col] = std::distance(vec.begin(), index_iter);
+                            else
+                                valptr[r * top_ind_blob.w + col] = std::distance(vec.begin(), index_iter);
+                        }
+                        else
+                        {
+                            fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
+                            return -100;
+                        }
+                    }
+                }
+            }
+            if (dims == 3 && positive_axis == 1)
+            {
+                if (keep_dims == 1)
+                {
+                    top_val_blob.create(bottom_blobs[0].w, topk, bottom_blobs[0].c, 4u, opt.blob_allocator);
+                    if (top_val_blob.empty()) return -100;
+                    if (top_blobs.size() == 2)
+                    {
+                        top_ind_blob.create(bottom_blobs[0].w, topk, bottom_blobs[0].c, 4u, opt.blob_allocator);
+                        if (top_ind_blob.empty()) return -100;
+                    }
 
-        if (top_blobs.size() == 2) {
-          top_ind_blob.create(bottom_blobs[0].w, 4u, opt.blob_allocator);
-          if (top_ind_blob.empty()) return -100;
-        }
-      }
-      const float* ptr = bottom_blobs[0];
-      std::vector<float> vec;
-      vec.resize(bottom_blobs[0].h);
-      float* valptr = top_val_blob;
-      float* indptr;
-      if (top_blobs.size() == 2) indptr = top_ind_blob;
-      for (int col = 0; col < bottom_blobs[0].w; col++) {
-        for (int i = 0; i < bottom_blobs[0].h; i++) {
-          vec[i] = ptr[i * bottom_blobs[0].w + col];
-        }
-        if (largest == 1) {
-          auto index_iter = std::max_element(vec.begin(), vec.end());
-          valptr[col] = *index_iter;
-          if (top_blobs.size() == 2)
-            indptr[col] = std::distance(vec.begin(), index_iter);
-          else
-            valptr[col] = std::distance(vec.begin(), index_iter);
-
-        } else if (largest == 0) {
-          auto index_iter = std::min_element(vec.begin(), vec.end());
-          valptr[col] = *index_iter;
-          if (top_blobs.size() == 2)
-            indptr[col] = std::distance(vec.begin(), index_iter);
-          else
-            valptr[col] = std::distance(vec.begin(), index_iter);
-        } else {
-          fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
-          return -100;
-        }
-      }
-    }
-    if (dims == 2 && positive_axis == 1) {
-      if (keep_dims == 1) {
-        top_val_blob.create(topk, bottom_blobs[0].h, 4u, opt.blob_allocator);
-        if (top_val_blob.empty()) return -100;
-        if (top_blobs.size() == 2) {
-          top_ind_blob.create(topk, bottom_blobs[0].h, 4u, opt.blob_allocator);
-          if (top_ind_blob.empty()) return -100;
-        }
+                    std::vector<float> vec;
+                    vec.resize(bottom_blobs[0].h);
 
-      } else {
-        top_val_blob.create(bottom_blobs[0].h, 4u, opt.blob_allocator);
-        if (top_val_blob.empty()) return -100;
-        if (top_blobs.size() == 2) {
-          top_ind_blob.create(bottom_blobs[0].h, 4u, opt.blob_allocator);
-          if (top_ind_blob.empty()) return -100;
-        }
-      }
-
-      const float* ptr = bottom_blobs[0];
-      std::vector<float> vec;
-      vec.resize(bottom_blobs[0].w);
-      float* valptr = top_val_blob;
-      float* indptr;
-      if (top_blobs.size() == 2) indptr = top_ind_blob;
-
-      for (int r = 0; r < bottom_blobs[0].h; r++) {
-        for (int i = 0; i < bottom_blobs[0].w; i++) {
-          vec[i] = ptr[r * bottom_blobs[0].w + i];
-        }
-        if (largest == 1) {
-          auto index_iter = std::max_element(vec.begin(), vec.end());
-          valptr[r] = *index_iter;
-          if (top_blobs.size() == 2)
-            indptr[r] = std::distance(vec.begin(), index_iter);
-          else
-            valptr[r] = std::distance(vec.begin(), index_iter);
-
-        } else if (largest == 0) {
-          auto index_iter = std::min_element(vec.begin(), vec.end());
-          valptr[r] = *index_iter;
-          if (top_blobs.size() == 2)
-            indptr[r] = std::distance(vec.begin(), index_iter);
-          else
-            valptr[r] = std::distance(vec.begin(), index_iter);
-        } else {
-          fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
-          return -100;
-        }
-      }
-    }
-    if (dims == 3 && positive_axis == 0) {
-      if (keep_dims == 1) {
-        top_val_blob.create(bottom_blobs[0].w, bottom_blobs[0].h, topk, 4u, opt.blob_allocator);
-        if (top_val_blob.empty()) return -100;
-        if (top_blobs.size() == 2) {
-          top_ind_blob.create(bottom_blobs[0].w, bottom_blobs[0].h, topk, 4u, opt.blob_allocator);
-          if (top_ind_blob.empty()) return -100;
-        }
+                    for (int page = 0; page < bottom_blobs[0].c; page++)
+                    {
+                        const float* ptr    = bottom_blobs[0].channel(page);
+                        float*       valptr = top_val_blob.channel(page);
+                        float*       indptr;
+                        if (top_blobs.size() == 2) indptr = top_ind_blob.channel(page);
+                        for (int col = 0; col < bottom_blobs[0].w; col++)
+                        {
+                            for (int i = 0; i < bottom_blobs[0].h; i++)
+                            {
+                                vec[i] = ptr[i * bottom_blobs[0].w + col];
+                            }
+                            if (largest == 1)
+                            {
+                                auto index_iter = std::max_element(vec.begin(), vec.end());
+                                valptr[col]     = *index_iter;
+                                if (top_blobs.size() == 2)
+                                    indptr[col] = std::distance(vec.begin(), index_iter);
+                                else
+                                    valptr[col] = std::distance(vec.begin(), index_iter);
+                            }
+                            else if (largest == 0)
+                            {
+                                auto index_iter = std::min_element(vec.begin(), vec.end());
+                                valptr[col]     = *index_iter;
+                                if (top_blobs.size() == 2)
+                                    indptr[col] = std::distance(vec.begin(), index_iter);
+                                else
+                                    valptr[col] = std::distance(vec.begin(), index_iter);
+                            }
+                            else
+                            {
+                                fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
+                                return -100;
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    top_val_blob.create(bottom_blobs[0].w, bottom_blobs[0].c, 4u, opt.blob_allocator);
+                    if (top_val_blob.empty()) return -100;
+                    if (top_blobs.size() == 2)
+                    {
+                        top_ind_blob.create(bottom_blobs[0].w, bottom_blobs[0].c, 4u, opt.blob_allocator);
+                        if (top_ind_blob.empty()) return -100;
+                    }
 
-      } else {
-        top_val_blob.create(bottom_blobs[0].w, bottom_blobs[0].h, 4u, opt.blob_allocator);
-        if (top_val_blob.empty()) return -100;
-        if (top_blobs.size() == 2) {
-          top_ind_blob.create(bottom_blobs[0].w, bottom_blobs[0].h, 4u, opt.blob_allocator);
-          if (top_ind_blob.empty()) return -100;
-        }
-      }
-      const float* ptr = bottom_blobs[0];
-      std::vector<float> vec;
-      vec.resize(bottom_blobs[0].c);
-      float* valptr = top_val_blob;
-      float* indptr;
-      if (top_blobs.size() == 2) indptr = top_ind_blob;
-
-      for (int r = 0; r < bottom_blobs[0].h; r++) {
-        for (int col = 0; col < bottom_blobs[0].w; col++) {
-          for (int i = 0; i < bottom_blobs[0].c; i++) {
-            ptr = bottom_blobs[0].channel(i);
-            vec[i] = ptr[r * bottom_blobs[0].w + col];
-          }
-          if (largest == 1) {
-            auto index_iter = std::max_element(vec.begin(), vec.end());
-            valptr[r * top_val_blob.w + col] = *index_iter;
-            if (top_blobs.size() == 2)
-              indptr[r * top_ind_blob.w + col] = std::distance(vec.begin(), index_iter);
-            else
-              valptr[r * top_ind_blob.w + col] = std::distance(vec.begin(), index_iter);
-
-          } else if (largest == 0) {
-            auto index_iter = std::min_element(vec.begin(), vec.end());
-            valptr[r * top_val_blob.w + col] = *index_iter;
-
-            if (top_blobs.size() == 2)
-              indptr[r * top_ind_blob.w + col] = std::distance(vec.begin(), index_iter);
-            else
-              valptr[r * top_ind_blob.w + col] = std::distance(vec.begin(), index_iter);
-          } else {
-            fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
-            return -100;
-          }
-        }
-      }
-    }
-    if (dims == 3 && positive_axis == 1) {
-      if (keep_dims == 1) {
-        top_val_blob.create(bottom_blobs[0].w, topk, bottom_blobs[0].c, 4u, opt.blob_allocator);
-        if (top_val_blob.empty()) return -100;
-        if (top_blobs.size() == 2) {
-          top_ind_blob.create(bottom_blobs[0].w, topk, bottom_blobs[0].c, 4u, opt.blob_allocator);
-          if (top_ind_blob.empty()) return -100;
-        }
+                    std::vector<float> vec;
+                    vec.resize(bottom_blobs[0].h);
+                    float* valptr = top_val_blob;
+                    float* indptr;
+                    if (top_blobs.size() == 2) indptr = top_ind_blob;
 
-        std::vector<float> vec;
-        vec.resize(bottom_blobs[0].h);
-
-        for (int page = 0; page < bottom_blobs[0].c; page++) {
-          const float* ptr = bottom_blobs[0].channel(page);
-          float* valptr = top_val_blob.channel(page);
-          float* indptr;
-          if (top_blobs.size() == 2) indptr = top_ind_blob.channel(page);
-          for (int col = 0; col < bottom_blobs[0].w; col++) {
-            for (int i = 0; i < bottom_blobs[0].h; i++) {
-              vec[i] = ptr[i * bottom_blobs[0].w + col];
-            }
-            if (largest == 1) {
-              auto index_iter = std::max_element(vec.begin(), vec.end());
-              valptr[col] = *index_iter;
-              if (top_blobs.size() == 2)
-                indptr[col] = std::distance(vec.begin(), index_iter);
-              else
-                valptr[col] = std::distance(vec.begin(), index_iter);
-            } else if (largest == 0) {
-              auto index_iter = std::min_element(vec.begin(), vec.end());
-              valptr[col] = *index_iter;
-              if (top_blobs.size() == 2)
-                indptr[col] = std::distance(vec.begin(), index_iter);
-              else
-                valptr[col] = std::distance(vec.begin(), index_iter);
-            } else {
-              fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
-              return -100;
+                    for (int page = 0; page < bottom_blobs[0].c; page++)
+                    {
+                        const float* ptr = bottom_blobs[0].channel(page);
+                        for (int col = 0; col < bottom_blobs[0].w; col++)
+                        {
+                            for (int i = 0; i < bottom_blobs[0].h; i++)
+                            {
+                                vec[i] = ptr[i * bottom_blobs[0].w + col];
+                            }
+                            if (largest == 1)
+                            {
+                                auto index_iter                     = std::max_element(vec.begin(), vec.end());
+                                valptr[page * top_val_blob.w + col] = *index_iter;
+                                if (top_blobs.size() == 2)
+                                    indptr[page * top_ind_blob.w + col] = std::distance(vec.begin(), index_iter);
+                                else
+                                    valptr[page * top_ind_blob.w + col] = std::distance(vec.begin(), index_iter);
+                            }
+                            else if (largest == 0)
+                            {
+                                auto index_iter                     = std::min_element(vec.begin(), vec.end());
+                                valptr[page * top_val_blob.w + col] = *index_iter;
+                                if (top_blobs.size() == 2)
+                                    indptr[page * top_ind_blob.w + col] = std::distance(vec.begin(), index_iter);
+                                else
+                                    valptr[page * top_ind_blob.w + col] = std::distance(vec.begin(), index_iter);
+                            }
+                            else
+                            {
+                                fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
+                                return -100;
+                            }
+                        }
+                    }
+                }
             }
-          }
-        }
-      } else {
-        top_val_blob.create(bottom_blobs[0].w, bottom_blobs[0].c, 4u, opt.blob_allocator);
-        if (top_val_blob.empty()) return -100;
-        if (top_blobs.size() == 2) {
-          top_ind_blob.create(bottom_blobs[0].w, bottom_blobs[0].c, 4u, opt.blob_allocator);
-          if (top_ind_blob.empty()) return -100;
-        }
+            if (dims == 3 && positive_axis == 2)
+            {
+                if (keep_dims == 1)
+                {
+                    top_val_blob.create(topk, bottom_blobs[0].h, bottom_blobs[0].c, 4u, opt.blob_allocator);
+                    if (top_val_blob.empty()) return -100;
+                    if (top_blobs.size() == 2)
+                    {
+                        top_ind_blob.create(topk, bottom_blobs[0].h, bottom_blobs[0].c, 4u, opt.blob_allocator);
+                        if (top_ind_blob.empty()) return -100;
+                    }
 
-        std::vector<float> vec;
-        vec.resize(bottom_blobs[0].h);
-        float* valptr = top_val_blob;
-        float* indptr;
-        if (top_blobs.size() == 2) indptr = top_ind_blob;
-
-        for (int page = 0; page < bottom_blobs[0].c; page++) {
-          const float* ptr = bottom_blobs[0].channel(page);
-          for (int col = 0; col < bottom_blobs[0].w; col++) {
-            for (int i = 0; i < bottom_blobs[0].h; i++) {
-              vec[i] = ptr[i * bottom_blobs[0].w + col];
-            }
-            if (largest == 1) {
-              auto index_iter = std::max_element(vec.begin(), vec.end());
-              valptr[page * top_val_blob.w + col] = *index_iter;
-              if (top_blobs.size() == 2)
-                indptr[page * top_ind_blob.w + col] = std::distance(vec.begin(), index_iter);
-              else
-                valptr[page * top_ind_blob.w + col] = std::distance(vec.begin(), index_iter);
-            } else if (largest == 0) {
-              auto index_iter = std::min_element(vec.begin(), vec.end());
-              valptr[page * top_val_blob.w + col] = *index_iter;
-              if (top_blobs.size() == 2)
-                indptr[page * top_ind_blob.w + col] = std::distance(vec.begin(), index_iter);
-              else
-                valptr[page * top_ind_blob.w + col] = std::distance(vec.begin(), index_iter);
-            } else {
-              fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
-              return -100;
-            }
-          }
-        }
-      }
-    }
-    if (dims == 3 && positive_axis == 2) {
-      if (keep_dims == 1) {
-        top_val_blob.create(topk, bottom_blobs[0].h, bottom_blobs[0].c, 4u, opt.blob_allocator);
-        if (top_val_blob.empty()) return -100;
-        if (top_blobs.size() == 2) {
-          top_ind_blob.create(topk, bottom_blobs[0].h, bottom_blobs[0].c, 4u, opt.blob_allocator);
-          if (top_ind_blob.empty()) return -100;
-        }
+                    std::vector<float> vec;
+                    vec.resize(bottom_blobs[0].w);
 
-        std::vector<float> vec;
-        vec.resize(bottom_blobs[0].w);
-
-        for (int page = 0; page < bottom_blobs[0].c; page++) {
-          const float* ptr = bottom_blobs[0].channel(page);
-          float* valptr = top_val_blob.channel(page);
-          float* indptr;
-          if (top_blobs.size() == 2) indptr = top_ind_blob.channel(page);
-          for (int r = 0; r < bottom_blobs[0].h; r++) {
-            for (int i = 0; i < bottom_blobs[0].w; i++) {
-              vec[i] = ptr[r * bottom_blobs[0].w + i];
-            }
-            if (largest == 1) {
-              auto index_iter = std::max_element(vec.begin(), vec.end());
-              valptr[r] = *index_iter;
-              if (top_blobs.size() == 2)
-                indptr[r] = std::distance(vec.begin(), index_iter);
-              else
-                valptr[r] = std::distance(vec.begin(), index_iter);
-            } else if (largest == 0) {
-              auto index_iter = std::min_element(vec.begin(), vec.end());
-              valptr[r] = *index_iter;
-              if (top_blobs.size() == 2)
-                indptr[r] = std::distance(vec.begin(), index_iter);
-              else
-                valptr[r] = std::distance(vec.begin(), index_iter);
-            } else {
-              fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
-              return -100;
-            }
-          }
-        }
-      } else {
-        top_val_blob.create(bottom_blobs[0].h, bottom_blobs[0].c, 4u, opt.blob_allocator);
-        if (top_val_blob.empty()) return -100;
-        if (top_blobs.size() == 2) {
-          top_ind_blob.create(bottom_blobs[0].h, bottom_blobs[0].c, 4u, opt.blob_allocator);
-          if (top_ind_blob.empty()) return -100;
-        }
+                    for (int page = 0; page < bottom_blobs[0].c; page++)
+                    {
+                        const float* ptr    = bottom_blobs[0].channel(page);
+                        float*       valptr = top_val_blob.channel(page);
+                        float*       indptr;
+                        if (top_blobs.size() == 2) indptr = top_ind_blob.channel(page);
+                        for (int r = 0; r < bottom_blobs[0].h; r++)
+                        {
+                            for (int i = 0; i < bottom_blobs[0].w; i++)
+                            {
+                                vec[i] = ptr[r * bottom_blobs[0].w + i];
+                            }
+                            if (largest == 1)
+                            {
+                                auto index_iter = std::max_element(vec.begin(), vec.end());
+                                valptr[r]       = *index_iter;
+                                if (top_blobs.size() == 2)
+                                    indptr[r] = std::distance(vec.begin(), index_iter);
+                                else
+                                    valptr[r] = std::distance(vec.begin(), index_iter);
+                            }
+                            else if (largest == 0)
+                            {
+                                auto index_iter = std::min_element(vec.begin(), vec.end());
+                                valptr[r]       = *index_iter;
+                                if (top_blobs.size() == 2)
+                                    indptr[r] = std::distance(vec.begin(), index_iter);
+                                else
+                                    valptr[r] = std::distance(vec.begin(), index_iter);
+                            }
+                            else
+                            {
+                                fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
+                                return -100;
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    top_val_blob.create(bottom_blobs[0].h, bottom_blobs[0].c, 4u, opt.blob_allocator);
+                    if (top_val_blob.empty()) return -100;
+                    if (top_blobs.size() == 2)
+                    {
+                        top_ind_blob.create(bottom_blobs[0].h, bottom_blobs[0].c, 4u, opt.blob_allocator);
+                        if (top_ind_blob.empty()) return -100;
+                    }
 
-        std::vector<float> vec;
-        vec.resize(bottom_blobs[0].w);
-        float* valptr = top_val_blob;
-        float* indptr;
-        if (top_blobs.size() == 2) indptr = top_ind_blob;
-
-        for (int page = 0; page < bottom_blobs[0].c; page++) {
-          const float* ptr = bottom_blobs[0].channel(page);
-          for (int r = 0; r < bottom_blobs[0].h; r++) {
-            for (int i = 0; i < bottom_blobs[0].w; i++) {
-              vec[i] = ptr[r * bottom_blobs[0].w + i];
-            }
-            if (largest == 1) {
-              auto index_iter = std::max_element(vec.begin(), vec.end());
-              valptr[page * top_val_blob.w + r] = *index_iter;
-              if (top_blobs.size() == 2)
-                indptr[page * top_ind_blob.w + r] = std::distance(vec.begin(), index_iter);
-              else
-                valptr[page * top_ind_blob.w + r] = std::distance(vec.begin(), index_iter);
-            } else if (largest == 0) {
-              auto index_iter = std::min_element(vec.begin(), vec.end());
-              valptr[page * top_val_blob.w + r] = *index_iter;
-              if (top_blobs.size() == 2)
-                indptr[page * top_val_blob.w + r] = std::distance(vec.begin(), index_iter);
-              else
-                valptr[page * top_ind_blob.w + r] = std::distance(vec.begin(), index_iter);
-            } else {
-              fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
-              return -100;
+                    std::vector<float> vec;
+                    vec.resize(bottom_blobs[0].w);
+                    float* valptr = top_val_blob;
+                    float* indptr;
+                    if (top_blobs.size() == 2) indptr = top_ind_blob;
+
+                    for (int page = 0; page < bottom_blobs[0].c; page++)
+                    {
+                        const float* ptr = bottom_blobs[0].channel(page);
+                        for (int r = 0; r < bottom_blobs[0].h; r++)
+                        {
+                            for (int i = 0; i < bottom_blobs[0].w; i++)
+                            {
+                                vec[i] = ptr[r * bottom_blobs[0].w + i];
+                            }
+                            if (largest == 1)
+                            {
+                                auto index_iter                   = std::max_element(vec.begin(), vec.end());
+                                valptr[page * top_val_blob.w + r] = *index_iter;
+                                if (top_blobs.size() == 2)
+                                    indptr[page * top_ind_blob.w + r] = std::distance(vec.begin(), index_iter);
+                                else
+                                    valptr[page * top_ind_blob.w + r] = std::distance(vec.begin(), index_iter);
+                            }
+                            else if (largest == 0)
+                            {
+                                auto index_iter                   = std::min_element(vec.begin(), vec.end());
+                                valptr[page * top_val_blob.w + r] = *index_iter;
+                                if (top_blobs.size() == 2)
+                                    indptr[page * top_val_blob.w + r] = std::distance(vec.begin(), index_iter);
+                                else
+                                    valptr[page * top_ind_blob.w + r] = std::distance(vec.begin(), index_iter);
+                            }
+                            else
+                            {
+                                fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
+                                return -100;
+                            }
+                        }
+                    }
+                }
             }
-          }
         }
-      }
+        return 0;
     }
-  }
-  return 0;
-}
 
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/ncnn/ops/topk/topk.h b/csrc/mmdeploy/backend_ops/ncnn/ops/topk/topk.h
index d390fbafcd..45e7968b79 100644
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/topk/topk.h
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/topk/topk.h
@@ -4,21 +4,26 @@
 
 #include "layer.h"
 
-namespace mmdeploy {
-
-class TopK : public ncnn::Layer {
- public:
-  TopK();
-  virtual int load_param(const ncnn::ParamDict& pd);
-  virtual int forward(const std::vector<ncnn::Mat>& bottom_blobs, std::vector<ncnn::Mat>& top_blobs,
-                      const ncnn::Option& opt) const;
-
- public:
-  int axis;
-  int largest;
-  int sorted;
-  int keep_dims;
-};
+namespace mmdeploy
+{
+
+    class TopK : public ncnn::Layer
+    {
+      public:
+        TopK();
+
+        virtual int load_param(const ncnn::ParamDict& pd);
+
+        virtual int forward(const std::vector<ncnn::Mat>& bottom_blobs,
+                            std::vector<ncnn::Mat>&       top_blobs,
+                            const ncnn::Option&           opt) const;
+
+      public:
+        int axis;
+        int largest;
+        int sorted;
+        int keep_dims;
+    };
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/backend_ops/ncnn/pyncnn_ext/CMakeLists.txt b/csrc/mmdeploy/backend_ops/ncnn/pyncnn_ext/CMakeLists.txt
index 652f841f7a..1d2a381837 100755
--- a/csrc/mmdeploy/backend_ops/ncnn/pyncnn_ext/CMakeLists.txt
+++ b/csrc/mmdeploy/backend_ops/ncnn/pyncnn_ext/CMakeLists.txt
@@ -3,15 +3,16 @@
 project(ncnn_ext)
 
 # pybind11
-if (NOT TARGET pybind11)
-    add_subdirectory(${CMAKE_SOURCE_DIR}/third_party/pybind11 pybind11)
-endif ()
+if(NOT TARGET pybind11)
+  add_subdirectory(${CMAKE_SOURCE_DIR}/third_party/pybind11 pybind11)
+endif()
 
 pybind11_add_module(ncnn_ext ncnn_ext.cpp)
 
 target_link_libraries(ncnn_ext PUBLIC mmdeploy_ncnn_ops ncnn)
 set(_NCNN_EXT_DIR ${CMAKE_SOURCE_DIR}/mmdeploy/backend/ncnn)
-set_target_properties(ncnn_ext PROPERTIES
-        LIBRARY_OUTPUT_DIRECTORY ${_NCNN_EXT_DIR}
-        LIBRARY_OUTPUT_DIRECTORY_DEBUG ${_NCNN_EXT_DIR}
-        LIBRARY_OUTPUT_DIRECTORY_RELEASE ${_NCNN_EXT_DIR})
+set_target_properties(
+  ncnn_ext
+  PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${_NCNN_EXT_DIR}
+             LIBRARY_OUTPUT_DIRECTORY_DEBUG ${_NCNN_EXT_DIR}
+             LIBRARY_OUTPUT_DIRECTORY_RELEASE ${_NCNN_EXT_DIR})
diff --git a/csrc/mmdeploy/backend_ops/ncnn/pyncnn_ext/ncnn_ext.cpp b/csrc/mmdeploy/backend_ops/ncnn/pyncnn_ext/ncnn_ext.cpp
old mode 100755
new mode 100644
index ac158b9edb..1c8ad70cc7
--- a/csrc/mmdeploy/backend_ops/ncnn/pyncnn_ext/ncnn_ext.cpp
+++ b/csrc/mmdeploy/backend_ops/ncnn/pyncnn_ext/ncnn_ext.cpp
@@ -4,9 +4,11 @@
 #include "ncnn_ops_register.h"
 #include "net.h"
 
-PYBIND11_MODULE(ncnn_ext, m) {
-  m.def(
-      "register_mmdeploy_custom_layers",
-      [](ncnn::Net &net) { return register_mmdeploy_custom_layers(net); },
-      "register mmdeploy custom ncnn layers.");
+PYBIND11_MODULE(ncnn_ext, m)
+{
+    m.def(
+        "register_mmdeploy_custom_layers",
+        [](ncnn::Net& net)
+        { return register_mmdeploy_custom_layers(net); },
+        "register mmdeploy custom ncnn layers.");
 }
diff --git a/csrc/mmdeploy/backend_ops/onnxruntime/CMakeLists.txt b/csrc/mmdeploy/backend_ops/onnxruntime/CMakeLists.txt
index 9548110be6..f8f7e35f77 100644
--- a/csrc/mmdeploy/backend_ops/onnxruntime/CMakeLists.txt
+++ b/csrc/mmdeploy/backend_ops/onnxruntime/CMakeLists.txt
@@ -9,16 +9,18 @@ include(${CMAKE_SOURCE_DIR}/cmake/MMDeploy.cmake)
 file(GLOB_RECURSE ORT_OPS_SRCS *.cpp)
 add_library(${PROJECT_NAME}_obj OBJECT "${ORT_OPS_SRCS}")
 target_compile_definitions(${PROJECT_NAME}_obj PRIVATE -DMMDEPLOY_API_EXPORTS=1)
-target_compile_options(${PROJECT_NAME}_obj PRIVATE
-        $<$<COMPILE_LANGUAGE:CXX>:-fvisibility=hidden>)
-set_target_properties(${PROJECT_NAME}_obj PROPERTIES POSITION_INDEPENDENT_CODE 1)
+target_compile_options(${PROJECT_NAME}_obj
+                       PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-fvisibility=hidden>)
+set_target_properties(${PROJECT_NAME}_obj PROPERTIES POSITION_INDEPENDENT_CODE
+                                                     1)
 mmdeploy_export(${PROJECT_NAME}_obj)
 
-target_include_directories(${PROJECT_NAME}_obj PUBLIC
-        $<BUILD_INTERFACE:${ONNXRUNTIME_DIR}/include>
-        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/common>
-        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../common>
-        $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/csrc>)
+target_include_directories(
+  ${PROJECT_NAME}_obj
+  PUBLIC $<BUILD_INTERFACE:${ONNXRUNTIME_DIR}/include>
+         $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/common>
+         $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../common>
+         $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/csrc>)
 target_link_libraries(${PROJECT_NAME}_obj PUBLIC onnxruntime)
 
 mmdeploy_add_library(${PROJECT_NAME} SHARED EXCLUDE "")
diff --git a/csrc/mmdeploy/backend_ops/onnxruntime/common/onnxruntime_register.h b/csrc/mmdeploy/backend_ops/onnxruntime/common/onnxruntime_register.h
index 28d2a2b782..1095c28bae 100644
--- a/csrc/mmdeploy/backend_ops/onnxruntime/common/onnxruntime_register.h
+++ b/csrc/mmdeploy/backend_ops/onnxruntime/common/onnxruntime_register.h
@@ -6,11 +6,12 @@
 #include "mmdeploy/core/macro.h"
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
-MMDEPLOY_API OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options,
-                                                       const OrtApiBase *api);
+    MMDEPLOY_API OrtStatus* ORT_API_CALL RegisterCustomOps(OrtSessionOptions* options,
+                                                           const OrtApiBase*  api);
 
 #ifdef __cplusplus
 }
diff --git a/csrc/mmdeploy/backend_ops/onnxruntime/common/ort_utils.cpp b/csrc/mmdeploy/backend_ops/onnxruntime/common/ort_utils.cpp
index c604e4b650..da959ec37e 100644
--- a/csrc/mmdeploy/backend_ops/onnxruntime/common/ort_utils.cpp
+++ b/csrc/mmdeploy/backend_ops/onnxruntime/common/ort_utils.cpp
@@ -1,10 +1,12 @@
 // Copyright (c) OpenMMLab. All rights reserved.
 #include "ort_utils.h"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-CustomOpsTable& get_mmdeploy_custom_ops() {
-  static CustomOpsTable _custom_ops;
-  return _custom_ops;
-}
+    CustomOpsTable& get_mmdeploy_custom_ops()
+    {
+        static CustomOpsTable _custom_ops;
+        return _custom_ops;
+    }
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/onnxruntime/common/ort_utils.h b/csrc/mmdeploy/backend_ops/onnxruntime/common/ort_utils.h
index e19c984f86..14d2da3457 100644
--- a/csrc/mmdeploy/backend_ops/onnxruntime/common/ort_utils.h
+++ b/csrc/mmdeploy/backend_ops/onnxruntime/common/ort_utils.h
@@ -6,32 +6,39 @@
 #include <unordered_map>
 #include <vector>
 
-namespace mmdeploy {
-
-typedef std::unordered_map<std::string, std::vector<OrtCustomOp*>> CustomOpsTable;
-
-struct OrtTensorDimensions : std::vector<int64_t> {
-  OrtTensorDimensions(Ort::CustomOpApi ort, const OrtValue* value) {
-    OrtTensorTypeAndShapeInfo* info = ort.GetTensorTypeAndShape(value);
-    std::vector<int64_t>::operator=(ort.GetTensorShape(info));
-    ort.ReleaseTensorTypeAndShapeInfo(info);
-  }
-};
-
-CustomOpsTable& get_mmdeploy_custom_ops();
-
-template <char const* domain, typename T>
-class OrtOpsRegistry {
- public:
-  OrtOpsRegistry() { get_mmdeploy_custom_ops()[domain].push_back(&instance); }
-
- private:
-  T instance{};
-};
-
-#define REGISTER_ONNXRUNTIME_OPS(domain, name)     \
-  static char __domain_##domain##name[] = #domain; \
-  static OrtOpsRegistry<__domain_##domain##name, name> ort_ops_registry_##domain##name {}
+namespace mmdeploy
+{
+
+    typedef std::unordered_map<std::string, std::vector<OrtCustomOp*>> CustomOpsTable;
+
+    struct OrtTensorDimensions : std::vector<int64_t>
+    {
+        OrtTensorDimensions(Ort::CustomOpApi ort, const OrtValue* value)
+        {
+            OrtTensorTypeAndShapeInfo* info = ort.GetTensorTypeAndShape(value);
+            std::vector<int64_t>::operator=(ort.GetTensorShape(info));
+            ort.ReleaseTensorTypeAndShapeInfo(info);
+        }
+    };
+
+    CustomOpsTable& get_mmdeploy_custom_ops();
+
+    template<char const* domain, typename T>
+    class OrtOpsRegistry
+    {
+      public:
+        OrtOpsRegistry()
+        {
+            get_mmdeploy_custom_ops()[domain].push_back(&instance);
+        }
+
+      private:
+        T instance{};
+    };
+
+#define REGISTER_ONNXRUNTIME_OPS(domain, name)                                                \
+    static char                                          __domain_##domain##name[] = #domain; \
+    static OrtOpsRegistry<__domain_##domain##name, name> ort_ops_registry_##domain##name {}
 
 }  // namespace mmdeploy
 #endif  // ORT_MMCV_UTILS_H
diff --git a/csrc/mmdeploy/backend_ops/onnxruntime/grid_sample/grid_sample.cpp b/csrc/mmdeploy/backend_ops/onnxruntime/grid_sample/grid_sample.cpp
index c7fed37d23..27eb677394 100644
--- a/csrc/mmdeploy/backend_ops/onnxruntime/grid_sample/grid_sample.cpp
+++ b/csrc/mmdeploy/backend_ops/onnxruntime/grid_sample/grid_sample.cpp
@@ -8,287 +8,335 @@
 
 #include "ort_utils.h"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 #define MIN(a, b) (((a) < (b)) ? (a) : (b))
 #define MAX(a, b) (((a) < (b)) ? (b) : (a))
 #define CLIP_COORDINATES(in, out, clip_limit) out = MIN((clip_limit - 1), MAX(in, 0))
 
-GridSampleKernel::GridSampleKernel(const OrtApi &api, const OrtKernelInfo *info)
-    : ort_(api), info_(info) {
-  align_corners_ = ort_.KernelInfoGetAttribute<int64_t>(info, "align_corners");
-  interpolation_mode_ = ort_.KernelInfoGetAttribute<int64_t>(info, "interpolation_mode");
-  padding_mode_ = ort_.KernelInfoGetAttribute<int64_t>(info, "padding_mode");
-
-  allocator_ = Ort::AllocatorWithDefaultOptions();
-}
-
-enum GridSamplerInterpolation { Bilinear = 0, Nearest = 1, Bicubic = 2 };
-enum GridSamplerPadding { Zeros = 0, Border = 1, Reflection = 2 };
-
-template <typename scalar_t>
-static inline scalar_t grid_sampler_unnormalize(scalar_t coord, int64_t size, bool align_corners) {
-  if (align_corners) {
-    return ((coord + 1) / 2) * (size - 1);
-  } else {
-    return ((coord + 1) * size - 1) / 2;
-  }
-}
-
-// Clips coordinates to between 0 and clip_limit - 1
-template <typename scalar_t>
-static inline scalar_t clip_coordinates(scalar_t in, int64_t clip_limit) {
-  return std::min(static_cast<scalar_t>(clip_limit - 1), std::max(in, static_cast<scalar_t>(0)));
-}
-
-// Reflects coordinates until they fall between low and high (inclusive).
-// The bounds are passed as twice their value so that half-integer values
-// can be represented as ints.
-template <typename scalar_t>
-static inline scalar_t reflect_coordinates(scalar_t in, int64_t twice_low, int64_t twice_high) {
-  if (twice_low == twice_high) {
-    return static_cast<scalar_t>(0);
-  }
-  scalar_t min = static_cast<scalar_t>(twice_low) / 2;
-  scalar_t span = static_cast<scalar_t>(twice_high - twice_low) / 2;
-  in = std::fabs(in - min);
-  // `fmod` returns same sign as `in`, which is positive after the `fabs` above.
-  scalar_t extra = std::fmod(in, span);
-  int flips = static_cast<int>(std::floor(in / span));
-  if (flips % 2 == 0) {
-    return extra + min;
-  } else {
-    return span - extra + min;
-  }
-}
-
-template <typename scalar_t>
-static inline scalar_t compute_coordinates(scalar_t coord, int64_t size, int64_t padding_mode,
-                                           bool align_corners) {
-  if (padding_mode == GridSamplerPadding::Border) {
-    coord = clip_coordinates(coord, size);
-  } else if (padding_mode == GridSamplerPadding::Reflection) {
-    if (align_corners) {
-      coord = reflect_coordinates(coord, 0, 2 * (size - 1));
-    } else {
-      coord = reflect_coordinates(coord, -1, 2 * size - 1);
+    GridSampleKernel::GridSampleKernel(const OrtApi& api, const OrtKernelInfo* info)
+        : ort_(api)
+        , info_(info)
+    {
+        align_corners_      = ort_.KernelInfoGetAttribute<int64_t>(info, "align_corners");
+        interpolation_mode_ = ort_.KernelInfoGetAttribute<int64_t>(info, "interpolation_mode");
+        padding_mode_       = ort_.KernelInfoGetAttribute<int64_t>(info, "padding_mode");
+
+        allocator_ = Ort::AllocatorWithDefaultOptions();
     }
-    coord = clip_coordinates(coord, size);
-  }
-  return coord;
-}
-
-// Computes the pixel source index value for a grid coordinate
-template <typename scalar_t>
-static inline scalar_t grid_sampler_compute_source_index(scalar_t coord, int64_t size,
-                                                         int64_t padding_mode, bool align_corners) {
-  coord = grid_sampler_unnormalize(coord, size, align_corners);
-  coord = compute_coordinates(coord, size, padding_mode, align_corners);
-  return coord;
-}
-
-static inline bool within_bounds_2d(int64_t h, int64_t w, int64_t H, int64_t W) {
-  return h >= 0 && h < H && w >= 0 && w < W;
-}
-
-template <typename scalar_t>
-static inline scalar_t get_value_bounded(const scalar_t *data, scalar_t x, scalar_t y, int64_t W,
-                                         int64_t H, int64_t sW, int64_t sH, int64_t padding_mode,
-                                         bool align_corners) {
-  x = compute_coordinates(x, W, padding_mode, align_corners);
-  y = compute_coordinates(y, H, padding_mode, align_corners);
-
-  int64_t ix = static_cast<int64_t>(x);
-  int64_t iy = static_cast<int64_t>(y);
-
-  if (within_bounds_2d(iy, ix, H, W)) {
-    return data[iy * sH + ix * sW];
-  }
-  return static_cast<scalar_t>(0);
-}
-
-template <typename scalar_t>
-static inline scalar_t cubic_convolution1(scalar_t x, scalar_t A) {
-  return ((A + 2) * x - (A + 3)) * x * x + 1;
-}
-
-template <typename scalar_t>
-static inline scalar_t cubic_convolution2(scalar_t x, scalar_t A) {
-  return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
-}
-
-template <typename scalar_t>
-static inline void get_cubic_upsample_coefficients(scalar_t coeffs[4], scalar_t t) {
-  scalar_t A = -0.75;
-
-  scalar_t x1 = t;
-  coeffs[0] = cubic_convolution2<scalar_t>(x1 + 1.0, A);
-  coeffs[1] = cubic_convolution1<scalar_t>(x1, A);
-
-  // opposite coefficients
-  scalar_t x2 = 1.0 - t;
-  coeffs[2] = cubic_convolution1<scalar_t>(x2, A);
-  coeffs[3] = cubic_convolution2<scalar_t>(x2 + 1.0, A);
-}
-
-template <typename scalar_t>
-static inline scalar_t cubic_interp1d(scalar_t x0, scalar_t x1, scalar_t x2, scalar_t x3,
-                                      scalar_t t) {
-  scalar_t coeffs[4];
-  get_cubic_upsample_coefficients<scalar_t>(coeffs, t);
-
-  return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3];
-}
-
-void GridSampleKernel::Compute(OrtKernelContext *context) {
-  const bool align_corners = align_corners_;
-  const int64_t padding_mode = padding_mode_;
-  const int64_t interpolation_mode = interpolation_mode_;
-
-  const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
-  const float *input_data = reinterpret_cast<const float *>(ort_.GetTensorData<float>(input));
-
-  const OrtValue *grid = ort_.KernelContext_GetInput(context, 1);
-  const float *grid_data = reinterpret_cast<const float *>(ort_.GetTensorData<float>(grid));
-
-  OrtTensorDimensions input_dims(ort_, input);
-  OrtTensorDimensions grid_dims(ort_, grid);
-  int64_t N = input_dims[0];
-  int64_t C = input_dims[1];
-  int64_t inp_H = input_dims[2];
-  int64_t inp_W = input_dims[3];
-  int64_t out_H = grid_dims[1];
-  int64_t out_W = grid_dims[2];
-
-  std::vector<int64_t> output_dims = {N, C, out_H, out_W};
-  OrtValue *output =
-      ort_.KernelContext_GetOutput(context, 0, output_dims.data(), output_dims.size());
-  float *out_ptr = ort_.GetTensorMutableData<float>(output);
-
-  int64_t inp_sN = input_dims[1] * input_dims[2] * input_dims[3];
-  int64_t inp_sC = input_dims[2] * input_dims[3];
-  int64_t inp_sH = input_dims[3];
-  int64_t inp_sW = 1;
-  int64_t grid_sN = grid_dims[1] * grid_dims[2] * grid_dims[3];
-  int64_t grid_sH = grid_dims[2] * grid_dims[3];
-  int64_t grid_sW = grid_dims[3];
-  int64_t grid_sCoor = 1;
-  int64_t out_sN = output_dims[1] * output_dims[2] * output_dims[3];
-  int64_t out_sC = output_dims[2] * output_dims[3];
-  int64_t out_sH = output_dims[3];
-  int64_t out_sW = 1;
-
-  // loop over each output pixel
-  for (int64_t n = 0; n < N; ++n) {
-    const float *grid_ptr_N = grid_data + n * grid_sN;
-    const float *inp_ptr_N = input_data + n * inp_sN;
-    for (int64_t h = 0; h < out_H; ++h) {
-      for (int64_t w = 0; w < out_W; ++w) {
-        const float *grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW;
-        float x = *grid_ptr_NHW;
-        float y = grid_ptr_NHW[grid_sCoor];
-
-        float ix = grid_sampler_compute_source_index(x, inp_W, padding_mode, align_corners);
-        float iy = grid_sampler_compute_source_index(y, inp_H, padding_mode, align_corners);
-
-        if (interpolation_mode == GridSamplerInterpolation::Bilinear) {
-          // get corner pixel values from (x, y)
-          // for 4d, we use north-east-south-west
-          int64_t ix_nw = static_cast<int64_t>(std::floor(ix));
-          int64_t iy_nw = static_cast<int64_t>(std::floor(iy));
-
-          int64_t ix_ne = ix_nw + 1;
-          int64_t iy_ne = iy_nw;
-
-          int64_t ix_sw = ix_nw;
-          int64_t iy_sw = iy_nw + 1;
-
-          int64_t ix_se = ix_nw + 1;
-          int64_t iy_se = iy_nw + 1;
-
-          // get surfaces to each neighbor:
-          float nw = (ix_se - ix) * (iy_se - iy);
-          float ne = (ix - ix_sw) * (iy_sw - iy);
-          float sw = (ix_ne - ix) * (iy - iy_ne);
-          float se = (ix - ix_nw) * (iy - iy_nw);
-
-          // calculate bilinear weighted pixel value and set output pixel
-          const float *inp_ptr_NC = inp_ptr_N;
-          float *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
-          for (int64_t c = 0; c < C; ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
-            auto res = static_cast<float>(0);
-            if (within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W)) {
-              res += inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW] * nw;
-            }
-            if (within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W)) {
-              res += inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW] * ne;
-            }
-            if (within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W)) {
-              res += inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW] * sw;
-            }
-            if (within_bounds_2d(iy_se, ix_se, inp_H, inp_W)) {
-              res += inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW] * se;
-            }
-            *out_ptr_NCHW = res;
-          }
-        } else if (interpolation_mode == GridSamplerInterpolation::Nearest) {
-          int64_t ix_nearest = static_cast<int64_t>(std::nearbyint(ix));
-          int64_t iy_nearest = static_cast<int64_t>(std::nearbyint(iy));
-
-          // assign nearest neighbor pixel value to output pixel
-          float *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
-          const float *inp_ptr_NC = inp_ptr_N;
-          for (int64_t c = 0; c < C; ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
-            if (within_bounds_2d(iy_nearest, ix_nearest, inp_H, inp_W)) {
-              *out_ptr_NCHW = inp_ptr_NC[iy_nearest * inp_sH + ix_nearest * inp_sW];
-            } else {
-              *out_ptr_NCHW = static_cast<float>(0);
+
+    enum GridSamplerInterpolation
+    {
+        Bilinear = 0,
+        Nearest  = 1,
+        Bicubic  = 2
+    };
+    enum GridSamplerPadding
+    {
+        Zeros      = 0,
+        Border     = 1,
+        Reflection = 2
+    };
+
+    template<typename scalar_t>
+    static inline scalar_t grid_sampler_unnormalize(scalar_t coord, int64_t size, bool align_corners)
+    {
+        if (align_corners)
+        {
+            return ((coord + 1) / 2) * (size - 1);
+        }
+        else
+        {
+            return ((coord + 1) * size - 1) / 2;
+        }
+    }
+
+    // Clips coordinates to between 0 and clip_limit - 1
+    template<typename scalar_t>
+    static inline scalar_t clip_coordinates(scalar_t in, int64_t clip_limit)
+    {
+        return std::min(static_cast<scalar_t>(clip_limit - 1), std::max(in, static_cast<scalar_t>(0)));
+    }
+
+    // Reflects coordinates until they fall between low and high (inclusive).
+    // The bounds are passed as twice their value so that half-integer values
+    // can be represented as ints.
+    template<typename scalar_t>
+    static inline scalar_t reflect_coordinates(scalar_t in, int64_t twice_low, int64_t twice_high)
+    {
+        if (twice_low == twice_high)
+        {
+            return static_cast<scalar_t>(0);
+        }
+        scalar_t min   = static_cast<scalar_t>(twice_low) / 2;
+        scalar_t span  = static_cast<scalar_t>(twice_high - twice_low) / 2;
+        in             = std::fabs(in - min);
+        // `fmod` returns same sign as `in`, which is positive after the `fabs` above.
+        scalar_t extra = std::fmod(in, span);
+        int      flips = static_cast<int>(std::floor(in / span));
+        if (flips % 2 == 0)
+        {
+            return extra + min;
+        }
+        else
+        {
+            return span - extra + min;
+        }
+    }
+
+    template<typename scalar_t>
+    static inline scalar_t compute_coordinates(scalar_t coord, int64_t size, int64_t padding_mode, bool align_corners)
+    {
+        if (padding_mode == GridSamplerPadding::Border)
+        {
+            coord = clip_coordinates(coord, size);
+        }
+        else if (padding_mode == GridSamplerPadding::Reflection)
+        {
+            if (align_corners)
+            {
+                coord = reflect_coordinates(coord, 0, 2 * (size - 1));
             }
-          }
-        } else if (interpolation_mode == GridSamplerInterpolation::Bicubic) {
-          // grid_sampler_compute_source_index will "clip the value" of idx
-          // depends on the padding,
-          // which would cause calculation to be wrong,
-          // for example x = -0.1 -> ix = 0 for zero padding, but in bicubic ix
-          // = floor(x) = -1
-          // There would be more problem in reflection padding, since the -1 and
-          // +1 direction is not fixed in boundary condition
-          ix = grid_sampler_unnormalize(x, inp_W, align_corners);
-          iy = grid_sampler_unnormalize(y, inp_H, align_corners);
-
-          float ix_nw = std::floor(ix);
-          float iy_nw = std::floor(iy);
-
-          const float tx = ix - ix_nw;
-          const float ty = iy - iy_nw;
-
-          const float *inp_ptr_NC = inp_ptr_N;
-          float *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
-          for (int64_t c = 0; c < C; ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
-            float coefficients[4];
-
-            // Interpolate 4 values in the x direction
-            for (int64_t i = 0; i < 4; ++i) {
-              coefficients[i] = cubic_interp1d<float>(
-                  get_value_bounded<float>(inp_ptr_NC, ix_nw - 1, iy_nw - 1 + i, inp_W, inp_H,
-                                           inp_sW, inp_sH, padding_mode, align_corners),
-                  get_value_bounded<float>(inp_ptr_NC, ix_nw + 0, iy_nw - 1 + i, inp_W, inp_H,
-                                           inp_sW, inp_sH, padding_mode, align_corners),
-                  get_value_bounded<float>(inp_ptr_NC, ix_nw + 1, iy_nw - 1 + i, inp_W, inp_H,
-                                           inp_sW, inp_sH, padding_mode, align_corners),
-                  get_value_bounded<float>(inp_ptr_NC, ix_nw + 2, iy_nw - 1 + i, inp_W, inp_H,
-                                           inp_sW, inp_sH, padding_mode, align_corners),
-                  tx);
+            else
+            {
+                coord = reflect_coordinates(coord, -1, 2 * size - 1);
             }
+            coord = clip_coordinates(coord, size);
+        }
+        return coord;
+    }
 
-            // Interpolate in the y direction
-            *out_ptr_NCHW = cubic_interp1d<float>(coefficients[0], coefficients[1], coefficients[2],
-                                                  coefficients[3], ty);
-          }
+    // Computes the pixel source index value for a grid coordinate
+    template<typename scalar_t>
+    static inline scalar_t grid_sampler_compute_source_index(scalar_t coord, int64_t size, int64_t padding_mode, bool align_corners)
+    {
+        coord = grid_sampler_unnormalize(coord, size, align_corners);
+        coord = compute_coordinates(coord, size, padding_mode, align_corners);
+        return coord;
+    }
+
+    static inline bool within_bounds_2d(int64_t h, int64_t w, int64_t H, int64_t W)
+    {
+        return h >= 0 && h < H && w >= 0 && w < W;
+    }
+
+    template<typename scalar_t>
+    static inline scalar_t get_value_bounded(const scalar_t* data, scalar_t x, scalar_t y, int64_t W, int64_t H, int64_t sW, int64_t sH, int64_t padding_mode, bool align_corners)
+    {
+        x = compute_coordinates(x, W, padding_mode, align_corners);
+        y = compute_coordinates(y, H, padding_mode, align_corners);
+
+        int64_t ix = static_cast<int64_t>(x);
+        int64_t iy = static_cast<int64_t>(y);
+
+        if (within_bounds_2d(iy, ix, H, W))
+        {
+            return data[iy * sH + ix * sW];
+        }
+        return static_cast<scalar_t>(0);
+    }
+
+    template<typename scalar_t>
+    static inline scalar_t cubic_convolution1(scalar_t x, scalar_t A)
+    {
+        return ((A + 2) * x - (A + 3)) * x * x + 1;
+    }
+
+    template<typename scalar_t>
+    static inline scalar_t cubic_convolution2(scalar_t x, scalar_t A)
+    {
+        return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
+    }
+
+    template<typename scalar_t>
+    static inline void get_cubic_upsample_coefficients(scalar_t coeffs[4], scalar_t t)
+    {
+        scalar_t A = -0.75;
+
+        scalar_t x1 = t;
+        coeffs[0]   = cubic_convolution2<scalar_t>(x1 + 1.0, A);
+        coeffs[1]   = cubic_convolution1<scalar_t>(x1, A);
+
+        // opposite coefficients
+        scalar_t x2 = 1.0 - t;
+        coeffs[2]   = cubic_convolution1<scalar_t>(x2, A);
+        coeffs[3]   = cubic_convolution2<scalar_t>(x2 + 1.0, A);
+    }
+
+    template<typename scalar_t>
+    static inline scalar_t cubic_interp1d(scalar_t x0, scalar_t x1, scalar_t x2, scalar_t x3, scalar_t t)
+    {
+        scalar_t coeffs[4];
+        get_cubic_upsample_coefficients<scalar_t>(coeffs, t);
+
+        return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3];
+    }
+
+    void GridSampleKernel::Compute(OrtKernelContext* context)
+    {
+        const bool           align_corners      = align_corners_;
+        const int64_t        padding_mode       = padding_mode_;
+        const int64_t        interpolation_mode = interpolation_mode_;
+
+        const OrtValue*      input      = ort_.KernelContext_GetInput(context, 0);
+        const float*         input_data = reinterpret_cast<const float*>(ort_.GetTensorData<float>(input));
+
+        const OrtValue*      grid      = ort_.KernelContext_GetInput(context, 1);
+        const float*         grid_data = reinterpret_cast<const float*>(ort_.GetTensorData<float>(grid));
+
+        OrtTensorDimensions  input_dims(ort_, input);
+        OrtTensorDimensions  grid_dims(ort_, grid);
+        int64_t              N     = input_dims[0];
+        int64_t              C     = input_dims[1];
+        int64_t              inp_H = input_dims[2];
+        int64_t              inp_W = input_dims[3];
+        int64_t              out_H = grid_dims[1];
+        int64_t              out_W = grid_dims[2];
+
+        std::vector<int64_t> output_dims = {N, C, out_H, out_W};
+        OrtValue*            output =
+            ort_.KernelContext_GetOutput(context, 0, output_dims.data(), output_dims.size());
+        float*  out_ptr = ort_.GetTensorMutableData<float>(output);
+
+        int64_t inp_sN     = input_dims[1] * input_dims[2] * input_dims[3];
+        int64_t inp_sC     = input_dims[2] * input_dims[3];
+        int64_t inp_sH     = input_dims[3];
+        int64_t inp_sW     = 1;
+        int64_t grid_sN    = grid_dims[1] * grid_dims[2] * grid_dims[3];
+        int64_t grid_sH    = grid_dims[2] * grid_dims[3];
+        int64_t grid_sW    = grid_dims[3];
+        int64_t grid_sCoor = 1;
+        int64_t out_sN     = output_dims[1] * output_dims[2] * output_dims[3];
+        int64_t out_sC     = output_dims[2] * output_dims[3];
+        int64_t out_sH     = output_dims[3];
+        int64_t out_sW     = 1;
+
+        // loop over each output pixel
+        for (int64_t n = 0; n < N; ++n)
+        {
+            const float* grid_ptr_N = grid_data + n * grid_sN;
+            const float* inp_ptr_N  = input_data + n * inp_sN;
+            for (int64_t h = 0; h < out_H; ++h)
+            {
+                for (int64_t w = 0; w < out_W; ++w)
+                {
+                    const float* grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW;
+                    float        x            = *grid_ptr_NHW;
+                    float        y            = grid_ptr_NHW[grid_sCoor];
+
+                    float        ix = grid_sampler_compute_source_index(x, inp_W, padding_mode, align_corners);
+                    float        iy = grid_sampler_compute_source_index(y, inp_H, padding_mode, align_corners);
+
+                    if (interpolation_mode == GridSamplerInterpolation::Bilinear)
+                    {
+                        // get corner pixel values from (x, y)
+                        // for 4d, we use north-east-south-west
+                        int64_t      ix_nw = static_cast<int64_t>(std::floor(ix));
+                        int64_t      iy_nw = static_cast<int64_t>(std::floor(iy));
+
+                        int64_t      ix_ne = ix_nw + 1;
+                        int64_t      iy_ne = iy_nw;
+
+                        int64_t      ix_sw = ix_nw;
+                        int64_t      iy_sw = iy_nw + 1;
+
+                        int64_t      ix_se = ix_nw + 1;
+                        int64_t      iy_se = iy_nw + 1;
+
+                        // get surfaces to each neighbor:
+                        float        nw = (ix_se - ix) * (iy_se - iy);
+                        float        ne = (ix - ix_sw) * (iy_sw - iy);
+                        float        sw = (ix_ne - ix) * (iy - iy_ne);
+                        float        se = (ix - ix_nw) * (iy - iy_nw);
+
+                        // calculate bilinear weighted pixel value and set output pixel
+                        const float* inp_ptr_NC   = inp_ptr_N;
+                        float*       out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
+                        for (int64_t c = 0; c < C; ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC)
+                        {
+                            auto res = static_cast<float>(0);
+                            if (within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W))
+                            {
+                                res += inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW] * nw;
+                            }
+                            if (within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W))
+                            {
+                                res += inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW] * ne;
+                            }
+                            if (within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W))
+                            {
+                                res += inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW] * sw;
+                            }
+                            if (within_bounds_2d(iy_se, ix_se, inp_H, inp_W))
+                            {
+                                res += inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW] * se;
+                            }
+                            *out_ptr_NCHW = res;
+                        }
+                    }
+                    else if (interpolation_mode == GridSamplerInterpolation::Nearest)
+                    {
+                        int64_t      ix_nearest = static_cast<int64_t>(std::nearbyint(ix));
+                        int64_t      iy_nearest = static_cast<int64_t>(std::nearbyint(iy));
+
+                        // assign nearest neighbor pixel value to output pixel
+                        float*       out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
+                        const float* inp_ptr_NC   = inp_ptr_N;
+                        for (int64_t c = 0; c < C; ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC)
+                        {
+                            if (within_bounds_2d(iy_nearest, ix_nearest, inp_H, inp_W))
+                            {
+                                *out_ptr_NCHW = inp_ptr_NC[iy_nearest * inp_sH + ix_nearest * inp_sW];
+                            }
+                            else
+                            {
+                                *out_ptr_NCHW = static_cast<float>(0);
+                            }
+                        }
+                    }
+                    else if (interpolation_mode == GridSamplerInterpolation::Bicubic)
+                    {
+                        // grid_sampler_compute_source_index will "clip the value" of idx
+                        // depends on the padding,
+                        // which would cause calculation to be wrong,
+                        // for example x = -0.1 -> ix = 0 for zero padding, but in bicubic ix
+                        // = floor(x) = -1
+                        // There would be more problem in reflection padding, since the -1 and
+                        // +1 direction is not fixed in boundary condition
+                        ix = grid_sampler_unnormalize(x, inp_W, align_corners);
+                        iy = grid_sampler_unnormalize(y, inp_H, align_corners);
+
+                        float        ix_nw = std::floor(ix);
+                        float        iy_nw = std::floor(iy);
+
+                        const float  tx = ix - ix_nw;
+                        const float  ty = iy - iy_nw;
+
+                        const float* inp_ptr_NC   = inp_ptr_N;
+                        float*       out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
+                        for (int64_t c = 0; c < C; ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC)
+                        {
+                            float coefficients[4];
+
+                            // Interpolate 4 values in the x direction
+                            for (int64_t i = 0; i < 4; ++i)
+                            {
+                                coefficients[i] = cubic_interp1d<float>(
+                                    get_value_bounded<float>(inp_ptr_NC, ix_nw - 1, iy_nw - 1 + i, inp_W, inp_H, inp_sW, inp_sH, padding_mode, align_corners),
+                                    get_value_bounded<float>(inp_ptr_NC, ix_nw + 0, iy_nw - 1 + i, inp_W, inp_H, inp_sW, inp_sH, padding_mode, align_corners),
+                                    get_value_bounded<float>(inp_ptr_NC, ix_nw + 1, iy_nw - 1 + i, inp_W, inp_H, inp_sW, inp_sH, padding_mode, align_corners),
+                                    get_value_bounded<float>(inp_ptr_NC, ix_nw + 2, iy_nw - 1 + i, inp_W, inp_H, inp_sW, inp_sH, padding_mode, align_corners),
+                                    tx);
+                            }
+
+                            // Interpolate in the y direction
+                            *out_ptr_NCHW = cubic_interp1d<float>(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty);
+                        }
+                    }
+                }
+            }
         }
-      }
     }
-  }
-}
 
-REGISTER_ONNXRUNTIME_OPS(mmdeploy, GridSampleOp);
+    REGISTER_ONNXRUNTIME_OPS(mmdeploy, GridSampleOp);
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/onnxruntime/grid_sample/grid_sample.h b/csrc/mmdeploy/backend_ops/onnxruntime/grid_sample/grid_sample.h
index 2581b7833e..e6c9fa280f 100644
--- a/csrc/mmdeploy/backend_ops/onnxruntime/grid_sample/grid_sample.h
+++ b/csrc/mmdeploy/backend_ops/onnxruntime/grid_sample/grid_sample.h
@@ -4,41 +4,59 @@
 
 #include <onnxruntime_cxx_api.h>
 
-namespace mmdeploy {
-
-struct GridSampleKernel {
-  GridSampleKernel(const OrtApi &api, const OrtKernelInfo *info);
-
-  void Compute(OrtKernelContext *context);
-
- protected:
-  Ort::CustomOpApi ort_;
-  const OrtKernelInfo *info_;
-  Ort::AllocatorWithDefaultOptions allocator_;
-
-  int64_t align_corners_;
-  int64_t interpolation_mode_;
-  int64_t padding_mode_;
-};
-
-struct GridSampleOp : Ort::CustomOpBase<GridSampleOp, GridSampleKernel> {
-  void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const {
-    return new GridSampleKernel(api, info);
-  };
-
-  const char *GetName() const { return "grid_sampler"; };
-
-  size_t GetInputTypeCount() const { return 2; };
-  ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
-    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
-  };
-
-  size_t GetOutputTypeCount() const { return 1; };
-  ONNXTensorElementDataType GetOutputType(size_t /*index*/) const {
-    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
-  };
-
-  const char *GetExecutionProviderType() const { return "CPUExecutionProvider"; };
-};
+namespace mmdeploy
+{
+
+    struct GridSampleKernel
+    {
+        GridSampleKernel(const OrtApi& api, const OrtKernelInfo* info);
+
+        void Compute(OrtKernelContext* context);
+
+      protected:
+        Ort::CustomOpApi                 ort_;
+        const OrtKernelInfo*             info_;
+        Ort::AllocatorWithDefaultOptions allocator_;
+
+        int64_t                          align_corners_;
+        int64_t                          interpolation_mode_;
+        int64_t                          padding_mode_;
+    };
+
+    struct GridSampleOp : Ort::CustomOpBase<GridSampleOp, GridSampleKernel>
+    {
+        void* CreateKernel(const OrtApi& api, const OrtKernelInfo* info) const
+        {
+            return new GridSampleKernel(api, info);
+        };
+
+        const char* GetName() const
+        {
+            return "grid_sampler";
+        };
+
+        size_t GetInputTypeCount() const
+        {
+            return 2;
+        };
+        ONNXTensorElementDataType GetInputType(size_t /*index*/) const
+        {
+            return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+        };
+
+        size_t GetOutputTypeCount() const
+        {
+            return 1;
+        };
+        ONNXTensorElementDataType GetOutputType(size_t /*index*/) const
+        {
+            return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+        };
+
+        const char* GetExecutionProviderType() const
+        {
+            return "CPUExecutionProvider";
+        };
+    };
 }  // namespace mmdeploy
 #endif
diff --git a/csrc/mmdeploy/backend_ops/onnxruntime/modulated_deform_conv/modulated_deform_conv.cpp b/csrc/mmdeploy/backend_ops/onnxruntime/modulated_deform_conv/modulated_deform_conv.cpp
index 075c3277bc..320fa8dd45 100644
--- a/csrc/mmdeploy/backend_ops/onnxruntime/modulated_deform_conv/modulated_deform_conv.cpp
+++ b/csrc/mmdeploy/backend_ops/onnxruntime/modulated_deform_conv/modulated_deform_conv.cpp
@@ -8,191 +8,218 @@
 #include "modulated_deform_conv/modulated_deform_conv_cpu.h"
 #include "ort_utils.h"
 
-namespace mmdeploy {
-
-void parallel_unroll_gemm(const float *A, const float *B, const float *V, const float *H,
-                          const int32_t M, const int32_t N, const int32_t K, const float alpha,
-                          const float beta, float *Y, const int32_t start_row,
-                          const int32_t end_row) {
-  std::vector<float> tmp(N);
-  for (int32_t m = start_row; m < end_row; ++m) {
-    for (int32_t n = 0; n < N; n++) {
-      tmp[n] = 0;
-    }
+namespace mmdeploy
+{
+
+    void parallel_unroll_gemm(const float* A, const float* B, const float* V, const float* H, const int32_t M, const int32_t N, const int32_t K, const float alpha, const float beta, float* Y, const int32_t start_row, const int32_t end_row)
     {
-      int32_t remainder = K % 8;  // unroll
-      for (int32_t k = 0; k < K; k += 8) {
-        for (int32_t n = 0; n < N; n++) {
-          tmp[n] += A[m * K + k] * B[k * N + n];
-          tmp[n] += A[m * K + k + 1] * B[k * N + N + n];
-          tmp[n] += A[m * K + k + 2] * B[k * N + 2 * N + n];
-          tmp[n] += A[m * K + k + 3] * B[k * N + 3 * N + n];
-          tmp[n] += A[m * K + k + 4] * B[k * N + 4 * N + n];
-          tmp[n] += A[m * K + k + 5] * B[k * N + 5 * N + n];
-          tmp[n] += A[m * K + k + 6] * B[k * N + 6 * N + n];
-          tmp[n] += A[m * K + k + 7] * B[k * N + 7 * N + n];
+        std::vector<float> tmp(N);
+        for (int32_t m = start_row; m < end_row; ++m)
+        {
+            for (int32_t n = 0; n < N; n++)
+            {
+                tmp[n] = 0;
+            }
+            {
+                int32_t remainder = K % 8;  // unroll
+                for (int32_t k = 0; k < K; k += 8)
+                {
+                    for (int32_t n = 0; n < N; n++)
+                    {
+                        tmp[n] += A[m * K + k] * B[k * N + n];
+                        tmp[n] += A[m * K + k + 1] * B[k * N + N + n];
+                        tmp[n] += A[m * K + k + 2] * B[k * N + 2 * N + n];
+                        tmp[n] += A[m * K + k + 3] * B[k * N + 3 * N + n];
+                        tmp[n] += A[m * K + k + 4] * B[k * N + 4 * N + n];
+                        tmp[n] += A[m * K + k + 5] * B[k * N + 5 * N + n];
+                        tmp[n] += A[m * K + k + 6] * B[k * N + 6 * N + n];
+                        tmp[n] += A[m * K + k + 7] * B[k * N + 7 * N + n];
+                    }
+                }
+                for (int32_t k = K - remainder; k < K; k++)
+                {
+                    for (int32_t n = 0; n < N; n++)
+                    {
+                        tmp[n] += A[m * K + k] * B[k * N + n];
+                    }
+                }
+            }
+            for (int32_t n = 0; n < N; n++)
+            {
+                tmp[n] *= alpha;
+                if (V) tmp[n] += beta * V[n];
+                if (H) tmp[n] += beta * H[m * N + n];
+                Y[m * N + n] = tmp[n];
+            }
         }
-      }
-      for (int32_t k = K - remainder; k < K; k++) {
-        for (int32_t n = 0; n < N; n++) {
-          tmp[n] += A[m * K + k] * B[k * N + n];
+    }
+
+    void deformable_conv2d_ref_fp32(const float* src, const float* offset, const float* mask, const float* filter, const float* bias, const int64_t batch, const int64_t src_c, const int64_t src_h, const int64_t src_w, const int64_t dst_c, const int64_t dst_h, const int64_t dst_w, const int64_t group, const int64_t offset_group, const int64_t channels, const int64_t num_output, const int64_t kernel_h, const int64_t kernel_w, const int64_t stride_h, const int64_t stride_w, const int64_t pad_h, const int64_t pad_w, const int64_t dilation_h, const int64_t dilation_w, float* columns, float* dst)
+    {
+        const int64_t            ic_per_gp   = channels / group;
+        const int64_t            oc_per_gp   = num_output / group;
+        // Set up for launching threads
+        std::size_t              num_threads = std::thread::hardware_concurrency();
+        std::vector<std::thread> threads;
+        threads.reserve(num_threads);
+
+        for (int64_t b = 0; b < batch; ++b)
+        {
+            for (int64_t g = 0; g < group; ++g)
+            {
+                deformable_im2col_2d<float>(
+                    src + b * src_c * src_h * src_w + g * ic_per_gp * src_h * src_w,
+                    offset + b * offset_group * 2 * kernel_h * kernel_w * dst_h * dst_w,
+                    mask + b * offset_group * kernel_h * kernel_w * dst_h * dst_w,
+                    src_h,
+                    src_w,
+                    kernel_h,
+                    kernel_w,
+                    pad_h,
+                    pad_w,
+                    stride_h,
+                    stride_w,
+                    dilation_h,
+                    dilation_w,
+                    ic_per_gp,
+                    offset_group,
+                    dst_h,
+                    dst_w,
+                    mask != nullptr,
+                    columns);
+                float* dst_ptr = dst + b * dst_c * dst_h * dst_w + g * oc_per_gp * dst_h * dst_w;
+                if (bias != nullptr)
+                {
+                    const float* bias_ptr = bias + g * oc_per_gp;
+                    for (int64_t oc = 0; oc < oc_per_gp; ++oc)
+                    {
+                        for (int64_t hw = 0; hw < dst_h * dst_w; ++hw)
+                        {
+                            dst_ptr[oc * dst_h * dst_w + hw] = bias_ptr[oc];
+                        }
+                    }
+                }
+                else
+                {
+                    memset(dst_ptr, 0.0f, sizeof(float) * oc_per_gp * dst_h * dst_w);
+                }
+                if (num_threads > 1)
+                {
+                    // Calculate values to pass to threads
+                    int32_t n_rows  = (oc_per_gp + num_threads - 1) / num_threads;
+                    int32_t end_row = 0;
+                    for (int32_t i = 0; i < num_threads; i++)
+                    {
+                        auto start_row = i * n_rows;
+                        end_row        = start_row + n_rows;
+                        if (end_row > oc_per_gp) end_row = oc_per_gp;
+                        std::thread t(parallel_unroll_gemm,
+                                      filter + g * oc_per_gp * ic_per_gp * kernel_h * kernel_w,
+                                      columns,
+                                      nullptr,
+                                      dst_ptr,
+                                      oc_per_gp,
+                                      dst_h * dst_w,
+                                      ic_per_gp * kernel_h * kernel_w,
+                                      1.0f,
+                                      1.0f,
+                                      dst_ptr,
+                                      start_row,
+                                      end_row);
+                        threads.emplace_back(std::move(t));
+                    }
+                    // Wait for all threads to complete
+                    for (auto& t : threads) t.join();
+                    threads.clear();
+                }
+                else
+                {  // parallel gemm degrade to serial gemm with start_row=0 and end_row= oc_per_gp
+                    parallel_unroll_gemm(filter + g * oc_per_gp * ic_per_gp * kernel_h * kernel_w, columns, nullptr, dst_ptr, oc_per_gp, dst_h * dst_w, ic_per_gp * kernel_h * kernel_w, 1.0f, 1.0f, dst_ptr, 0, oc_per_gp);
+                }
+            }
         }
-      }
     }
-    for (int32_t n = 0; n < N; n++) {
-      tmp[n] *= alpha;
-      if (V) tmp[n] += beta * V[n];
-      if (H) tmp[n] += beta * H[m * N + n];
-      Y[m * N + n] = tmp[n];
+
+    MMCVModulatedDeformConvKernel::MMCVModulatedDeformConvKernel(const OrtApi&        api,
+                                                                 const OrtKernelInfo* info)
+        : ort_(api)
+        , info_(info)
+    {
+        std::vector<int64_t> stride  = ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "stride");
+        stride_height_               = stride[0];
+        stride_width_                = stride[1];
+        std::vector<int64_t> padding = ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "padding");
+        padding_height_              = padding[0];
+        padding_width_               = padding[1];
+        std::vector<int64_t> dilation =
+            ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "dilation");
+        dilation_height_  = dilation[0];
+        dilation_width_   = dilation[1];
+        deformable_group_ = ort_.KernelInfoGetAttribute<int64_t>(info, "deform_groups");
+        group_            = ort_.KernelInfoGetAttribute<int64_t>(info, "groups");
+
+        // create allocator
+        allocator_ = Ort::AllocatorWithDefaultOptions();
     }
-  }
-}
-
-void deformable_conv2d_ref_fp32(const float *src, const float *offset, const float *mask,
-                                const float *filter, const float *bias, const int64_t batch,
-                                const int64_t src_c, const int64_t src_h, const int64_t src_w,
-                                const int64_t dst_c, const int64_t dst_h, const int64_t dst_w,
-                                const int64_t group, const int64_t offset_group,
-                                const int64_t channels, const int64_t num_output,
-                                const int64_t kernel_h, const int64_t kernel_w,
-                                const int64_t stride_h, const int64_t stride_w, const int64_t pad_h,
-                                const int64_t pad_w, const int64_t dilation_h,
-                                const int64_t dilation_w, float *columns, float *dst) {
-  const int64_t ic_per_gp = channels / group;
-  const int64_t oc_per_gp = num_output / group;
-  // Set up for launching threads
-  std::size_t num_threads = std::thread::hardware_concurrency();
-  std::vector<std::thread> threads;
-  threads.reserve(num_threads);
-
-  for (int64_t b = 0; b < batch; ++b) {
-    for (int64_t g = 0; g < group; ++g) {
-      deformable_im2col_2d<float>(
-          src + b * src_c * src_h * src_w + g * ic_per_gp * src_h * src_w,
-          offset + b * offset_group * 2 * kernel_h * kernel_w * dst_h * dst_w,
-          mask + b * offset_group * kernel_h * kernel_w * dst_h * dst_w, src_h, src_w, kernel_h,
-          kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, ic_per_gp,
-          offset_group, dst_h, dst_w, mask != nullptr, columns);
-      float *dst_ptr = dst + b * dst_c * dst_h * dst_w + g * oc_per_gp * dst_h * dst_w;
-      if (bias != nullptr) {
-        const float *bias_ptr = bias + g * oc_per_gp;
-        for (int64_t oc = 0; oc < oc_per_gp; ++oc) {
-          for (int64_t hw = 0; hw < dst_h * dst_w; ++hw) {
-            dst_ptr[oc * dst_h * dst_w + hw] = bias_ptr[oc];
-          }
-        }
-      } else {
-        memset(dst_ptr, 0.0f, sizeof(float) * oc_per_gp * dst_h * dst_w);
-      }
-      if (num_threads > 1) {
-        // Calculate values to pass to threads
-        int32_t n_rows = (oc_per_gp + num_threads - 1) / num_threads;
-        int32_t end_row = 0;
-        for (int32_t i = 0; i < num_threads; i++) {
-          auto start_row = i * n_rows;
-          end_row = start_row + n_rows;
-          if (end_row > oc_per_gp) end_row = oc_per_gp;
-          std::thread t(parallel_unroll_gemm,
-                        filter + g * oc_per_gp * ic_per_gp * kernel_h * kernel_w, columns, nullptr,
-                        dst_ptr, oc_per_gp, dst_h * dst_w, ic_per_gp * kernel_h * kernel_w, 1.0f,
-                        1.0f, dst_ptr, start_row, end_row);
-          threads.emplace_back(std::move(t));
-        }
-        // Wait for all threads to complete
-        for (auto &t : threads) t.join();
-        threads.clear();
-      } else {  // parallel gemm degrade to serial gemm with start_row=0 and end_row= oc_per_gp
-        parallel_unroll_gemm(filter + g * oc_per_gp * ic_per_gp * kernel_h * kernel_w, columns,
-                             nullptr, dst_ptr, oc_per_gp, dst_h * dst_w,
-                             ic_per_gp * kernel_h * kernel_w, 1.0f, 1.0f, dst_ptr, 0, oc_per_gp);
-      }
+
+    void MMCVModulatedDeformConvKernel::Compute(OrtKernelContext* context)
+    {
+        const int64_t       stride_height    = stride_height_;
+        const int64_t       stride_width     = stride_width_;
+        const int64_t       padding_height   = padding_height_;
+        const int64_t       padding_width    = padding_width_;
+        const int64_t       dilation_height  = dilation_height_;
+        const int64_t       dilation_width   = dilation_width_;
+        const int64_t       deformable_group = deformable_group_;
+        const int64_t       group            = group_;
+
+        const OrtValue*     input      = ort_.KernelContext_GetInput(context, 0);
+        const float*        input_data = reinterpret_cast<const float*>(ort_.GetTensorData<float>(input));
+
+        const OrtValue*     offset      = ort_.KernelContext_GetInput(context, 1);
+        const float*        offset_data = reinterpret_cast<const float*>(ort_.GetTensorData<float>(offset));
+
+        const OrtValue*     mask      = ort_.KernelContext_GetInput(context, 2);
+        const float*        mask_data = reinterpret_cast<const float*>(ort_.GetTensorData<float>(mask));
+
+        const OrtValue*     filter      = ort_.KernelContext_GetInput(context, 3);
+        const float*        filter_data = reinterpret_cast<const float*>(ort_.GetTensorData<float>(filter));
+
+        const OrtValue*     bias      = ort_.KernelContext_GetInput(context, 4);
+        const float*        bias_data = (bias != nullptr) ? reinterpret_cast<const float*>(ort_.GetTensorData<float>(bias)) : nullptr;
+        // const float *bias_data = nullptr;
+
+        OrtTensorDimensions input_dims(ort_, input);
+        OrtTensorDimensions filter_dims(ort_, filter);
+
+        int64_t             batch         = input_dims[0];
+        int64_t             channels      = input_dims[1];
+        int64_t             in_height     = input_dims[2];
+        int64_t             in_width      = input_dims[3];
+        int64_t             num_output    = filter_dims[0];
+        int64_t             kernel_height = filter_dims[2];
+        int64_t             kernel_width  = filter_dims[3];
+
+        // get output memory
+        int64_t             out_height = floor(
+            (in_height + 2 * padding_height - dilation_height * (kernel_height - 1) - 1) / stride_height +
+            1);
+        int64_t out_width = floor(
+            (in_width + 2 * padding_width - dilation_width * (kernel_width - 1) - 1) / stride_width + 1);
+
+        std::vector<int64_t> output_dims = {batch, num_output, out_height, out_width};
+        OrtValue*            output =
+            ort_.KernelContext_GetOutput(context, 0, output_dims.data(), output_dims.size());
+        float*  out_ptr = ort_.GetTensorMutableData<float>(output);
+
+        // allocate tmp memory
+        int64_t column_len = (channels / group) * kernel_height * kernel_width * out_height * out_width;
+        float*  columns    = (float*)allocator_.Alloc(sizeof(float) * column_len);
+
+        deformable_conv2d_ref_fp32(input_data, offset_data, mask_data, filter_data, bias_data, batch, channels, in_height, in_width, num_output, out_height, out_width, group, deformable_group, channels, num_output, kernel_height, kernel_width, stride_height, stride_width, padding_height, padding_width, dilation_height, dilation_width, columns, out_ptr);
+
+        allocator_.Free(columns);
     }
-  }
-}
-
-MMCVModulatedDeformConvKernel::MMCVModulatedDeformConvKernel(const OrtApi &api,
-                                                             const OrtKernelInfo *info)
-    : ort_(api), info_(info) {
-  std::vector<int64_t> stride = ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "stride");
-  stride_height_ = stride[0];
-  stride_width_ = stride[1];
-  std::vector<int64_t> padding = ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "padding");
-  padding_height_ = padding[0];
-  padding_width_ = padding[1];
-  std::vector<int64_t> dilation =
-      ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "dilation");
-  dilation_height_ = dilation[0];
-  dilation_width_ = dilation[1];
-  deformable_group_ = ort_.KernelInfoGetAttribute<int64_t>(info, "deform_groups");
-  group_ = ort_.KernelInfoGetAttribute<int64_t>(info, "groups");
-
-  // create allocator
-  allocator_ = Ort::AllocatorWithDefaultOptions();
-}
-
-void MMCVModulatedDeformConvKernel::Compute(OrtKernelContext *context) {
-  const int64_t stride_height = stride_height_;
-  const int64_t stride_width = stride_width_;
-  const int64_t padding_height = padding_height_;
-  const int64_t padding_width = padding_width_;
-  const int64_t dilation_height = dilation_height_;
-  const int64_t dilation_width = dilation_width_;
-  const int64_t deformable_group = deformable_group_;
-  const int64_t group = group_;
-
-  const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
-  const float *input_data = reinterpret_cast<const float *>(ort_.GetTensorData<float>(input));
-
-  const OrtValue *offset = ort_.KernelContext_GetInput(context, 1);
-  const float *offset_data = reinterpret_cast<const float *>(ort_.GetTensorData<float>(offset));
-
-  const OrtValue *mask = ort_.KernelContext_GetInput(context, 2);
-  const float *mask_data = reinterpret_cast<const float *>(ort_.GetTensorData<float>(mask));
-
-  const OrtValue *filter = ort_.KernelContext_GetInput(context, 3);
-  const float *filter_data = reinterpret_cast<const float *>(ort_.GetTensorData<float>(filter));
-
-  const OrtValue *bias = ort_.KernelContext_GetInput(context, 4);
-  const float *bias_data = (bias != nullptr)
-                               ? reinterpret_cast<const float *>(ort_.GetTensorData<float>(bias))
-                               : nullptr;
-  // const float *bias_data = nullptr;
-
-  OrtTensorDimensions input_dims(ort_, input);
-  OrtTensorDimensions filter_dims(ort_, filter);
-
-  int64_t batch = input_dims[0];
-  int64_t channels = input_dims[1];
-  int64_t in_height = input_dims[2];
-  int64_t in_width = input_dims[3];
-  int64_t num_output = filter_dims[0];
-  int64_t kernel_height = filter_dims[2];
-  int64_t kernel_width = filter_dims[3];
-
-  // get output memory
-  int64_t out_height = floor(
-      (in_height + 2 * padding_height - dilation_height * (kernel_height - 1) - 1) / stride_height +
-      1);
-  int64_t out_width = floor(
-      (in_width + 2 * padding_width - dilation_width * (kernel_width - 1) - 1) / stride_width + 1);
-
-  std::vector<int64_t> output_dims = {batch, num_output, out_height, out_width};
-  OrtValue *output =
-      ort_.KernelContext_GetOutput(context, 0, output_dims.data(), output_dims.size());
-  float *out_ptr = ort_.GetTensorMutableData<float>(output);
-
-  // allocate tmp memory
-  int64_t column_len = (channels / group) * kernel_height * kernel_width * out_height * out_width;
-  float *columns = (float *)allocator_.Alloc(sizeof(float) * column_len);
-
-  deformable_conv2d_ref_fp32(input_data, offset_data, mask_data, filter_data, bias_data, batch,
-                             channels, in_height, in_width, num_output, out_height, out_width,
-                             group, deformable_group, channels, num_output, kernel_height,
-                             kernel_width, stride_height, stride_width, padding_height,
-                             padding_width, dilation_height, dilation_width, columns, out_ptr);
-
-  allocator_.Free(columns);
-}
-REGISTER_ONNXRUNTIME_OPS(mmdeploy, MMCVModulatedDeformConvOp);
-REGISTER_ONNXRUNTIME_OPS(mmcv, MMCVModulatedDeformConvOp);
+    REGISTER_ONNXRUNTIME_OPS(mmdeploy, MMCVModulatedDeformConvOp);
+    REGISTER_ONNXRUNTIME_OPS(mmcv, MMCVModulatedDeformConvOp);
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/onnxruntime/modulated_deform_conv/modulated_deform_conv.h b/csrc/mmdeploy/backend_ops/onnxruntime/modulated_deform_conv/modulated_deform_conv.h
index 772a9c4a88..7ffeb702d3 100644
--- a/csrc/mmdeploy/backend_ops/onnxruntime/modulated_deform_conv/modulated_deform_conv.h
+++ b/csrc/mmdeploy/backend_ops/onnxruntime/modulated_deform_conv/modulated_deform_conv.h
@@ -4,55 +4,74 @@
 
 #include <onnxruntime_cxx_api.h>
 
-namespace mmdeploy {
-
-struct MMCVModulatedDeformConvKernel {
-  MMCVModulatedDeformConvKernel(const OrtApi &api, const OrtKernelInfo *info);
-
-  void Compute(OrtKernelContext *context);
-
- protected:
-  Ort::CustomOpApi ort_;
-  const OrtKernelInfo *info_;
-  Ort::AllocatorWithDefaultOptions allocator_;
-
-  int64_t stride_height_;
-  int64_t stride_width_;
-  int64_t padding_height_;
-  int64_t padding_width_;
-  int64_t dilation_height_;
-  int64_t dilation_width_;
-  int64_t deformable_group_;
-  int64_t group_;
-};
-
-struct MMCVModulatedDeformConvOp
-    : Ort::CustomOpBase<MMCVModulatedDeformConvOp, MMCVModulatedDeformConvKernel> {
-  void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const {
-    return new MMCVModulatedDeformConvKernel(api, info);
-  }
-
-  const char *GetName() const { return "MMCVModulatedDeformConv2d"; };
-
-  size_t GetInputTypeCount() const { return 5; };
-  ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
-    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
-  };
-
-  OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(size_t index) const {
-    // The last input (index == 4) is optional, which is bias
-    if (index == 4) return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_OPTIONAL;
-
-    return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_REQUIRED;
-  }
-
-  size_t GetOutputTypeCount() const { return 1; };
-  ONNXTensorElementDataType GetOutputType(size_t /*index*/) const {
-    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
-  };
-
-  // force cpu
-  const char *GetExecutionProviderType() const { return "CPUExecutionProvider"; };
-};
+namespace mmdeploy
+{
+
+    struct MMCVModulatedDeformConvKernel
+    {
+        MMCVModulatedDeformConvKernel(const OrtApi& api, const OrtKernelInfo* info);
+
+        void Compute(OrtKernelContext* context);
+
+      protected:
+        Ort::CustomOpApi                 ort_;
+        const OrtKernelInfo*             info_;
+        Ort::AllocatorWithDefaultOptions allocator_;
+
+        int64_t                          stride_height_;
+        int64_t                          stride_width_;
+        int64_t                          padding_height_;
+        int64_t                          padding_width_;
+        int64_t                          dilation_height_;
+        int64_t                          dilation_width_;
+        int64_t                          deformable_group_;
+        int64_t                          group_;
+    };
+
+    struct MMCVModulatedDeformConvOp
+        : Ort::CustomOpBase<MMCVModulatedDeformConvOp, MMCVModulatedDeformConvKernel>
+    {
+        void* CreateKernel(const OrtApi& api, const OrtKernelInfo* info) const
+        {
+            return new MMCVModulatedDeformConvKernel(api, info);
+        }
+
+        const char* GetName() const
+        {
+            return "MMCVModulatedDeformConv2d";
+        };
+
+        size_t GetInputTypeCount() const
+        {
+            return 5;
+        };
+        ONNXTensorElementDataType GetInputType(size_t /*index*/) const
+        {
+            return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+        };
+
+        OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(size_t index) const
+        {
+            // The last input (index == 4) is optional, which is bias
+            if (index == 4) return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_OPTIONAL;
+
+            return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_REQUIRED;
+        }
+
+        size_t GetOutputTypeCount() const
+        {
+            return 1;
+        };
+        ONNXTensorElementDataType GetOutputType(size_t /*index*/) const
+        {
+            return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+        };
+
+        // force cpu
+        const char* GetExecutionProviderType() const
+        {
+            return "CPUExecutionProvider";
+        };
+    };
 }  // namespace mmdeploy
 #endif
diff --git a/csrc/mmdeploy/backend_ops/onnxruntime/nms_match/nms_match.cpp b/csrc/mmdeploy/backend_ops/onnxruntime/nms_match/nms_match.cpp
index 784be2c987..397bcbf92c 100644
--- a/csrc/mmdeploy/backend_ops/onnxruntime/nms_match/nms_match.cpp
+++ b/csrc/mmdeploy/backend_ops/onnxruntime/nms_match/nms_match.cpp
@@ -13,117 +13,132 @@
 
 #include "ort_utils.h"
 
-namespace mmdeploy {
-struct Box {
-  float x1, y1, x2, y2;
-};
-
-float nms_match_iou(Box box1, Box box2) {
-  auto inter_x1 = std::max(box1.x1, box2.x1);
-  auto inter_y1 = std::max(box1.y1, box2.y1);
-  auto inter_x2 = std::min(box1.x2, box2.x2);
-  auto inter_y2 = std::min(box1.y2, box2.y2);
-
-  auto eps = 1e-10;
-
-  auto w = std::max(static_cast<float>(0), inter_x2 - inter_x1);
-  auto h = std::max(static_cast<float>(0), inter_y2 - inter_y1);
-
-  auto area1 = (box1.x2 - box1.x1) * (box1.y2 - box1.y1);
-  auto area2 = (box2.x2 - box2.x1) * (box2.y2 - box2.y1);
-  auto inter = w * h;
-  auto ovr = inter / (area1 + area2 - inter + eps);
-  return ovr;
-}
-NMSMatchKernel::NMSMatchKernel(const OrtApi& api, const OrtKernelInfo* info)
-    : ort_(api), info_(info) {
-  // create allocator
-  allocator_ = Ort::AllocatorWithDefaultOptions();
-}
-
-void NMSMatchKernel::Compute(OrtKernelContext* context) {
-  const OrtValue* boxes = ort_.KernelContext_GetInput(context, 0);
-  const float* boxes_data = reinterpret_cast<const float*>(ort_.GetTensorData<float>(boxes));
-  const OrtValue* scores = ort_.KernelContext_GetInput(context, 1);
-  const float* scores_data = reinterpret_cast<const float*>(ort_.GetTensorData<float>(scores));
-  const OrtValue* iou_threshold_ = ort_.KernelContext_GetInput(context, 2);
-  const float iou_threshold_data = ort_.GetTensorData<float>(iou_threshold_)[0];
-  const OrtValue* score_threshold_ = ort_.KernelContext_GetInput(context, 3);
-  const float score_threshold_data = ort_.GetTensorData<float>(score_threshold_)[0];
-
-  OrtTensorDimensions boxes_dim(ort_, boxes);
-  OrtTensorDimensions scores_dim(ort_, scores);
-  // loop over batch
-  int64_t nbatch = boxes_dim[0];
-  int64_t nboxes = boxes_dim[1];
-  int64_t nclass = scores_dim[1];
-  assert(boxes_dim[2] == 4);  //(x1, x2, y1, y2)
-  // alloc some temp memory
-  bool* select = (bool*)allocator_.Alloc(sizeof(bool) * nbatch * nboxes);
-
-  std::vector<int64_t> res_order;
-  for (int64_t k = 0; k < nbatch; k++) {
-    for (int64_t g = 0; g < nclass; g++) {
-      for (int64_t i = 0; i < nboxes; i++) {
-        select[i] = true;
-      }
-      // scores
-      // k * nboxes * nclass means per batch
-      // g * nboxes means per class
-      // batch = 2 boxes = 3 classes = 4
-      std::vector<float> tmp_sc;
-      // get the class scores
-      for (int i = 0; i < nboxes; i++) {
-        tmp_sc.push_back(scores_data[k * nboxes * nclass + g * nboxes + i]);
-      }
-
-      std::vector<int64_t> order(tmp_sc.size());
-      std::iota(order.begin(), order.end(), 0);
-      std::sort(order.begin(), order.end(),
-                [&tmp_sc](int64_t id1, int64_t id2) { return tmp_sc[id1] > tmp_sc[id2]; });
-      for (int64_t _i = 0; _i < nboxes; _i++) {
-        auto i = order[_i];
-        if (select[i] == false) continue;
-        std::vector<int64_t> v_i;
-        for (int64_t _j = _i + 1; _j < nboxes; _j++) {
-          auto j = order[_j];
-          if (select[j] == false) continue;
-          Box vbox1, vbox2;
-          vbox1.x1 = boxes_data[k * nboxes * 4 + i * 4];
-          vbox1.y1 = boxes_data[k * nboxes * 4 + i * 4 + 1];
-          vbox1.x2 = boxes_data[k * nboxes * 4 + i * 4 + 2];
-          vbox1.y2 = boxes_data[k * nboxes * 4 + i * 4 + 3];
-
-          vbox2.x1 = boxes_data[k * nboxes * 4 + j * 4];
-          vbox2.y1 = boxes_data[k * nboxes * 4 + j * 4 + 1];
-          vbox2.x2 = boxes_data[k * nboxes * 4 + j * 4 + 2];
-          vbox2.y2 = boxes_data[k * nboxes * 4 + j * 4 + 3];
-
-          auto ovr = nms_match_iou(vbox1, vbox2);
-          if (ovr >= iou_threshold_data) {
-            select[j] = false;
-            v_i.push_back(j);
-          }
-        }
-        if (tmp_sc[i] > score_threshold_data && v_i.size() != 0) {
-          for (int v_i_idx = 0; v_i_idx < v_i.size(); v_i_idx++) {
-            res_order.push_back(k);
-            res_order.push_back(g);
-            res_order.push_back(i);
-            res_order.push_back(v_i[v_i_idx]);
-          }
-        }
-      }
+namespace mmdeploy
+{
+    struct Box
+    {
+        float x1, y1, x2, y2;
+    };
+
+    float nms_match_iou(Box box1, Box box2)
+    {
+        auto inter_x1 = std::max(box1.x1, box2.x1);
+        auto inter_y1 = std::max(box1.y1, box2.y1);
+        auto inter_x2 = std::min(box1.x2, box2.x2);
+        auto inter_y2 = std::min(box1.y2, box2.y2);
+
+        auto eps = 1e-10;
+
+        auto w = std::max(static_cast<float>(0), inter_x2 - inter_x1);
+        auto h = std::max(static_cast<float>(0), inter_y2 - inter_y1);
+
+        auto area1 = (box1.x2 - box1.x1) * (box1.y2 - box1.y1);
+        auto area2 = (box2.x2 - box2.x1) * (box2.y2 - box2.y1);
+        auto inter = w * h;
+        auto ovr   = inter / (area1 + area2 - inter + eps);
+        return ovr;
+    }
+    NMSMatchKernel::NMSMatchKernel(const OrtApi& api, const OrtKernelInfo* info)
+        : ort_(api)
+        , info_(info)
+    {
+        // create allocator
+        allocator_ = Ort::AllocatorWithDefaultOptions();
     }
-  }
-  std::vector<int64_t> inds_dims({(int64_t)res_order.size() / 4, 4});
 
-  OrtValue* res = ort_.KernelContext_GetOutput(context, 0, inds_dims.data(), inds_dims.size());
-  int64_t* res_data = ort_.GetTensorMutableData<int64_t>(res);
+    void NMSMatchKernel::Compute(OrtKernelContext* context)
+    {
+        const OrtValue*     boxes                = ort_.KernelContext_GetInput(context, 0);
+        const float*        boxes_data           = reinterpret_cast<const float*>(ort_.GetTensorData<float>(boxes));
+        const OrtValue*     scores               = ort_.KernelContext_GetInput(context, 1);
+        const float*        scores_data          = reinterpret_cast<const float*>(ort_.GetTensorData<float>(scores));
+        const OrtValue*     iou_threshold_       = ort_.KernelContext_GetInput(context, 2);
+        const float         iou_threshold_data   = ort_.GetTensorData<float>(iou_threshold_)[0];
+        const OrtValue*     score_threshold_     = ort_.KernelContext_GetInput(context, 3);
+        const float         score_threshold_data = ort_.GetTensorData<float>(score_threshold_)[0];
+
+        OrtTensorDimensions boxes_dim(ort_, boxes);
+        OrtTensorDimensions scores_dim(ort_, scores);
+        // loop over batch
+        int64_t             nbatch = boxes_dim[0];
+        int64_t             nboxes = boxes_dim[1];
+        int64_t             nclass = scores_dim[1];
+        assert(boxes_dim[2] == 4);  //(x1, x2, y1, y2)
+        // alloc some temp memory
+        bool*                select = (bool*)allocator_.Alloc(sizeof(bool) * nbatch * nboxes);
+
+        std::vector<int64_t> res_order;
+        for (int64_t k = 0; k < nbatch; k++)
+        {
+            for (int64_t g = 0; g < nclass; g++)
+            {
+                for (int64_t i = 0; i < nboxes; i++)
+                {
+                    select[i] = true;
+                }
+                // scores
+                // k * nboxes * nclass means per batch
+                // g * nboxes means per class
+                // batch = 2 boxes = 3 classes = 4
+                std::vector<float> tmp_sc;
+                // get the class scores
+                for (int i = 0; i < nboxes; i++)
+                {
+                    tmp_sc.push_back(scores_data[k * nboxes * nclass + g * nboxes + i]);
+                }
+
+                std::vector<int64_t> order(tmp_sc.size());
+                std::iota(order.begin(), order.end(), 0);
+                std::sort(order.begin(), order.end(), [&tmp_sc](int64_t id1, int64_t id2)
+                          { return tmp_sc[id1] > tmp_sc[id2]; });
+                for (int64_t _i = 0; _i < nboxes; _i++)
+                {
+                    auto i = order[_i];
+                    if (select[i] == false) continue;
+                    std::vector<int64_t> v_i;
+                    for (int64_t _j = _i + 1; _j < nboxes; _j++)
+                    {
+                        auto j = order[_j];
+                        if (select[j] == false) continue;
+                        Box vbox1, vbox2;
+                        vbox1.x1 = boxes_data[k * nboxes * 4 + i * 4];
+                        vbox1.y1 = boxes_data[k * nboxes * 4 + i * 4 + 1];
+                        vbox1.x2 = boxes_data[k * nboxes * 4 + i * 4 + 2];
+                        vbox1.y2 = boxes_data[k * nboxes * 4 + i * 4 + 3];
+
+                        vbox2.x1 = boxes_data[k * nboxes * 4 + j * 4];
+                        vbox2.y1 = boxes_data[k * nboxes * 4 + j * 4 + 1];
+                        vbox2.x2 = boxes_data[k * nboxes * 4 + j * 4 + 2];
+                        vbox2.y2 = boxes_data[k * nboxes * 4 + j * 4 + 3];
+
+                        auto ovr = nms_match_iou(vbox1, vbox2);
+                        if (ovr >= iou_threshold_data)
+                        {
+                            select[j] = false;
+                            v_i.push_back(j);
+                        }
+                    }
+                    if (tmp_sc[i] > score_threshold_data && v_i.size() != 0)
+                    {
+                        for (int v_i_idx = 0; v_i_idx < v_i.size(); v_i_idx++)
+                        {
+                            res_order.push_back(k);
+                            res_order.push_back(g);
+                            res_order.push_back(i);
+                            res_order.push_back(v_i[v_i_idx]);
+                        }
+                    }
+                }
+            }
+        }
+        std::vector<int64_t> inds_dims({(int64_t)res_order.size() / 4, 4});
+
+        OrtValue*            res      = ort_.KernelContext_GetOutput(context, 0, inds_dims.data(), inds_dims.size());
+        int64_t*             res_data = ort_.GetTensorMutableData<int64_t>(res);
 
-  memcpy(res_data, res_order.data(), sizeof(int64_t) * res_order.size());
+        memcpy(res_data, res_order.data(), sizeof(int64_t) * res_order.size());
 
-  allocator_.Free(select);
-}
-REGISTER_ONNXRUNTIME_OPS(mmdeploy, NMSMatchOp);
+        allocator_.Free(select);
+    }
+    REGISTER_ONNXRUNTIME_OPS(mmdeploy, NMSMatchOp);
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/onnxruntime/nms_match/nms_match.h b/csrc/mmdeploy/backend_ops/onnxruntime/nms_match/nms_match.h
index 57aa94d964..48e0d0dbb0 100644
--- a/csrc/mmdeploy/backend_ops/onnxruntime/nms_match/nms_match.h
+++ b/csrc/mmdeploy/backend_ops/onnxruntime/nms_match/nms_match.h
@@ -10,37 +10,55 @@
 #include <string>
 #include <vector>
 
-namespace mmdeploy {
-struct NMSMatchKernel {
-  NMSMatchKernel(const OrtApi& api, const OrtKernelInfo* info);
-
-  void Compute(OrtKernelContext* context);
-
- private:
-  Ort::CustomOpApi ort_;
-  const OrtKernelInfo* info_;
-  Ort::AllocatorWithDefaultOptions allocator_;
-};
-
-struct NMSMatchOp : Ort::CustomOpBase<NMSMatchOp, NMSMatchKernel> {
-  void* CreateKernel(const OrtApi& api, const OrtKernelInfo* info) const {
-    return new NMSMatchKernel(api, info);
-  }
-  const char* GetName() const { return "NMSMatch"; }
-
-  size_t GetInputTypeCount() const { return 4; }
-  ONNXTensorElementDataType GetInputType(size_t) const {
-    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
-  }
-
-  size_t GetOutputTypeCount() const { return 1; }
-  ONNXTensorElementDataType GetOutputType(size_t) const {
-    return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
-  }
-
-  // force cpu
-  const char* GetExecutionProviderType() const { return "CPUExecutionProvider"; }
-};
+namespace mmdeploy
+{
+    struct NMSMatchKernel
+    {
+        NMSMatchKernel(const OrtApi& api, const OrtKernelInfo* info);
+
+        void Compute(OrtKernelContext* context);
+
+      private:
+        Ort::CustomOpApi                 ort_;
+        const OrtKernelInfo*             info_;
+        Ort::AllocatorWithDefaultOptions allocator_;
+    };
+
+    struct NMSMatchOp : Ort::CustomOpBase<NMSMatchOp, NMSMatchKernel>
+    {
+        void* CreateKernel(const OrtApi& api, const OrtKernelInfo* info) const
+        {
+            return new NMSMatchKernel(api, info);
+        }
+        const char* GetName() const
+        {
+            return "NMSMatch";
+        }
+
+        size_t GetInputTypeCount() const
+        {
+            return 4;
+        }
+        ONNXTensorElementDataType GetInputType(size_t) const
+        {
+            return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+        }
+
+        size_t GetOutputTypeCount() const
+        {
+            return 1;
+        }
+        ONNXTensorElementDataType GetOutputType(size_t) const
+        {
+            return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
+        }
+
+        // force cpu
+        const char* GetExecutionProviderType() const
+        {
+            return "CPUExecutionProvider";
+        }
+    };
 }  // namespace mmdeploy
 
 #endif  // ONNXRUNTIME_NMS_MATCH_H
diff --git a/csrc/mmdeploy/backend_ops/onnxruntime/nms_rotated/nms_rotated.cpp b/csrc/mmdeploy/backend_ops/onnxruntime/nms_rotated/nms_rotated.cpp
index 9d8cc4597e..73c508ce47 100644
--- a/csrc/mmdeploy/backend_ops/onnxruntime/nms_rotated/nms_rotated.cpp
+++ b/csrc/mmdeploy/backend_ops/onnxruntime/nms_rotated/nms_rotated.cpp
@@ -13,356 +13,418 @@
 
 #include "ort_utils.h"
 
-namespace mmdeploy {
-
-namespace {
-struct RotatedBox {
-  float x_ctr, y_ctr, w, h, a;
-};
-struct Point {
-  float x, y;
-  Point(const float& px = 0, const float& py = 0) : x(px), y(py) {}
-  Point operator+(const Point& p) const { return Point(x + p.x, y + p.y); }
-  Point& operator+=(const Point& p) {
-    x += p.x;
-    y += p.y;
-    return *this;
-  }
-  Point operator-(const Point& p) const { return Point(x - p.x, y - p.y); }
-  Point operator*(const float coeff) const { return Point(x * coeff, y * coeff); }
-};
-
-float dot_2d(const Point& A, const Point& B) { return A.x * B.x + A.y * B.y; }
-
-float cross_2d(const Point& A, const Point& B) { return A.x * B.y - B.x * A.y; }
-}  // namespace
-
-void get_rotated_vertices(const RotatedBox& box, Point (&pts)[4]) {
-  // M_PI / 180. == 0.01745329251
-  // double theta = box.a * 0.01745329251;
-  // MODIFIED
-  double theta = box.a;
-  float cosTheta2 = (float)cos(theta) * 0.5f;
-  float sinTheta2 = (float)sin(theta) * 0.5f;
-
-  // y: top --> down; x: left --> right
-  pts[0].x = box.x_ctr - sinTheta2 * box.h - cosTheta2 * box.w;
-  pts[0].y = box.y_ctr + cosTheta2 * box.h - sinTheta2 * box.w;
-  pts[1].x = box.x_ctr + sinTheta2 * box.h - cosTheta2 * box.w;
-  pts[1].y = box.y_ctr - cosTheta2 * box.h - sinTheta2 * box.w;
-  pts[2].x = 2 * box.x_ctr - pts[0].x;
-  pts[2].y = 2 * box.y_ctr - pts[0].y;
-  pts[3].x = 2 * box.x_ctr - pts[1].x;
-  pts[3].y = 2 * box.y_ctr - pts[1].y;
-}
-
-int get_intersection_points(const Point (&pts1)[4], const Point (&pts2)[4],
-                            Point (&intersections)[24]) {
-  // Line vector
-  // A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1]
-  Point vec1[4], vec2[4];
-  for (int i = 0; i < 4; i++) {
-    vec1[i] = pts1[(i + 1) % 4] - pts1[i];
-    vec2[i] = pts2[(i + 1) % 4] - pts2[i];
-  }
-
-  // Line test - test all line combos for intersection
-  int num = 0;  // number of intersections
-  for (int i = 0; i < 4; i++) {
-    for (int j = 0; j < 4; j++) {
-      // Solve for 2x2 Ax=b
-      float det = cross_2d(vec2[j], vec1[i]);
-
-      // This takes care of parallel lines
-      if (fabs(det) <= 1e-14) {
-        continue;
-      }
-
-      auto vec12 = pts2[j] - pts1[i];
-
-      float t1 = cross_2d(vec2[j], vec12) / det;
-      float t2 = cross_2d(vec1[i], vec12) / det;
-
-      if (t1 >= 0.0f && t1 <= 1.0f && t2 >= 0.0f && t2 <= 1.0f) {
-        intersections[num++] = pts1[i] + vec1[i] * t1;
-      }
-    }
-  }
-
-  // Check for vertices of rect1 inside rect2
-  {
-    const auto& AB = vec2[0];
-    const auto& DA = vec2[3];
-    auto ABdotAB = dot_2d(AB, AB);
-    auto ADdotAD = dot_2d(DA, DA);
-    for (int i = 0; i < 4; i++) {
-      // assume ABCD is the rectangle, and P is the point to be judged
-      // P is inside ABCD iff. P's projection on AB lies within AB
-      // and P's projection on AD lies within AD
-
-      auto AP = pts1[i] - pts2[0];
-
-      auto APdotAB = dot_2d(AP, AB);
-      auto APdotAD = -dot_2d(AP, DA);
-
-      if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && (APdotAD <= ADdotAD)) {
-        intersections[num++] = pts1[i];
-      }
-    }
-  }
-
-  // Reverse the check - check for vertices of rect2 inside rect1
-  {
-    const auto& AB = vec1[0];
-    const auto& DA = vec1[3];
-    auto ABdotAB = dot_2d(AB, AB);
-    auto ADdotAD = dot_2d(DA, DA);
-    for (int i = 0; i < 4; i++) {
-      auto AP = pts2[i] - pts1[0];
-
-      auto APdotAB = dot_2d(AP, AB);
-      auto APdotAD = -dot_2d(AP, DA);
-
-      if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && (APdotAD <= ADdotAD)) {
-        intersections[num++] = pts2[i];
-      }
+namespace mmdeploy
+{
+
+    namespace
+    {
+        struct RotatedBox
+        {
+            float x_ctr, y_ctr, w, h, a;
+        };
+        struct Point
+        {
+            float x, y;
+            Point(const float& px = 0, const float& py = 0)
+                : x(px)
+                , y(py)
+            {
+            }
+            Point operator+(const Point& p) const
+            {
+                return Point(x + p.x, y + p.y);
+            }
+            Point& operator+=(const Point& p)
+            {
+                x += p.x;
+                y += p.y;
+                return *this;
+            }
+            Point operator-(const Point& p) const
+            {
+                return Point(x - p.x, y - p.y);
+            }
+            Point operator*(const float coeff) const
+            {
+                return Point(x * coeff, y * coeff);
+            }
+        };
+
+        float dot_2d(const Point& A, const Point& B)
+        {
+            return A.x * B.x + A.y * B.y;
+        }
+
+        float cross_2d(const Point& A, const Point& B)
+        {
+            return A.x * B.y - B.x * A.y;
+        }
+    }  // namespace
+
+    void get_rotated_vertices(const RotatedBox& box, Point (&pts)[4])
+    {
+        // M_PI / 180. == 0.01745329251
+        // double theta = box.a * 0.01745329251;
+        // MODIFIED
+        double theta     = box.a;
+        float  cosTheta2 = (float)cos(theta) * 0.5f;
+        float  sinTheta2 = (float)sin(theta) * 0.5f;
+
+        // y: top --> down; x: left --> right
+        pts[0].x = box.x_ctr - sinTheta2 * box.h - cosTheta2 * box.w;
+        pts[0].y = box.y_ctr + cosTheta2 * box.h - sinTheta2 * box.w;
+        pts[1].x = box.x_ctr + sinTheta2 * box.h - cosTheta2 * box.w;
+        pts[1].y = box.y_ctr - cosTheta2 * box.h - sinTheta2 * box.w;
+        pts[2].x = 2 * box.x_ctr - pts[0].x;
+        pts[2].y = 2 * box.y_ctr - pts[0].y;
+        pts[3].x = 2 * box.x_ctr - pts[1].x;
+        pts[3].y = 2 * box.y_ctr - pts[1].y;
     }
-  }
-
-  return num;
-}
-
-int convex_hull_graham(const Point (&p)[24], const int& num_in, Point (&q)[24],
-                       bool shift_to_zero = false) {
-  assert(num_in >= 2);
-
-  // Step 1:
-  // Find point with minimum y
-  // if more than 1 points have the same minimum y,
-  // pick the one with the minimum x.
-  int t = 0;
-  for (int i = 1; i < num_in; i++) {
-    if (p[i].y < p[t].y || (p[i].y == p[t].y && p[i].x < p[t].x)) {
-      t = i;
+
+    int get_intersection_points(const Point (&pts1)[4], const Point (&pts2)[4], Point (&intersections)[24])
+    {
+        // Line vector
+        // A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1]
+        Point vec1[4], vec2[4];
+        for (int i = 0; i < 4; i++)
+        {
+            vec1[i] = pts1[(i + 1) % 4] - pts1[i];
+            vec2[i] = pts2[(i + 1) % 4] - pts2[i];
+        }
+
+        // Line test - test all line combos for intersection
+        int num = 0;  // number of intersections
+        for (int i = 0; i < 4; i++)
+        {
+            for (int j = 0; j < 4; j++)
+            {
+                // Solve for 2x2 Ax=b
+                float det = cross_2d(vec2[j], vec1[i]);
+
+                // This takes care of parallel lines
+                if (fabs(det) <= 1e-14)
+                {
+                    continue;
+                }
+
+                auto  vec12 = pts2[j] - pts1[i];
+
+                float t1 = cross_2d(vec2[j], vec12) / det;
+                float t2 = cross_2d(vec1[i], vec12) / det;
+
+                if (t1 >= 0.0f && t1 <= 1.0f && t2 >= 0.0f && t2 <= 1.0f)
+                {
+                    intersections[num++] = pts1[i] + vec1[i] * t1;
+                }
+            }
+        }
+
+        // Check for vertices of rect1 inside rect2
+        {
+            const auto& AB      = vec2[0];
+            const auto& DA      = vec2[3];
+            auto        ABdotAB = dot_2d(AB, AB);
+            auto        ADdotAD = dot_2d(DA, DA);
+            for (int i = 0; i < 4; i++)
+            {
+                // assume ABCD is the rectangle, and P is the point to be judged
+                // P is inside ABCD iff. P's projection on AB lies within AB
+                // and P's projection on AD lies within AD
+
+                auto AP = pts1[i] - pts2[0];
+
+                auto APdotAB = dot_2d(AP, AB);
+                auto APdotAD = -dot_2d(AP, DA);
+
+                if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && (APdotAD <= ADdotAD))
+                {
+                    intersections[num++] = pts1[i];
+                }
+            }
+        }
+
+        // Reverse the check - check for vertices of rect2 inside rect1
+        {
+            const auto& AB      = vec1[0];
+            const auto& DA      = vec1[3];
+            auto        ABdotAB = dot_2d(AB, AB);
+            auto        ADdotAD = dot_2d(DA, DA);
+            for (int i = 0; i < 4; i++)
+            {
+                auto AP = pts2[i] - pts1[0];
+
+                auto APdotAB = dot_2d(AP, AB);
+                auto APdotAD = -dot_2d(AP, DA);
+
+                if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && (APdotAD <= ADdotAD))
+                {
+                    intersections[num++] = pts2[i];
+                }
+            }
+        }
+
+        return num;
     }
-  }
-  auto& start = p[t];  // starting point
-
-  // Step 2:
-  // Subtract starting point from every points (for sorting in the next step)
-  for (int i = 0; i < num_in; i++) {
-    q[i] = p[i] - start;
-  }
-
-  // Swap the starting point to position 0
-  auto tmp = q[0];
-  q[0] = q[t];
-  q[t] = tmp;
-
-  // Step 3:
-  // Sort point 1 ~ num_in according to their relative cross-product values
-  // (essentially sorting according to angles)
-  // If the angles are the same, sort according to their distance to origin
-  float dist[24];
-  for (int i = 0; i < num_in; i++) {
-    dist[i] = dot_2d(q[i], q[i]);
-  }
-
-  // CPU version
-  std::sort(q + 1, q + num_in, [](const Point& A, const Point& B) -> bool {
+
+    int convex_hull_graham(const Point (&p)[24], const int& num_in, Point (&q)[24], bool shift_to_zero = false)
+    {
+        assert(num_in >= 2);
+
+        // Step 1:
+        // Find point with minimum y
+        // if more than 1 points have the same minimum y,
+        // pick the one with the minimum x.
+        int t = 0;
+        for (int i = 1; i < num_in; i++)
+        {
+            if (p[i].y < p[t].y || (p[i].y == p[t].y && p[i].x < p[t].x))
+            {
+                t = i;
+            }
+        }
+        auto& start = p[t];  // starting point
+
+        // Step 2:
+        // Subtract starting point from every points (for sorting in the next step)
+        for (int i = 0; i < num_in; i++)
+        {
+            q[i] = p[i] - start;
+        }
+
+        // Swap the starting point to position 0
+        auto tmp = q[0];
+        q[0]     = q[t];
+        q[t]     = tmp;
+
+        // Step 3:
+        // Sort point 1 ~ num_in according to their relative cross-product values
+        // (essentially sorting according to angles)
+        // If the angles are the same, sort according to their distance to origin
+        float dist[24];
+        for (int i = 0; i < num_in; i++)
+        {
+            dist[i] = dot_2d(q[i], q[i]);
+        }
+
+        // CPU version
+        std::sort(q + 1, q + num_in, [](const Point& A, const Point& B) -> bool
+                  {
     float temp = cross_2d(A, B);
     if (fabs(temp) < 1e-6) {
       return dot_2d(A, A) < dot_2d(B, B);
     } else {
       return temp > 0;
+    } });
+        // compute distance to origin after sort, since the points are now different.
+        for (int i = 0; i < num_in; i++)
+        {
+            dist[i] = dot_2d(q[i], q[i]);
+        }
+
+        // Step 4:
+        // Make sure there are at least 2 points (that don't overlap with each other)
+        // in the stack
+        int k;  // index of the non-overlapped second point
+        for (k = 1; k < num_in; k++)
+        {
+            if (dist[k] > 1e-8)
+            {
+                break;
+            }
+        }
+        if (k == num_in)
+        {
+            // We reach the end, which means the convex hull is just one point
+            q[0] = p[t];
+            return 1;
+        }
+        q[1]  = q[k];
+        int m = 2;  // 2 points in the stack
+        // Step 5:
+        // Finally we can start the scanning process.
+        // When a non-convex relationship between the 3 points is found
+        // (either concave shape or duplicated points),
+        // we pop the previous point from the stack
+        // until the 3-point relationship is convex again, or
+        // until the stack only contains two points
+        for (int i = k + 1; i < num_in; i++)
+        {
+            while (m > 1 && cross_2d(q[i] - q[m - 2], q[m - 1] - q[m - 2]) >= 0)
+            {
+                m--;
+            }
+            q[m++] = q[i];
+        }
+
+        // Step 6 (Optional):
+        // In general sense we need the original coordinates, so we
+        // need to shift the points back (reverting Step 2)
+        // But if we're only interested in getting the area/perimeter of the shape
+        // We can simply return.
+        if (!shift_to_zero)
+        {
+            for (int i = 0; i < m; i++)
+            {
+                q[i] += start;
+            }
+        }
+
+        return m;
     }
-  });
-  // compute distance to origin after sort, since the points are now different.
-  for (int i = 0; i < num_in; i++) {
-    dist[i] = dot_2d(q[i], q[i]);
-  }
-
-  // Step 4:
-  // Make sure there are at least 2 points (that don't overlap with each other)
-  // in the stack
-  int k;  // index of the non-overlapped second point
-  for (k = 1; k < num_in; k++) {
-    if (dist[k] > 1e-8) {
-      break;
-    }
-  }
-  if (k == num_in) {
-    // We reach the end, which means the convex hull is just one point
-    q[0] = p[t];
-    return 1;
-  }
-  q[1] = q[k];
-  int m = 2;  // 2 points in the stack
-  // Step 5:
-  // Finally we can start the scanning process.
-  // When a non-convex relationship between the 3 points is found
-  // (either concave shape or duplicated points),
-  // we pop the previous point from the stack
-  // until the 3-point relationship is convex again, or
-  // until the stack only contains two points
-  for (int i = k + 1; i < num_in; i++) {
-    while (m > 1 && cross_2d(q[i] - q[m - 2], q[m - 1] - q[m - 2]) >= 0) {
-      m--;
-    }
-    q[m++] = q[i];
-  }
-
-  // Step 6 (Optional):
-  // In general sense we need the original coordinates, so we
-  // need to shift the points back (reverting Step 2)
-  // But if we're only interested in getting the area/perimeter of the shape
-  // We can simply return.
-  if (!shift_to_zero) {
-    for (int i = 0; i < m; i++) {
-      q[i] += start;
-    }
-  }
-
-  return m;
-}
-
-float polygon_area(const Point (&q)[24], const int& m) {
-  if (m <= 2) {
-    return 0;
-  }
-
-  float area = 0;
-  for (int i = 1; i < m - 1; i++) {
-    area += fabs(cross_2d(q[i] - q[0], q[i + 1] - q[0]));
-  }
-
-  return area / 2.0;
-}
-
-float rotated_boxes_intersection(const RotatedBox& box1, const RotatedBox& box2) {
-  // There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned
-  // from rotated_rect_intersection_pts
-  Point intersectPts[24], orderedPts[24];
-
-  Point pts1[4];
-  Point pts2[4];
-  get_rotated_vertices(box1, pts1);
-  get_rotated_vertices(box2, pts2);
-
-  int num = get_intersection_points(pts1, pts2, intersectPts);
-
-  if (num <= 2) {
-    return 0.0;
-  }
-
-  // Convex Hull to order the intersection points in clockwise order and find
-  // the contour area.
-  int num_convex = convex_hull_graham(intersectPts, num, orderedPts, true);
-  return polygon_area(orderedPts, num_convex);
-}
-
-NMSRotatedKernel::NMSRotatedKernel(const OrtApi& api, const OrtKernelInfo* info)
-    : ort_(api), info_(info) {
-  iou_threshold_ = ort_.KernelInfoGetAttribute<float>(info, "iou_threshold");
-  score_threshold_ = ort_.KernelInfoGetAttribute<float>(info, "score_threshold");
-
-  // create allocator
-  allocator_ = Ort::AllocatorWithDefaultOptions();
-}
-
-void NMSRotatedKernel::Compute(OrtKernelContext* context) {
-  const float iou_threshold = iou_threshold_;
-  const float score_threshold = score_threshold_;
-
-  const OrtValue* boxes = ort_.KernelContext_GetInput(context, 0);
-  const float* boxes_data = reinterpret_cast<const float*>(ort_.GetTensorData<float>(boxes));
-  const OrtValue* scores = ort_.KernelContext_GetInput(context, 1);
-  const float* scores_data = reinterpret_cast<const float*>(ort_.GetTensorData<float>(scores));
-
-  OrtTensorDimensions boxes_dim(ort_, boxes);
-  OrtTensorDimensions scores_dim(ort_, scores);
-
-  // loop over batch
-  int64_t nbatch = boxes_dim[0];
-  int64_t nboxes = boxes_dim[1];
-  int64_t nclass = scores_dim[1];
-  assert(boxes_dim[2] == 5);  //(cx,cy,w,h,theta)
-
-  // allocate tmp memory
-  float* tmp_boxes = (float*)allocator_.Alloc(sizeof(float) * nbatch * nboxes * 5);
-  float* sc = (float*)allocator_.Alloc(sizeof(float) * nbatch * nclass * nboxes);
-  bool* select = (bool*)allocator_.Alloc(sizeof(bool) * nbatch * nboxes);
-
-  memcpy(tmp_boxes, boxes_data, sizeof(float) * nbatch * nboxes * 5);
-  memcpy(sc, scores_data, sizeof(float) * nbatch * nclass * nboxes);
-
-  // std::vector<std::vector<int64_t>> res_order;
-  std::vector<int64_t> res_order;
-  for (int64_t k = 0; k < nbatch; k++) {
-    for (int64_t g = 0; g < nclass; g++) {
-      for (int64_t i = 0; i < nboxes; i++) {
-        select[i] = true;
-      }
-      // sort scores
-      std::vector<float> tmp_sc;
-      for (int i = 0; i < nboxes; i++) {
-        tmp_sc.push_back(sc[k * nboxes * nclass + g * nboxes + i]);
-      }
-      std::vector<int64_t> order(tmp_sc.size());
-      std::iota(order.begin(), order.end(), 0);
-      std::sort(order.begin(), order.end(),
-                [&tmp_sc](int64_t id1, int64_t id2) { return tmp_sc[id1] > tmp_sc[id2]; });
-      for (int64_t _i = 0; _i < nboxes; _i++) {
-        if (select[_i] == false) continue;
-        auto i = order[_i];
-        for (int64_t _j = _i + 1; _j < nboxes; _j++) {
-          if (select[_j] == false) continue;
-          auto j = order[_j];
-          RotatedBox box1, box2;
-          auto center_shift_x =
-              (tmp_boxes[k * nboxes * 5 + i * 5] + tmp_boxes[k * nboxes * 5 + j * 5]) / 2.0;
-          auto center_shift_y =
-              (tmp_boxes[k * nboxes * 5 + i * 5 + 1] + tmp_boxes[k * nboxes * 5 + j * 5 + 1]) / 2.0;
-          box1.x_ctr = tmp_boxes[k * nboxes * 5 + i * 5] - center_shift_x;
-          box1.y_ctr = tmp_boxes[k * nboxes * 5 + i * 5 + 1] - center_shift_y;
-          box1.w = tmp_boxes[k * nboxes * 5 + i * 5 + 2];
-          box1.h = tmp_boxes[k * nboxes * 5 + i * 5 + 3];
-          box1.a = tmp_boxes[k * nboxes * 5 + i * 5 + 4];
-          box2.x_ctr = tmp_boxes[k * nboxes * 5 + j * 5] - center_shift_x;
-          box2.y_ctr = tmp_boxes[k * nboxes * 5 + j * 5 + 1] - center_shift_y;
-          box2.w = tmp_boxes[k * nboxes * 5 + j * 5 + 2];
-          box2.h = tmp_boxes[k * nboxes * 5 + j * 5 + 3];
-          box2.a = tmp_boxes[k * nboxes * 5 + j * 5 + 4];
-          auto area1 = box1.w * box1.h;
-          auto area2 = box2.w * box2.h;
-          auto intersection = rotated_boxes_intersection(box1, box2);
-          float baseS = 1.0;
-          baseS = (area1 + area2 - intersection);
-          auto ovr = intersection / baseS;
-          if (ovr > iou_threshold) select[_j] = false;
+
+    float polygon_area(const Point (&q)[24], const int& m)
+    {
+        if (m <= 2)
+        {
+            return 0;
         }
-      }
-      for (int i = 0; i < nboxes; i++) {
-        if (select[i] & (tmp_sc[order[i]] > score_threshold)) {
-          res_order.push_back(k);
-          res_order.push_back(g);
-          res_order.push_back(order[i]);
+
+        float area = 0;
+        for (int i = 1; i < m - 1; i++)
+        {
+            area += fabs(cross_2d(q[i] - q[0], q[i + 1] - q[0]));
         }
-      }
-    }  // class loop
-  }    // batch loop
 
-  std::vector<int64_t> inds_dims({(int64_t)res_order.size() / 3, 3});
+        return area / 2.0;
+    }
+
+    float rotated_boxes_intersection(const RotatedBox& box1, const RotatedBox& box2)
+    {
+        // There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned
+        // from rotated_rect_intersection_pts
+        Point intersectPts[24], orderedPts[24];
 
-  OrtValue* res = ort_.KernelContext_GetOutput(context, 0, inds_dims.data(), inds_dims.size());
-  int64_t* res_data = ort_.GetTensorMutableData<int64_t>(res);
+        Point pts1[4];
+        Point pts2[4];
+        get_rotated_vertices(box1, pts1);
+        get_rotated_vertices(box2, pts2);
 
-  memcpy(res_data, res_order.data(), sizeof(int64_t) * res_order.size());
+        int num = get_intersection_points(pts1, pts2, intersectPts);
 
-  allocator_.Free(tmp_boxes);
-  allocator_.Free(sc);
-  allocator_.Free(select);
-}
+        if (num <= 2)
+        {
+            return 0.0;
+        }
+
+        // Convex Hull to order the intersection points in clockwise order and find
+        // the contour area.
+        int num_convex = convex_hull_graham(intersectPts, num, orderedPts, true);
+        return polygon_area(orderedPts, num_convex);
+    }
+
+    NMSRotatedKernel::NMSRotatedKernel(const OrtApi& api, const OrtKernelInfo* info)
+        : ort_(api)
+        , info_(info)
+    {
+        iou_threshold_   = ort_.KernelInfoGetAttribute<float>(info, "iou_threshold");
+        score_threshold_ = ort_.KernelInfoGetAttribute<float>(info, "score_threshold");
+
+        // create allocator
+        allocator_ = Ort::AllocatorWithDefaultOptions();
+    }
+
+    void NMSRotatedKernel::Compute(OrtKernelContext* context)
+    {
+        const float         iou_threshold   = iou_threshold_;
+        const float         score_threshold = score_threshold_;
+
+        const OrtValue*     boxes       = ort_.KernelContext_GetInput(context, 0);
+        const float*        boxes_data  = reinterpret_cast<const float*>(ort_.GetTensorData<float>(boxes));
+        const OrtValue*     scores      = ort_.KernelContext_GetInput(context, 1);
+        const float*        scores_data = reinterpret_cast<const float*>(ort_.GetTensorData<float>(scores));
+
+        OrtTensorDimensions boxes_dim(ort_, boxes);
+        OrtTensorDimensions scores_dim(ort_, scores);
+
+        // loop over batch
+        int64_t             nbatch = boxes_dim[0];
+        int64_t             nboxes = boxes_dim[1];
+        int64_t             nclass = scores_dim[1];
+        assert(boxes_dim[2] == 5);  //(cx,cy,w,h,theta)
+
+        // allocate tmp memory
+        float* tmp_boxes = (float*)allocator_.Alloc(sizeof(float) * nbatch * nboxes * 5);
+        float* sc        = (float*)allocator_.Alloc(sizeof(float) * nbatch * nclass * nboxes);
+        bool*  select    = (bool*)allocator_.Alloc(sizeof(bool) * nbatch * nboxes);
+
+        memcpy(tmp_boxes, boxes_data, sizeof(float) * nbatch * nboxes * 5);
+        memcpy(sc, scores_data, sizeof(float) * nbatch * nclass * nboxes);
+
+        // std::vector<std::vector<int64_t>> res_order;
+        std::vector<int64_t> res_order;
+        for (int64_t k = 0; k < nbatch; k++)
+        {
+            for (int64_t g = 0; g < nclass; g++)
+            {
+                for (int64_t i = 0; i < nboxes; i++)
+                {
+                    select[i] = true;
+                }
+                // sort scores
+                std::vector<float> tmp_sc;
+                for (int i = 0; i < nboxes; i++)
+                {
+                    tmp_sc.push_back(sc[k * nboxes * nclass + g * nboxes + i]);
+                }
+                std::vector<int64_t> order(tmp_sc.size());
+                std::iota(order.begin(), order.end(), 0);
+                std::sort(order.begin(), order.end(), [&tmp_sc](int64_t id1, int64_t id2)
+                          { return tmp_sc[id1] > tmp_sc[id2]; });
+                for (int64_t _i = 0; _i < nboxes; _i++)
+                {
+                    if (select[_i] == false) continue;
+                    auto i = order[_i];
+                    for (int64_t _j = _i + 1; _j < nboxes; _j++)
+                    {
+                        if (select[_j] == false) continue;
+                        auto       j = order[_j];
+                        RotatedBox box1, box2;
+                        auto       center_shift_x =
+                            (tmp_boxes[k * nboxes * 5 + i * 5] + tmp_boxes[k * nboxes * 5 + j * 5]) / 2.0;
+                        auto center_shift_y =
+                            (tmp_boxes[k * nboxes * 5 + i * 5 + 1] + tmp_boxes[k * nboxes * 5 + j * 5 + 1]) / 2.0;
+                        box1.x_ctr         = tmp_boxes[k * nboxes * 5 + i * 5] - center_shift_x;
+                        box1.y_ctr         = tmp_boxes[k * nboxes * 5 + i * 5 + 1] - center_shift_y;
+                        box1.w             = tmp_boxes[k * nboxes * 5 + i * 5 + 2];
+                        box1.h             = tmp_boxes[k * nboxes * 5 + i * 5 + 3];
+                        box1.a             = tmp_boxes[k * nboxes * 5 + i * 5 + 4];
+                        box2.x_ctr         = tmp_boxes[k * nboxes * 5 + j * 5] - center_shift_x;
+                        box2.y_ctr         = tmp_boxes[k * nboxes * 5 + j * 5 + 1] - center_shift_y;
+                        box2.w             = tmp_boxes[k * nboxes * 5 + j * 5 + 2];
+                        box2.h             = tmp_boxes[k * nboxes * 5 + j * 5 + 3];
+                        box2.a             = tmp_boxes[k * nboxes * 5 + j * 5 + 4];
+                        auto  area1        = box1.w * box1.h;
+                        auto  area2        = box2.w * box2.h;
+                        auto  intersection = rotated_boxes_intersection(box1, box2);
+                        float baseS        = 1.0;
+                        baseS              = (area1 + area2 - intersection);
+                        auto ovr           = intersection / baseS;
+                        if (ovr > iou_threshold) select[_j] = false;
+                    }
+                }
+                for (int i = 0; i < nboxes; i++)
+                {
+                    if (select[i] & (tmp_sc[order[i]] > score_threshold))
+                    {
+                        res_order.push_back(k);
+                        res_order.push_back(g);
+                        res_order.push_back(order[i]);
+                    }
+                }
+            }  // class loop
+        }      // batch loop
+
+        std::vector<int64_t> inds_dims({(int64_t)res_order.size() / 3, 3});
+
+        OrtValue*            res      = ort_.KernelContext_GetOutput(context, 0, inds_dims.data(), inds_dims.size());
+        int64_t*             res_data = ort_.GetTensorMutableData<int64_t>(res);
+
+        memcpy(res_data, res_order.data(), sizeof(int64_t) * res_order.size());
+
+        allocator_.Free(tmp_boxes);
+        allocator_.Free(sc);
+        allocator_.Free(select);
+    }
 
-REGISTER_ONNXRUNTIME_OPS(mmdeploy, NMSRotatedOp);
+    REGISTER_ONNXRUNTIME_OPS(mmdeploy, NMSRotatedOp);
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/onnxruntime/nms_rotated/nms_rotated.h b/csrc/mmdeploy/backend_ops/onnxruntime/nms_rotated/nms_rotated.h
index 6ed44ce410..3b4aa856a5 100644
--- a/csrc/mmdeploy/backend_ops/onnxruntime/nms_rotated/nms_rotated.h
+++ b/csrc/mmdeploy/backend_ops/onnxruntime/nms_rotated/nms_rotated.h
@@ -10,39 +10,57 @@
 #include <string>
 #include <vector>
 
-namespace mmdeploy {
-struct NMSRotatedKernel {
-  NMSRotatedKernel(const OrtApi& api, const OrtKernelInfo* info);
-
-  void Compute(OrtKernelContext* context);
-
- private:
-  Ort::CustomOpApi ort_;
-  const OrtKernelInfo* info_;
-  Ort::AllocatorWithDefaultOptions allocator_;
-  float iou_threshold_;
-  float score_threshold_;
-};
-
-struct NMSRotatedOp : Ort::CustomOpBase<NMSRotatedOp, NMSRotatedKernel> {
-  void* CreateKernel(const OrtApi& api, const OrtKernelInfo* info) const {
-    return new NMSRotatedKernel(api, info);
-  }
-  const char* GetName() const { return "NMSRotated"; }
-
-  size_t GetInputTypeCount() const { return 2; }
-  ONNXTensorElementDataType GetInputType(size_t) const {
-    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
-  }
-
-  size_t GetOutputTypeCount() const { return 1; }
-  ONNXTensorElementDataType GetOutputType(size_t) const {
-    return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
-  }
-
-  // force cpu
-  const char* GetExecutionProviderType() const { return "CPUExecutionProvider"; }
-};
+namespace mmdeploy
+{
+    struct NMSRotatedKernel
+    {
+        NMSRotatedKernel(const OrtApi& api, const OrtKernelInfo* info);
+
+        void Compute(OrtKernelContext* context);
+
+      private:
+        Ort::CustomOpApi                 ort_;
+        const OrtKernelInfo*             info_;
+        Ort::AllocatorWithDefaultOptions allocator_;
+        float                            iou_threshold_;
+        float                            score_threshold_;
+    };
+
+    struct NMSRotatedOp : Ort::CustomOpBase<NMSRotatedOp, NMSRotatedKernel>
+    {
+        void* CreateKernel(const OrtApi& api, const OrtKernelInfo* info) const
+        {
+            return new NMSRotatedKernel(api, info);
+        }
+        const char* GetName() const
+        {
+            return "NMSRotated";
+        }
+
+        size_t GetInputTypeCount() const
+        {
+            return 2;
+        }
+        ONNXTensorElementDataType GetInputType(size_t) const
+        {
+            return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+        }
+
+        size_t GetOutputTypeCount() const
+        {
+            return 1;
+        }
+        ONNXTensorElementDataType GetOutputType(size_t) const
+        {
+            return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
+        }
+
+        // force cpu
+        const char* GetExecutionProviderType() const
+        {
+            return "CPUExecutionProvider";
+        }
+    };
 }  // namespace mmdeploy
 
 #endif  // ONNXRUNTIME_NMS_ROTATED_H
diff --git a/csrc/mmdeploy/backend_ops/onnxruntime/onnxruntime_register.cpp b/csrc/mmdeploy/backend_ops/onnxruntime/onnxruntime_register.cpp
index f7b9cedff8..1159496843 100644
--- a/csrc/mmdeploy/backend_ops/onnxruntime/onnxruntime_register.cpp
+++ b/csrc/mmdeploy/backend_ops/onnxruntime/onnxruntime_register.cpp
@@ -3,25 +3,30 @@
 
 #include "ort_utils.h"
 
-const char *c_MMDeployOpDomain = "mmdeploy";
+const char*             c_MMDeployOpDomain = "mmdeploy";
 
-OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options, const OrtApiBase *api) {
-  const OrtApi *kOrtApi = api->GetApi(ORT_API_VERSION);
-  OrtStatus *status = nullptr;
-  for (auto &_op_list_pair : mmdeploy::get_mmdeploy_custom_ops()) {
-    OrtCustomOpDomain *domain = nullptr;
-    if (auto status = kOrtApi->CreateCustomOpDomain(_op_list_pair.first.c_str(), &domain)) {
-      return status;
+OrtStatus* ORT_API_CALL RegisterCustomOps(OrtSessionOptions* options, const OrtApiBase* api)
+{
+    const OrtApi* kOrtApi = api->GetApi(ORT_API_VERSION);
+    OrtStatus*    status  = nullptr;
+    for (auto& _op_list_pair : mmdeploy::get_mmdeploy_custom_ops())
+    {
+        OrtCustomOpDomain* domain = nullptr;
+        if (auto status = kOrtApi->CreateCustomOpDomain(_op_list_pair.first.c_str(), &domain))
+        {
+            return status;
+        }
+        auto& _op_list = _op_list_pair.second;
+        for (auto& _op : _op_list)
+        {
+            if (auto status = kOrtApi->CustomOpDomain_Add(domain, _op))
+            {
+                return status;
+            }
+        }
+        // TODO: figure out what will return if failed.
+        status = kOrtApi->AddCustomOpDomain(options, domain);
     }
-    auto &_op_list = _op_list_pair.second;
-    for (auto &_op : _op_list) {
-      if (auto status = kOrtApi->CustomOpDomain_Add(domain, _op)) {
-        return status;
-      }
-    }
-    // TODO: figure out what will return if failed.
-    status = kOrtApi->AddCustomOpDomain(options, domain);
-  }
 
-  return status;
+    return status;
 }
diff --git a/csrc/mmdeploy/backend_ops/onnxruntime/roi_align_rotated/roi_align_rotated.cpp b/csrc/mmdeploy/backend_ops/onnxruntime/roi_align_rotated/roi_align_rotated.cpp
index a8e7023fe1..4fbf6365d0 100644
--- a/csrc/mmdeploy/backend_ops/onnxruntime/roi_align_rotated/roi_align_rotated.cpp
+++ b/csrc/mmdeploy/backend_ops/onnxruntime/roi_align_rotated/roi_align_rotated.cpp
@@ -5,233 +5,245 @@
 
 #include "ort_utils.h"
 
-namespace mmdeploy {
-// implementation taken from Caffe2
-struct PreCalc {
-  int pos1;
-  int pos2;
-  int pos3;
-  int pos4;
-  float w1;
-  float w2;
-  float w3;
-  float w4;
-};
-
-void pre_calc_for_bilinear_interpolate(const int height, const int width, const int pooled_height,
-                                       const int pooled_width, const int iy_upper,
-                                       const int ix_upper, float roi_start_h, float roi_start_w,
-                                       float bin_size_h, float bin_size_w, int roi_bin_grid_h,
-                                       int roi_bin_grid_w, float roi_center_h, float roi_center_w,
-                                       float cos_theta, float sin_theta,
-                                       std::vector<PreCalc> &pre_calc) {
-  int pre_calc_index = 0;
-  for (int ph = 0; ph < pooled_height; ph++) {
-    for (int pw = 0; pw < pooled_width; pw++) {
-      for (int iy = 0; iy < iy_upper; iy++) {
-        const float yy = roi_start_h + ph * bin_size_h +
-                         static_cast<float>(iy + .5f) * bin_size_h /
-                             static_cast<float>(roi_bin_grid_h);  // e.g., 0.5, 1.5
-        for (int ix = 0; ix < ix_upper; ix++) {
-          const float xx =
-              roi_start_w + pw * bin_size_w +
-              static_cast<float>(ix + .5f) * bin_size_w / static_cast<float>(roi_bin_grid_w);
-
-          // Rotate by theta around the center and translate
-          // In image space, (y, x) is the order for Right Handed System,
-          // and this is essentially multiplying the point by a rotation matrix
-          // to rotate it counterclockwise through angle theta.
-          float y = yy * cos_theta - xx * sin_theta + roi_center_h;
-          float x = yy * sin_theta + xx * cos_theta + roi_center_w;
-          // deal with: inverse elements are out of feature map boundary
-          if (y < -1.0 || y > height || x < -1.0 || x > width) {
-            // empty
-            PreCalc pc;
-            pc.pos1 = 0;
-            pc.pos2 = 0;
-            pc.pos3 = 0;
-            pc.pos4 = 0;
-            pc.w1 = 0;
-            pc.w2 = 0;
-            pc.w3 = 0;
-            pc.w4 = 0;
-            pre_calc[pre_calc_index] = pc;
-            pre_calc_index += 1;
-            continue;
-          }
-
-          if (y < 0) {
-            y = 0;
-          }
-          if (x < 0) {
-            x = 0;
-          }
-
-          int y_low = (int)y;
-          int x_low = (int)x;
-          int y_high;
-          int x_high;
-
-          if (y_low >= height - 1) {
-            y_high = y_low = height - 1;
-            y = (float)y_low;
-          } else {
-            y_high = y_low + 1;
-          }
-
-          if (x_low >= width - 1) {
-            x_high = x_low = width - 1;
-            x = (float)x_low;
-          } else {
-            x_high = x_low + 1;
-          }
-
-          float ly = y - y_low;
-          float lx = x - x_low;
-          float hy = 1. - ly, hx = 1. - lx;
-          float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-
-          // save weights and indices
-          PreCalc pc;
-          pc.pos1 = y_low * width + x_low;
-          pc.pos2 = y_low * width + x_high;
-          pc.pos3 = y_high * width + x_low;
-          pc.pos4 = y_high * width + x_high;
-          pc.w1 = w1;
-          pc.w2 = w2;
-          pc.w3 = w3;
-          pc.w4 = w4;
-          pre_calc[pre_calc_index] = pc;
-
-          pre_calc_index += 1;
+namespace mmdeploy
+{
+    // implementation taken from Caffe2
+    struct PreCalc
+    {
+        int   pos1;
+        int   pos2;
+        int   pos3;
+        int   pos4;
+        float w1;
+        float w2;
+        float w3;
+        float w4;
+    };
+
+    void pre_calc_for_bilinear_interpolate(const int height, const int width, const int pooled_height, const int pooled_width, const int iy_upper, const int ix_upper, float roi_start_h, float roi_start_w, float bin_size_h, float bin_size_w, int roi_bin_grid_h, int roi_bin_grid_w, float roi_center_h, float roi_center_w, float cos_theta, float sin_theta, std::vector<PreCalc>& pre_calc)
+    {
+        int pre_calc_index = 0;
+        for (int ph = 0; ph < pooled_height; ph++)
+        {
+            for (int pw = 0; pw < pooled_width; pw++)
+            {
+                for (int iy = 0; iy < iy_upper; iy++)
+                {
+                    const float yy = roi_start_h + ph * bin_size_h +
+                                     static_cast<float>(iy + .5f) * bin_size_h /
+                                         static_cast<float>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+                    for (int ix = 0; ix < ix_upper; ix++)
+                    {
+                        const float xx =
+                            roi_start_w + pw * bin_size_w +
+                            static_cast<float>(ix + .5f) * bin_size_w / static_cast<float>(roi_bin_grid_w);
+
+                        // Rotate by theta around the center and translate
+                        // In image space, (y, x) is the order for Right Handed System,
+                        // and this is essentially multiplying the point by a rotation matrix
+                        // to rotate it counterclockwise through angle theta.
+                        float y = yy * cos_theta - xx * sin_theta + roi_center_h;
+                        float x = yy * sin_theta + xx * cos_theta + roi_center_w;
+                        // deal with: inverse elements are out of feature map boundary
+                        if (y < -1.0 || y > height || x < -1.0 || x > width)
+                        {
+                            // empty
+                            PreCalc pc;
+                            pc.pos1                  = 0;
+                            pc.pos2                  = 0;
+                            pc.pos3                  = 0;
+                            pc.pos4                  = 0;
+                            pc.w1                    = 0;
+                            pc.w2                    = 0;
+                            pc.w3                    = 0;
+                            pc.w4                    = 0;
+                            pre_calc[pre_calc_index] = pc;
+                            pre_calc_index += 1;
+                            continue;
+                        }
+
+                        if (y < 0)
+                        {
+                            y = 0;
+                        }
+                        if (x < 0)
+                        {
+                            x = 0;
+                        }
+
+                        int y_low = (int)y;
+                        int x_low = (int)x;
+                        int y_high;
+                        int x_high;
+
+                        if (y_low >= height - 1)
+                        {
+                            y_high = y_low = height - 1;
+                            y              = (float)y_low;
+                        }
+                        else
+                        {
+                            y_high = y_low + 1;
+                        }
+
+                        if (x_low >= width - 1)
+                        {
+                            x_high = x_low = width - 1;
+                            x              = (float)x_low;
+                        }
+                        else
+                        {
+                            x_high = x_low + 1;
+                        }
+
+                        float   ly = y - y_low;
+                        float   lx = x - x_low;
+                        float   hy = 1. - ly, hx = 1. - lx;
+                        float   w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+                        // save weights and indices
+                        PreCalc pc;
+                        pc.pos1                  = y_low * width + x_low;
+                        pc.pos2                  = y_low * width + x_high;
+                        pc.pos3                  = y_high * width + x_low;
+                        pc.pos4                  = y_high * width + x_high;
+                        pc.w1                    = w1;
+                        pc.w2                    = w2;
+                        pc.w3                    = w3;
+                        pc.w4                    = w4;
+                        pre_calc[pre_calc_index] = pc;
+
+                        pre_calc_index += 1;
+                    }
+                }
+            }
         }
-      }
-    }
-  }
-}
-
-void ROIAlignRotatedForwardCPU(const int nthreads, const float *input, const float *rois,
-                               float *output, const float &spatial_scale, const int aligned,
-                               const int clockwise, const int channels, const int height,
-                               const int width, const int pooled_height, const int pooled_width,
-                               const int sampling_ratio) {
-  int n_rois = nthreads / channels / pooled_width / pooled_height;
-  // (n, c, ph, pw) is an element in the pooled output
-  // can be parallelized using omp
-  // #pragma omp parallel for num_threads(32)
-  for (int n = 0; n < n_rois; n++) {
-    int index_n = n * channels * pooled_width * pooled_height;
-
-    const float *current_roi = rois + n * 6;
-    int roi_batch_ind = current_roi[0];
-
-    // Do not use rounding; this implementation detail is critical
-    float offset = aligned ? (float)0.5 : (float)0.0;
-    float roi_center_w = current_roi[1] * spatial_scale - offset;
-    float roi_center_h = current_roi[2] * spatial_scale - offset;
-    float roi_width = current_roi[3] * spatial_scale;
-    float roi_height = current_roi[4] * spatial_scale;
-    // float theta = current_roi[5] * M_PI / 180.0;
-    float theta = current_roi[5];  // Radian angle by default
-    if (clockwise) {
-      theta = -theta;
     }
-    float cos_theta = cos(theta);
-    float sin_theta = sin(theta);
-    if (!aligned) {  // for backward-compatibility only
-      roi_width = std::max(roi_width, (float)1.);
-      roi_height = std::max(roi_height, (float)1.);
-    }
-
-    float bin_size_h = static_cast<float>(roi_height) / static_cast<float>(pooled_height);
-    float bin_size_w = static_cast<float>(roi_width) / static_cast<float>(pooled_width);
-
-    // We use roi_bin_grid to sample the grid and mimic integral
-    int roi_bin_grid_h =
-        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height);  // e.g., = 2
-    int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
-
-    // We do average (integral) pooling inside a bin
-    const float count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
-
-    // we want to precalculate indices and weights shared by all channels,
-    // this is the key point of optimization
-    std::vector<PreCalc> pre_calc(roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
-
-    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
-    // Appropriate translation needs to be applied after.
-    float roi_start_h = -roi_height / 2.0;
-    float roi_start_w = -roi_width / 2.0;
 
-    pre_calc_for_bilinear_interpolate(height, width, pooled_height, pooled_width, roi_bin_grid_h,
-                                      roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h,
-                                      bin_size_w, roi_bin_grid_h, roi_bin_grid_w, roi_center_h,
-                                      roi_center_w, cos_theta, sin_theta, pre_calc);
-
-    for (int c = 0; c < channels; c++) {
-      int index_n_c = index_n + c * pooled_width * pooled_height;
-      const float *offset_input = input + (roi_batch_ind * channels + c) * height * width;
-      int pre_calc_index = 0;
+    void ROIAlignRotatedForwardCPU(const int nthreads, const float* input, const float* rois, float* output, const float& spatial_scale, const int aligned, const int clockwise, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int sampling_ratio)
+    {
+        int n_rois = nthreads / channels / pooled_width / pooled_height;
+        // (n, c, ph, pw) is an element in the pooled output
+        // can be parallelized using omp
+        // #pragma omp parallel for num_threads(32)
+        for (int n = 0; n < n_rois; n++)
+        {
+            int          index_n = n * channels * pooled_width * pooled_height;
+
+            const float* current_roi   = rois + n * 6;
+            int          roi_batch_ind = current_roi[0];
+
+            // Do not use rounding; this implementation detail is critical
+            float        offset       = aligned ? (float)0.5 : (float)0.0;
+            float        roi_center_w = current_roi[1] * spatial_scale - offset;
+            float        roi_center_h = current_roi[2] * spatial_scale - offset;
+            float        roi_width    = current_roi[3] * spatial_scale;
+            float        roi_height   = current_roi[4] * spatial_scale;
+            // float theta = current_roi[5] * M_PI / 180.0;
+            float        theta        = current_roi[5];  // Radian angle by default
+            if (clockwise)
+            {
+                theta = -theta;
+            }
+            float cos_theta = cos(theta);
+            float sin_theta = sin(theta);
+            if (!aligned)
+            {  // for backward-compatibility only
+                roi_width  = std::max(roi_width, (float)1.);
+                roi_height = std::max(roi_height, (float)1.);
+            }
 
-      for (int ph = 0; ph < pooled_height; ph++) {
-        for (int pw = 0; pw < pooled_width; pw++) {
-          int index = index_n_c + ph * pooled_width + pw;
+            float bin_size_h = static_cast<float>(roi_height) / static_cast<float>(pooled_height);
+            float bin_size_w = static_cast<float>(roi_width) / static_cast<float>(pooled_width);
+
+            // We use roi_bin_grid to sample the grid and mimic integral
+            int   roi_bin_grid_h =
+                (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height);  // e.g., = 2
+            int                  roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+            // We do average (integral) pooling inside a bin
+            const float          count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
+
+            // we want to precalculate indices and weights shared by all channels,
+            // this is the key point of optimization
+            std::vector<PreCalc> pre_calc(roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
+
+            // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+            // Appropriate translation needs to be applied after.
+            float                roi_start_h = -roi_height / 2.0;
+            float                roi_start_w = -roi_width / 2.0;
+
+            pre_calc_for_bilinear_interpolate(height, width, pooled_height, pooled_width, roi_bin_grid_h, roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w, roi_bin_grid_h, roi_bin_grid_w, roi_center_h, roi_center_w, cos_theta, sin_theta, pre_calc);
+
+            for (int c = 0; c < channels; c++)
+            {
+                int          index_n_c      = index_n + c * pooled_width * pooled_height;
+                const float* offset_input   = input + (roi_batch_ind * channels + c) * height * width;
+                int          pre_calc_index = 0;
+
+                for (int ph = 0; ph < pooled_height; ph++)
+                {
+                    for (int pw = 0; pw < pooled_width; pw++)
+                    {
+                        int   index = index_n_c + ph * pooled_width + pw;
+
+                        float output_val = 0.;
+                        for (int iy = 0; iy < roi_bin_grid_h; iy++)
+                        {
+                            for (int ix = 0; ix < roi_bin_grid_w; ix++)
+                            {
+                                PreCalc pc = pre_calc[pre_calc_index];
+                                output_val += pc.w1 * offset_input[pc.pos1] + pc.w2 * offset_input[pc.pos2] +
+                                              pc.w3 * offset_input[pc.pos3] + pc.w4 * offset_input[pc.pos4];
+
+                                pre_calc_index += 1;
+                            }
+                        }
+                        output_val /= count;
+
+                        output[index] = output_val;
+                    }  // for pw
+                }      // for ph
+            }          // for c
+        }              // for n
+    }
 
-          float output_val = 0.;
-          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
-            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-              PreCalc pc = pre_calc[pre_calc_index];
-              output_val += pc.w1 * offset_input[pc.pos1] + pc.w2 * offset_input[pc.pos2] +
-                            pc.w3 * offset_input[pc.pos3] + pc.w4 * offset_input[pc.pos4];
+    void MMCVRoIAlignRotatedKernel::Compute(OrtKernelContext* context)
+    {
+        // Setup inputs
+        const OrtValue* input_X    = ort_.KernelContext_GetInput(context, 0);
+        const float*    X_data     = reinterpret_cast<const float*>(ort_.GetTensorData<float>(input_X));
+        const OrtValue* input_rois = ort_.KernelContext_GetInput(context, 1);
+        const float*    rois =
+            reinterpret_cast<const float*>(ort_.GetTensorData<const float*>(input_rois));
+
+        // Setup output
+        OrtTensorDimensions out_dimensions(ort_, input_X);
+        OrtTensorDimensions roi_dimensions(ort_, input_rois);
+
+        int                 batch_size     = out_dimensions.data()[0];
+        int                 input_channels = out_dimensions.data()[1];
+        int                 input_height   = out_dimensions.data()[2];
+        int                 input_width    = out_dimensions.data()[3];
+
+        out_dimensions.data()[0] = roi_dimensions.data()[0];
+        out_dimensions.data()[2] = aligned_height_;
+        out_dimensions.data()[3] = aligned_width_;
+
+        OrtValue* output =
+            ort_.KernelContext_GetOutput(context, 0, out_dimensions.data(), out_dimensions.size());
+        float*                     out         = ort_.GetTensorMutableData<float>(output);
+        OrtTensorTypeAndShapeInfo* output_info = ort_.GetTensorTypeAndShape(output);
+        ort_.ReleaseTensorTypeAndShapeInfo(output_info);
+
+        // TODO: forward here
+        int output_size = out_dimensions.data()[0];
+        for (auto i = 1; i < out_dimensions.size(); ++i)
+        {
+            output_size *= out_dimensions.data()[i];
+        }
+        ROIAlignRotatedForwardCPU(output_size, X_data, rois, out, spatial_scale_, aligned_, clockwise_, input_channels, input_height, input_width, aligned_height_, aligned_width_, sampling_ratio_);
+    }
 
-              pre_calc_index += 1;
-            }
-          }
-          output_val /= count;
-
-          output[index] = output_val;
-        }  // for pw
-      }    // for ph
-    }      // for c
-  }        // for n
-}
-
-void MMCVRoIAlignRotatedKernel::Compute(OrtKernelContext *context) {
-  // Setup inputs
-  const OrtValue *input_X = ort_.KernelContext_GetInput(context, 0);
-  const float *X_data = reinterpret_cast<const float *>(ort_.GetTensorData<float>(input_X));
-  const OrtValue *input_rois = ort_.KernelContext_GetInput(context, 1);
-  const float *rois =
-      reinterpret_cast<const float *>(ort_.GetTensorData<const float *>(input_rois));
-
-  // Setup output
-  OrtTensorDimensions out_dimensions(ort_, input_X);
-  OrtTensorDimensions roi_dimensions(ort_, input_rois);
-
-  int batch_size = out_dimensions.data()[0];
-  int input_channels = out_dimensions.data()[1];
-  int input_height = out_dimensions.data()[2];
-  int input_width = out_dimensions.data()[3];
-
-  out_dimensions.data()[0] = roi_dimensions.data()[0];
-  out_dimensions.data()[2] = aligned_height_;
-  out_dimensions.data()[3] = aligned_width_;
-
-  OrtValue *output =
-      ort_.KernelContext_GetOutput(context, 0, out_dimensions.data(), out_dimensions.size());
-  float *out = ort_.GetTensorMutableData<float>(output);
-  OrtTensorTypeAndShapeInfo *output_info = ort_.GetTensorTypeAndShape(output);
-  ort_.ReleaseTensorTypeAndShapeInfo(output_info);
-
-  // TODO: forward here
-  int output_size = out_dimensions.data()[0];
-  for (auto i = 1; i < out_dimensions.size(); ++i) {
-    output_size *= out_dimensions.data()[i];
-  }
-  ROIAlignRotatedForwardCPU(output_size, X_data, rois, out, spatial_scale_, aligned_, clockwise_,
-                            input_channels, input_height, input_width, aligned_height_,
-                            aligned_width_, sampling_ratio_);
-}
-
-REGISTER_ONNXRUNTIME_OPS(mmdeploy, MMCVRoIAlignRotatedCustomOp);
+    REGISTER_ONNXRUNTIME_OPS(mmdeploy, MMCVRoIAlignRotatedCustomOp);
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/onnxruntime/roi_align_rotated/roi_align_rotated.h b/csrc/mmdeploy/backend_ops/onnxruntime/roi_align_rotated/roi_align_rotated.h
index c0129d31f8..24a90e5321 100644
--- a/csrc/mmdeploy/backend_ops/onnxruntime/roi_align_rotated/roi_align_rotated.h
+++ b/csrc/mmdeploy/backend_ops/onnxruntime/roi_align_rotated/roi_align_rotated.h
@@ -10,50 +10,70 @@
 #include <string>
 #include <vector>
 
-namespace mmdeploy {
-struct MMCVRoIAlignRotatedKernel {
- public:
-  MMCVRoIAlignRotatedKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info) : ort_(ort) {
-    aligned_height_ = ort_.KernelInfoGetAttribute<int64_t>(info, "output_height");
-    aligned_width_ = ort_.KernelInfoGetAttribute<int64_t>(info, "output_width");
-    sampling_ratio_ = ort_.KernelInfoGetAttribute<int64_t>(info, "sampling_ratio");
-    spatial_scale_ = ort_.KernelInfoGetAttribute<float>(info, "spatial_scale");
-    aligned_ = ort_.KernelInfoGetAttribute<int64_t>(info, "aligned");
-    clockwise_ = ort_.KernelInfoGetAttribute<int64_t>(info, "clockwise");
-  }
-
-  void Compute(OrtKernelContext* context);
-
- private:
-  Ort::CustomOpApi ort_;
-  int aligned_height_;
-  int aligned_width_;
-  float spatial_scale_;
-  int sampling_ratio_;
-  int aligned_;
-  int clockwise_;
-};
-
-struct MMCVRoIAlignRotatedCustomOp
-    : Ort::CustomOpBase<MMCVRoIAlignRotatedCustomOp, MMCVRoIAlignRotatedKernel> {
-  void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const {
-    return new MMCVRoIAlignRotatedKernel(api, info);
-  }
-  const char* GetName() const { return "MMCVRoIAlignRotated"; }
-
-  size_t GetInputTypeCount() const { return 2; }
-  ONNXTensorElementDataType GetInputType(size_t) const {
-    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
-  }
-
-  size_t GetOutputTypeCount() const { return 1; }
-  ONNXTensorElementDataType GetOutputType(size_t) const {
-    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
-  }
-
-  // force cpu
-  const char* GetExecutionProviderType() const { return "CPUExecutionProvider"; }
-};
+namespace mmdeploy
+{
+    struct MMCVRoIAlignRotatedKernel
+    {
+      public:
+        MMCVRoIAlignRotatedKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info)
+            : ort_(ort)
+        {
+            aligned_height_ = ort_.KernelInfoGetAttribute<int64_t>(info, "output_height");
+            aligned_width_  = ort_.KernelInfoGetAttribute<int64_t>(info, "output_width");
+            sampling_ratio_ = ort_.KernelInfoGetAttribute<int64_t>(info, "sampling_ratio");
+            spatial_scale_  = ort_.KernelInfoGetAttribute<float>(info, "spatial_scale");
+            aligned_        = ort_.KernelInfoGetAttribute<int64_t>(info, "aligned");
+            clockwise_      = ort_.KernelInfoGetAttribute<int64_t>(info, "clockwise");
+        }
+
+        void Compute(OrtKernelContext* context);
+
+      private:
+        Ort::CustomOpApi ort_;
+        int              aligned_height_;
+        int              aligned_width_;
+        float            spatial_scale_;
+        int              sampling_ratio_;
+        int              aligned_;
+        int              clockwise_;
+    };
+
+    struct MMCVRoIAlignRotatedCustomOp
+        : Ort::CustomOpBase<MMCVRoIAlignRotatedCustomOp, MMCVRoIAlignRotatedKernel>
+    {
+        void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const
+        {
+            return new MMCVRoIAlignRotatedKernel(api, info);
+        }
+        const char* GetName() const
+        {
+            return "MMCVRoIAlignRotated";
+        }
+
+        size_t GetInputTypeCount() const
+        {
+            return 2;
+        }
+        ONNXTensorElementDataType GetInputType(size_t) const
+        {
+            return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+        }
+
+        size_t GetOutputTypeCount() const
+        {
+            return 1;
+        }
+        ONNXTensorElementDataType GetOutputType(size_t) const
+        {
+            return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+        }
+
+        // force cpu
+        const char* GetExecutionProviderType() const
+        {
+            return "CPUExecutionProvider";
+        }
+    };
 }  // namespace mmdeploy
 
 #endif  // ONNXRUNTIME_ROI_ALIGN_ROTATED_H
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/CMakeLists.txt b/csrc/mmdeploy/backend_ops/tensorrt/CMakeLists.txt
index a221311acd..d43e8c4a1b 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/CMakeLists.txt
+++ b/csrc/mmdeploy/backend_ops/tensorrt/CMakeLists.txt
@@ -4,28 +4,28 @@ project(mmdeploy_tensorrt_ops)
 include(${CMAKE_SOURCE_DIR}/cmake/tensorrt.cmake)
 
 # cub
-if (NOT DEFINED CUB_ROOT_DIR)
-    if (CUDA_VERSION VERSION_LESS 11.0)
-        set(CUB_ROOT_DIR "${CMAKE_SOURCE_DIR}/third_party/cub")
-    endif ()
-endif ()
+if(NOT DEFINED CUB_ROOT_DIR)
+  if(CUDA_VERSION VERSION_LESS 11.0)
+    set(CUB_ROOT_DIR "${CMAKE_SOURCE_DIR}/third_party/cub")
+  endif()
+endif()
 
 file(GLOB_RECURSE BACKEND_OPS_SRCS *.cpp *.cu)
 add_library(${PROJECT_NAME}_obj OBJECT "${BACKEND_OPS_SRCS}")
-set_target_properties(${PROJECT_NAME}_obj PROPERTIES POSITION_INDEPENDENT_CODE 1)
+set_target_properties(${PROJECT_NAME}_obj PROPERTIES POSITION_INDEPENDENT_CODE
+                                                     1)
 target_compile_definitions(${PROJECT_NAME}_obj
-        PRIVATE -DTHRUST_IGNORE_DEPRECATED_CPP_DIALECT=1)
+                           PRIVATE -DTHRUST_IGNORE_DEPRECATED_CPP_DIALECT=1)
 target_include_directories(${PROJECT_NAME}_obj
-        PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common)
+                           PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common)
 target_include_directories(${PROJECT_NAME}_obj
-        PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/common)
+                           PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/common)
 target_include_directories(${PROJECT_NAME}_obj
-        PRIVATE ${CUDA_TOOLKIT_ROOT_DIR}/include)
+                           PRIVATE ${CUDA_TOOLKIT_ROOT_DIR}/include)
 target_include_directories(${PROJECT_NAME}_obj PRIVATE ${TENSORRT_INCLUDE_DIR})
 target_include_directories(${PROJECT_NAME}_obj PRIVATE ${CUDNN_DIR}/include)
 target_include_directories(${PROJECT_NAME}_obj PRIVATE ${CUB_ROOT_DIR})
-target_link_libraries(${PROJECT_NAME}_obj
-        PUBLIC ${TENSORRT_LIBS} cublas cudnn)
+target_link_libraries(${PROJECT_NAME}_obj PUBLIC ${TENSORRT_LIBS} cublas cudnn)
 mmdeploy_export(${PROJECT_NAME}_obj)
 
 # Build module library. It is used to convert onnx model to tensorrt engine
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/batched_nms/trt_batched_nms.cpp b/csrc/mmdeploy/backend_ops/tensorrt/batched_nms/trt_batched_nms.cpp
index 431f2dd63b..3bb08a5e22 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/batched_nms/trt_batched_nms.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/batched_nms/trt_batched_nms.cpp
@@ -9,225 +9,314 @@
 #include "nms/kernel.h"
 #include "trt_serialize.hpp"
 
-namespace mmdeploy {
-using namespace nvinfer1;
-using nvinfer1::plugin::NMSParameters;
-
-namespace {
-static const char* NMS_PLUGIN_VERSION{"1"};
-static const char* NMS_PLUGIN_NAME{"TRTBatchedNMS"};
-}  // namespace
-
-TRTBatchedNMS::TRTBatchedNMS(const std::string& name, NMSParameters params, bool returnIndex)
-    : TRTPluginBase(name), param(params), mReturnIndex(returnIndex) {}
-
-TRTBatchedNMS::TRTBatchedNMS(const std::string& name, const void* data, size_t length)
-    : TRTPluginBase(name) {
-  deserialize_value(&data, &length, &param);
-  deserialize_value(&data, &length, &mClipBoxes);
-  deserialize_value(&data, &length, &mReturnIndex);
-}
-
-int TRTBatchedNMS::getNbOutputs() const TRT_NOEXCEPT {
-  int num = mReturnIndex ? 3 : 2;
-  return num;
-}
-
-nvinfer1::DimsExprs TRTBatchedNMS::getOutputDimensions(
-    int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
-    nvinfer1::IExprBuilder& exprBuilder) TRT_NOEXCEPT {
-  ASSERT(nbInputs == 2);
-  ASSERT(outputIndex >= 0 && outputIndex < this->getNbOutputs());
-  ASSERT(inputs[0].nbDims == 4);
-  ASSERT(inputs[1].nbDims == 3);
-
-  nvinfer1::DimsExprs ret;
-  ret.d[0] = inputs[0].d[0];
-  ret.d[1] = exprBuilder.constant(param.keepTopK);
-  switch (outputIndex) {
-    case 0:
-      ret.nbDims = 3;
-      ret.d[2] = exprBuilder.constant(5);
-      break;
-    case 1:
-      ret.nbDims = 2;
-      break;
-    case 2:
-      ret.nbDims = 2;
-    default:
-      break;
-  }
-
-  return ret;
-}
-
-size_t TRTBatchedNMS::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
-                                       const nvinfer1::PluginTensorDesc* outputs,
-                                       int nbOutputs) const TRT_NOEXCEPT {
-  size_t batch_size = inputs[0].dims.d[0];
-  size_t boxes_size = inputs[0].dims.d[1] * inputs[0].dims.d[2] * inputs[0].dims.d[3];
-  size_t score_size = inputs[1].dims.d[1] * inputs[1].dims.d[2];
-  size_t num_priors = inputs[0].dims.d[1];
-  bool shareLocation = (inputs[0].dims.d[2] == 1);
-  int topk = param.topK > 0 && param.topK <= inputs[1].dims.d[1] ? param.topK : inputs[1].dims.d[1];
-  return detectionInferenceWorkspaceSize(shareLocation, batch_size, boxes_size, score_size,
-                                         param.numClasses, num_priors, topk, DataType::kFLOAT,
-                                         DataType::kFLOAT);
-}
-
-int TRTBatchedNMS::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
-                           const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs,
-                           void* const* outputs, void* workSpace,
-                           cudaStream_t stream) TRT_NOEXCEPT {
-  const void* const locData = inputs[0];
-  const void* const confData = inputs[1];
-
-  void* nmsedDets = outputs[0];
-  void* nmsedLabels = outputs[1];
-  void* nmsedIndex = mReturnIndex ? outputs[2] : nullptr;
-
-  size_t batch_size = inputDesc[0].dims.d[0];
-  size_t boxes_size = inputDesc[0].dims.d[1] * inputDesc[0].dims.d[2] * inputDesc[0].dims.d[3];
-  size_t score_size = inputDesc[1].dims.d[1] * inputDesc[1].dims.d[2];
-  size_t num_priors = inputDesc[0].dims.d[1];
-  bool shareLocation = (inputDesc[0].dims.d[2] == 1);
-
-  int topk =
-      param.topK > 0 && param.topK <= inputDesc[1].dims.d[1] ? param.topK : inputDesc[1].dims.d[1];
-  bool rotated = false;
-  pluginStatus_t status = nmsInference(
-      stream, batch_size, boxes_size, score_size, shareLocation, param.backgroundLabelId,
-      num_priors, param.numClasses, topk, param.keepTopK, param.scoreThreshold, param.iouThreshold,
-      DataType::kFLOAT, locData, DataType::kFLOAT, confData, nmsedDets, nmsedLabels, nmsedIndex,
-      workSpace, param.isNormalized, false, mClipBoxes, rotated);
-  ASSERT(status == STATUS_SUCCESS);
-
-  return 0;
-}
-
-size_t TRTBatchedNMS::getSerializationSize() const TRT_NOEXCEPT {
-  // NMSParameters
-  return sizeof(NMSParameters) + sizeof(mClipBoxes) + sizeof(mReturnIndex);
-}
-
-void TRTBatchedNMS::serialize(void* buffer) const TRT_NOEXCEPT {
-  serialize_value(&buffer, param);
-  serialize_value(&buffer, mClipBoxes);
-  serialize_value(&buffer, mReturnIndex);
-}
-
-void TRTBatchedNMS::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs, int nbInputs,
-                                    const nvinfer1::DynamicPluginTensorDesc* outputs,
-                                    int nbOutputs) TRT_NOEXCEPT {
-  // Validate input arguments
-}
-
-bool TRTBatchedNMS::supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc,
-                                              int nbInputs, int nbOutputs) TRT_NOEXCEPT {
-  if (pos == 3 || pos == 4) {
-    return ioDesc[pos].type == nvinfer1::DataType::kINT32 &&
-           ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR;
-  }
-  return ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
-         ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR;
-}
-
-const char* TRTBatchedNMS::getPluginType() const TRT_NOEXCEPT { return NMS_PLUGIN_NAME; }
-
-const char* TRTBatchedNMS::getPluginVersion() const TRT_NOEXCEPT { return NMS_PLUGIN_VERSION; }
-
-IPluginV2DynamicExt* TRTBatchedNMS::clone() const TRT_NOEXCEPT {
-  auto* plugin = new TRTBatchedNMS(mLayerName, param, mReturnIndex);
-  plugin->setPluginNamespace(mNamespace.c_str());
-  plugin->setClipParam(mClipBoxes);
-  return plugin;
-}
-
-nvinfer1::DataType TRTBatchedNMS::getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
-                                                    int nbInputs) const TRT_NOEXCEPT {
-  ASSERT(index >= 0 && index < this->getNbOutputs());
-  if (index == 1 || index == 2) {
-    return nvinfer1::DataType::kINT32;
-  }
-  return inputTypes[0];
-}
-
-void TRTBatchedNMS::setClipParam(bool clip) { mClipBoxes = clip; }
-
-TRTBatchedNMSCreator::TRTBatchedNMSCreator() {
-  mPluginAttributes.emplace_back(
-      PluginField("background_label_id", nullptr, PluginFieldType::kINT32, 1));
-  mPluginAttributes.emplace_back(PluginField("num_classes", nullptr, PluginFieldType::kINT32, 1));
-  mPluginAttributes.emplace_back(PluginField("topk", nullptr, PluginFieldType::kINT32, 1));
-  mPluginAttributes.emplace_back(PluginField("keep_topk", nullptr, PluginFieldType::kINT32, 1));
-  mPluginAttributes.emplace_back(
-      PluginField("score_threshold", nullptr, PluginFieldType::kFLOAT32, 1));
-  mPluginAttributes.emplace_back(
-      PluginField("iou_threshold", nullptr, PluginFieldType::kFLOAT32, 1));
-  mPluginAttributes.emplace_back(PluginField("is_normalized", nullptr, PluginFieldType::kINT32, 1));
-  mPluginAttributes.emplace_back(PluginField("clip_boxes", nullptr, PluginFieldType::kINT32, 1));
-  mPluginAttributes.emplace_back(PluginField("return_index", nullptr, PluginFieldType::kINT32, 1));
-
-  mFC.nbFields = mPluginAttributes.size();
-  mFC.fields = mPluginAttributes.data();
-}
-
-const char* TRTBatchedNMSCreator::getPluginName() const TRT_NOEXCEPT { return NMS_PLUGIN_NAME; }
-
-const char* TRTBatchedNMSCreator::getPluginVersion() const TRT_NOEXCEPT {
-  return NMS_PLUGIN_VERSION;
-}
-
-IPluginV2Ext* TRTBatchedNMSCreator::createPlugin(const char* name,
-                                                 const PluginFieldCollection* fc) TRT_NOEXCEPT {
-  const PluginField* fields = fc->fields;
-  bool clipBoxes = true;
-  bool returnIndex = false;
-  nvinfer1::plugin::NMSParameters params{};
-
-  for (int i = 0; i < fc->nbFields; ++i) {
-    const char* attrName = fields[i].name;
-    if (!strcmp(attrName, "background_label_id")) {
-      ASSERT(fields[i].type == PluginFieldType::kINT32);
-      params.backgroundLabelId = *(static_cast<const int*>(fields[i].data));
-    } else if (!strcmp(attrName, "num_classes")) {
-      ASSERT(fields[i].type == PluginFieldType::kINT32);
-      params.numClasses = *(static_cast<const int*>(fields[i].data));
-    } else if (!strcmp(attrName, "topk")) {
-      ASSERT(fields[i].type == PluginFieldType::kINT32);
-      params.topK = *(static_cast<const int*>(fields[i].data));
-    } else if (!strcmp(attrName, "keep_topk")) {
-      ASSERT(fields[i].type == PluginFieldType::kINT32);
-      params.keepTopK = *(static_cast<const int*>(fields[i].data));
-    } else if (!strcmp(attrName, "score_threshold")) {
-      ASSERT(fields[i].type == PluginFieldType::kFLOAT32);
-      params.scoreThreshold = *(static_cast<const float*>(fields[i].data));
-    } else if (!strcmp(attrName, "iou_threshold")) {
-      ASSERT(fields[i].type == PluginFieldType::kFLOAT32);
-      params.iouThreshold = *(static_cast<const float*>(fields[i].data));
-    } else if (!strcmp(attrName, "is_normalized")) {
-      params.isNormalized = *(static_cast<const bool*>(fields[i].data));
-    } else if (!strcmp(attrName, "clip_boxes")) {
-      clipBoxes = *(static_cast<const bool*>(fields[i].data));
-    } else if (!strcmp(attrName, "return_index")) {
-      returnIndex = *(static_cast<const bool*>(fields[i].data));
-    }
-  }
-
-  TRTBatchedNMS* plugin = new TRTBatchedNMS(name, params, returnIndex);
-  plugin->setClipParam(clipBoxes);
-  plugin->setPluginNamespace(mNamespace.c_str());
-  return plugin;
-}
-
-IPluginV2Ext* TRTBatchedNMSCreator::deserializePlugin(const char* name, const void* serialData,
-                                                      size_t serialLength) TRT_NOEXCEPT {
-  // This object will be deleted when the network is destroyed, which will
-  // call NMS::destroy()
-  TRTBatchedNMS* plugin = new TRTBatchedNMS(name, serialData, serialLength);
-  plugin->setPluginNamespace(mNamespace.c_str());
-  return plugin;
-}
-
-REGISTER_TENSORRT_PLUGIN(TRTBatchedNMSCreator);
+namespace mmdeploy
+{
+    using namespace nvinfer1;
+    using nvinfer1::plugin::NMSParameters;
+
+    namespace
+    {
+        static const char* NMS_PLUGIN_VERSION{"1"};
+        static const char* NMS_PLUGIN_NAME{"TRTBatchedNMS"};
+    }  // namespace
+
+    TRTBatchedNMS::TRTBatchedNMS(const std::string& name, NMSParameters params, bool returnIndex)
+        : TRTPluginBase(name)
+        , param(params)
+        , mReturnIndex(returnIndex)
+    {
+    }
+
+    TRTBatchedNMS::TRTBatchedNMS(const std::string& name, const void* data, size_t length)
+        : TRTPluginBase(name)
+    {
+        deserialize_value(&data, &length, &param);
+        deserialize_value(&data, &length, &mClipBoxes);
+        deserialize_value(&data, &length, &mReturnIndex);
+    }
+
+    int TRTBatchedNMS::getNbOutputs() const TRT_NOEXCEPT
+    {
+        int num = mReturnIndex ? 3 : 2;
+        return num;
+    }
+
+    nvinfer1::DimsExprs TRTBatchedNMS::getOutputDimensions(
+        int                        outputIndex,
+        const nvinfer1::DimsExprs* inputs,
+        int                        nbInputs,
+        nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT
+    {
+        ASSERT(nbInputs == 2);
+        ASSERT(outputIndex >= 0 && outputIndex < this->getNbOutputs());
+        ASSERT(inputs[0].nbDims == 4);
+        ASSERT(inputs[1].nbDims == 3);
+
+        nvinfer1::DimsExprs ret;
+        ret.d[0] = inputs[0].d[0];
+        ret.d[1] = exprBuilder.constant(param.keepTopK);
+        switch (outputIndex)
+        {
+            case 0:
+                ret.nbDims = 3;
+                ret.d[2]   = exprBuilder.constant(5);
+                break;
+            case 1:
+                ret.nbDims = 2;
+                break;
+            case 2:
+                ret.nbDims = 2;
+            default:
+                break;
+        }
+
+        return ret;
+    }
+
+    size_t TRTBatchedNMS::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                           int                               nbInputs,
+                                           const nvinfer1::PluginTensorDesc* outputs,
+                                           int                               nbOutputs) const TRT_NOEXCEPT
+    {
+        size_t batch_size    = inputs[0].dims.d[0];
+        size_t boxes_size    = inputs[0].dims.d[1] * inputs[0].dims.d[2] * inputs[0].dims.d[3];
+        size_t score_size    = inputs[1].dims.d[1] * inputs[1].dims.d[2];
+        size_t num_priors    = inputs[0].dims.d[1];
+        bool   shareLocation = (inputs[0].dims.d[2] == 1);
+        int    topk          = param.topK > 0 && param.topK <= inputs[1].dims.d[1] ? param.topK : inputs[1].dims.d[1];
+        return detectionInferenceWorkspaceSize(shareLocation,
+                                               batch_size,
+                                               boxes_size,
+                                               score_size,
+                                               param.numClasses,
+                                               num_priors,
+                                               topk,
+                                               DataType::kFLOAT,
+                                               DataType::kFLOAT);
+    }
+
+    int TRTBatchedNMS::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                               const nvinfer1::PluginTensorDesc* outputDesc,
+                               const void* const*                inputs,
+                               void* const*                      outputs,
+                               void*                             workSpace,
+                               cudaStream_t                      stream) TRT_NOEXCEPT
+    {
+        const void* const locData  = inputs[0];
+        const void* const confData = inputs[1];
+
+        void*             nmsedDets   = outputs[0];
+        void*             nmsedLabels = outputs[1];
+        void*             nmsedIndex  = mReturnIndex ? outputs[2] : nullptr;
+
+        size_t            batch_size    = inputDesc[0].dims.d[0];
+        size_t            boxes_size    = inputDesc[0].dims.d[1] * inputDesc[0].dims.d[2] * inputDesc[0].dims.d[3];
+        size_t            score_size    = inputDesc[1].dims.d[1] * inputDesc[1].dims.d[2];
+        size_t            num_priors    = inputDesc[0].dims.d[1];
+        bool              shareLocation = (inputDesc[0].dims.d[2] == 1);
+
+        int               topk =
+            param.topK > 0 && param.topK <= inputDesc[1].dims.d[1] ? param.topK : inputDesc[1].dims.d[1];
+        bool           rotated = false;
+        pluginStatus_t status  = nmsInference(stream,
+                                             batch_size,
+                                             boxes_size,
+                                             score_size,
+                                             shareLocation,
+                                             param.backgroundLabelId,
+                                             num_priors,
+                                             param.numClasses,
+                                             topk,
+                                             param.keepTopK,
+                                             param.scoreThreshold,
+                                             param.iouThreshold,
+                                             DataType::kFLOAT,
+                                             locData,
+                                             DataType::kFLOAT,
+                                             confData,
+                                             nmsedDets,
+                                             nmsedLabels,
+                                             nmsedIndex,
+                                             workSpace,
+                                             param.isNormalized,
+                                             false,
+                                             mClipBoxes,
+                                             rotated);
+        ASSERT(status == STATUS_SUCCESS);
+
+        return 0;
+    }
+
+    size_t TRTBatchedNMS::getSerializationSize() const TRT_NOEXCEPT
+    {
+        // NMSParameters
+        return sizeof(NMSParameters) + sizeof(mClipBoxes) + sizeof(mReturnIndex);
+    }
+
+    void TRTBatchedNMS::serialize(void* buffer) const TRT_NOEXCEPT
+    {
+        serialize_value(&buffer, param);
+        serialize_value(&buffer, mClipBoxes);
+        serialize_value(&buffer, mReturnIndex);
+    }
+
+    void TRTBatchedNMS::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs,
+                                        int                                      nbInputs,
+                                        const nvinfer1::DynamicPluginTensorDesc* outputs,
+                                        int                                      nbOutputs) TRT_NOEXCEPT
+    {
+        // Validate input arguments
+    }
+
+    bool TRTBatchedNMS::supportsFormatCombination(int                               pos,
+                                                  const nvinfer1::PluginTensorDesc* ioDesc,
+                                                  int                               nbInputs,
+                                                  int                               nbOutputs) TRT_NOEXCEPT
+    {
+        if (pos == 3 || pos == 4)
+        {
+            return ioDesc[pos].type == nvinfer1::DataType::kINT32 &&
+                   ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR;
+        }
+        return ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
+               ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR;
+    }
+
+    const char* TRTBatchedNMS::getPluginType() const TRT_NOEXCEPT
+    {
+        return NMS_PLUGIN_NAME;
+    }
+
+    const char* TRTBatchedNMS::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return NMS_PLUGIN_VERSION;
+    }
+
+    IPluginV2DynamicExt* TRTBatchedNMS::clone() const TRT_NOEXCEPT
+    {
+        auto* plugin = new TRTBatchedNMS(mLayerName, param, mReturnIndex);
+        plugin->setPluginNamespace(mNamespace.c_str());
+        plugin->setClipParam(mClipBoxes);
+        return plugin;
+    }
+
+    nvinfer1::DataType TRTBatchedNMS::getOutputDataType(int                       index,
+                                                        const nvinfer1::DataType* inputTypes,
+                                                        int                       nbInputs) const TRT_NOEXCEPT
+    {
+        ASSERT(index >= 0 && index < this->getNbOutputs());
+        if (index == 1 || index == 2)
+        {
+            return nvinfer1::DataType::kINT32;
+        }
+        return inputTypes[0];
+    }
+
+    void TRTBatchedNMS::setClipParam(bool clip)
+    {
+        mClipBoxes = clip;
+    }
+
+    TRTBatchedNMSCreator::TRTBatchedNMSCreator()
+    {
+        mPluginAttributes.emplace_back(
+            PluginField("background_label_id", nullptr, PluginFieldType::kINT32, 1));
+        mPluginAttributes.emplace_back(PluginField("num_classes", nullptr, PluginFieldType::kINT32, 1));
+        mPluginAttributes.emplace_back(PluginField("topk", nullptr, PluginFieldType::kINT32, 1));
+        mPluginAttributes.emplace_back(PluginField("keep_topk", nullptr, PluginFieldType::kINT32, 1));
+        mPluginAttributes.emplace_back(
+            PluginField("score_threshold", nullptr, PluginFieldType::kFLOAT32, 1));
+        mPluginAttributes.emplace_back(
+            PluginField("iou_threshold", nullptr, PluginFieldType::kFLOAT32, 1));
+        mPluginAttributes.emplace_back(PluginField("is_normalized", nullptr, PluginFieldType::kINT32, 1));
+        mPluginAttributes.emplace_back(PluginField("clip_boxes", nullptr, PluginFieldType::kINT32, 1));
+        mPluginAttributes.emplace_back(PluginField("return_index", nullptr, PluginFieldType::kINT32, 1));
+
+        mFC.nbFields = mPluginAttributes.size();
+        mFC.fields   = mPluginAttributes.data();
+    }
+
+    const char* TRTBatchedNMSCreator::getPluginName() const TRT_NOEXCEPT
+    {
+        return NMS_PLUGIN_NAME;
+    }
+
+    const char* TRTBatchedNMSCreator::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return NMS_PLUGIN_VERSION;
+    }
+
+    IPluginV2Ext* TRTBatchedNMSCreator::createPlugin(const char*                  name,
+                                                     const PluginFieldCollection* fc) TRT_NOEXCEPT
+    {
+        const PluginField*              fields      = fc->fields;
+        bool                            clipBoxes   = true;
+        bool                            returnIndex = false;
+        nvinfer1::plugin::NMSParameters params{};
+
+        for (int i = 0; i < fc->nbFields; ++i)
+        {
+            const char* attrName = fields[i].name;
+            if (!strcmp(attrName, "background_label_id"))
+            {
+                ASSERT(fields[i].type == PluginFieldType::kINT32);
+                params.backgroundLabelId = *(static_cast<const int*>(fields[i].data));
+            }
+            else if (!strcmp(attrName, "num_classes"))
+            {
+                ASSERT(fields[i].type == PluginFieldType::kINT32);
+                params.numClasses = *(static_cast<const int*>(fields[i].data));
+            }
+            else if (!strcmp(attrName, "topk"))
+            {
+                ASSERT(fields[i].type == PluginFieldType::kINT32);
+                params.topK = *(static_cast<const int*>(fields[i].data));
+            }
+            else if (!strcmp(attrName, "keep_topk"))
+            {
+                ASSERT(fields[i].type == PluginFieldType::kINT32);
+                params.keepTopK = *(static_cast<const int*>(fields[i].data));
+            }
+            else if (!strcmp(attrName, "score_threshold"))
+            {
+                ASSERT(fields[i].type == PluginFieldType::kFLOAT32);
+                params.scoreThreshold = *(static_cast<const float*>(fields[i].data));
+            }
+            else if (!strcmp(attrName, "iou_threshold"))
+            {
+                ASSERT(fields[i].type == PluginFieldType::kFLOAT32);
+                params.iouThreshold = *(static_cast<const float*>(fields[i].data));
+            }
+            else if (!strcmp(attrName, "is_normalized"))
+            {
+                params.isNormalized = *(static_cast<const bool*>(fields[i].data));
+            }
+            else if (!strcmp(attrName, "clip_boxes"))
+            {
+                clipBoxes = *(static_cast<const bool*>(fields[i].data));
+            }
+            else if (!strcmp(attrName, "return_index"))
+            {
+                returnIndex = *(static_cast<const bool*>(fields[i].data));
+            }
+        }
+
+        TRTBatchedNMS* plugin = new TRTBatchedNMS(name, params, returnIndex);
+        plugin->setClipParam(clipBoxes);
+        plugin->setPluginNamespace(mNamespace.c_str());
+        return plugin;
+    }
+
+    IPluginV2Ext* TRTBatchedNMSCreator::deserializePlugin(const char* name,
+                                                          const void* serialData,
+                                                          size_t      serialLength) TRT_NOEXCEPT
+    {
+        // This object will be deleted when the network is destroyed, which will
+        // call NMS::destroy()
+        TRTBatchedNMS* plugin = new TRTBatchedNMS(name, serialData, serialLength);
+        plugin->setPluginNamespace(mNamespace.c_str());
+        return plugin;
+    }
+
+    REGISTER_TENSORRT_PLUGIN(TRTBatchedNMSCreator);
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/batched_nms/trt_batched_nms.hpp b/csrc/mmdeploy/backend_ops/tensorrt/batched_nms/trt_batched_nms.hpp
index d1e5d643db..b1d77a54d0 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/batched_nms/trt_batched_nms.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/batched_nms/trt_batched_nms.hpp
@@ -8,75 +8,94 @@
 
 #include "NvInferPluginUtils.h"
 #include "trt_plugin_base.hpp"
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-enum NMSReturnType { RETURN_DETS = 1, RETURN_INDEX = 1 << 1 };
+    enum NMSReturnType
+    {
+        RETURN_DETS  = 1,
+        RETURN_INDEX = 1 << 1
+    };
 
-class TRTBatchedNMS : public TRTPluginBase {
- public:
-  TRTBatchedNMS(const std::string& name, nvinfer1::plugin::NMSParameters param, bool returnIndex);
+    class TRTBatchedNMS : public TRTPluginBase
+    {
+      public:
+        TRTBatchedNMS(const std::string&              name,
+                      nvinfer1::plugin::NMSParameters param,
+                      bool                            returnIndex);
 
-  TRTBatchedNMS(const std::string& name, const void* data, size_t length);
+        TRTBatchedNMS(const std::string& name, const void* data, size_t length);
 
-  ~TRTBatchedNMS() TRT_NOEXCEPT override = default;
+        ~TRTBatchedNMS() TRT_NOEXCEPT override = default;
 
-  int getNbOutputs() const TRT_NOEXCEPT override;
+        int                 getNbOutputs() const TRT_NOEXCEPT override;
 
-  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs* inputs,
-                                          int nbInputs, nvinfer1::IExprBuilder& exprBuilder)
-      TRT_NOEXCEPT override;
+        nvinfer1::DimsExprs            getOutputDimensions(int                        outputIndex,
+                                                           const nvinfer1::DimsExprs* inputs,
+                                                           int                        nbInputs,
+                                                           nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT override;
 
-  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
-                          const nvinfer1::PluginTensorDesc* outputs,
-                          int nbOutputs) const TRT_NOEXCEPT override;
+        size_t                         getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                                        int                               nbInputs,
+                                                        const nvinfer1::PluginTensorDesc* outputs,
+                                                        int                               nbOutputs) const TRT_NOEXCEPT override;
 
-  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
-              const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs,
-              void* const* outputs, void* workSpace, cudaStream_t stream) TRT_NOEXCEPT override;
+        int                            enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                               const nvinfer1::PluginTensorDesc* outputDesc,
+                                               const void* const*                inputs,
+                                               void* const*                      outputs,
+                                               void*                             workSpace,
+                                               cudaStream_t                      stream) TRT_NOEXCEPT override;
 
-  size_t getSerializationSize() const TRT_NOEXCEPT override;
+        size_t                         getSerializationSize() const TRT_NOEXCEPT override;
 
-  void serialize(void* buffer) const TRT_NOEXCEPT override;
+        void                           serialize(void* buffer) const TRT_NOEXCEPT override;
 
-  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs, int nbInputs,
-                       const nvinfer1::DynamicPluginTensorDesc* outputs,
-                       int nbOutputs) TRT_NOEXCEPT override;
+        void                           configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs,
+                                                       int                                      nbInputs,
+                                                       const nvinfer1::DynamicPluginTensorDesc* outputs,
+                                                       int                                      nbOutputs) TRT_NOEXCEPT override;
 
-  bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs,
-                                 int nbOutputs) TRT_NOEXCEPT override;
+        bool                           supportsFormatCombination(int                               pos,
+                                                                 const nvinfer1::PluginTensorDesc* ioDesc,
+                                                                 int                               nbInputs,
+                                                                 int                               nbOutputs) TRT_NOEXCEPT override;
 
-  const char* getPluginType() const TRT_NOEXCEPT override;
+        const char*                    getPluginType() const TRT_NOEXCEPT override;
 
-  const char* getPluginVersion() const TRT_NOEXCEPT override;
+        const char*                    getPluginVersion() const TRT_NOEXCEPT override;
 
-  nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
+        nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
 
-  nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputType,
-                                       int nbInputs) const TRT_NOEXCEPT override;
+        nvinfer1::DataType             getOutputDataType(int                       index,
+                                                         const nvinfer1::DataType* inputType,
+                                                         int                       nbInputs) const TRT_NOEXCEPT override;
 
-  void setClipParam(bool clip);
+        void                           setClipParam(bool clip);
 
- private:
-  nvinfer1::plugin::NMSParameters param{};
-  bool mClipBoxes{};
-  bool mReturnIndex{};
-};
+      private:
+        nvinfer1::plugin::NMSParameters param{};
+        bool                            mClipBoxes{};
+        bool                            mReturnIndex{};
+    };
 
-class TRTBatchedNMSCreator : public TRTPluginCreatorBase {
- public:
-  TRTBatchedNMSCreator();
+    class TRTBatchedNMSCreator : public TRTPluginCreatorBase
+    {
+      public:
+        TRTBatchedNMSCreator();
 
-  ~TRTBatchedNMSCreator() TRT_NOEXCEPT override = default;
+        ~TRTBatchedNMSCreator() TRT_NOEXCEPT override = default;
 
-  const char* getPluginName() const TRT_NOEXCEPT override;
+        const char*             getPluginName() const TRT_NOEXCEPT override;
 
-  const char* getPluginVersion() const TRT_NOEXCEPT override;
+        const char*             getPluginVersion() const TRT_NOEXCEPT override;
 
-  nvinfer1::IPluginV2Ext* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc)
-      TRT_NOEXCEPT override;
+        nvinfer1::IPluginV2Ext* createPlugin(const char*                            name,
+                                             const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override;
 
-  nvinfer1::IPluginV2Ext* deserializePlugin(const char* name, const void* serialData,
-                                            size_t serialLength) TRT_NOEXCEPT override;
-};
+        nvinfer1::IPluginV2Ext* deserializePlugin(const char* name,
+                                                  const void* serialData,
+                                                  size_t      serialLength) TRT_NOEXCEPT override;
+    };
 }  // namespace mmdeploy
 #endif  // TRT_BATCHED_NMS_PLUGIN_CUSTOM_H
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/batched_rotated_nms/trt_batched_rotated_nms.cpp b/csrc/mmdeploy/backend_ops/tensorrt/batched_rotated_nms/trt_batched_rotated_nms.cpp
index 9d977bc937..80b5be6abc 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/batched_rotated_nms/trt_batched_rotated_nms.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/batched_rotated_nms/trt_batched_rotated_nms.cpp
@@ -8,222 +8,295 @@
 #include "nms/kernel.h"
 #include "trt_serialize.hpp"
 
-namespace mmdeploy {
-using namespace nvinfer1;
-using nvinfer1::plugin::NMSParameters;
-
-namespace {
-static const char* NMS_PLUGIN_VERSION{"1"};
-static const char* NMS_PLUGIN_NAME{"TRTBatchedRotatedNMS"};
-}  // namespace
-
-TRTBatchedRotatedNMS::TRTBatchedRotatedNMS(const std::string& name, NMSParameters params)
-    : TRTPluginBase(name), param(params) {}
-
-TRTBatchedRotatedNMS::TRTBatchedRotatedNMS(const std::string& name, const void* data, size_t length)
-    : TRTPluginBase(name) {
-  deserialize_value(&data, &length, &param);
-  deserialize_value(&data, &length, &mClipBoxes);
-}
-
-int TRTBatchedRotatedNMS::getNbOutputs() const TRT_NOEXCEPT { return 2; }
-
-nvinfer1::DimsExprs TRTBatchedRotatedNMS::getOutputDimensions(
-    int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
-    nvinfer1::IExprBuilder& exprBuilder) TRT_NOEXCEPT {
-  ASSERT(nbInputs == 2);
-  ASSERT(outputIndex >= 0 && outputIndex < this->getNbOutputs());
-  ASSERT(inputs[0].nbDims == 4);
-  ASSERT(inputs[1].nbDims == 3);
-
-  nvinfer1::DimsExprs ret;
-  ret.d[0] = inputs[0].d[0];
-  ret.d[1] = exprBuilder.constant(param.keepTopK);
-  switch (outputIndex) {
-    case 0:
-      ret.nbDims = 3;
-      ret.d[2] = exprBuilder.constant(6);
-      break;
-    case 1:
-      ret.nbDims = 2;
-      break;
-    default:
-      break;
-  }
-
-  return ret;
-}
-
-size_t TRTBatchedRotatedNMS::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
-                                              int nbInputs,
-                                              const nvinfer1::PluginTensorDesc* outputs,
-                                              int nbOutputs) const TRT_NOEXCEPT {
-  size_t batch_size = inputs[0].dims.d[0];
-  size_t boxes_size = inputs[0].dims.d[1] * inputs[0].dims.d[2] * inputs[0].dims.d[3];
-  size_t score_size = inputs[1].dims.d[1] * inputs[1].dims.d[2];
-  size_t num_priors = inputs[0].dims.d[1];
-  bool shareLocation = (inputs[0].dims.d[2] == 1);
-  int topk = param.topK > 0 && param.topK <= inputs[1].dims.d[1] ? param.topK : inputs[1].dims.d[1];
-  return detectionInferenceWorkspaceSize(shareLocation, batch_size, boxes_size, score_size,
-                                         param.numClasses, num_priors, topk, DataType::kFLOAT,
-                                         DataType::kFLOAT);
-}
-
-int TRTBatchedRotatedNMS::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
-                                  const nvinfer1::PluginTensorDesc* outputDesc,
-                                  const void* const* inputs, void* const* outputs, void* workSpace,
-                                  cudaStream_t stream) TRT_NOEXCEPT {
-  const void* const locData = inputs[0];
-  const void* const confData = inputs[1];
-
-  void* nmsedDets = outputs[0];
-  void* nmsedLabels = outputs[1];
-
-  size_t batch_size = inputDesc[0].dims.d[0];
-  size_t boxes_size = inputDesc[0].dims.d[1] * inputDesc[0].dims.d[2] * inputDesc[0].dims.d[3];
-  size_t score_size = inputDesc[1].dims.d[1] * inputDesc[1].dims.d[2];
-  size_t num_priors = inputDesc[0].dims.d[1];
-  bool shareLocation = (inputDesc[0].dims.d[2] == 1);
-
-  int topk =
-      param.topK > 0 && param.topK <= inputDesc[1].dims.d[1] ? param.topK : inputDesc[1].dims.d[1];
-  bool rotated = true;
-  pluginStatus_t status = nmsInference(
-      stream, batch_size, boxes_size, score_size, shareLocation, param.backgroundLabelId,
-      num_priors, param.numClasses, topk, param.keepTopK, param.scoreThreshold, param.iouThreshold,
-      DataType::kFLOAT, locData, DataType::kFLOAT, confData, nmsedDets, nmsedLabels, nullptr,
-      workSpace, param.isNormalized, false, mClipBoxes, rotated);
-  ASSERT(status == STATUS_SUCCESS);
-
-  return 0;
-}
-
-size_t TRTBatchedRotatedNMS::getSerializationSize() const TRT_NOEXCEPT {
-  // NMSParameters,
-  return sizeof(NMSParameters) + sizeof(bool);
-}
-
-void TRTBatchedRotatedNMS::serialize(void* buffer) const TRT_NOEXCEPT {
-  serialize_value(&buffer, param);
-  serialize_value(&buffer, mClipBoxes);
-}
-
-void TRTBatchedRotatedNMS::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs,
-                                           int nbInputs,
-                                           const nvinfer1::DynamicPluginTensorDesc* outputs,
-                                           int nbOutputs) TRT_NOEXCEPT {
-  // Validate input arguments
-}
-
-bool TRTBatchedRotatedNMS::supportsFormatCombination(int pos,
-                                                     const nvinfer1::PluginTensorDesc* ioDesc,
-                                                     int nbInputs, int nbOutputs) TRT_NOEXCEPT {
-  if (pos == 3) {
-    return ioDesc[pos].type == nvinfer1::DataType::kINT32 &&
-           ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR;
-  }
-  return ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
-         ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR;
-}
-
-const char* TRTBatchedRotatedNMS::getPluginType() const TRT_NOEXCEPT { return NMS_PLUGIN_NAME; }
-
-const char* TRTBatchedRotatedNMS::getPluginVersion() const TRT_NOEXCEPT {
-  return NMS_PLUGIN_VERSION;
-}
-
-IPluginV2DynamicExt* TRTBatchedRotatedNMS::clone() const TRT_NOEXCEPT {
-  auto* plugin = new TRTBatchedRotatedNMS(mLayerName, param);
-  plugin->setPluginNamespace(mNamespace.c_str());
-  plugin->setClipParam(mClipBoxes);
-  return plugin;
-}
-
-nvinfer1::DataType TRTBatchedRotatedNMS::getOutputDataType(int index,
-                                                           const nvinfer1::DataType* inputTypes,
-                                                           int nbInputs) const TRT_NOEXCEPT {
-  ASSERT(index >= 0 && index < this->getNbOutputs());
-  if (index == 1) {
-    return nvinfer1::DataType::kINT32;
-  }
-  return inputTypes[0];
-}
-
-void TRTBatchedRotatedNMS::setClipParam(bool clip) { mClipBoxes = clip; }
-
-TRTBatchedRotatedNMSCreator::TRTBatchedRotatedNMSCreator() {
-  mPluginAttributes.emplace_back(
-      PluginField("background_label_id", nullptr, PluginFieldType::kINT32, 1));
-  mPluginAttributes.emplace_back(PluginField("num_classes", nullptr, PluginFieldType::kINT32, 1));
-  mPluginAttributes.emplace_back(PluginField("topk", nullptr, PluginFieldType::kINT32, 1));
-  mPluginAttributes.emplace_back(PluginField("keep_topk", nullptr, PluginFieldType::kINT32, 1));
-  mPluginAttributes.emplace_back(
-      PluginField("score_threshold", nullptr, PluginFieldType::kFLOAT32, 1));
-  mPluginAttributes.emplace_back(
-      PluginField("iou_threshold", nullptr, PluginFieldType::kFLOAT32, 1));
-  mPluginAttributes.emplace_back(PluginField("is_normalized", nullptr, PluginFieldType::kINT32, 1));
-  mPluginAttributes.emplace_back(PluginField("clip_boxes", nullptr, PluginFieldType::kINT32, 1));
-
-  mFC.nbFields = mPluginAttributes.size();
-  mFC.fields = mPluginAttributes.data();
-}
-
-const char* TRTBatchedRotatedNMSCreator::getPluginName() const TRT_NOEXCEPT {
-  return NMS_PLUGIN_NAME;
-}
-
-const char* TRTBatchedRotatedNMSCreator::getPluginVersion() const TRT_NOEXCEPT {
-  return NMS_PLUGIN_VERSION;
-}
-
-IPluginV2Ext* TRTBatchedRotatedNMSCreator::createPlugin(
-    const char* name, const PluginFieldCollection* fc) TRT_NOEXCEPT {
-  const PluginField* fields = fc->fields;
-  bool clipBoxes = true;
-  nvinfer1::plugin::NMSParameters params{};
-
-  for (int i = 0; i < fc->nbFields; ++i) {
-    const char* attrName = fields[i].name;
-    if (!strcmp(attrName, "background_label_id")) {
-      ASSERT(fields[i].type == PluginFieldType::kINT32);
-      params.backgroundLabelId = *(static_cast<const int*>(fields[i].data));
-    } else if (!strcmp(attrName, "num_classes")) {
-      ASSERT(fields[i].type == PluginFieldType::kINT32);
-      params.numClasses = *(static_cast<const int*>(fields[i].data));
-    } else if (!strcmp(attrName, "topk")) {
-      ASSERT(fields[i].type == PluginFieldType::kINT32);
-      params.topK = *(static_cast<const int*>(fields[i].data));
-    } else if (!strcmp(attrName, "keep_topk")) {
-      ASSERT(fields[i].type == PluginFieldType::kINT32);
-      params.keepTopK = *(static_cast<const int*>(fields[i].data));
-    } else if (!strcmp(attrName, "score_threshold")) {
-      ASSERT(fields[i].type == PluginFieldType::kFLOAT32);
-      params.scoreThreshold = *(static_cast<const float*>(fields[i].data));
-    } else if (!strcmp(attrName, "iou_threshold")) {
-      ASSERT(fields[i].type == PluginFieldType::kFLOAT32);
-      params.iouThreshold = *(static_cast<const float*>(fields[i].data));
-    } else if (!strcmp(attrName, "is_normalized")) {
-      params.isNormalized = *(static_cast<const bool*>(fields[i].data));
-    } else if (!strcmp(attrName, "clip_boxes")) {
-      clipBoxes = *(static_cast<const bool*>(fields[i].data));
-    }
-  }
-
-  TRTBatchedRotatedNMS* plugin = new TRTBatchedRotatedNMS(name, params);
-  plugin->setClipParam(clipBoxes);
-  plugin->setPluginNamespace(mNamespace.c_str());
-  return plugin;
-}
-
-IPluginV2Ext* TRTBatchedRotatedNMSCreator::deserializePlugin(const char* name,
-                                                             const void* serialData,
-                                                             size_t serialLength) TRT_NOEXCEPT {
-  // This object will be deleted when the network is destroyed, which will
-  // call NMS::destroy()
-  TRTBatchedRotatedNMS* plugin = new TRTBatchedRotatedNMS(name, serialData, serialLength);
-  plugin->setPluginNamespace(mNamespace.c_str());
-  return plugin;
-}
-
-REGISTER_TENSORRT_PLUGIN(TRTBatchedRotatedNMSCreator);
+namespace mmdeploy
+{
+    using namespace nvinfer1;
+    using nvinfer1::plugin::NMSParameters;
+
+    namespace
+    {
+        static const char* NMS_PLUGIN_VERSION{"1"};
+        static const char* NMS_PLUGIN_NAME{"TRTBatchedRotatedNMS"};
+    }  // namespace
+
+    TRTBatchedRotatedNMS::TRTBatchedRotatedNMS(const std::string& name, NMSParameters params)
+        : TRTPluginBase(name)
+        , param(params)
+    {
+    }
+
+    TRTBatchedRotatedNMS::TRTBatchedRotatedNMS(const std::string& name, const void* data, size_t length)
+        : TRTPluginBase(name)
+    {
+        deserialize_value(&data, &length, &param);
+        deserialize_value(&data, &length, &mClipBoxes);
+    }
+
+    int TRTBatchedRotatedNMS::getNbOutputs() const TRT_NOEXCEPT
+    {
+        return 2;
+    }
+
+    nvinfer1::DimsExprs TRTBatchedRotatedNMS::getOutputDimensions(
+        int                        outputIndex,
+        const nvinfer1::DimsExprs* inputs,
+        int                        nbInputs,
+        nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT
+    {
+        ASSERT(nbInputs == 2);
+        ASSERT(outputIndex >= 0 && outputIndex < this->getNbOutputs());
+        ASSERT(inputs[0].nbDims == 4);
+        ASSERT(inputs[1].nbDims == 3);
+
+        nvinfer1::DimsExprs ret;
+        ret.d[0] = inputs[0].d[0];
+        ret.d[1] = exprBuilder.constant(param.keepTopK);
+        switch (outputIndex)
+        {
+            case 0:
+                ret.nbDims = 3;
+                ret.d[2]   = exprBuilder.constant(6);
+                break;
+            case 1:
+                ret.nbDims = 2;
+                break;
+            default:
+                break;
+        }
+
+        return ret;
+    }
+
+    size_t TRTBatchedRotatedNMS::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                                  int                               nbInputs,
+                                                  const nvinfer1::PluginTensorDesc* outputs,
+                                                  int                               nbOutputs) const TRT_NOEXCEPT
+    {
+        size_t batch_size    = inputs[0].dims.d[0];
+        size_t boxes_size    = inputs[0].dims.d[1] * inputs[0].dims.d[2] * inputs[0].dims.d[3];
+        size_t score_size    = inputs[1].dims.d[1] * inputs[1].dims.d[2];
+        size_t num_priors    = inputs[0].dims.d[1];
+        bool   shareLocation = (inputs[0].dims.d[2] == 1);
+        int    topk          = param.topK > 0 && param.topK <= inputs[1].dims.d[1] ? param.topK : inputs[1].dims.d[1];
+        return detectionInferenceWorkspaceSize(shareLocation, batch_size, boxes_size, score_size, param.numClasses, num_priors, topk, DataType::kFLOAT, DataType::kFLOAT);
+    }
+
+    int TRTBatchedRotatedNMS::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                      const nvinfer1::PluginTensorDesc* outputDesc,
+                                      const void* const*                inputs,
+                                      void* const*                      outputs,
+                                      void*                             workSpace,
+                                      cudaStream_t                      stream) TRT_NOEXCEPT
+    {
+        const void* const locData  = inputs[0];
+        const void* const confData = inputs[1];
+
+        void*             nmsedDets   = outputs[0];
+        void*             nmsedLabels = outputs[1];
+
+        size_t            batch_size    = inputDesc[0].dims.d[0];
+        size_t            boxes_size    = inputDesc[0].dims.d[1] * inputDesc[0].dims.d[2] * inputDesc[0].dims.d[3];
+        size_t            score_size    = inputDesc[1].dims.d[1] * inputDesc[1].dims.d[2];
+        size_t            num_priors    = inputDesc[0].dims.d[1];
+        bool              shareLocation = (inputDesc[0].dims.d[2] == 1);
+
+        int               topk =
+            param.topK > 0 && param.topK <= inputDesc[1].dims.d[1] ? param.topK : inputDesc[1].dims.d[1];
+        bool           rotated = true;
+        pluginStatus_t status  = nmsInference(
+            stream,
+            batch_size,
+            boxes_size,
+            score_size,
+            shareLocation,
+            param.backgroundLabelId,
+            num_priors,
+            param.numClasses,
+            topk,
+            param.keepTopK,
+            param.scoreThreshold,
+            param.iouThreshold,
+            DataType::kFLOAT,
+            locData,
+            DataType::kFLOAT,
+            confData,
+            nmsedDets,
+            nmsedLabels,
+            nullptr,
+            workSpace,
+            param.isNormalized,
+            false,
+            mClipBoxes,
+            rotated);
+        ASSERT(status == STATUS_SUCCESS);
+
+        return 0;
+    }
+
+    size_t TRTBatchedRotatedNMS::getSerializationSize() const TRT_NOEXCEPT
+    {
+        // NMSParameters,
+        return sizeof(NMSParameters) + sizeof(bool);
+    }
+
+    void TRTBatchedRotatedNMS::serialize(void* buffer) const TRT_NOEXCEPT
+    {
+        serialize_value(&buffer, param);
+        serialize_value(&buffer, mClipBoxes);
+    }
+
+    void TRTBatchedRotatedNMS::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs,
+                                               int                                      nbInputs,
+                                               const nvinfer1::DynamicPluginTensorDesc* outputs,
+                                               int                                      nbOutputs) TRT_NOEXCEPT
+    {
+        // Validate input arguments
+    }
+
+    bool TRTBatchedRotatedNMS::supportsFormatCombination(int                               pos,
+                                                         const nvinfer1::PluginTensorDesc* ioDesc,
+                                                         int                               nbInputs,
+                                                         int                               nbOutputs) TRT_NOEXCEPT
+    {
+        if (pos == 3)
+        {
+            return ioDesc[pos].type == nvinfer1::DataType::kINT32 &&
+                   ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR;
+        }
+        return ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
+               ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR;
+    }
+
+    const char* TRTBatchedRotatedNMS::getPluginType() const TRT_NOEXCEPT
+    {
+        return NMS_PLUGIN_NAME;
+    }
+
+    const char* TRTBatchedRotatedNMS::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return NMS_PLUGIN_VERSION;
+    }
+
+    IPluginV2DynamicExt* TRTBatchedRotatedNMS::clone() const TRT_NOEXCEPT
+    {
+        auto* plugin = new TRTBatchedRotatedNMS(mLayerName, param);
+        plugin->setPluginNamespace(mNamespace.c_str());
+        plugin->setClipParam(mClipBoxes);
+        return plugin;
+    }
+
+    nvinfer1::DataType TRTBatchedRotatedNMS::getOutputDataType(int                       index,
+                                                               const nvinfer1::DataType* inputTypes,
+                                                               int                       nbInputs) const TRT_NOEXCEPT
+    {
+        ASSERT(index >= 0 && index < this->getNbOutputs());
+        if (index == 1)
+        {
+            return nvinfer1::DataType::kINT32;
+        }
+        return inputTypes[0];
+    }
+
+    void TRTBatchedRotatedNMS::setClipParam(bool clip)
+    {
+        mClipBoxes = clip;
+    }
+
+    TRTBatchedRotatedNMSCreator::TRTBatchedRotatedNMSCreator()
+    {
+        mPluginAttributes.emplace_back(
+            PluginField("background_label_id", nullptr, PluginFieldType::kINT32, 1));
+        mPluginAttributes.emplace_back(PluginField("num_classes", nullptr, PluginFieldType::kINT32, 1));
+        mPluginAttributes.emplace_back(PluginField("topk", nullptr, PluginFieldType::kINT32, 1));
+        mPluginAttributes.emplace_back(PluginField("keep_topk", nullptr, PluginFieldType::kINT32, 1));
+        mPluginAttributes.emplace_back(
+            PluginField("score_threshold", nullptr, PluginFieldType::kFLOAT32, 1));
+        mPluginAttributes.emplace_back(
+            PluginField("iou_threshold", nullptr, PluginFieldType::kFLOAT32, 1));
+        mPluginAttributes.emplace_back(PluginField("is_normalized", nullptr, PluginFieldType::kINT32, 1));
+        mPluginAttributes.emplace_back(PluginField("clip_boxes", nullptr, PluginFieldType::kINT32, 1));
+
+        mFC.nbFields = mPluginAttributes.size();
+        mFC.fields   = mPluginAttributes.data();
+    }
+
+    const char* TRTBatchedRotatedNMSCreator::getPluginName() const TRT_NOEXCEPT
+    {
+        return NMS_PLUGIN_NAME;
+    }
+
+    const char* TRTBatchedRotatedNMSCreator::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return NMS_PLUGIN_VERSION;
+    }
+
+    IPluginV2Ext* TRTBatchedRotatedNMSCreator::createPlugin(
+        const char*                  name,
+        const PluginFieldCollection* fc) TRT_NOEXCEPT
+    {
+        const PluginField*              fields    = fc->fields;
+        bool                            clipBoxes = true;
+        nvinfer1::plugin::NMSParameters params{};
+
+        for (int i = 0; i < fc->nbFields; ++i)
+        {
+            const char* attrName = fields[i].name;
+            if (!strcmp(attrName, "background_label_id"))
+            {
+                ASSERT(fields[i].type == PluginFieldType::kINT32);
+                params.backgroundLabelId = *(static_cast<const int*>(fields[i].data));
+            }
+            else if (!strcmp(attrName, "num_classes"))
+            {
+                ASSERT(fields[i].type == PluginFieldType::kINT32);
+                params.numClasses = *(static_cast<const int*>(fields[i].data));
+            }
+            else if (!strcmp(attrName, "topk"))
+            {
+                ASSERT(fields[i].type == PluginFieldType::kINT32);
+                params.topK = *(static_cast<const int*>(fields[i].data));
+            }
+            else if (!strcmp(attrName, "keep_topk"))
+            {
+                ASSERT(fields[i].type == PluginFieldType::kINT32);
+                params.keepTopK = *(static_cast<const int*>(fields[i].data));
+            }
+            else if (!strcmp(attrName, "score_threshold"))
+            {
+                ASSERT(fields[i].type == PluginFieldType::kFLOAT32);
+                params.scoreThreshold = *(static_cast<const float*>(fields[i].data));
+            }
+            else if (!strcmp(attrName, "iou_threshold"))
+            {
+                ASSERT(fields[i].type == PluginFieldType::kFLOAT32);
+                params.iouThreshold = *(static_cast<const float*>(fields[i].data));
+            }
+            else if (!strcmp(attrName, "is_normalized"))
+            {
+                params.isNormalized = *(static_cast<const bool*>(fields[i].data));
+            }
+            else if (!strcmp(attrName, "clip_boxes"))
+            {
+                clipBoxes = *(static_cast<const bool*>(fields[i].data));
+            }
+        }
+
+        TRTBatchedRotatedNMS* plugin = new TRTBatchedRotatedNMS(name, params);
+        plugin->setClipParam(clipBoxes);
+        plugin->setPluginNamespace(mNamespace.c_str());
+        return plugin;
+    }
+
+    IPluginV2Ext* TRTBatchedRotatedNMSCreator::deserializePlugin(const char* name,
+                                                                 const void* serialData,
+                                                                 size_t      serialLength) TRT_NOEXCEPT
+    {
+        // This object will be deleted when the network is destroyed, which will
+        // call NMS::destroy()
+        TRTBatchedRotatedNMS* plugin = new TRTBatchedRotatedNMS(name, serialData, serialLength);
+        plugin->setPluginNamespace(mNamespace.c_str());
+        return plugin;
+    }
+
+    REGISTER_TENSORRT_PLUGIN(TRTBatchedRotatedNMSCreator);
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/batched_rotated_nms/trt_batched_rotated_nms.hpp b/csrc/mmdeploy/backend_ops/tensorrt/batched_rotated_nms/trt_batched_rotated_nms.hpp
index 66479eb7e7..be156dc9c9 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/batched_rotated_nms/trt_batched_rotated_nms.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/batched_rotated_nms/trt_batched_rotated_nms.hpp
@@ -7,72 +7,85 @@
 
 #include "NvInferPluginUtils.h"
 #include "trt_plugin_base.hpp"
-namespace mmdeploy {
-class TRTBatchedRotatedNMS : public TRTPluginBase {
- public:
-  TRTBatchedRotatedNMS(const std::string& name, nvinfer1::plugin::NMSParameters param);
+namespace mmdeploy
+{
+    class TRTBatchedRotatedNMS : public TRTPluginBase
+    {
+      public:
+        TRTBatchedRotatedNMS(const std::string& name, nvinfer1::plugin::NMSParameters param);
 
-  TRTBatchedRotatedNMS(const std::string& name, const void* data, size_t length);
+        TRTBatchedRotatedNMS(const std::string& name, const void* data, size_t length);
 
-  ~TRTBatchedRotatedNMS() TRT_NOEXCEPT override = default;
+        ~TRTBatchedRotatedNMS() TRT_NOEXCEPT override = default;
 
-  int getNbOutputs() const TRT_NOEXCEPT override;
+        int                 getNbOutputs() const TRT_NOEXCEPT override;
 
-  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs* inputs,
-                                          int nbInputs, nvinfer1::IExprBuilder& exprBuilder)
-      TRT_NOEXCEPT override;
+        nvinfer1::DimsExprs            getOutputDimensions(int                        outputIndex,
+                                                           const nvinfer1::DimsExprs* inputs,
+                                                           int                        nbInputs,
+                                                           nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT override;
 
-  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
-                          const nvinfer1::PluginTensorDesc* outputs,
-                          int nbOutputs) const TRT_NOEXCEPT override;
+        size_t                         getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                                        int                               nbInputs,
+                                                        const nvinfer1::PluginTensorDesc* outputs,
+                                                        int                               nbOutputs) const TRT_NOEXCEPT override;
 
-  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
-              const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs,
-              void* const* outputs, void* workSpace, cudaStream_t stream) TRT_NOEXCEPT override;
+        int                            enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                               const nvinfer1::PluginTensorDesc* outputDesc,
+                                               const void* const*                inputs,
+                                               void* const*                      outputs,
+                                               void*                             workSpace,
+                                               cudaStream_t                      stream) TRT_NOEXCEPT override;
 
-  size_t getSerializationSize() const TRT_NOEXCEPT override;
+        size_t                         getSerializationSize() const TRT_NOEXCEPT override;
 
-  void serialize(void* buffer) const TRT_NOEXCEPT override;
+        void                           serialize(void* buffer) const TRT_NOEXCEPT override;
 
-  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs, int nbInputs,
-                       const nvinfer1::DynamicPluginTensorDesc* outputs,
-                       int nbOutputs) TRT_NOEXCEPT override;
+        void                           configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs,
+                                                       int                                      nbInputs,
+                                                       const nvinfer1::DynamicPluginTensorDesc* outputs,
+                                                       int                                      nbOutputs) TRT_NOEXCEPT override;
 
-  bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs,
-                                 int nbOutputs) TRT_NOEXCEPT override;
+        bool                           supportsFormatCombination(int                               pos,
+                                                                 const nvinfer1::PluginTensorDesc* ioDesc,
+                                                                 int                               nbInputs,
+                                                                 int                               nbOutputs) TRT_NOEXCEPT override;
 
-  const char* getPluginType() const TRT_NOEXCEPT override;
+        const char*                    getPluginType() const TRT_NOEXCEPT override;
 
-  const char* getPluginVersion() const TRT_NOEXCEPT override;
+        const char*                    getPluginVersion() const TRT_NOEXCEPT override;
 
-  nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
+        nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
 
-  nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputType,
-                                       int nbInputs) const TRT_NOEXCEPT override;
+        nvinfer1::DataType             getOutputDataType(int                       index,
+                                                         const nvinfer1::DataType* inputType,
+                                                         int                       nbInputs) const TRT_NOEXCEPT override;
 
-  void setClipParam(bool clip);
+        void                           setClipParam(bool clip);
 
- private:
-  nvinfer1::plugin::NMSParameters param{};
-  bool mClipBoxes{};
-};
+      private:
+        nvinfer1::plugin::NMSParameters param{};
+        bool                            mClipBoxes{};
+    };
 
-class TRTBatchedRotatedNMSCreator : public TRTPluginCreatorBase {
- public:
-  TRTBatchedRotatedNMSCreator();
+    class TRTBatchedRotatedNMSCreator : public TRTPluginCreatorBase
+    {
+      public:
+        TRTBatchedRotatedNMSCreator();
 
-  ~TRTBatchedRotatedNMSCreator() TRT_NOEXCEPT override = default;
+        ~TRTBatchedRotatedNMSCreator() TRT_NOEXCEPT override = default;
 
-  const char* getPluginName() const TRT_NOEXCEPT override;
+        const char*             getPluginName() const TRT_NOEXCEPT override;
 
-  const char* getPluginVersion() const TRT_NOEXCEPT override;
+        const char*             getPluginVersion() const TRT_NOEXCEPT override;
 
-  nvinfer1::IPluginV2Ext* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc)
-      TRT_NOEXCEPT override;
+        nvinfer1::IPluginV2Ext* createPlugin(const char*                            name,
+                                             const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override;
 
-  nvinfer1::IPluginV2Ext* deserializePlugin(const char* name, const void* serialData,
-                                            size_t serialLength) TRT_NOEXCEPT override;
-};
+        nvinfer1::IPluginV2Ext* deserializePlugin(const char* name,
+                                                  const void* serialData,
+                                                  size_t      serialLength) TRT_NOEXCEPT override;
+    };
 }  // namespace mmdeploy
 
 #endif
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate.cpp b/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate.cpp
index 0f236e4956..6f46a9f295 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate.cpp
@@ -10,176 +10,228 @@
 #include "trt_serialize.hpp"
 using namespace nvinfer1;
 
-namespace mmdeploy {
-namespace {
-static const char *PLUGIN_VERSION{"1"};
-static const char *PLUGIN_NAME{"TRTBicubicInterpolate"};
-}  // namespace
-
-TRTBicubicInterpolate::TRTBicubicInterpolate(const std::string &name,
-                                             std::vector<float> scale_factor, bool align_corners)
-    : TRTPluginBase(name), mScaleFactor(scale_factor), mAlignCorners(align_corners) {}
-
-TRTBicubicInterpolate::TRTBicubicInterpolate(const std::string name, const void *data,
-                                             size_t length)
-    : TRTPluginBase(name) {
-  deserialize_value(&data, &length, &mScaleFactor);
-  deserialize_value(&data, &length, &mAlignCorners);
-}
-
-nvinfer1::IPluginV2DynamicExt *TRTBicubicInterpolate::clone() const TRT_NOEXCEPT {
-  TRTBicubicInterpolate *plugin =
-      new TRTBicubicInterpolate(mLayerName, mScaleFactor, mAlignCorners);
-  plugin->setPluginNamespace(getPluginNamespace());
-
-  return plugin;
-}
-
-nvinfer1::DimsExprs TRTBicubicInterpolate::getOutputDimensions(
-    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
-    nvinfer1::IExprBuilder &exprBuilder) TRT_NOEXCEPT {
-  nvinfer1::DimsExprs ret;
-  ret.nbDims = 4;
-  ret.d[0] = inputs[0].d[0];
-  ret.d[1] = inputs[0].d[1];
-  auto height = exprBuilder.constant(mScaleFactor[0]);
-  auto width = exprBuilder.constant(mScaleFactor[1]);
-  auto d2 = exprBuilder.operation(DimensionOperation::kPROD, *inputs[0].d[2], *height);
-  auto d3 = exprBuilder.operation(DimensionOperation::kPROD, *inputs[0].d[3], *width);
-  ret.d[2] = d2;
-  ret.d[3] = d3;
-
-  return ret;
-}
-
-bool TRTBicubicInterpolate::supportsFormatCombination(int pos,
-                                                      const nvinfer1::PluginTensorDesc *ioDesc,
-                                                      int nbInputs, int nbOutputs) TRT_NOEXCEPT {
-  if (pos == 0) {
-    return (ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
-            ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR);
-
-  } else {
-    return ioDesc[pos].type == ioDesc[0].type && ioDesc[pos].format == ioDesc[0].format;
-  }
-}
-
-void TRTBicubicInterpolate::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *inputs,
-                                            int nbInputs,
-                                            const nvinfer1::DynamicPluginTensorDesc *outputs,
-                                            int nbOutputs) TRT_NOEXCEPT {}
-
-size_t TRTBicubicInterpolate::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs,
-                                               int nbInputs,
-                                               const nvinfer1::PluginTensorDesc *outputs,
-                                               int nbOutputs) const TRT_NOEXCEPT {
-  return 0;
-}
-
-int TRTBicubicInterpolate::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
-                                   const nvinfer1::PluginTensorDesc *outputDesc,
-                                   const void *const *inputs, void *const *outputs, void *workSpace,
-                                   cudaStream_t stream) TRT_NOEXCEPT {
-  int batch = inputDesc[0].dims.d[0];
-  int channels = inputDesc[0].dims.d[1];
-  int height = inputDesc[0].dims.d[2];
-  int width = inputDesc[0].dims.d[3];
-
-  int height_out = outputDesc[0].dims.d[2];
-  int width_out = outputDesc[0].dims.d[3];
-  const void *x = inputs[0];
-  void *output = outputs[0];
-
-  // TODO: add fp16 support
-  auto data_type = inputDesc[0].type;
-  switch (data_type) {
-    case nvinfer1::DataType::kFLOAT:
-      bicubic_interpolate<float>((float *)x, (float *)output, batch, channels, height, width,
-                                 height_out, width_out, mAlignCorners, stream);
-      break;
-    default:
-      return 1;
-      break;
-  }
-
-  return 0;
-}
-
-nvinfer1::DataType TRTBicubicInterpolate::getOutputDataType(int index,
-                                                            const nvinfer1::DataType *inputTypes,
-                                                            int nbInputs) const TRT_NOEXCEPT {
-  return inputTypes[0];
-}
-
-// IPluginV2 Methods
-const char *TRTBicubicInterpolate::getPluginType() const TRT_NOEXCEPT { return PLUGIN_NAME; }
-
-const char *TRTBicubicInterpolate::getPluginVersion() const TRT_NOEXCEPT { return PLUGIN_VERSION; }
-
-int TRTBicubicInterpolate::getNbOutputs() const TRT_NOEXCEPT { return 1; }
-
-size_t TRTBicubicInterpolate::getSerializationSize() const TRT_NOEXCEPT {
-  return serialized_size(mScaleFactor) + serialized_size(mAlignCorners);
-}
-
-void TRTBicubicInterpolate::serialize(void *buffer) const TRT_NOEXCEPT {
-  serialize_value(&buffer, mScaleFactor);
-  serialize_value(&buffer, mAlignCorners);
-}
-
-////////////////////// creator /////////////////////////////
-
-TRTBicubicInterpolateCreator::TRTBicubicInterpolateCreator() {
-  mPluginAttributes.clear();
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("scale_factor"));
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("align_corners"));
-  mFC.nbFields = mPluginAttributes.size();
-  mFC.fields = mPluginAttributes.data();
-}
-
-const char *TRTBicubicInterpolateCreator::getPluginName() const TRT_NOEXCEPT { return PLUGIN_NAME; }
-
-const char *TRTBicubicInterpolateCreator::getPluginVersion() const TRT_NOEXCEPT {
-  return PLUGIN_VERSION;
-}
-
-nvinfer1::IPluginV2 *TRTBicubicInterpolateCreator::createPlugin(
-    const char *name, const nvinfer1::PluginFieldCollection *fc) TRT_NOEXCEPT {
-  nvinfer1::Dims size{2, {1, 1}};
-  std::vector<float> scale_factor;
-  bool align_corners = 1;
-
-  for (int i = 0; i < fc->nbFields; i++) {
-    if (fc->fields[i].data == nullptr) {
-      continue;
-    }
-    std::string field_name(fc->fields[i].name);
-
-    if (field_name.compare("scale_factor") == 0) {
-      int data_size = (fc->fields[i].length);
-      if (data_size != 2) {
-        data_size = data_size / sizeof(float);
-      }
-      ASSERT(data_size == 2)
-      const float *data_start = static_cast<const float *>(fc->fields[i].data);
-      scale_factor = std::vector<float>(data_start, data_start + data_size);
-    }
-
-    if (field_name.compare("align_corners") == 0) {
-      align_corners = static_cast<const int *>(fc->fields[i].data)[0];
-    }
-  }
-
-  TRTBicubicInterpolate *plugin = new TRTBicubicInterpolate(name, scale_factor, align_corners);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-
-nvinfer1::IPluginV2 *TRTBicubicInterpolateCreator::deserializePlugin(
-    const char *name, const void *serialData, size_t serialLength) TRT_NOEXCEPT {
-  auto plugin = new TRTBicubicInterpolate(name, serialData, serialLength);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-REGISTER_TENSORRT_PLUGIN(TRTBicubicInterpolateCreator);
+namespace mmdeploy
+{
+    namespace
+    {
+        static const char* PLUGIN_VERSION{"1"};
+        static const char* PLUGIN_NAME{"TRTBicubicInterpolate"};
+    }  // namespace
+
+    TRTBicubicInterpolate::TRTBicubicInterpolate(const std::string& name,
+                                                 std::vector<float> scale_factor,
+                                                 bool               align_corners)
+        : TRTPluginBase(name)
+        , mScaleFactor(scale_factor)
+        , mAlignCorners(align_corners)
+    {
+    }
+
+    TRTBicubicInterpolate::TRTBicubicInterpolate(const std::string name, const void* data, size_t length)
+        : TRTPluginBase(name)
+    {
+        deserialize_value(&data, &length, &mScaleFactor);
+        deserialize_value(&data, &length, &mAlignCorners);
+    }
+
+    nvinfer1::IPluginV2DynamicExt* TRTBicubicInterpolate::clone() const TRT_NOEXCEPT
+    {
+        TRTBicubicInterpolate* plugin =
+            new TRTBicubicInterpolate(mLayerName, mScaleFactor, mAlignCorners);
+        plugin->setPluginNamespace(getPluginNamespace());
+
+        return plugin;
+    }
+
+    nvinfer1::DimsExprs TRTBicubicInterpolate::getOutputDimensions(int                        outputIndex,
+                                                                   const nvinfer1::DimsExprs* inputs,
+                                                                   int                        nbInputs,
+                                                                   nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT
+    {
+        nvinfer1::DimsExprs ret;
+        ret.nbDims  = 4;
+        ret.d[0]    = inputs[0].d[0];
+        ret.d[1]    = inputs[0].d[1];
+        auto height = exprBuilder.constant(mScaleFactor[0]);
+        auto width  = exprBuilder.constant(mScaleFactor[1]);
+        auto d2     = exprBuilder.operation(DimensionOperation::kPROD, *inputs[0].d[2], *height);
+        auto d3     = exprBuilder.operation(DimensionOperation::kPROD, *inputs[0].d[3], *width);
+        ret.d[2]    = d2;
+        ret.d[3]    = d3;
+
+        return ret;
+    }
+
+    bool TRTBicubicInterpolate::supportsFormatCombination(int                               pos,
+                                                          const nvinfer1::PluginTensorDesc* ioDesc,
+                                                          int                               nbInputs,
+                                                          int                               nbOutputs) TRT_NOEXCEPT
+    {
+        if (pos == 0)
+        {
+            return (ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
+                    ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR);
+        }
+        else
+        {
+            return ioDesc[pos].type == ioDesc[0].type && ioDesc[pos].format == ioDesc[0].format;
+        }
+    }
+
+    void   TRTBicubicInterpolate::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs,
+                                                int                                      nbInputs,
+                                                const nvinfer1::DynamicPluginTensorDesc* outputs,
+                                                int                                      nbOutputs) TRT_NOEXCEPT {}
+
+    size_t TRTBicubicInterpolate::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                                   int                               nbInputs,
+                                                   const nvinfer1::PluginTensorDesc* outputs,
+                                                   int                               nbOutputs) const TRT_NOEXCEPT
+    {
+        return 0;
+    }
+
+    int TRTBicubicInterpolate::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                       const nvinfer1::PluginTensorDesc* outputDesc,
+                                       const void* const*                inputs,
+                                       void* const*                      outputs,
+                                       void*                             workSpace,
+                                       cudaStream_t                      stream) TRT_NOEXCEPT
+    {
+        int         batch    = inputDesc[0].dims.d[0];
+        int         channels = inputDesc[0].dims.d[1];
+        int         height   = inputDesc[0].dims.d[2];
+        int         width    = inputDesc[0].dims.d[3];
+
+        int         height_out = outputDesc[0].dims.d[2];
+        int         width_out  = outputDesc[0].dims.d[3];
+        const void* x          = inputs[0];
+        void*       output     = outputs[0];
+
+        // TODO: add fp16 support
+        auto        data_type = inputDesc[0].type;
+        switch (data_type)
+        {
+            case nvinfer1::DataType::kFLOAT:
+                bicubic_interpolate<float>((float*)x,
+                                           (float*)output,
+                                           batch,
+                                           channels,
+                                           height,
+                                           width,
+                                           height_out,
+                                           width_out,
+                                           mAlignCorners,
+                                           stream);
+                break;
+            default:
+                return 1;
+                break;
+        }
+
+        return 0;
+    }
+
+    nvinfer1::DataType TRTBicubicInterpolate::getOutputDataType(int                       index,
+                                                                const nvinfer1::DataType* inputTypes,
+                                                                int                       nbInputs) const TRT_NOEXCEPT
+    {
+        return inputTypes[0];
+    }
+
+    // IPluginV2 Methods
+    const char* TRTBicubicInterpolate::getPluginType() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
+    }
+
+    const char* TRTBicubicInterpolate::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
+    }
+
+    int TRTBicubicInterpolate::getNbOutputs() const TRT_NOEXCEPT
+    {
+        return 1;
+    }
+
+    size_t TRTBicubicInterpolate::getSerializationSize() const TRT_NOEXCEPT
+    {
+        return serialized_size(mScaleFactor) + serialized_size(mAlignCorners);
+    }
+
+    void TRTBicubicInterpolate::serialize(void* buffer) const TRT_NOEXCEPT
+    {
+        serialize_value(&buffer, mScaleFactor);
+        serialize_value(&buffer, mAlignCorners);
+    }
+
+    ////////////////////// creator /////////////////////////////
+
+    TRTBicubicInterpolateCreator::TRTBicubicInterpolateCreator()
+    {
+        mPluginAttributes.clear();
+        mPluginAttributes.emplace_back(nvinfer1::PluginField("scale_factor"));
+        mPluginAttributes.emplace_back(nvinfer1::PluginField("align_corners"));
+        mFC.nbFields = mPluginAttributes.size();
+        mFC.fields   = mPluginAttributes.data();
+    }
+
+    const char* TRTBicubicInterpolateCreator::getPluginName() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
+    }
+
+    const char* TRTBicubicInterpolateCreator::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
+    }
+
+    nvinfer1::IPluginV2* TRTBicubicInterpolateCreator::createPlugin(const char*                            name,
+                                                                    const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT
+    {
+        nvinfer1::Dims     size{2, {1, 1}};
+        std::vector<float> scale_factor;
+        bool               align_corners = 1;
+
+        for (int i = 0; i < fc->nbFields; i++)
+        {
+            if (fc->fields[i].data == nullptr)
+            {
+                continue;
+            }
+            std::string field_name(fc->fields[i].name);
+
+            if (field_name.compare("scale_factor") == 0)
+            {
+                int data_size = (fc->fields[i].length);
+                if (data_size != 2)
+                {
+                    data_size = data_size / sizeof(float);
+                }
+                ASSERT(data_size == 2)
+                const float* data_start = static_cast<const float*>(fc->fields[i].data);
+                scale_factor            = std::vector<float>(data_start, data_start + data_size);
+            }
+
+            if (field_name.compare("align_corners") == 0)
+            {
+                align_corners = static_cast<const int*>(fc->fields[i].data)[0];
+            }
+        }
+
+        TRTBicubicInterpolate* plugin = new TRTBicubicInterpolate(name, scale_factor, align_corners);
+        plugin->setPluginNamespace(getPluginNamespace());
+        return plugin;
+    }
+
+    nvinfer1::IPluginV2* TRTBicubicInterpolateCreator::deserializePlugin(const char* name,
+                                                                         const void* serialData,
+                                                                         size_t      serialLength) TRT_NOEXCEPT
+    {
+        auto plugin = new TRTBicubicInterpolate(name, serialData, serialLength);
+        plugin->setPluginNamespace(getPluginNamespace());
+        return plugin;
+    }
+    REGISTER_TENSORRT_PLUGIN(TRTBicubicInterpolateCreator);
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate.hpp b/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate.hpp
index 37ad7cf9ff..9a66c5e718 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate.hpp
@@ -7,61 +7,78 @@
 #include <vector>
 
 #include "trt_plugin_base.hpp"
-namespace mmdeploy {
-class TRTBicubicInterpolate : public TRTPluginBase {
- public:
-  TRTBicubicInterpolate(const std::string &name, std::vector<float> scale_factor,
-                        bool align_corners);
-
-  TRTBicubicInterpolate(const std::string name, const void *data, size_t length);
-
-  TRTBicubicInterpolate() = delete;
-
-  // IPluginV2DynamicExt Methods
-  nvinfer1::IPluginV2DynamicExt *clone() const TRT_NOEXCEPT override;
-  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs,
-                                          int nbInputs, nvinfer1::IExprBuilder &exprBuilder)
-      TRT_NOEXCEPT override;
-  bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs,
-                                 int nbOutputs) TRT_NOEXCEPT override;
-  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs,
-                       const nvinfer1::DynamicPluginTensorDesc *out,
-                       int nbOutputs) TRT_NOEXCEPT override;
-  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
-                          const nvinfer1::PluginTensorDesc *outputs,
-                          int nbOutputs) const TRT_NOEXCEPT override;
-  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
-              const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
-              void *const *outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT override;
-
-  // IPluginV2Ext Methods
-  nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
-                                       int nbInputs) const TRT_NOEXCEPT override;
-
-  // IPluginV2 Methods
-  const char *getPluginType() const TRT_NOEXCEPT override;
-  const char *getPluginVersion() const TRT_NOEXCEPT override;
-  int getNbOutputs() const TRT_NOEXCEPT override;
-  size_t getSerializationSize() const TRT_NOEXCEPT override;
-  void serialize(void *buffer) const TRT_NOEXCEPT override;
-
- private:
-  std::vector<float> mScaleFactor;
-  bool mAlignCorners;
-};
-
-class TRTBicubicInterpolateCreator : public TRTPluginCreatorBase {
- public:
-  TRTBicubicInterpolateCreator();
-
-  const char *getPluginName() const TRT_NOEXCEPT override;
-
-  const char *getPluginVersion() const TRT_NOEXCEPT override;
-  nvinfer1::IPluginV2 *createPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
-      TRT_NOEXCEPT override;
-
-  nvinfer1::IPluginV2 *deserializePlugin(const char *name, const void *serialData,
-                                         size_t serialLength) TRT_NOEXCEPT override;
-};
+namespace mmdeploy
+{
+    class TRTBicubicInterpolate : public TRTPluginBase
+    {
+      public:
+        TRTBicubicInterpolate(const std::string& name, std::vector<float> scale_factor, bool align_corners);
+
+        TRTBicubicInterpolate(const std::string name, const void* data, size_t length);
+
+        TRTBicubicInterpolate() = delete;
+
+        // IPluginV2DynamicExt Methods
+        nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
+
+        nvinfer1::DimsExprs            getOutputDimensions(int                        outputIndex,
+                                                           const nvinfer1::DimsExprs* inputs,
+                                                           int                        nbInputs,
+                                                           nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT override;
+
+        bool                           supportsFormatCombination(int                               pos,
+                                                                 const nvinfer1::PluginTensorDesc* ioDesc,
+                                                                 int                               nbInputs,
+                                                                 int                               nbOutputs) TRT_NOEXCEPT override;
+
+        void                           configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                                                       int                                      nbInputs,
+                                                       const nvinfer1::DynamicPluginTensorDesc* out,
+                                                       int                                      nbOutputs) TRT_NOEXCEPT override;
+
+        size_t                         getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                                        int                               nbInputs,
+                                                        const nvinfer1::PluginTensorDesc* outputs,
+                                                        int                               nbOutputs) const TRT_NOEXCEPT override;
+
+        int                enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                   const nvinfer1::PluginTensorDesc* outputDesc,
+                                   const void* const*                inputs,
+                                   void* const*                      outputs,
+                                   void*                             workspace,
+                                   cudaStream_t                      stream) TRT_NOEXCEPT override;
+
+        // IPluginV2Ext Methods
+        nvinfer1::DataType             getOutputDataType(int                       index,
+                                                         const nvinfer1::DataType* inputTypes,
+                                                         int                       nbInputs) const TRT_NOEXCEPT override;
+
+        // IPluginV2 Methods
+        const char*        getPluginType() const TRT_NOEXCEPT override;
+        const char*        getPluginVersion() const TRT_NOEXCEPT override;
+        int                getNbOutputs() const TRT_NOEXCEPT override;
+        size_t             getSerializationSize() const TRT_NOEXCEPT override;
+        void               serialize(void* buffer) const TRT_NOEXCEPT override;
+
+      private:
+        std::vector<float> mScaleFactor;
+        bool               mAlignCorners;
+    };
+
+    class TRTBicubicInterpolateCreator : public TRTPluginCreatorBase
+    {
+      public:
+        TRTBicubicInterpolateCreator();
+
+        const char*          getPluginName() const TRT_NOEXCEPT override;
+
+        const char*          getPluginVersion() const TRT_NOEXCEPT override;
+        nvinfer1::IPluginV2* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc)
+            TRT_NOEXCEPT override;
+
+        nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                               const void* serialData,
+                                               size_t      serialLength) TRT_NOEXCEPT override;
+    };
 }  // namespace mmdeploy
 #endif  // TRT_BICUBIC_INTERPOLATE_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate_kernel.cu b/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate_kernel.cu
index efb078c431..2c189e0a45 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate_kernel.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate_kernel.cu
@@ -12,159 +12,236 @@
 
 // Based on
 // https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
-template <typename scalar_t>
-__device__ __forceinline__ static scalar_t cubic_convolution1(scalar_t x, scalar_t A) {
-  return ((A + 2) * x - (A + 3)) * x * x + 1;
+template<typename scalar_t>
+__device__ __forceinline__ static scalar_t cubic_convolution1(scalar_t x, scalar_t A)
+{
+    return ((A + 2) * x - (A + 3)) * x * x + 1;
 }
 
-template <typename scalar_t>
-__device__ __forceinline__ static scalar_t cubic_convolution2(scalar_t x, scalar_t A) {
-  return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
+template<typename scalar_t>
+__device__ __forceinline__ static scalar_t cubic_convolution2(scalar_t x, scalar_t A)
+{
+    return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
 }
 
-template <typename scalar_t>
+template<typename scalar_t>
 __device__ __forceinline__ static void get_cubic_upsample_coefficients(scalar_t coeffs[4],
-                                                                       scalar_t t) {
-  scalar_t A = -0.75;
-
-  scalar_t x1 = t;
-  coeffs[0] = cubic_convolution2<scalar_t>(x1 + 1.0, A);
-  coeffs[1] = cubic_convolution1<scalar_t>(x1, A);
-
-  // opposite coefficients
-  scalar_t x2 = 1.0 - t;
-  coeffs[2] = cubic_convolution1<scalar_t>(x2, A);
-  coeffs[3] = cubic_convolution2<scalar_t>(x2 + 1.0, A);
+                                                                       scalar_t t)
+{
+    scalar_t A = -0.75;
+
+    scalar_t x1 = t;
+    coeffs[0]   = cubic_convolution2<scalar_t>(x1 + 1.0, A);
+    coeffs[1]   = cubic_convolution1<scalar_t>(x1, A);
+
+    // opposite coefficients
+    scalar_t x2 = 1.0 - t;
+    coeffs[2]   = cubic_convolution1<scalar_t>(x2, A);
+    coeffs[3]   = cubic_convolution2<scalar_t>(x2 + 1.0, A);
 }
 
-template <typename scalar_t>
-__device__ __forceinline__ static scalar_t cubic_interp1d(scalar_t x0, scalar_t x1, scalar_t x2,
-                                                          scalar_t x3, scalar_t t) {
-  scalar_t coeffs[4];
-  get_cubic_upsample_coefficients<scalar_t>(coeffs, t);
-
-  return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3];
+template<typename scalar_t>
+__device__ __forceinline__ static scalar_t cubic_interp1d(scalar_t x0,
+                                                          scalar_t x1,
+                                                          scalar_t x2,
+                                                          scalar_t x3,
+                                                          scalar_t t)
+{
+    scalar_t coeffs[4];
+    get_cubic_upsample_coefficients<scalar_t>(coeffs, t);
+
+    return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3];
 }
 
 /* Used by UpSampleBicubic2d.cu */
-template <typename scalar_t>
-__device__ __forceinline__ static scalar_t upsample_get_value_bounded(const scalar_t *data,
-                                                                      int batch, int channel,
-                                                                      int batchsize, int channels,
-                                                                      int height, int width, int y,
-                                                                      int x) {
-  int access_y = max(min(y, height - 1), 0);
-  int access_x = max(min(x, width - 1), 0);
-  return data[batch * channels * height * width + channel * height * width + access_y * width +
-              access_x];
+template<typename scalar_t>
+__device__ __forceinline__ static scalar_t upsample_get_value_bounded(const scalar_t* data,
+                                                                      int             batch,
+                                                                      int             channel,
+                                                                      int             batchsize,
+                                                                      int             channels,
+                                                                      int             height,
+                                                                      int             width,
+                                                                      int             y,
+                                                                      int             x)
+{
+    int access_y = max(min(y, height - 1), 0);
+    int access_x = max(min(x, width - 1), 0);
+    return data[batch * channels * height * width + channel * height * width +
+                access_y * width +
+                access_x];
 }
 
-template <typename scalar_t>
-__device__ __forceinline__ scalar_t
-area_pixel_compute_source_index(scalar_t scale, int64_t dst_index, bool align_corners, bool cubic) {
-  if (align_corners) {
-    return scale * dst_index;
-  } else {
-    scalar_t src_idx = scale * (dst_index + 0.5) - 0.5;
-    // [Note] Follow Opencv resize logic:
-    // We allow negative src_idx here and later will use
-    //   dx = src_idx - floorf(src_idx)
-    // to compute the "distance"(which affects weights).
-    // For linear modes, weight distribution doesn't matter
-    // for negative indices as they use 2 pixels to interpolate.
-    // For example, [-1, 0], they both use pixel 0 value so it
-    // doesn't affect if we bound the src_idx to 0 or not.
-    // TODO: Our current linear mode impls use unbound indices
-    // where we should and then remove this cubic flag.
-    // This matters in cubic mode, as we might need [-1, 0, 1, 2]
-    // to interpolate and the weights can be affected.
-    return (!cubic && src_idx < 0) ? scalar_t(0) : src_idx;
-  }
+template<typename scalar_t>
+__device__ __forceinline__ scalar_t area_pixel_compute_source_index(scalar_t scale,
+                                                                    int64_t  dst_index,
+                                                                    bool     align_corners,
+                                                                    bool     cubic)
+{
+    if (align_corners)
+    {
+        return scale * dst_index;
+    }
+    else
+    {
+        scalar_t src_idx = scale * (dst_index + 0.5) - 0.5;
+        // [Note] Follow Opencv resize logic:
+        // We allow negative src_idx here and later will use
+        //   dx = src_idx - floorf(src_idx)
+        // to compute the "distance"(which affects weights).
+        // For linear modes, weight distribution doesn't matter
+        // for negative indices as they use 2 pixels to interpolate.
+        // For example, [-1, 0], they both use pixel 0 value so it
+        // doesn't affect if we bound the src_idx to 0 or not.
+        // TODO: Our current linear mode impls use unbound indices
+        // where we should and then remove this cubic flag.
+        // This matters in cubic mode, as we might need [-1, 0, 1, 2]
+        // to interpolate and the weights can be affected.
+        return (!cubic && src_idx < 0) ? scalar_t(0) : src_idx;
+    }
 }
 
 // cubic interpolation pytorch
-template <typename scalar_t>
-__global__ void resize_cubic_kernel_torch(const int num_elements, const scalar_t *src,
-                                          const int batchsize, const int channels, int srcWidth,
-                                          int srcHeight, scalar_t *dst, int dstWidth, int dstHeight,
-                                          bool align_corners, float height_scale,
-                                          float width_scale) {
-  CUDA_1D_KERNEL_LOOP(index, num_elements) {
-    // Special case: input and output are the same size, just copy
-    const int output_x = index % dstWidth;
-    const int output_y = index / dstWidth;
-
-    if (srcHeight == dstHeight && srcWidth == dstWidth) {
-      for (int n = 0; n < batchsize; n++) {
-        for (int c = 0; c < channels; c++) {
-          const scalar_t val = src[n * channels * dstHeight * dstWidth + c * dstHeight * dstWidth +
-                                   output_y * dstWidth + output_x];
-          dst[n * channels * dstHeight * dstWidth + c * dstHeight * dstWidth + output_y * dstWidth +
-              output_x] = val;
+template<typename scalar_t>
+__global__ void resize_cubic_kernel_torch(const int       num_elements,
+                                          const scalar_t* src,
+                                          const int       batchsize,
+                                          const int       channels,
+                                          int             srcWidth,
+                                          int             srcHeight,
+                                          scalar_t*       dst,
+                                          int             dstWidth,
+                                          int             dstHeight,
+                                          bool            align_corners,
+                                          float           height_scale,
+                                          float           width_scale)
+{
+    CUDA_1D_KERNEL_LOOP(index, num_elements)
+    {
+        // Special case: input and output are the same size, just copy
+        const int output_x = index % dstWidth;
+        const int output_y = index / dstWidth;
+
+        if (srcHeight == dstHeight && srcWidth == dstWidth)
+        {
+            for (int n = 0; n < batchsize; n++)
+            {
+                for (int c = 0; c < channels; c++)
+                {
+                    const scalar_t val = src[n * channels * dstHeight * dstWidth + c * dstHeight * dstWidth +
+                                             output_y * dstWidth +
+                                             output_x];
+                    dst[n * channels * dstHeight * dstWidth + c * dstHeight * dstWidth +
+                        output_y * dstWidth +
+                        output_x]      = val;
+                }
+            }
+            return;
         }
-      }
-      return;
-    }
-    // Interpolation kernel
-    scalar_t real_x =
-        area_pixel_compute_source_index(width_scale, output_x, align_corners, /*cubic=*/true);
-    int in_x = floorf(real_x);
-    scalar_t t_x = real_x - in_x;
-
-    scalar_t real_y =
-        area_pixel_compute_source_index(height_scale, output_y, align_corners, /*cubic=*/true);
-    int in_y = floorf(real_y);
-    scalar_t t_y = real_y - in_y;
-
-    for (int n = 0; n < batchsize; n++) {
-      for (int c = 0; c < channels; c++) {
-        scalar_t coefficients[4];
-
-        for (int k = 0; k < 4; k++) {
-          coefficients[k] = cubic_interp1d<scalar_t>(
-              upsample_get_value_bounded(src, n, c, batchsize, channels, srcHeight, srcWidth,
-                                         in_y - 1 + k, in_x - 1),
-              upsample_get_value_bounded(src, n, c, batchsize, channels, srcHeight, srcWidth,
-                                         in_y - 1 + k, in_x + 0),
-              upsample_get_value_bounded(src, n, c, batchsize, channels, srcHeight, srcWidth,
-                                         in_y - 1 + k, in_x + 1),
-              upsample_get_value_bounded(src, n, c, batchsize, channels, srcHeight, srcWidth,
-                                         in_y - 1 + k, in_x + 2),
-              t_x);
+        // Interpolation kernel
+        scalar_t real_x =
+            area_pixel_compute_source_index(width_scale, output_x, align_corners, /*cubic=*/true);
+        int      in_x = floorf(real_x);
+        scalar_t t_x  = real_x - in_x;
+
+        scalar_t real_y =
+            area_pixel_compute_source_index(height_scale, output_y, align_corners, /*cubic=*/true);
+        int      in_y = floorf(real_y);
+        scalar_t t_y  = real_y - in_y;
+
+        for (int n = 0; n < batchsize; n++)
+        {
+            for (int c = 0; c < channels; c++)
+            {
+                scalar_t coefficients[4];
+
+                for (int k = 0; k < 4; k++)
+                {
+                    coefficients[k] = cubic_interp1d<scalar_t>(
+                        upsample_get_value_bounded(src, n, c, batchsize, channels, srcHeight, srcWidth, in_y - 1 + k, in_x - 1),
+                        upsample_get_value_bounded(src, n, c, batchsize, channels, srcHeight, srcWidth, in_y - 1 + k, in_x + 0),
+                        upsample_get_value_bounded(src, n, c, batchsize, channels, srcHeight, srcWidth, in_y - 1 + k, in_x + 1),
+                        upsample_get_value_bounded(src, n, c, batchsize, channels, srcHeight, srcWidth, in_y - 1 + k, in_x + 2),
+                        t_x);
+                }
+
+                dst[n * channels * dstHeight * dstWidth + c * dstHeight * dstWidth +
+                    output_y * dstWidth +
+                    output_x] = scalar_t(cubic_interp1d(coefficients[0],
+                                                        coefficients[1],
+                                                        coefficients[2],
+                                                        coefficients[3],
+                                                        t_y));
+            }
         }
-
-        dst[n * channels * dstHeight * dstWidth + c * dstHeight * dstWidth + output_y * dstWidth +
-            output_x] = scalar_t(cubic_interp1d(coefficients[0], coefficients[1], coefficients[2],
-                                                coefficients[3], t_y));
-      }
     }
-  }
 }
 
-template <typename scalar_t>
-void resizeGPU(const scalar_t *pIn_d, scalar_t *pOut_d, int batch, int channels, int srcWidth,
-               int srcHeight, int dstWidth, int dstHeight, bool align_corners,
-               cudaStream_t stream) {
-  float height_scale = float(srcHeight) / dstHeight;
-  float width_scale = float(srcWidth) / dstWidth;
-  if (align_corners && dstWidth > 1 && dstHeight > 1) {
-    height_scale = (float)(srcHeight - 1) / (dstHeight - 1);
-    width_scale = (float)(srcWidth - 1) / (dstWidth - 1);
-  }
-  int n = batch * dstWidth * dstHeight * channels;
-  resize_cubic_kernel_torch<<<GET_BLOCKS(n), THREADS_PER_BLOCK, 0, stream>>>(
-      dstWidth * dstHeight, pIn_d, batch, channels, srcWidth, srcHeight, pOut_d, dstWidth,
-      dstHeight, align_corners, height_scale, width_scale);
+template<typename scalar_t>
+void resizeGPU(const scalar_t* pIn_d,
+               scalar_t*       pOut_d,
+               int             batch,
+               int             channels,
+               int             srcWidth,
+               int             srcHeight,
+               int             dstWidth,
+               int             dstHeight,
+               bool            align_corners,
+               cudaStream_t    stream)
+{
+    float height_scale = float(srcHeight) / dstHeight;
+    float width_scale  = float(srcWidth) / dstWidth;
+    if (align_corners && dstWidth > 1 && dstHeight > 1)
+    {
+        height_scale = (float)(srcHeight - 1) / (dstHeight - 1);
+        width_scale  = (float)(srcWidth - 1) / (dstWidth - 1);
+    }
+    int n = batch * dstWidth * dstHeight * channels;
+    resize_cubic_kernel_torch<<<GET_BLOCKS(n), THREADS_PER_BLOCK, 0, stream>>>(dstWidth * dstHeight,
+                                                                               pIn_d,
+                                                                               batch,
+                                                                               channels,
+                                                                               srcWidth,
+                                                                               srcHeight,
+                                                                               pOut_d,
+                                                                               dstWidth,
+                                                                               dstHeight,
+                                                                               align_corners,
+                                                                               height_scale,
+                                                                               width_scale);
 }
 
-template <typename scalar_t>
-void bicubic_interpolate(const scalar_t *input, scalar_t *output, int batch, int channels,
-                         int in_height, int in_width, int out_height, int out_width,
-                         bool align_corners, cudaStream_t stream) {
-  resizeGPU(input, output, batch, channels, in_width, in_height, out_width, out_height,
-            align_corners, stream);
+template<typename scalar_t>
+void bicubic_interpolate(const scalar_t* input,
+                         scalar_t*       output,
+                         int             batch,
+                         int             channels,
+                         int             in_height,
+                         int             in_width,
+                         int             out_height,
+                         int             out_width,
+                         bool            align_corners,
+                         cudaStream_t    stream)
+{
+    resizeGPU(input,
+              output,
+              batch,
+              channels,
+              in_width,
+              in_height,
+              out_width,
+              out_height,
+              align_corners,
+              stream);
 }
 
-template void bicubic_interpolate<float>(const float *input, float *output, int batch, int channels,
-                                         int in_height, int in_width, int out_height, int out_width,
-                                         bool align_corners, cudaStream_t stream);
+template void bicubic_interpolate<float>(const float* input,
+                                         float*       output,
+                                         int          batch,
+                                         int          channels,
+                                         int          in_height,
+                                         int          in_width,
+                                         int          out_height,
+                                         int          out_width,
+                                         bool         align_corners,
+                                         cudaStream_t stream);
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate_kernel.hpp b/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate_kernel.hpp
index 66560f59f5..4ecf16c5fe 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate_kernel.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/bicubic_interpolate/trt_bicubic_interpolate_kernel.hpp
@@ -4,8 +4,15 @@
 
 #include "common_cuda_helper.hpp"
 
-template <typename scalar_t>
-void bicubic_interpolate(const scalar_t *input, scalar_t *output, int batch, int channels,
-                         int in_height, int in_width, int out_height, int out_width,
-                         bool align_corners, cudaStream_t stream);
+template<typename scalar_t>
+void bicubic_interpolate(const scalar_t* input,
+                         scalar_t*       output,
+                         int             batch,
+                         int             channels,
+                         int             in_height,
+                         int             in_width,
+                         int             out_height,
+                         int             out_width,
+                         bool            align_corners,
+                         cudaStream_t    stream);
 #endif  // TRT_BICUBIC_INTERPOLATE_KERNEL_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/common/common_cuda_helper.hpp b/csrc/mmdeploy/backend_ops/tensorrt/common/common_cuda_helper.hpp
index c76cac8a32..c71de75638 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/common/common_cuda_helper.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common/common_cuda_helper.hpp
@@ -9,25 +9,27 @@
 #include <algorithm>
 
 #define CUDA_1D_KERNEL_LOOP(i, n) \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
+    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
 
 #define THREADS_PER_BLOCK 512
 
 #define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
-inline int GET_BLOCKS(const int N) {
-  int optimal_block_num = DIVUP(N, THREADS_PER_BLOCK);
-  int max_block_num = 4096;
-  return std::min(optimal_block_num, max_block_num);
+inline int GET_BLOCKS(const int N)
+{
+    int optimal_block_num = DIVUP(N, THREADS_PER_BLOCK);
+    int max_block_num     = 4096;
+    return std::min(optimal_block_num, max_block_num);
 }
 
-#define cudaCheckError()                                                               \
-  {                                                                                    \
-    cudaError_t e = cudaGetLastError();                                                \
-    if (e != cudaSuccess) {                                                            \
-      printf("Cuda failure %s:%d: '%s'\n", __FILE__, __LINE__, cudaGetErrorString(e)); \
-      exit(0);                                                                         \
-    }                                                                                  \
-  }
+#define cudaCheckError()                                                                     \
+    {                                                                                        \
+        cudaError_t e = cudaGetLastError();                                                  \
+        if (e != cudaSuccess)                                                                \
+        {                                                                                    \
+            printf("Cuda failure %s:%d: '%s'\n", __FILE__, __LINE__, cudaGetErrorString(e)); \
+            exit(0);                                                                         \
+        }                                                                                    \
+    }
 
 /**
  * Returns a view of the original tensor with its dimensions permuted.
@@ -39,44 +41,61 @@ inline int GET_BLOCKS(const int N) {
  * @param[in] src_dim dim of src tensor
  * @param[in] stream cuda stream handle
  */
-template <class scalar_t>
-void memcpyPermute(scalar_t* dst, const scalar_t* src, int* src_size, int* permute, int src_dim,
-                   cudaStream_t stream = 0);
+template<class scalar_t>
+void memcpyPermute(scalar_t*       dst,
+                   const scalar_t* src,
+                   int*            src_size,
+                   int*            permute,
+                   int             src_dim,
+                   cudaStream_t    stream = 0);
 
-template <typename scalar_t>
-cublasStatus_t cublasGemmWrap(cublasHandle_t handle, cublasOperation_t transa,
-                              cublasOperation_t transb, int m, int n, int k, const scalar_t* alpha,
-                              const scalar_t* A, int lda, const scalar_t* B, int ldb,
-                              const scalar_t* beta, scalar_t* C, int ldc);
+template<typename scalar_t>
+cublasStatus_t cublasGemmWrap(cublasHandle_t    handle,
+                              cublasOperation_t transa,
+                              cublasOperation_t transb,
+                              int               m,
+                              int               n,
+                              int               k,
+                              const scalar_t*   alpha,
+                              const scalar_t*   A,
+                              int               lda,
+                              const scalar_t*   B,
+                              int               ldb,
+                              const scalar_t*   beta,
+                              scalar_t*         C,
+                              int               ldc);
 
-template <typename scalar_t>
+template<typename scalar_t>
 __device__ __forceinline__ scalar_t bilinear_interpolate(const scalar_t* __restrict__ input,
-                                                         const int height, const int width,
-                                                         scalar_t y, scalar_t x) {
-  // deal with cases that inverse elements are out of feature map boundary
-  if (y < -1.0 || y > height || x < -1.0 || x > width) return 0;
+                                                         const int height,
+                                                         const int width,
+                                                         scalar_t  y,
+                                                         scalar_t  x)
+{
+    // deal with cases that inverse elements are out of feature map boundary
+    if (y < -1.0 || y > height || x < -1.0 || x > width) return 0;
 
-  y = min(scalar_t(height - 1), max(scalar_t(0), y));
-  x = min(scalar_t(width - 1), max(scalar_t(0), x));
+    y = min(scalar_t(height - 1), max(scalar_t(0), y));
+    x = min(scalar_t(width - 1), max(scalar_t(0), x));
 
-  const int y_low = floor(y);
-  const int x_low = floor(x);
-  const int y_high = ceil(y);
-  const int x_high = ceil(x);
+    const int      y_low  = floor(y);
+    const int      x_low  = floor(x);
+    const int      y_high = ceil(y);
+    const int      x_high = ceil(x);
 
-  const scalar_t v1 = input[y_low * width + x_low];
-  const scalar_t v2 = input[y_low * width + x_high];
-  const scalar_t v3 = input[y_high * width + x_low];
-  const scalar_t v4 = input[y_high * width + x_high];
+    const scalar_t v1 = input[y_low * width + x_low];
+    const scalar_t v2 = input[y_low * width + x_high];
+    const scalar_t v3 = input[y_high * width + x_low];
+    const scalar_t v4 = input[y_high * width + x_high];
 
-  // lerp can be performed by fma
-  const scalar_t ly = y - y_low;
-  const scalar_t lx = x - x_low;
-  const scalar_t v_low = fma(v2 - v1, lx, v1);
-  const scalar_t v_high = fma(v4 - v3, lx, v3);
-  const scalar_t val = fma(v_high - v_low, ly, v_low);
+    // lerp can be performed by fma
+    const scalar_t ly     = y - y_low;
+    const scalar_t lx     = x - x_low;
+    const scalar_t v_low  = fma(v2 - v1, lx, v1);
+    const scalar_t v_high = fma(v4 - v3, lx, v3);
+    const scalar_t val    = fma(v_high - v_low, ly, v_low);
 
-  return val;
+    return val;
 }
 
 #endif  // COMMON_CUDA_HELPER
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/common/nms/batched_nms_kernel.hpp b/csrc/mmdeploy/backend_ops/tensorrt/common/nms/batched_nms_kernel.hpp
index 22cffa0605..542db78b96 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/common/nms/batched_nms_kernel.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common/nms/batched_nms_kernel.hpp
@@ -6,14 +6,29 @@
 #include "cuda_runtime_api.h"
 #include "kernel.h"
 
-pluginStatus_t nmsInference(cudaStream_t stream, const int N, const int perBatchBoxesSize,
-                            const int perBatchScoresSize, const bool shareLocation,
-                            const int backgroundLabelId, const int numPredsPerClass,
-                            const int numClasses, const int topK, const int keepTopK,
-                            const float scoreThreshold, const float iouThreshold,
-                            const DataType DT_BBOX, const void* locData, const DataType DT_SCORE,
-                            const void* confData, void* nmsedDets, void* nmsedLabels,
-                            void* nmsedIndex, void* workspace, bool isNormalized, bool confSigmoid,
-                            bool clipBoxes, bool rotated = false);
+pluginStatus_t nmsInference(cudaStream_t   stream,
+                            const int      N,
+                            const int      perBatchBoxesSize,
+                            const int      perBatchScoresSize,
+                            const bool     shareLocation,
+                            const int      backgroundLabelId,
+                            const int      numPredsPerClass,
+                            const int      numClasses,
+                            const int      topK,
+                            const int      keepTopK,
+                            const float    scoreThreshold,
+                            const float    iouThreshold,
+                            const DataType DT_BBOX,
+                            const void*    locData,
+                            const DataType DT_SCORE,
+                            const void*    confData,
+                            void*          nmsedDets,
+                            void*          nmsedLabels,
+                            void*          nmsedIndex,
+                            void*          workspace,
+                            bool           isNormalized,
+                            bool           confSigmoid,
+                            bool           clipBoxes,
+                            bool           rotated = false);
 
 #endif
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/common/nms/cub_helper.h b/csrc/mmdeploy/backend_ops/tensorrt/common/nms/cub_helper.h
index 93fd2a4fb9..19efec4ac5 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/common/nms/cub_helper.h
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common/nms/cub_helper.h
@@ -2,14 +2,19 @@
 // modify from
 // https://github.com/NVIDIA/TensorRT/tree/master/plugin/batchedNMSPlugin
 #include "kernel.h"
-template <typename KeyT, typename ValueT>
-size_t cubSortPairsWorkspaceSize(int num_items, int num_segments) {
-  size_t temp_storage_bytes = 0;
-  cub::DeviceSegmentedRadixSort::SortPairsDescending((void*)NULL, temp_storage_bytes,
-                                                     (const KeyT*)NULL, (KeyT*)NULL,
-                                                     (const ValueT*)NULL, (ValueT*)NULL,
-                                                     num_items,     // # items
-                                                     num_segments,  // # segments
-                                                     (const int*)NULL, (const int*)NULL);
-  return temp_storage_bytes;
+template<typename KeyT, typename ValueT>
+size_t cubSortPairsWorkspaceSize(int num_items, int num_segments)
+{
+    size_t temp_storage_bytes = 0;
+    cub::DeviceSegmentedRadixSort::SortPairsDescending((void*)NULL,
+                                                       temp_storage_bytes,
+                                                       (const KeyT*)NULL,
+                                                       (KeyT*)NULL,
+                                                       (const ValueT*)NULL,
+                                                       (ValueT*)NULL,
+                                                       num_items,     // # items
+                                                       num_segments,  // # segments
+                                                       (const int*)NULL,
+                                                       (const int*)NULL);
+    return temp_storage_bytes;
 }
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/common/nms/kernel.h b/csrc/mmdeploy/backend_ops/tensorrt/common/nms/kernel.h
index 1b50fa4e9f..6e690731d9 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/common/nms/kernel.h
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common/nms/kernel.h
@@ -15,72 +15,152 @@
 using namespace nvinfer1;
 #define DEBUG_ENABLE 0
 
-template <typename T>
-struct Bbox {
-  T xmin, ymin, xmax, ymax;
-  Bbox(T xmin, T ymin, T xmax, T ymax) : xmin(xmin), ymin(ymin), xmax(xmax), ymax(ymax) {}
-  Bbox() = default;
+template<typename T>
+struct Bbox
+{
+    T xmin, ymin, xmax, ymax;
+    Bbox(T xmin, T ymin, T xmax, T ymax)
+        : xmin(xmin)
+        , ymin(ymin)
+        , xmax(xmax)
+        , ymax(ymax)
+    {
+    }
+    Bbox() = default;
 };
 
-size_t get_cuda_arch(int devID);
-
-int8_t* alignPtr(int8_t* ptr, uintptr_t to);
-
-int8_t* nextWorkspacePtr(int8_t* ptr, uintptr_t previousWorkspaceSize);
-
-void setUniformOffsets(cudaStream_t stream, int num_segments, int offset, int* d_offsets);
-
-pluginStatus_t allClassNMS(cudaStream_t stream, int num, int num_classes, int num_preds_per_class,
-                           int top_k, float nms_threshold, bool share_location, bool isNormalized,
-                           DataType DT_SCORE, DataType DT_BBOX, void* bbox_data,
-                           void* beforeNMS_scores, void* beforeNMS_index_array,
-                           void* afterNMS_scores, void* afterNMS_index_array, bool flipXY = false);
-
-pluginStatus_t allClassRotatedNMS(cudaStream_t stream, int num, int num_classes,
-                                  int num_preds_per_class, int top_k, float nms_threshold,
-                                  bool share_location, bool isNormalized, DataType DT_SCORE,
-                                  DataType DT_BBOX, void* bbox_data, void* beforeNMS_scores,
-                                  void* beforeNMS_index_array, void* afterNMS_scores,
-                                  void* afterNMS_index_array, bool flipXY = false);
-
-size_t detectionForwardBBoxDataSize(int N, int C1, DataType DT_BBOX);
-
-size_t detectionForwardBBoxPermuteSize(bool shareLocation, int N, int C1, DataType DT_BBOX);
-
-size_t sortScoresPerClassWorkspaceSize(int num, int num_classes, int num_preds_per_class,
-                                       DataType DT_CONF);
-
-size_t sortScoresPerImageWorkspaceSize(int num_images, int num_items_per_image, DataType DT_SCORE);
-
-pluginStatus_t sortScoresPerImage(cudaStream_t stream, int num_images, int num_items_per_image,
-                                  DataType DT_SCORE, void* unsorted_scores,
-                                  void* unsorted_bbox_indices, void* sorted_scores,
-                                  void* sorted_bbox_indices, void* workspace);
-
-pluginStatus_t sortScoresPerClass(cudaStream_t stream, int num, int num_classes,
-                                  int num_preds_per_class, int background_label_id,
-                                  float confidence_threshold, DataType DT_SCORE,
-                                  void* conf_scores_gpu, void* index_array_gpu, void* workspace);
-
-size_t calculateTotalWorkspaceSize(size_t* workspaces, int count);
-
-pluginStatus_t permuteData(cudaStream_t stream, int nthreads, int num_classes, int num_data,
-                           int num_dim, DataType DT_DATA, bool confSigmoid, const void* data,
-                           void* new_data);
-
-size_t detectionForwardPreNMSSize(int N, int C2);
-
-size_t detectionForwardPostNMSSize(int N, int numClasses, int topK);
-
-pluginStatus_t gatherNMSOutputs(cudaStream_t stream, bool shareLocation, int numImages,
-                                int numPredsPerClass, int numClasses, int topK, int keepTopK,
-                                DataType DT_BBOX, DataType DT_SCORE, const void* indices,
-                                const void* scores, const void* bboxData, void* nmsedDets,
-                                void* nmsedLabels, void* nmsedIndex = nullptr,
-                                bool clipBoxes = true, bool rotated = false);
-
-size_t detectionInferenceWorkspaceSize(bool shareLocation, int N, int C1, int C2, int numClasses,
-                                       int numPredsPerClass, int topK, DataType DT_BBOX,
-                                       DataType DT_SCORE);
+size_t         get_cuda_arch(int devID);
+
+int8_t*        alignPtr(int8_t*   ptr,
+                        uintptr_t to);
+
+int8_t*        nextWorkspacePtr(int8_t*   ptr,
+                                uintptr_t previousWorkspaceSize);
+
+void           setUniformOffsets(cudaStream_t stream,
+                                 int          num_segments,
+                                 int          offset,
+                                 int*         d_offsets);
+
+pluginStatus_t allClassNMS(cudaStream_t stream,
+                           int          num,
+                           int          num_classes,
+                           int          num_preds_per_class,
+                           int          top_k,
+                           float        nms_threshold,
+                           bool         share_location,
+                           bool         isNormalized,
+                           DataType     DT_SCORE,
+                           DataType     DT_BBOX,
+                           void*        bbox_data,
+                           void*        beforeNMS_scores,
+                           void*        beforeNMS_index_array,
+                           void*        afterNMS_scores,
+                           void*        afterNMS_index_array,
+                           bool         flipXY = false);
+
+pluginStatus_t allClassRotatedNMS(cudaStream_t stream,
+                                  int          num,
+                                  int          num_classes,
+                                  int          num_preds_per_class,
+                                  int          top_k,
+                                  float        nms_threshold,
+                                  bool         share_location,
+                                  bool         isNormalized,
+                                  DataType     DT_SCORE,
+                                  DataType     DT_BBOX,
+                                  void*        bbox_data,
+                                  void*        beforeNMS_scores,
+                                  void*        beforeNMS_index_array,
+                                  void*        afterNMS_scores,
+                                  void*        afterNMS_index_array,
+                                  bool         flipXY = false);
+
+size_t         detectionForwardBBoxDataSize(int      N,
+                                            int      C1,
+                                            DataType DT_BBOX);
+
+size_t         detectionForwardBBoxPermuteSize(bool     shareLocation,
+                                               int      N,
+                                               int      C1,
+                                               DataType DT_BBOX);
+
+size_t         sortScoresPerClassWorkspaceSize(int      num,
+                                               int      num_classes,
+                                               int      num_preds_per_class,
+                                               DataType DT_CONF);
+
+size_t         sortScoresPerImageWorkspaceSize(int      num_images,
+                                               int      num_items_per_image,
+                                               DataType DT_SCORE);
+
+pluginStatus_t sortScoresPerImage(cudaStream_t stream,
+                                  int          num_images,
+                                  int          num_items_per_image,
+                                  DataType     DT_SCORE,
+                                  void*        unsorted_scores,
+                                  void*        unsorted_bbox_indices,
+                                  void*        sorted_scores,
+                                  void*        sorted_bbox_indices,
+                                  void*        workspace);
+
+pluginStatus_t sortScoresPerClass(cudaStream_t stream,
+                                  int          num,
+                                  int          num_classes,
+                                  int          num_preds_per_class,
+                                  int          background_label_id,
+                                  float        confidence_threshold,
+                                  DataType     DT_SCORE,
+                                  void*        conf_scores_gpu,
+                                  void*        index_array_gpu,
+                                  void*        workspace);
+
+size_t         calculateTotalWorkspaceSize(size_t* workspaces,
+                                           int     count);
+
+pluginStatus_t permuteData(cudaStream_t stream,
+                           int          nthreads,
+                           int          num_classes,
+                           int          num_data,
+                           int          num_dim,
+                           DataType     DT_DATA,
+                           bool         confSigmoid,
+                           const void*  data,
+                           void*        new_data);
+
+size_t         detectionForwardPreNMSSize(int N,
+                                          int C2);
+
+size_t         detectionForwardPostNMSSize(int N,
+                                           int numClasses,
+                                           int topK);
+
+pluginStatus_t gatherNMSOutputs(cudaStream_t stream,
+                                bool         shareLocation,
+                                int          numImages,
+                                int          numPredsPerClass,
+                                int          numClasses,
+                                int          topK,
+                                int          keepTopK,
+                                DataType     DT_BBOX,
+                                DataType     DT_SCORE,
+                                const void*  indices,
+                                const void*  scores,
+                                const void*  bboxData,
+                                void*        nmsedDets,
+                                void*        nmsedLabels,
+                                void*        nmsedIndex = nullptr,
+                                bool         clipBoxes  = true,
+                                bool         rotated    = false);
+
+size_t         detectionInferenceWorkspaceSize(bool     shareLocation,
+                                               int      N,
+                                               int      C1,
+                                               int      C2,
+                                               int      numClasses,
+                                               int      numPredsPerClass,
+                                               int      topK,
+                                               DataType DT_BBOX,
+                                               DataType DT_SCORE);
 
 #endif
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/common/trt_plugin_base.hpp b/csrc/mmdeploy/backend_ops/tensorrt/common/trt_plugin_base.hpp
index 8440bb6219..cbe5c1a34c 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/common/trt_plugin_base.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common/trt_plugin_base.hpp
@@ -5,73 +5,106 @@
 #include "NvInferVersion.h"
 #include "trt_plugin_helper.hpp"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
 #if NV_TENSORRT_MAJOR > 7
-#define TRT_NOEXCEPT noexcept
+    #define TRT_NOEXCEPT noexcept
 #else
-#define TRT_NOEXCEPT
+    #define TRT_NOEXCEPT
 #endif
 
-class TRTPluginBase : public nvinfer1::IPluginV2DynamicExt {
- public:
-  TRTPluginBase(const std::string &name) : mLayerName(name) {}
-  // IPluginV2 Methods
-  const char *getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
-  int initialize() TRT_NOEXCEPT override { return STATUS_SUCCESS; }
-  void terminate() TRT_NOEXCEPT override {}
-  void destroy() TRT_NOEXCEPT override { delete this; }
-  void setPluginNamespace(const char *pluginNamespace) TRT_NOEXCEPT override {
-    mNamespace = pluginNamespace;
-  }
-  const char *getPluginNamespace() const TRT_NOEXCEPT override { return mNamespace.c_str(); }
+    class TRTPluginBase : public nvinfer1::IPluginV2DynamicExt
+    {
+      public:
+        TRTPluginBase(const std::string& name)
+            : mLayerName(name)
+        {
+        }
+        // IPluginV2 Methods
+        const char* getPluginVersion() const TRT_NOEXCEPT override
+        {
+            return "1";
+        }
+        int initialize() TRT_NOEXCEPT override
+        {
+            return STATUS_SUCCESS;
+        }
+        void terminate() TRT_NOEXCEPT override {}
+        void destroy() TRT_NOEXCEPT override
+        {
+            delete this;
+        }
+        void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override
+        {
+            mNamespace = pluginNamespace;
+        }
+        const char* getPluginNamespace() const TRT_NOEXCEPT override
+        {
+            return mNamespace.c_str();
+        }
 
-  virtual void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs,
-                               const nvinfer1::DynamicPluginTensorDesc *out,
-                               int nbOutputs) TRT_NOEXCEPT override {}
+        virtual void   configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                                       int                                      nbInputs,
+                                       const nvinfer1::DynamicPluginTensorDesc* out,
+                                       int                                      nbOutputs) TRT_NOEXCEPT override {}
 
-  virtual size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
-                                  const nvinfer1::PluginTensorDesc *outputs,
-                                  int nbOutputs) const TRT_NOEXCEPT override {
-    return 0;
-  }
+        virtual size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                        int                               nbInputs,
+                                        const nvinfer1::PluginTensorDesc* outputs,
+                                        int                               nbOutputs) const TRT_NOEXCEPT override
+        {
+            return 0;
+        }
 
-  virtual void attachToContext(cudnnContext *cudnnContext, cublasContext *cublasContext,
-                               nvinfer1::IGpuAllocator *gpuAllocator) TRT_NOEXCEPT override {}
+        virtual void attachToContext(cudnnContext*            cudnnContext,
+                                     cublasContext*           cublasContext,
+                                     nvinfer1::IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override {}
 
-  virtual void detachFromContext() TRT_NOEXCEPT override {}
+        virtual void detachFromContext() TRT_NOEXCEPT override {}
 
- protected:
-  const std::string mLayerName;
-  std::string mNamespace;
+      protected:
+        const std::string mLayerName;
+        std::string       mNamespace;
 
 #if NV_TENSORRT_MAJOR < 8
- protected:
-  // To prevent compiler warnings.
-  using nvinfer1::IPluginV2DynamicExt::canBroadcastInputAcrossBatch;
-  using nvinfer1::IPluginV2DynamicExt::enqueue;
-  using nvinfer1::IPluginV2DynamicExt::getOutputDimensions;
-  using nvinfer1::IPluginV2DynamicExt::isOutputBroadcastAcrossBatch;
-  using nvinfer1::IPluginV2DynamicExt::supportsFormat;
+      protected:
+        // To prevent compiler warnings.
+        using nvinfer1::IPluginV2DynamicExt::canBroadcastInputAcrossBatch;
+        using nvinfer1::IPluginV2DynamicExt::enqueue;
+        using nvinfer1::IPluginV2DynamicExt::getOutputDimensions;
+        using nvinfer1::IPluginV2DynamicExt::isOutputBroadcastAcrossBatch;
+        using nvinfer1::IPluginV2DynamicExt::supportsFormat;
 #endif
-};
+    };
 
-class TRTPluginCreatorBase : public nvinfer1::IPluginCreator {
- public:
-  const char *getPluginVersion() const TRT_NOEXCEPT override { return "1"; };
+    class TRTPluginCreatorBase : public nvinfer1::IPluginCreator
+    {
+      public:
+        const char* getPluginVersion() const TRT_NOEXCEPT override
+        {
+            return "1";
+        };
 
-  const nvinfer1::PluginFieldCollection *getFieldNames() TRT_NOEXCEPT override { return &mFC; }
+        const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override
+        {
+            return &mFC;
+        }
 
-  void setPluginNamespace(const char *pluginNamespace) TRT_NOEXCEPT override {
-    mNamespace = pluginNamespace;
-  }
+        void setPluginNamespace(const char* pluginNamespace) TRT_NOEXCEPT override
+        {
+            mNamespace = pluginNamespace;
+        }
 
-  const char *getPluginNamespace() const TRT_NOEXCEPT override { return mNamespace.c_str(); }
+        const char* getPluginNamespace() const TRT_NOEXCEPT override
+        {
+            return mNamespace.c_str();
+        }
 
- protected:
-  nvinfer1::PluginFieldCollection mFC;
-  std::vector<nvinfer1::PluginField> mPluginAttributes;
-  std::string mNamespace;
-};
+      protected:
+        nvinfer1::PluginFieldCollection    mFC;
+        std::vector<nvinfer1::PluginField> mPluginAttributes;
+        std::string                        mNamespace;
+    };
 }  // namespace mmdeploy
 #endif
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/common/trt_plugin_helper.hpp b/csrc/mmdeploy/backend_ops/tensorrt/common/trt_plugin_helper.hpp
index 41b47acdbe..050c0dd308 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/common/trt_plugin_helper.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common/trt_plugin_helper.hpp
@@ -11,145 +11,159 @@
 cudnnStatus_t convert_trt2cudnn_dtype(nvinfer1::DataType trt_dtype, cudnnDataType_t* cudnn_dtype);
 
 // Enumerator for status
-typedef enum {
-  STATUS_SUCCESS = 0,
-  STATUS_FAILURE = 1,
-  STATUS_BAD_PARAM = 2,
-  STATUS_NOT_SUPPORTED = 3,
-  STATUS_NOT_INITIALIZED = 4
+typedef enum
+{
+    STATUS_SUCCESS         = 0,
+    STATUS_FAILURE         = 1,
+    STATUS_BAD_PARAM       = 2,
+    STATUS_NOT_SUPPORTED   = 3,
+    STATUS_NOT_INITIALIZED = 4
 } pluginStatus_t;
 
-#define ASSERT(assertion)                                                    \
-  {                                                                          \
-    if (!(assertion)) {                                                      \
-      std::cerr << "#assertion" << __FILE__ << "," << __LINE__ << std::endl; \
-      abort();                                                               \
-    }                                                                        \
-  }
-
-#define CUASSERT(status_)                                                                       \
-  {                                                                                             \
-    auto s_ = status_;                                                                          \
-    if (s_ != cudaSuccess) {                                                                    \
-      std::cerr << __FILE__ << ", " << __LINE__ << ", " << s_ << ", " << cudaGetErrorString(s_) \
-                << std::endl;                                                                   \
-    }                                                                                           \
-  }
-#define CUBLASASSERT(status_)                                               \
-  {                                                                         \
-    auto s_ = status_;                                                      \
-    if (s_ != CUBLAS_STATUS_SUCCESS) {                                      \
-      std::cerr << __FILE__ << ", " << __LINE__ << ", " << s_ << std::endl; \
-    }                                                                       \
-  }
-#define CUERRORMSG(status_)                                                            \
-  {                                                                                    \
-    auto s_ = status_;                                                                 \
-    if (s_ != 0) std::cerr << __FILE__ << ", " << __LINE__ << ", " << s_ << std::endl; \
-  }
+#define ASSERT(assertion)                                                          \
+    {                                                                              \
+        if (!(assertion))                                                          \
+        {                                                                          \
+            std::cerr << "#assertion" << __FILE__ << "," << __LINE__ << std::endl; \
+            abort();                                                               \
+        }                                                                          \
+    }
+
+#define CUASSERT(status_)                                                                             \
+    {                                                                                                 \
+        auto s_ = status_;                                                                            \
+        if (s_ != cudaSuccess)                                                                        \
+        {                                                                                             \
+            std::cerr << __FILE__ << ", " << __LINE__ << ", " << s_ << ", " << cudaGetErrorString(s_) \
+                      << std::endl;                                                                   \
+        }                                                                                             \
+    }
+#define CUBLASASSERT(status_)                                                     \
+    {                                                                             \
+        auto s_ = status_;                                                        \
+        if (s_ != CUBLAS_STATUS_SUCCESS)                                          \
+        {                                                                         \
+            std::cerr << __FILE__ << ", " << __LINE__ << ", " << s_ << std::endl; \
+        }                                                                         \
+    }
+#define CUERRORMSG(status_)                                                                \
+    {                                                                                      \
+        auto s_ = status_;                                                                 \
+        if (s_ != 0) std::cerr << __FILE__ << ", " << __LINE__ << ", " << s_ << std::endl; \
+    }
 
 #ifndef DEBUG
 
-#define CHECK(status)         \
-  do {                        \
-    if (status != 0) abort(); \
-  } while (0)
-
-#define ASSERT_PARAM(exp)                \
-  do {                                   \
-    if (!(exp)) return STATUS_BAD_PARAM; \
-  } while (0)
-
-#define ASSERT_FAILURE(exp)            \
-  do {                                 \
-    if (!(exp)) return STATUS_FAILURE; \
-  } while (0)
-
-#define CSC(call, err)               \
-  do {                               \
-    cudaError_t cudaStatus = call;   \
-    if (cudaStatus != cudaSuccess) { \
-      return err;                    \
-    }                                \
-  } while (0)
-
-#define DEBUG_PRINTF(...) \
-  do {                    \
-  } while (0)
+    #define CHECK(status)             \
+        do {                          \
+            if (status != 0) abort(); \
+        } while (0)
+
+    #define ASSERT_PARAM(exp)                    \
+        do {                                     \
+            if (!(exp)) return STATUS_BAD_PARAM; \
+        } while (0)
+
+    #define ASSERT_FAILURE(exp)                \
+        do {                                   \
+            if (!(exp)) return STATUS_FAILURE; \
+        } while (0)
+
+    #define CSC(call, err)                 \
+        do {                               \
+            cudaError_t cudaStatus = call; \
+            if (cudaStatus != cudaSuccess) \
+            {                              \
+                return err;                \
+            }                              \
+        } while (0)
+
+    #define DEBUG_PRINTF(...) \
+        do {                  \
+        } while (0)
 
 #else
 
-#define ASSERT_PARAM(exp)                                                   \
-  do {                                                                      \
-    if (!(exp)) {                                                           \
-      fprintf(stderr, "Bad param - " #exp ", %s:%d\n", __FILE__, __LINE__); \
-      return STATUS_BAD_PARAM;                                              \
-    }                                                                       \
-  } while (0)
-
-#define ASSERT_FAILURE(exp)                                               \
-  do {                                                                    \
-    if (!(exp)) {                                                         \
-      fprintf(stderr, "Failure - " #exp ", %s:%d\n", __FILE__, __LINE__); \
-      return STATUS_FAILURE;                                              \
-    }                                                                     \
-  } while (0)
-
-#define CSC(call, err)                                                                    \
-  do {                                                                                    \
-    cudaError_t cudaStatus = call;                                                        \
-    if (cudaStatus != cudaSuccess) {                                                      \
-      printf("%s %d CUDA FAIL %s\n", __FILE__, __LINE__, cudaGetErrorString(cudaStatus)); \
-      return err;                                                                         \
-    }                                                                                     \
-  } while (0)
-
-#define CHECK(status)                                                                       \
-  {                                                                                         \
-    if (status != 0) {                                                                      \
-      DEBUG_PRINTF("%s %d CUDA FAIL %s\n", __FILE__, __LINE__, cudaGetErrorString(status)); \
-      abort();                                                                              \
-    }                                                                                       \
-  }
-
-#define DEBUG_PRINTF(...) \
-  do {                    \
-    printf(__VA_ARGS__);  \
-  } while (0)
+    #define ASSERT_PARAM(exp)                                                         \
+        do {                                                                          \
+            if (!(exp))                                                               \
+            {                                                                         \
+                fprintf(stderr, "Bad param - " #exp ", %s:%d\n", __FILE__, __LINE__); \
+                return STATUS_BAD_PARAM;                                              \
+            }                                                                         \
+        } while (0)
+
+    #define ASSERT_FAILURE(exp)                                                     \
+        do {                                                                        \
+            if (!(exp))                                                             \
+            {                                                                       \
+                fprintf(stderr, "Failure - " #exp ", %s:%d\n", __FILE__, __LINE__); \
+                return STATUS_FAILURE;                                              \
+            }                                                                       \
+        } while (0)
+
+    #define CSC(call, err)                                                                          \
+        do {                                                                                        \
+            cudaError_t cudaStatus = call;                                                          \
+            if (cudaStatus != cudaSuccess)                                                          \
+            {                                                                                       \
+                printf("%s %d CUDA FAIL %s\n", __FILE__, __LINE__, cudaGetErrorString(cudaStatus)); \
+                return err;                                                                         \
+            }                                                                                       \
+        } while (0)
+
+    #define CHECK(status)                                                                             \
+        {                                                                                             \
+            if (status != 0)                                                                          \
+            {                                                                                         \
+                DEBUG_PRINTF("%s %d CUDA FAIL %s\n", __FILE__, __LINE__, cudaGetErrorString(status)); \
+                abort();                                                                              \
+            }                                                                                         \
+        }
+
+    #define DEBUG_PRINTF(...)    \
+        do {                     \
+            printf(__VA_ARGS__); \
+        } while (0)
 
 #endif
 
-namespace mmdeploy {
-
-const int MAXTENSORDIMS = 10;
-
-struct TensorDesc {
-  int shape[MAXTENSORDIMS];
-  int stride[MAXTENSORDIMS];
-  int dim;
-};
-
-inline unsigned int getElementSize(nvinfer1::DataType t) {
-  switch (t) {
-    case nvinfer1::DataType::kINT32:
-      return 4;
-    case nvinfer1::DataType::kFLOAT:
-      return 4;
-    case nvinfer1::DataType::kHALF:
-      return 2;
-    // case nvinfer1::DataType::kBOOL:
-    case nvinfer1::DataType::kINT8:
-      return 1;
-    default:
-      throw std::runtime_error("Invalid DataType.");
-  }
-  throw std::runtime_error("Invalid DataType.");
-  return 0;
-}
-
-inline size_t getAlignedSize(size_t origin_size, size_t aligned_number = 16) {
-  return size_t((origin_size + aligned_number - 1) / aligned_number) * aligned_number;
-}
+namespace mmdeploy
+{
+
+    const int MAXTENSORDIMS = 10;
+
+    struct TensorDesc
+    {
+        int shape[MAXTENSORDIMS];
+        int stride[MAXTENSORDIMS];
+        int dim;
+    };
+
+    inline unsigned int getElementSize(nvinfer1::DataType t)
+    {
+        switch (t)
+        {
+            case nvinfer1::DataType::kINT32:
+                return 4;
+            case nvinfer1::DataType::kFLOAT:
+                return 4;
+            case nvinfer1::DataType::kHALF:
+                return 2;
+            // case nvinfer1::DataType::kBOOL:
+            case nvinfer1::DataType::kINT8:
+                return 1;
+            default:
+                throw std::runtime_error("Invalid DataType.");
+        }
+        throw std::runtime_error("Invalid DataType.");
+        return 0;
+    }
+
+    inline size_t getAlignedSize(size_t origin_size, size_t aligned_number = 16)
+    {
+        return size_t((origin_size + aligned_number - 1) / aligned_number) * aligned_number;
+    }
 
 }  // namespace mmdeploy
 #endif  // TRT_PLUGIN_HELPER_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/common/trt_serialize.hpp b/csrc/mmdeploy/backend_ops/tensorrt/common/trt_serialize.hpp
index db88184432..c059a7cfb8 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/common/trt_serialize.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common/trt_serialize.hpp
@@ -9,89 +9,117 @@
 #include <type_traits>
 #include <vector>
 
-template <typename T>
+template<typename T>
 inline void serialize_value(void** buffer, T const& value);
 
-template <typename T>
+template<typename T>
 inline void deserialize_value(void const** buffer, size_t* buffer_size, T* value);
 
-namespace {
-
-template <typename T, class Enable = void>
-struct Serializer {};
-
-template <typename T>
-struct Serializer<T,
-                  typename std::enable_if<std::is_arithmetic<T>::value || std::is_enum<T>::value ||
-                                          std::is_pod<T>::value>::type> {
-  static size_t serialized_size(T const& value) { return sizeof(T); }
-  static void serialize(void** buffer, T const& value) {
-    ::memcpy(*buffer, &value, sizeof(T));
-    reinterpret_cast<char*&>(*buffer) += sizeof(T);
-  }
-  static void deserialize(void const** buffer, size_t* buffer_size, T* value) {
-    assert(*buffer_size >= sizeof(T));
-    ::memcpy(value, *buffer, sizeof(T));
-    reinterpret_cast<char const*&>(*buffer) += sizeof(T);
-    *buffer_size -= sizeof(T);
-  }
-};
-
-template <>
-struct Serializer<const char*> {
-  static size_t serialized_size(const char* value) { return strlen(value) + 1; }
-  static void serialize(void** buffer, const char* value) {
-    ::strcpy(static_cast<char*>(*buffer), value);
-    reinterpret_cast<char*&>(*buffer) += strlen(value) + 1;
-  }
-  static void deserialize(void const** buffer, size_t* buffer_size, const char** value) {
-    *value = static_cast<char const*>(*buffer);
-    size_t data_size = strnlen(*value, *buffer_size) + 1;
-    assert(*buffer_size >= data_size);
-    reinterpret_cast<char const*&>(*buffer) += data_size;
-    *buffer_size -= data_size;
-  }
-};
-
-template <typename T>
-struct Serializer<std::vector<T>,
-                  typename std::enable_if<std::is_arithmetic<T>::value || std::is_enum<T>::value ||
-                                          std::is_pod<T>::value>::type> {
-  static size_t serialized_size(std::vector<T> const& value) {
-    return sizeof(value.size()) + value.size() * sizeof(T);
-  }
-  static void serialize(void** buffer, std::vector<T> const& value) {
-    serialize_value(buffer, value.size());
-    size_t nbyte = value.size() * sizeof(T);
-    ::memcpy(*buffer, value.data(), nbyte);
-    reinterpret_cast<char*&>(*buffer) += nbyte;
-  }
-  static void deserialize(void const** buffer, size_t* buffer_size, std::vector<T>* value) {
-    size_t size;
-    deserialize_value(buffer, buffer_size, &size);
-    value->resize(size);
-    size_t nbyte = value->size() * sizeof(T);
-    assert(*buffer_size >= nbyte);
-    ::memcpy(value->data(), *buffer, nbyte);
-    reinterpret_cast<char const*&>(*buffer) += nbyte;
-    *buffer_size -= nbyte;
-  }
-};
+namespace
+{
+
+    template<typename T, class Enable = void>
+    struct Serializer
+    {
+    };
+
+    template<typename T>
+    struct Serializer<T,
+                      typename std::enable_if<std::is_arithmetic<T>::value || std::is_enum<T>::value ||
+                                              std::is_pod<T>::value>::type>
+    {
+        static size_t serialized_size(T const& value)
+        {
+            return sizeof(T);
+        }
+
+        static void serialize(void** buffer, T const& value)
+        {
+            ::memcpy(*buffer, &value, sizeof(T));
+            reinterpret_cast<char*&>(*buffer) += sizeof(T);
+        }
+
+        static void deserialize(void const** buffer, size_t* buffer_size, T* value)
+        {
+            assert(*buffer_size >= sizeof(T));
+            ::memcpy(value, *buffer, sizeof(T));
+            reinterpret_cast<char const*&>(*buffer) += sizeof(T);
+            *buffer_size -= sizeof(T);
+        }
+    };
+
+    template<>
+    struct Serializer<const char*>
+    {
+        static size_t serialized_size(const char* value)
+        {
+            return strlen(value) + 1;
+        }
+
+        static void serialize(void** buffer, const char* value)
+        {
+            ::strcpy(static_cast<char*>(*buffer), value);
+            reinterpret_cast<char*&>(*buffer) += strlen(value) + 1;
+        }
+
+        static void deserialize(void const** buffer, size_t* buffer_size, const char** value)
+        {
+            *value           = static_cast<char const*>(*buffer);
+            size_t data_size = strnlen(*value, *buffer_size) + 1;
+            assert(*buffer_size >= data_size);
+            reinterpret_cast<char const*&>(*buffer) += data_size;
+            *buffer_size -= data_size;
+        }
+    };
+
+    template<typename T>
+    struct Serializer<std::vector<T>,
+                      typename std::enable_if<std::is_arithmetic<T>::value || std::is_enum<T>::value ||
+                                              std::is_pod<T>::value>::type>
+    {
+        static size_t serialized_size(std::vector<T> const& value)
+        {
+            return sizeof(value.size()) + value.size() * sizeof(T);
+        }
+
+        static void serialize(void** buffer, std::vector<T> const& value)
+        {
+            serialize_value(buffer, value.size());
+            size_t nbyte = value.size() * sizeof(T);
+            ::memcpy(*buffer, value.data(), nbyte);
+            reinterpret_cast<char*&>(*buffer) += nbyte;
+        }
+
+        static void deserialize(void const** buffer, size_t* buffer_size, std::vector<T>* value)
+        {
+            size_t size;
+            deserialize_value(buffer, buffer_size, &size);
+            value->resize(size);
+            size_t nbyte = value->size() * sizeof(T);
+            assert(*buffer_size >= nbyte);
+            ::memcpy(value->data(), *buffer, nbyte);
+            reinterpret_cast<char const*&>(*buffer) += nbyte;
+            *buffer_size -= nbyte;
+        }
+    };
 
 }  // namespace
 
-template <typename T>
-inline size_t serialized_size(T const& value) {
-  return Serializer<T>::serialized_size(value);
+template<typename T>
+inline size_t serialized_size(T const& value)
+{
+    return Serializer<T>::serialized_size(value);
 }
 
-template <typename T>
-inline void serialize_value(void** buffer, T const& value) {
-  return Serializer<T>::serialize(buffer, value);
+template<typename T>
+inline void serialize_value(void** buffer, T const& value)
+{
+    return Serializer<T>::serialize(buffer, value);
 }
 
-template <typename T>
-inline void deserialize_value(void const** buffer, size_t* buffer_size, T* value) {
-  return Serializer<T>::deserialize(buffer, buffer_size, value);
+template<typename T>
+inline void deserialize_value(void const** buffer, size_t* buffer_size, T* value)
+{
+    return Serializer<T>::deserialize(buffer, buffer_size, value);
 }
 #endif  // TRT_SERIALIZE_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/allClassNMS.cu b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/allClassNMS.cu
index 44c08152db..99aba5704c 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/allClassNMS.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/allClassNMS.cu
@@ -7,261 +7,381 @@
 
 const static int BS = 512;
 
-template <typename T_BBOX>
-__device__ T_BBOX bboxSize(const Bbox<T_BBOX> &bbox, const bool normalized, T_BBOX offset) {
-  if (bbox.xmax < bbox.xmin || bbox.ymax < bbox.ymin) {
-    // If bbox is invalid (e.g. xmax < xmin or ymax < ymin), return 0.
-    return 0;
-  } else {
-    T_BBOX width = bbox.xmax - bbox.xmin;
-    T_BBOX height = bbox.ymax - bbox.ymin;
-    if (normalized) {
-      return width * height;
-    } else {
-      // If bbox is not within range [0, 1].
-      return (width + offset) * (height + offset);
+template<typename T_BBOX>
+__device__ T_BBOX bboxSize(const Bbox<T_BBOX>& bbox,
+                           const bool          normalized,
+                           T_BBOX              offset)
+{
+    if (bbox.xmax < bbox.xmin || bbox.ymax < bbox.ymin)
+    {
+        // If bbox is invalid (e.g. xmax < xmin or ymax < ymin), return 0.
+        return 0;
+    }
+    else
+    {
+        T_BBOX width  = bbox.xmax - bbox.xmin;
+        T_BBOX height = bbox.ymax - bbox.ymin;
+        if (normalized)
+        {
+            return width * height;
+        }
+        else
+        {
+            // If bbox is not within range [0, 1].
+            return (width + offset) * (height + offset);
+        }
     }
-  }
 }
 
-template <typename T_BBOX>
-__device__ void intersectBbox(const Bbox<T_BBOX> &bbox1, const Bbox<T_BBOX> &bbox2,
-                              Bbox<T_BBOX> *intersect_bbox) {
-  if (bbox2.xmin > bbox1.xmax || bbox2.xmax < bbox1.xmin || bbox2.ymin > bbox1.ymax ||
-      bbox2.ymax < bbox1.ymin) {
-    // Return [0, 0, 0, 0] if there is no intersection.
-    intersect_bbox->xmin = T_BBOX(0);
-    intersect_bbox->ymin = T_BBOX(0);
-    intersect_bbox->xmax = T_BBOX(0);
-    intersect_bbox->ymax = T_BBOX(0);
-  } else {
-    intersect_bbox->xmin = max(bbox1.xmin, bbox2.xmin);
-    intersect_bbox->ymin = max(bbox1.ymin, bbox2.ymin);
-    intersect_bbox->xmax = min(bbox1.xmax, bbox2.xmax);
-    intersect_bbox->ymax = min(bbox1.ymax, bbox2.ymax);
-  }
+template<typename T_BBOX>
+__device__ void intersectBbox(const Bbox<T_BBOX>& bbox1,
+                              const Bbox<T_BBOX>& bbox2,
+                              Bbox<T_BBOX>*       intersect_bbox)
+{
+    if (bbox2.xmin > bbox1.xmax || bbox2.xmax < bbox1.xmin || bbox2.ymin > bbox1.ymax ||
+        bbox2.ymax < bbox1.ymin)
+    {
+        // Return [0, 0, 0, 0] if there is no intersection.
+        intersect_bbox->xmin = T_BBOX(0);
+        intersect_bbox->ymin = T_BBOX(0);
+        intersect_bbox->xmax = T_BBOX(0);
+        intersect_bbox->ymax = T_BBOX(0);
+    }
+    else
+    {
+        intersect_bbox->xmin = max(bbox1.xmin, bbox2.xmin);
+        intersect_bbox->ymin = max(bbox1.ymin, bbox2.ymin);
+        intersect_bbox->xmax = min(bbox1.xmax, bbox2.xmax);
+        intersect_bbox->ymax = min(bbox1.ymax, bbox2.ymax);
+    }
 }
 
-template <typename T_BBOX>
-__device__ float jaccardOverlap(const Bbox<T_BBOX> &bbox1, const Bbox<T_BBOX> &bbox2,
-                                const bool normalized, T_BBOX offset) {
-  Bbox<T_BBOX> intersect_bbox;
-  intersectBbox(bbox1, bbox2, &intersect_bbox);
-  float intersect_width, intersect_height;
-  if (normalized) {
-    intersect_width = intersect_bbox.xmax - intersect_bbox.xmin;
-    intersect_height = intersect_bbox.ymax - intersect_bbox.ymin;
-  } else {
-    intersect_width = intersect_bbox.xmax - intersect_bbox.xmin + offset;
-    intersect_height = intersect_bbox.ymax - intersect_bbox.ymin + offset;
-  }
-  if (intersect_width > 0 && intersect_height > 0) {
-    float intersect_size = intersect_width * intersect_height;
-    float bbox1_size = bboxSize(bbox1, normalized, offset);
-    float bbox2_size = bboxSize(bbox2, normalized, offset);
-    return intersect_size / (bbox1_size + bbox2_size - intersect_size);
-  } else {
-    return 0.;
-  }
+template<typename T_BBOX>
+__device__ float jaccardOverlap(const Bbox<T_BBOX>& bbox1,
+                                const Bbox<T_BBOX>& bbox2,
+                                const bool          normalized,
+                                T_BBOX              offset)
+{
+    Bbox<T_BBOX> intersect_bbox;
+    intersectBbox(bbox1, bbox2, &intersect_bbox);
+    float intersect_width, intersect_height;
+    if (normalized)
+    {
+        intersect_width  = intersect_bbox.xmax - intersect_bbox.xmin;
+        intersect_height = intersect_bbox.ymax - intersect_bbox.ymin;
+    }
+    else
+    {
+        intersect_width  = intersect_bbox.xmax - intersect_bbox.xmin + offset;
+        intersect_height = intersect_bbox.ymax - intersect_bbox.ymin + offset;
+    }
+    if (intersect_width > 0 && intersect_height > 0)
+    {
+        float intersect_size = intersect_width * intersect_height;
+        float bbox1_size     = bboxSize(bbox1, normalized, offset);
+        float bbox2_size     = bboxSize(bbox2, normalized, offset);
+        return intersect_size / (bbox1_size + bbox2_size - intersect_size);
+    }
+    else
+    {
+        return 0.;
+    }
 }
 
 /********** new NMS for only score and index array **********/
 
-// clang-format off
-template <typename T_SCORE, typename T_BBOX, int TSIZE>
+template<typename T_SCORE, typename T_BBOX, int TSIZE>
 __global__ void
 #ifdef __CUDA_ARCH__
-#if __CUDA_ARCH__ == 620 || __CUDA_ARCH__ == 530
-__launch_bounds__(512)
+    #if __CUDA_ARCH__ == 620 || __CUDA_ARCH__ == 530
+    __launch_bounds__(512)
+    #endif
 #endif
-#endif
-allClassNMS_kernel(const int num, const int num_classes, const int num_preds_per_class,
-                    const int top_k, const float nms_threshold, const bool share_location,
-                    const bool isNormalized,
-                    T_BBOX *bbox_data,  // bbox_data should be float to preserve
-                                        // location information
-                    T_SCORE *beforeNMS_scores, int *beforeNMS_index_array,
-                    T_SCORE *afterNMS_scores, int *afterNMS_index_array, bool flipXY = false) {
-  // clang-format on
-  //__shared__ bool kept_bboxinfo_flag[CAFFE_CUDA_NUM_THREADS * TSIZE];
-  __shared__ bool kept_bboxinfo_flag[TSIZE * BS];
-  for (int i = 0; i < num; i++) {
-    const int offset = i * num_classes * num_preds_per_class + blockIdx.x * num_preds_per_class;
-    const int max_idx = offset + top_k;  // put top_k bboxes into NMS calculation
-    const int bbox_idx_offset =
-        share_location ? (i * num_preds_per_class) : (i * num_classes * num_preds_per_class);
-
-    // local thread data
-    int loc_bboxIndex[TSIZE];
-    Bbox<T_BBOX> loc_bbox[TSIZE];
-
-    // initialize Bbox, Bboxinfo, kept_bboxinfo_flag
-    // Eliminate shared memory RAW hazard
-    __syncthreads();
+        allClassNMS_kernel(const int   num,
+                           const int   num_classes,
+                           const int   num_preds_per_class,
+                           const int   top_k,
+                           const float nms_threshold,
+                           const bool  share_location,
+                           const bool  isNormalized,
+                           T_BBOX*     bbox_data,  // bbox_data should be float to preserve location information
+                           T_SCORE*    beforeNMS_scores,
+                           int*        beforeNMS_index_array,
+                           T_SCORE*    afterNMS_scores,
+                           int*        afterNMS_index_array,
+                           bool        flipXY = false)
+{
+    //__shared__ bool kept_bboxinfo_flag[CAFFE_CUDA_NUM_THREADS * TSIZE];
+    __shared__ bool kept_bboxinfo_flag[TSIZE * BS];
+    for (int i = 0; i < num; i++)
+    {
+        const int offset  = i * num_classes * num_preds_per_class + blockIdx.x * num_preds_per_class;
+        const int max_idx = offset + top_k;  // put top_k bboxes into NMS calculation
+        const int bbox_idx_offset =
+            share_location ? (i * num_preds_per_class) : (i * num_classes * num_preds_per_class);
+
+        // local thread data
+        int          loc_bboxIndex[TSIZE];
+        Bbox<T_BBOX> loc_bbox[TSIZE];
+
+        // initialize Bbox, Bboxinfo, kept_bboxinfo_flag
+        // Eliminate shared memory RAW hazard
+        __syncthreads();
 #pragma unroll
-    for (int t = 0; t < TSIZE; t++) {
-      const int cur_idx = threadIdx.x + blockDim.x * t;
-      const int item_idx = offset + cur_idx;
+        for (int t = 0; t < TSIZE; t++)
+        {
+            const int cur_idx  = threadIdx.x + blockDim.x * t;
+            const int item_idx = offset + cur_idx;
+
+            if (item_idx < max_idx)
+            {
+                loc_bboxIndex[t] = beforeNMS_index_array[item_idx];
+
+                if (loc_bboxIndex[t] >= 0)
+                // if (loc_bboxIndex[t] != -1)
+                {
+                    const int bbox_data_idx = share_location ? (loc_bboxIndex[t] % num_preds_per_class + bbox_idx_offset) : loc_bboxIndex[t];
+
+                    loc_bbox[t].xmin =
+                        flipXY ? bbox_data[bbox_data_idx * 4 + 1] : bbox_data[bbox_data_idx * 4 + 0];
+                    loc_bbox[t].ymin =
+                        flipXY ? bbox_data[bbox_data_idx * 4 + 0] : bbox_data[bbox_data_idx * 4 + 1];
+                    loc_bbox[t].xmax =
+                        flipXY ? bbox_data[bbox_data_idx * 4 + 3] : bbox_data[bbox_data_idx * 4 + 2];
+                    loc_bbox[t].ymax =
+                        flipXY ? bbox_data[bbox_data_idx * 4 + 2] : bbox_data[bbox_data_idx * 4 + 3];
+                    kept_bboxinfo_flag[cur_idx] = true;
+                }
+                else
+                {
+                    kept_bboxinfo_flag[cur_idx] = false;
+                }
+            }
+            else
+            {
+                kept_bboxinfo_flag[cur_idx] = false;
+            }
+        }
 
-      if (item_idx < max_idx) {
-        loc_bboxIndex[t] = beforeNMS_index_array[item_idx];
+        // filter out overlapped boxes with lower scores
+        int ref_item_idx = offset;
+        int ref_bbox_idx = share_location ?
+                               (beforeNMS_index_array[ref_item_idx] % num_preds_per_class + bbox_idx_offset) :
+                               beforeNMS_index_array[ref_item_idx];
 
-        if (loc_bboxIndex[t] >= 0)
-        // if (loc_bboxIndex[t] != -1)
+        while ((ref_bbox_idx != -1) && ref_item_idx < max_idx)
         {
-          const int bbox_data_idx = share_location
-                                        ? (loc_bboxIndex[t] % num_preds_per_class + bbox_idx_offset)
-                                        : loc_bboxIndex[t];
-
-          loc_bbox[t].xmin =
-              flipXY ? bbox_data[bbox_data_idx * 4 + 1] : bbox_data[bbox_data_idx * 4 + 0];
-          loc_bbox[t].ymin =
-              flipXY ? bbox_data[bbox_data_idx * 4 + 0] : bbox_data[bbox_data_idx * 4 + 1];
-          loc_bbox[t].xmax =
-              flipXY ? bbox_data[bbox_data_idx * 4 + 3] : bbox_data[bbox_data_idx * 4 + 2];
-          loc_bbox[t].ymax =
-              flipXY ? bbox_data[bbox_data_idx * 4 + 2] : bbox_data[bbox_data_idx * 4 + 3];
-          kept_bboxinfo_flag[cur_idx] = true;
-        } else {
-          kept_bboxinfo_flag[cur_idx] = false;
+            Bbox<T_BBOX> ref_bbox;
+            ref_bbox.xmin = flipXY ? bbox_data[ref_bbox_idx * 4 + 1] : bbox_data[ref_bbox_idx * 4 + 0];
+            ref_bbox.ymin = flipXY ? bbox_data[ref_bbox_idx * 4 + 0] : bbox_data[ref_bbox_idx * 4 + 1];
+            ref_bbox.xmax = flipXY ? bbox_data[ref_bbox_idx * 4 + 3] : bbox_data[ref_bbox_idx * 4 + 2];
+            ref_bbox.ymax = flipXY ? bbox_data[ref_bbox_idx * 4 + 2] : bbox_data[ref_bbox_idx * 4 + 3];
+
+            // Eliminate shared memory RAW hazard
+            __syncthreads();
+
+            for (int t = 0; t < TSIZE; t++)
+            {
+                const int cur_idx  = threadIdx.x + blockDim.x * t;
+                const int item_idx = offset + cur_idx;
+
+                if ((kept_bboxinfo_flag[cur_idx]) && (item_idx > ref_item_idx))
+                {
+                    // TODO: may need to add bool normalized as argument, HERE true means
+                    // normalized
+                    if (jaccardOverlap(ref_bbox, loc_bbox[t], isNormalized, T_BBOX(0)) > nms_threshold)
+                    {
+                        kept_bboxinfo_flag[cur_idx] = false;
+                    }
+                }
+            }
+            __syncthreads();
+
+            do {
+                ref_item_idx++;
+            } while (ref_item_idx < max_idx && !kept_bboxinfo_flag[ref_item_idx - offset]);
+
+            ref_bbox_idx =
+                share_location ? (beforeNMS_index_array[ref_item_idx] % num_preds_per_class + bbox_idx_offset) : beforeNMS_index_array[ref_item_idx];
         }
-      } else {
-        kept_bboxinfo_flag[cur_idx] = false;
-      }
-    }
 
-    // filter out overlapped boxes with lower scores
-    int ref_item_idx = offset;
-    int ref_bbox_idx =
-        share_location
-            ? (beforeNMS_index_array[ref_item_idx] % num_preds_per_class + bbox_idx_offset)
-            : beforeNMS_index_array[ref_item_idx];
-
-    while ((ref_bbox_idx != -1) && ref_item_idx < max_idx) {
-      Bbox<T_BBOX> ref_bbox;
-      ref_bbox.xmin = flipXY ? bbox_data[ref_bbox_idx * 4 + 1] : bbox_data[ref_bbox_idx * 4 + 0];
-      ref_bbox.ymin = flipXY ? bbox_data[ref_bbox_idx * 4 + 0] : bbox_data[ref_bbox_idx * 4 + 1];
-      ref_bbox.xmax = flipXY ? bbox_data[ref_bbox_idx * 4 + 3] : bbox_data[ref_bbox_idx * 4 + 2];
-      ref_bbox.ymax = flipXY ? bbox_data[ref_bbox_idx * 4 + 2] : bbox_data[ref_bbox_idx * 4 + 3];
-
-      // Eliminate shared memory RAW hazard
-      __syncthreads();
-
-      for (int t = 0; t < TSIZE; t++) {
-        const int cur_idx = threadIdx.x + blockDim.x * t;
-        const int item_idx = offset + cur_idx;
-
-        if ((kept_bboxinfo_flag[cur_idx]) && (item_idx > ref_item_idx)) {
-          // TODO: may need to add bool normalized as argument, HERE true means
-          // normalized
-          if (jaccardOverlap(ref_bbox, loc_bbox[t], isNormalized, T_BBOX(0)) > nms_threshold) {
-            kept_bboxinfo_flag[cur_idx] = false;
-          }
+        // store data
+        for (int t = 0; t < TSIZE; t++)
+        {
+            const int cur_idx        = threadIdx.x + blockDim.x * t;
+            const int read_item_idx  = offset + cur_idx;
+            const int write_item_idx = (i * num_classes * top_k + blockIdx.x * top_k) + cur_idx;
+            /*
+             * If not not keeping the bbox
+             * Set the score to 0
+             * Set the bounding box index to -1
+             */
+            if (read_item_idx < max_idx)
+            {
+                afterNMS_scores[write_item_idx] =
+                    kept_bboxinfo_flag[cur_idx] ? beforeNMS_scores[read_item_idx] : 0.0f;
+                afterNMS_index_array[write_item_idx] = kept_bboxinfo_flag[cur_idx] ? loc_bboxIndex[t] : -1;
+            }
         }
-      }
-      __syncthreads();
-
-      do {
-        ref_item_idx++;
-      } while (ref_item_idx < max_idx && !kept_bboxinfo_flag[ref_item_idx - offset]);
-
-      ref_bbox_idx =
-          share_location
-              ? (beforeNMS_index_array[ref_item_idx] % num_preds_per_class + bbox_idx_offset)
-              : beforeNMS_index_array[ref_item_idx];
     }
-
-    // store data
-    for (int t = 0; t < TSIZE; t++) {
-      const int cur_idx = threadIdx.x + blockDim.x * t;
-      const int read_item_idx = offset + cur_idx;
-      const int write_item_idx = (i * num_classes * top_k + blockIdx.x * top_k) + cur_idx;
-      /*
-       * If not not keeping the bbox
-       * Set the score to 0
-       * Set the bounding box index to -1
-       */
-      if (read_item_idx < max_idx) {
-        afterNMS_scores[write_item_idx] =
-            kept_bboxinfo_flag[cur_idx] ? beforeNMS_scores[read_item_idx] : 0.0f;
-        afterNMS_index_array[write_item_idx] = kept_bboxinfo_flag[cur_idx] ? loc_bboxIndex[t] : -1;
-      }
-    }
-  }
 }
 
-template <typename T_SCORE, typename T_BBOX>
-pluginStatus_t allClassNMS_gpu(cudaStream_t stream, const int num, const int num_classes,
-                               const int num_preds_per_class, const int top_k,
-                               const float nms_threshold, const bool share_location,
-                               const bool isNormalized, void *bbox_data, void *beforeNMS_scores,
-                               void *beforeNMS_index_array, void *afterNMS_scores,
-                               void *afterNMS_index_array, bool flipXY = false) {
+template<typename T_SCORE, typename T_BBOX>
+pluginStatus_t allClassNMS_gpu(cudaStream_t stream,
+                               const int    num,
+                               const int    num_classes,
+                               const int    num_preds_per_class,
+                               const int    top_k,
+                               const float  nms_threshold,
+                               const bool   share_location,
+                               const bool   isNormalized,
+                               void*        bbox_data,
+                               void*        beforeNMS_scores,
+                               void*        beforeNMS_index_array,
+                               void*        afterNMS_scores,
+                               void*        afterNMS_index_array,
+                               bool         flipXY = false)
+{
 #define P(tsize) allClassNMS_kernel<T_SCORE, T_BBOX, (tsize)>
 
-  void (*kernel[10])(const int, const int, const int, const int, const float, const bool,
-                     const bool, float *, T_SCORE *, int *, T_SCORE *, int *, bool) = {
-      P(1), P(2), P(3), P(4), P(5), P(6), P(7), P(8), P(9), P(10),
-  };
-
-  const int GS = num_classes;
-  const int t_size = (top_k + BS - 1) / BS;
-
-  ASSERT(t_size <= 10);
-  kernel[t_size - 1]<<<GS, BS, 0, stream>>>(
-      num, num_classes, num_preds_per_class, top_k, nms_threshold, share_location, isNormalized,
-      (T_BBOX *)bbox_data, (T_SCORE *)beforeNMS_scores, (int *)beforeNMS_index_array,
-      (T_SCORE *)afterNMS_scores, (int *)afterNMS_index_array, flipXY);
-
-  cudaError_t code = cudaGetLastError();
-  CUASSERT(code);
-  CSC(code, STATUS_FAILURE);
-  return STATUS_SUCCESS;
+    void (*kernel[10])(const int,
+                       const int,
+                       const int,
+                       const int,
+                       const float,
+                       const bool,
+                       const bool,
+                       float*,
+                       T_SCORE*,
+                       int*,
+                       T_SCORE*,
+                       int*,
+                       bool) = {
+        P(1),
+        P(2),
+        P(3),
+        P(4),
+        P(5),
+        P(6),
+        P(7),
+        P(8),
+        P(9),
+        P(10),
+    };
+
+    const int GS     = num_classes;
+    const int t_size = (top_k + BS - 1) / BS;
+
+    ASSERT(t_size <= 10);
+    kernel[t_size - 1]<<<GS, BS, 0, stream>>>(
+        num,
+        num_classes,
+        num_preds_per_class,
+        top_k,
+        nms_threshold,
+        share_location,
+        isNormalized,
+        (T_BBOX*)bbox_data,
+        (T_SCORE*)beforeNMS_scores,
+        (int*)beforeNMS_index_array,
+        (T_SCORE*)afterNMS_scores,
+        (int*)afterNMS_index_array,
+        flipXY);
+
+    cudaError_t code = cudaGetLastError();
+    CUASSERT(code);
+    CSC(code, STATUS_FAILURE);
+    return STATUS_SUCCESS;
 }
 
 // allClassNMS LAUNCH CONFIG
-typedef pluginStatus_t (*nmsFunc)(cudaStream_t, const int, const int, const int, const int,
-                                  const float, const bool, const bool, void *, void *, void *,
-                                  void *, void *, bool);
-
-struct nmsLaunchConfigSSD {
-  DataType t_score;
-  DataType t_bbox;
-  nmsFunc function;
-
-  nmsLaunchConfigSSD(DataType t_score, DataType t_bbox) : t_score(t_score), t_bbox(t_bbox) {}
-  nmsLaunchConfigSSD(DataType t_score, DataType t_bbox, nmsFunc function)
-      : t_score(t_score), t_bbox(t_bbox), function(function) {}
-  bool operator==(const nmsLaunchConfigSSD &other) {
-    return t_score == other.t_score && t_bbox == other.t_bbox;
-  }
+typedef pluginStatus_t (*nmsFunc)(cudaStream_t,
+                                  const int,
+                                  const int,
+                                  const int,
+                                  const int,
+                                  const float,
+                                  const bool,
+                                  const bool,
+                                  void*,
+                                  void*,
+                                  void*,
+                                  void*,
+                                  void*,
+                                  bool);
+
+struct nmsLaunchConfigSSD
+{
+    DataType t_score;
+    DataType t_bbox;
+    nmsFunc  function;
+
+    nmsLaunchConfigSSD(DataType t_score, DataType t_bbox)
+        : t_score(t_score)
+        , t_bbox(t_bbox)
+    {
+    }
+    nmsLaunchConfigSSD(DataType t_score, DataType t_bbox, nmsFunc function)
+        : t_score(t_score)
+        , t_bbox(t_bbox)
+        , function(function)
+    {
+    }
+    bool operator==(const nmsLaunchConfigSSD& other)
+    {
+        return t_score == other.t_score && t_bbox == other.t_bbox;
+    }
 };
 
 static std::vector<nmsLaunchConfigSSD> nmsFuncVec;
 
-bool nmsInit() {
-  nmsFuncVec.push_back(
-      nmsLaunchConfigSSD(DataType::kFLOAT, DataType::kFLOAT, allClassNMS_gpu<float, float>));
-  return true;
+bool                                   nmsInit()
+{
+    nmsFuncVec.push_back(
+        nmsLaunchConfigSSD(DataType::kFLOAT, DataType::kFLOAT, allClassNMS_gpu<float, float>));
+    return true;
 }
 
-static bool initialized = nmsInit();
-
-pluginStatus_t allClassNMS(cudaStream_t stream, const int num, const int num_classes,
-                           const int num_preds_per_class, const int top_k,
-                           const float nms_threshold, const bool share_location,
-                           const bool isNormalized, const DataType DT_SCORE, const DataType DT_BBOX,
-                           void *bbox_data, void *beforeNMS_scores, void *beforeNMS_index_array,
-                           void *afterNMS_scores, void *afterNMS_index_array, bool flipXY) {
-  nmsLaunchConfigSSD lc(DT_SCORE, DT_BBOX);
-  for (unsigned i = 0; i < nmsFuncVec.size(); ++i) {
-    if (lc == nmsFuncVec[i]) {
-      DEBUG_PRINTF("all class nms kernel %d\n", i);
-      return nmsFuncVec[i].function(stream, num, num_classes, num_preds_per_class, top_k,
-                                    nms_threshold, share_location, isNormalized, bbox_data,
-                                    beforeNMS_scores, beforeNMS_index_array, afterNMS_scores,
-                                    afterNMS_index_array, flipXY);
+static bool    initialized = nmsInit();
+
+pluginStatus_t allClassNMS(cudaStream_t   stream,
+                           const int      num,
+                           const int      num_classes,
+                           const int      num_preds_per_class,
+                           const int      top_k,
+                           const float    nms_threshold,
+                           const bool     share_location,
+                           const bool     isNormalized,
+                           const DataType DT_SCORE,
+                           const DataType DT_BBOX,
+                           void*          bbox_data,
+                           void*          beforeNMS_scores,
+                           void*          beforeNMS_index_array,
+                           void*          afterNMS_scores,
+                           void*          afterNMS_index_array,
+                           bool           flipXY)
+{
+    nmsLaunchConfigSSD lc(DT_SCORE, DT_BBOX);
+    for (unsigned i = 0; i < nmsFuncVec.size(); ++i)
+    {
+        if (lc == nmsFuncVec[i])
+        {
+            DEBUG_PRINTF("all class nms kernel %d\n", i);
+            return nmsFuncVec[i].function(stream,
+                                          num,
+                                          num_classes,
+                                          num_preds_per_class,
+                                          top_k,
+                                          nms_threshold,
+                                          share_location,
+                                          isNormalized,
+                                          bbox_data,
+                                          beforeNMS_scores,
+                                          beforeNMS_index_array,
+                                          afterNMS_scores,
+                                          afterNMS_index_array,
+                                          flipXY);
+        }
     }
-  }
-  return STATUS_BAD_PARAM;
+    return STATUS_BAD_PARAM;
 }
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/allClassRotatedNMS.cu b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/allClassRotatedNMS.cu
index 0edea2bfaf..e8c1cd2187 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/allClassRotatedNMS.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/allClassRotatedNMS.cu
@@ -6,490 +6,636 @@
 
 #include "nms/kernel.h"
 
-template <typename T>
-struct RotatedBox {
-  T x_ctr, y_ctr, w, h, a;
+template<typename T>
+struct RotatedBox
+{
+    T x_ctr, y_ctr, w, h, a;
 };
 
-template <typename T>
-struct Point {
-  T x, y;
-  __host__ __device__ __forceinline__ Point(const T &px = 0, const T &py = 0) : x(px), y(py) {}
-  __host__ __device__ __forceinline__ Point operator+(const Point &p) const {
-    return Point(x + p.x, y + p.y);
-  }
-  __host__ __device__ __forceinline__ Point &operator+=(const Point &p) {
-    x += p.x;
-    y += p.y;
-    return *this;
-  }
-  __host__ __device__ __forceinline__ Point operator-(const Point &p) const {
-    return Point(x - p.x, y - p.y);
-  }
-  __host__ __device__ __forceinline__ Point operator*(const T coeff) const {
-    return Point(x * coeff, y * coeff);
-  }
+template<typename T>
+struct Point
+{
+    T                                   x, y;
+    __host__ __device__ __forceinline__ Point(const T& px = 0, const T& py = 0)
+        : x(px)
+        , y(py)
+    {
+    }
+
+    __host__ __device__ __forceinline__ Point operator+(const Point& p) const
+    {
+        return Point(x + p.x, y + p.y);
+    }
+
+    __host__ __device__ __forceinline__ Point& operator+=(const Point& p)
+    {
+        x += p.x;
+        y += p.y;
+        return *this;
+    }
+
+    __host__ __device__ __forceinline__ Point operator-(const Point& p) const
+    {
+        return Point(x - p.x, y - p.y);
+    }
+
+    __host__ __device__ __forceinline__ Point operator*(const T coeff) const
+    {
+        return Point(x * coeff, y * coeff);
+    }
 };
 
-template <typename T>
-__host__ __device__ __forceinline__ T dot_2d(const Point<T> &A, const Point<T> &B) {
-  return A.x * B.x + A.y * B.y;
+template<typename T>
+__host__ __device__ __forceinline__ T dot_2d(const Point<T>& A, const Point<T>& B)
+{
+    return A.x * B.x + A.y * B.y;
 }
 
-template <typename T>
-__host__ __device__ __forceinline__ T cross_2d(const Point<T> &A, const Point<T> &B) {
-  return A.x * B.y - B.x * A.y;
+template<typename T>
+__host__ __device__ __forceinline__ T cross_2d(const Point<T>& A, const Point<T>& B)
+{
+    return A.x * B.y - B.x * A.y;
 }
 
-template <typename T>
-__host__ __device__ __forceinline__ void get_rotated_vertices(const RotatedBox<T> &box,
-                                                              Point<T> (&pts)[4]) {
-  // M_PI / 180. == 0.01745329251
-  // double theta = box.a * 0.01745329251;
-  // MODIFIED
-  double theta = box.a;
-  T cosTheta2 = (T)cos(theta) * 0.5f;
-  T sinTheta2 = (T)sin(theta) * 0.5f;
-
-  // y: top --> down; x: left --> right
-  pts[0].x = box.x_ctr - sinTheta2 * box.h - cosTheta2 * box.w;
-  pts[0].y = box.y_ctr + cosTheta2 * box.h - sinTheta2 * box.w;
-  pts[1].x = box.x_ctr + sinTheta2 * box.h - cosTheta2 * box.w;
-  pts[1].y = box.y_ctr - cosTheta2 * box.h - sinTheta2 * box.w;
-  pts[2].x = 2 * box.x_ctr - pts[0].x;
-  pts[2].y = 2 * box.y_ctr - pts[0].y;
-  pts[3].x = 2 * box.x_ctr - pts[1].x;
-  pts[3].y = 2 * box.y_ctr - pts[1].y;
+template<typename T>
+__host__ __device__ __forceinline__ void get_rotated_vertices(const RotatedBox<T>& box,
+                                                              Point<T> (&pts)[4])
+{
+    // M_PI / 180. == 0.01745329251
+    // double theta = box.a * 0.01745329251;
+    // MODIFIED
+    double theta     = box.a;
+    T      cosTheta2 = (T)cos(theta) * 0.5f;
+    T      sinTheta2 = (T)sin(theta) * 0.5f;
+
+    // y: top --> down; x: left --> right
+    pts[0].x = box.x_ctr - sinTheta2 * box.h - cosTheta2 * box.w;
+    pts[0].y = box.y_ctr + cosTheta2 * box.h - sinTheta2 * box.w;
+    pts[1].x = box.x_ctr + sinTheta2 * box.h - cosTheta2 * box.w;
+    pts[1].y = box.y_ctr - cosTheta2 * box.h - sinTheta2 * box.w;
+    pts[2].x = 2 * box.x_ctr - pts[0].x;
+    pts[2].y = 2 * box.y_ctr - pts[0].y;
+    pts[3].x = 2 * box.x_ctr - pts[1].x;
+    pts[3].y = 2 * box.y_ctr - pts[1].y;
 }
 
-template <typename T>
+template<typename T>
 __host__ __device__ __forceinline__ int get_intersection_points(const Point<T> (&pts1)[4],
                                                                 const Point<T> (&pts2)[4],
-                                                                Point<T> (&intersections)[24]) {
-  // Line vector
-  // A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1]
-  Point<T> vec1[4], vec2[4];
-  for (int i = 0; i < 4; i++) {
-    vec1[i] = pts1[(i + 1) % 4] - pts1[i];
-    vec2[i] = pts2[(i + 1) % 4] - pts2[i];
-  }
-
-  // Line test - test all line combos for intersection
-  int num = 0;  // number of intersections
-  for (int i = 0; i < 4; i++) {
-    for (int j = 0; j < 4; j++) {
-      // Solve for 2x2 Ax=b
-      T det = cross_2d<T>(vec2[j], vec1[i]);
-
-      // This takes care of parallel lines
-      if (fabs(det) <= 1e-14) {
-        continue;
-      }
-
-      auto vec12 = pts2[j] - pts1[i];
-
-      T t1 = cross_2d<T>(vec2[j], vec12) / det;
-      T t2 = cross_2d<T>(vec1[i], vec12) / det;
-
-      if (t1 >= 0.0f && t1 <= 1.0f && t2 >= 0.0f && t2 <= 1.0f) {
-        intersections[num++] = pts1[i] + vec1[i] * t1;
-      }
+                                                                Point<T> (&intersections)[24])
+{
+    // Line vector
+    // A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1]
+    Point<T> vec1[4], vec2[4];
+    for (int i = 0; i < 4; i++)
+    {
+        vec1[i] = pts1[(i + 1) % 4] - pts1[i];
+        vec2[i] = pts2[(i + 1) % 4] - pts2[i];
     }
-  }
-
-  // Check for vertices of rect1 inside rect2
-  {
-    const auto &AB = vec2[0];
-    const auto &DA = vec2[3];
-    auto ABdotAB = dot_2d<T>(AB, AB);
-    auto ADdotAD = dot_2d<T>(DA, DA);
-    for (int i = 0; i < 4; i++) {
-      // assume ABCD is the rectangle, and P is the point to be judged
-      // P is inside ABCD iff. P's projection on AB lies within AB
-      // and P's projection on AD lies within AD
-
-      auto AP = pts1[i] - pts2[0];
-
-      auto APdotAB = dot_2d<T>(AP, AB);
-      auto APdotAD = -dot_2d<T>(AP, DA);
-
-      if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && (APdotAD <= ADdotAD)) {
-        intersections[num++] = pts1[i];
-      }
+
+    // Line test - test all line combos for intersection
+    int num = 0;  // number of intersections
+    for (int i = 0; i < 4; i++)
+    {
+        for (int j = 0; j < 4; j++)
+        {
+            // Solve for 2x2 Ax=b
+            T det = cross_2d<T>(vec2[j], vec1[i]);
+
+            // This takes care of parallel lines
+            if (fabs(det) <= 1e-14)
+            {
+                continue;
+            }
+
+            auto vec12 = pts2[j] - pts1[i];
+
+            T    t1 = cross_2d<T>(vec2[j], vec12) / det;
+            T    t2 = cross_2d<T>(vec1[i], vec12) / det;
+
+            if (t1 >= 0.0f && t1 <= 1.0f && t2 >= 0.0f && t2 <= 1.0f)
+            {
+                intersections[num++] = pts1[i] + vec1[i] * t1;
+            }
+        }
     }
-  }
-
-  // Reverse the check - check for vertices of rect2 inside rect1
-  {
-    const auto &AB = vec1[0];
-    const auto &DA = vec1[3];
-    auto ABdotAB = dot_2d<T>(AB, AB);
-    auto ADdotAD = dot_2d<T>(DA, DA);
-    for (int i = 0; i < 4; i++) {
-      auto AP = pts2[i] - pts1[0];
-
-      auto APdotAB = dot_2d<T>(AP, AB);
-      auto APdotAD = -dot_2d<T>(AP, DA);
-
-      if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && (APdotAD <= ADdotAD)) {
-        intersections[num++] = pts2[i];
-      }
+
+    // Check for vertices of rect1 inside rect2
+    {
+        const auto& AB      = vec2[0];
+        const auto& DA      = vec2[3];
+        auto        ABdotAB = dot_2d<T>(AB, AB);
+        auto        ADdotAD = dot_2d<T>(DA, DA);
+        for (int i = 0; i < 4; i++)
+        {
+            // assume ABCD is the rectangle, and P is the point to be judged
+            // P is inside ABCD iff. P's projection on AB lies within AB
+            // and P's projection on AD lies within AD
+
+            auto AP = pts1[i] - pts2[0];
+
+            auto APdotAB = dot_2d<T>(AP, AB);
+            auto APdotAD = -dot_2d<T>(AP, DA);
+
+            if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && (APdotAD <= ADdotAD))
+            {
+                intersections[num++] = pts1[i];
+            }
+        }
     }
-  }
 
-  return num;
+    // Reverse the check - check for vertices of rect2 inside rect1
+    {
+        const auto& AB      = vec1[0];
+        const auto& DA      = vec1[3];
+        auto        ABdotAB = dot_2d<T>(AB, AB);
+        auto        ADdotAD = dot_2d<T>(DA, DA);
+        for (int i = 0; i < 4; i++)
+        {
+            auto AP = pts2[i] - pts1[0];
+
+            auto APdotAB = dot_2d<T>(AP, AB);
+            auto APdotAD = -dot_2d<T>(AP, DA);
+
+            if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && (APdotAD <= ADdotAD))
+            {
+                intersections[num++] = pts2[i];
+            }
+        }
+    }
+
+    return num;
 }
 
-template <typename T>
+template<typename T>
 __host__ __device__ __forceinline__ int convex_hull_graham(const Point<T> (&p)[24],
-                                                           const int &num_in, Point<T> (&q)[24],
-                                                           bool shift_to_zero = false) {
-  assert(num_in >= 2);
-
-  // Step 1:
-  // Find point with minimum y
-  // if more than 1 points have the same minimum y,
-  // pick the one with the minimum x.
-  int t = 0;
-  for (int i = 1; i < num_in; i++) {
-    if (p[i].y < p[t].y || (p[i].y == p[t].y && p[i].x < p[t].x)) {
-      t = i;
+                                                           const int& num_in,
+                                                           Point<T> (&q)[24],
+                                                           bool shift_to_zero = false)
+{
+    assert(num_in >= 2);
+
+    // Step 1:
+    // Find point with minimum y
+    // if more than 1 points have the same minimum y,
+    // pick the one with the minimum x.
+    int t = 0;
+    for (int i = 1; i < num_in; i++)
+    {
+        if (p[i].y < p[t].y || (p[i].y == p[t].y && p[i].x < p[t].x))
+        {
+            t = i;
+        }
     }
-  }
-  auto &start = p[t];  // starting point
-
-  // Step 2:
-  // Subtract starting point from every points (for sorting in the next step)
-  for (int i = 0; i < num_in; i++) {
-    q[i] = p[i] - start;
-  }
-
-  // Swap the starting point to position 0
-  auto tmp = q[0];
-  q[0] = q[t];
-  q[t] = tmp;
-
-  // Step 3:
-  // Sort point 1 ~ num_in according to their relative cross-product values
-  // (essentially sorting according to angles)
-  // If the angles are the same, sort according to their distance to origin
-  T dist[24];
-  for (int i = 0; i < num_in; i++) {
-    dist[i] = dot_2d<T>(q[i], q[i]);
-  }
-
-  for (int i = 1; i < num_in - 1; i++) {
-    for (int j = i + 1; j < num_in; j++) {
-      T crossProduct = cross_2d<T>(q[i], q[j]);
-      if ((crossProduct < -1e-6) || (fabs(crossProduct) < 1e-6 && dist[i] > dist[j])) {
-        auto q_tmp = q[i];
-        q[i] = q[j];
-        q[j] = q_tmp;
-        auto dist_tmp = dist[i];
-        dist[i] = dist[j];
-        dist[j] = dist_tmp;
-      }
+    auto& start = p[t];  // starting point
+
+    // Step 2:
+    // Subtract starting point from every points (for sorting in the next step)
+    for (int i = 0; i < num_in; i++)
+    {
+        q[i] = p[i] - start;
     }
-  }
-
-  // Step 4:
-  // Make sure there are at least 2 points (that don't overlap with each other)
-  // in the stack
-  int k;  // index of the non-overlapped second point
-  for (k = 1; k < num_in; k++) {
-    if (dist[k] > 1e-8) {
-      break;
+
+    // Swap the starting point to position 0
+    auto tmp = q[0];
+    q[0]     = q[t];
+    q[t]     = tmp;
+
+    // Step 3:
+    // Sort point 1 ~ num_in according to their relative cross-product values
+    // (essentially sorting according to angles)
+    // If the angles are the same, sort according to their distance to origin
+    T dist[24];
+    for (int i = 0; i < num_in; i++)
+    {
+        dist[i] = dot_2d<T>(q[i], q[i]);
     }
-  }
-  if (k == num_in) {
-    // We reach the end, which means the convex hull is just one point
-    q[0] = p[t];
-    return 1;
-  }
-  q[1] = q[k];
-  int m = 2;  // 2 points in the stack
-  // Step 5:
-  // Finally we can start the scanning process.
-  // When a non-convex relationship between the 3 points is found
-  // (either concave shape or duplicated points),
-  // we pop the previous point from the stack
-  // until the 3-point relationship is convex again, or
-  // until the stack only contains two points
-  for (int i = k + 1; i < num_in; i++) {
-    while (m > 1 && cross_2d<T>(q[i] - q[m - 2], q[m - 1] - q[m - 2]) >= 0) {
-      m--;
+
+    for (int i = 1; i < num_in - 1; i++)
+    {
+        for (int j = i + 1; j < num_in; j++)
+        {
+            T crossProduct = cross_2d<T>(q[i], q[j]);
+            if ((crossProduct < -1e-6) || (fabs(crossProduct) < 1e-6 && dist[i] > dist[j]))
+            {
+                auto q_tmp    = q[i];
+                q[i]          = q[j];
+                q[j]          = q_tmp;
+                auto dist_tmp = dist[i];
+                dist[i]       = dist[j];
+                dist[j]       = dist_tmp;
+            }
+        }
     }
-    q[m++] = q[i];
-  }
-
-  // Step 6 (Optional):
-  // In general sense we need the original coordinates, so we
-  // need to shift the points back (reverting Step 2)
-  // But if we're only interested in getting the area/perimeter of the shape
-  // We can simply return.
-  if (!shift_to_zero) {
-    for (int i = 0; i < m; i++) {
-      q[i] += start;
+
+    // Step 4:
+    // Make sure there are at least 2 points (that don't overlap with each other)
+    // in the stack
+    int k;  // index of the non-overlapped second point
+    for (k = 1; k < num_in; k++)
+    {
+        if (dist[k] > 1e-8)
+        {
+            break;
+        }
+    }
+    if (k == num_in)
+    {
+        // We reach the end, which means the convex hull is just one point
+        q[0] = p[t];
+        return 1;
+    }
+    q[1]  = q[k];
+    int m = 2;  // 2 points in the stack
+    // Step 5:
+    // Finally we can start the scanning process.
+    // When a non-convex relationship between the 3 points is found
+    // (either concave shape or duplicated points),
+    // we pop the previous point from the stack
+    // until the 3-point relationship is convex again, or
+    // until the stack only contains two points
+    for (int i = k + 1; i < num_in; i++)
+    {
+        while (m > 1 && cross_2d<T>(q[i] - q[m - 2], q[m - 1] - q[m - 2]) >= 0)
+        {
+            m--;
+        }
+        q[m++] = q[i];
     }
-  }
 
-  return m;
+    // Step 6 (Optional):
+    // In general sense we need the original coordinates, so we
+    // need to shift the points back (reverting Step 2)
+    // But if we're only interested in getting the area/perimeter of the shape
+    // We can simply return.
+    if (!shift_to_zero)
+    {
+        for (int i = 0; i < m; i++)
+        {
+            q[i] += start;
+        }
+    }
+
+    return m;
 }
 
-template <typename T>
-__host__ __device__ __forceinline__ T polygon_area(const Point<T> (&q)[24], const int &m) {
-  if (m <= 2) {
-    return 0;
-  }
+template<typename T>
+__host__ __device__ __forceinline__ T polygon_area(const Point<T> (&q)[24], const int& m)
+{
+    if (m <= 2)
+    {
+        return 0;
+    }
 
-  T area = 0;
-  for (int i = 1; i < m - 1; i++) {
-    area += fabs(cross_2d<T>(q[i] - q[0], q[i + 1] - q[0]));
-  }
+    T area = 0;
+    for (int i = 1; i < m - 1; i++)
+    {
+        area += fabs(cross_2d<T>(q[i] - q[0], q[i + 1] - q[0]));
+    }
 
-  return area / 2.0;
+    return area / 2.0;
 }
 
-template <typename T>
-__host__ __device__ __forceinline__ T rotated_boxes_intersection(const RotatedBox<T> &box1,
-                                                                 const RotatedBox<T> &box2) {
-  // There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned
-  // from rotated_rect_intersection_pts
-  Point<T> intersectPts[24], orderedPts[24];
+template<typename T>
+__host__ __device__ __forceinline__ T rotated_boxes_intersection(const RotatedBox<T>& box1,
+                                                                 const RotatedBox<T>& box2)
+{
+    // There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned
+    // from rotated_rect_intersection_pts
+    Point<T> intersectPts[24], orderedPts[24];
 
-  Point<T> pts1[4];
-  Point<T> pts2[4];
-  get_rotated_vertices<T>(box1, pts1);
-  get_rotated_vertices<T>(box2, pts2);
+    Point<T> pts1[4];
+    Point<T> pts2[4];
+    get_rotated_vertices<T>(box1, pts1);
+    get_rotated_vertices<T>(box2, pts2);
 
-  int num = get_intersection_points<T>(pts1, pts2, intersectPts);
+    int num = get_intersection_points<T>(pts1, pts2, intersectPts);
 
-  if (num <= 2) {
-    return 0.0;
-  }
+    if (num <= 2)
+    {
+        return 0.0;
+    }
 
-  // Convex Hull to order the intersection points in clockwise order and find
-  // the contour area.
-  int num_convex = convex_hull_graham<T>(intersectPts, num, orderedPts, true);
-  return polygon_area<T>(orderedPts, num_convex);
+    // Convex Hull to order the intersection points in clockwise order and find
+    // the contour area.
+    int num_convex = convex_hull_graham<T>(intersectPts, num, orderedPts, true);
+    return polygon_area<T>(orderedPts, num_convex);
 }
 
-template <typename T>
-__host__ __device__ __forceinline__ T single_box_iou_rotated(T const *const box1_raw,
-                                                             T const *const box2_raw) {
-  // shift center to the middle point to achieve higher precision in result
-  RotatedBox<T> box1, box2;
-  auto center_shift_x = (box1_raw[0] + box2_raw[0]) / 2.0;
-  auto center_shift_y = (box1_raw[1] + box2_raw[1]) / 2.0;
-  box1.x_ctr = box1_raw[0] - center_shift_x;
-  box1.y_ctr = box1_raw[1] - center_shift_y;
-  box1.w = box1_raw[2];
-  box1.h = box1_raw[3];
-  box1.a = box1_raw[4];
-  box2.x_ctr = box2_raw[0] - center_shift_x;
-  box2.y_ctr = box2_raw[1] - center_shift_y;
-  box2.w = box2_raw[2];
-  box2.h = box2_raw[3];
-  box2.a = box2_raw[4];
-
-  const T area1 = box1.w * box1.h;
-  const T area2 = box2.w * box2.h;
-  if (area1 < 1e-14 || area2 < 1e-14) {
-    return 1.0f;
-  }
-
-  const T intersection = rotated_boxes_intersection<T>(box1, box2);
-  T baseS = 1.0;
-  baseS = (area1 + area2 - intersection);
-  const T iou = intersection / baseS;
-  return iou;
+template<typename T>
+__host__ __device__ __forceinline__ T single_box_iou_rotated(T const* const box1_raw,
+                                                             T const* const box2_raw)
+{
+    // shift center to the middle point to achieve higher precision in result
+    RotatedBox<T> box1, box2;
+    auto          center_shift_x = (box1_raw[0] + box2_raw[0]) / 2.0;
+    auto          center_shift_y = (box1_raw[1] + box2_raw[1]) / 2.0;
+    box1.x_ctr                   = box1_raw[0] - center_shift_x;
+    box1.y_ctr                   = box1_raw[1] - center_shift_y;
+    box1.w                       = box1_raw[2];
+    box1.h                       = box1_raw[3];
+    box1.a                       = box1_raw[4];
+    box2.x_ctr                   = box2_raw[0] - center_shift_x;
+    box2.y_ctr                   = box2_raw[1] - center_shift_y;
+    box2.w                       = box2_raw[2];
+    box2.h                       = box2_raw[3];
+    box2.a                       = box2_raw[4];
+
+    const T area1 = box1.w * box1.h;
+    const T area2 = box2.w * box2.h;
+    if (area1 < 1e-14 || area2 < 1e-14)
+    {
+        return 1.0f;
+    }
+
+    const T intersection = rotated_boxes_intersection<T>(box1, box2);
+    T       baseS        = 1.0;
+    baseS                = (area1 + area2 - intersection);
+    const T iou          = intersection / baseS;
+    return iou;
 }
 
 /********** new NMS for only score and index array **********/
 
-template <typename T_SCORE, typename T_BBOX, int TSIZE>
-__global__ void allClassRotatedNMS_kernel(const int num, const int num_classes,
-                                          const int num_preds_per_class, const int top_k,
-                                          const float nms_threshold, const bool share_location,
-                                          const bool isNormalized,
-                                          T_BBOX *bbox_data,  // bbox_data should be float to
-                                                              // preserve location information
-                                          T_SCORE *beforeNMS_scores, int *beforeNMS_index_array,
-                                          T_SCORE *afterNMS_scores, int *afterNMS_index_array) {
-  //__shared__ bool kept_bboxinfo_flag[CAFFE_CUDA_NUM_THREADS * TSIZE];
-  extern __shared__ bool kept_bboxinfo_flag[];
-  for (int i = 0; i < num; i++) {
-    const int offset = i * num_classes * num_preds_per_class + blockIdx.x * num_preds_per_class;
-    const int max_idx = offset + top_k;  // put top_k bboxes into NMS calculation
-    const int bbox_idx_offset =
-        share_location ? (i * num_preds_per_class) : (i * num_classes * num_preds_per_class);
-
-    // local thread data
-    int loc_bboxIndex[TSIZE];
-    T_BBOX loc_bbox[TSIZE * 5];
-
-    // initialize Bbox, Bboxinfo, kept_bboxinfo_flag
-    // Eliminate shared memory RAW hazard
-    __syncthreads();
+template<typename T_SCORE, typename T_BBOX, int TSIZE>
+__global__ void allClassRotatedNMS_kernel(const int   num,
+                                          const int   num_classes,
+                                          const int   num_preds_per_class,
+                                          const int   top_k,
+                                          const float nms_threshold,
+                                          const bool  share_location,
+                                          const bool  isNormalized,
+                                          T_BBOX*     bbox_data,  // bbox_data should be float to preserve location information
+                                          T_SCORE*    beforeNMS_scores,
+                                          int*        beforeNMS_index_array,
+                                          T_SCORE*    afterNMS_scores,
+                                          int*        afterNMS_index_array)
+{
+    //__shared__ bool kept_bboxinfo_flag[CAFFE_CUDA_NUM_THREADS * TSIZE];
+    extern __shared__ bool kept_bboxinfo_flag[];
+    for (int i = 0; i < num; i++)
+    {
+        const int offset  = i * num_classes * num_preds_per_class + blockIdx.x * num_preds_per_class;
+        const int max_idx = offset + top_k;  // put top_k bboxes into NMS calculation
+        const int bbox_idx_offset =
+            share_location ? (i * num_preds_per_class) : (i * num_classes * num_preds_per_class);
+
+        // local thread data
+        int    loc_bboxIndex[TSIZE];
+        T_BBOX loc_bbox[TSIZE * 5];
+
+        // initialize Bbox, Bboxinfo, kept_bboxinfo_flag
+        // Eliminate shared memory RAW hazard
+        __syncthreads();
 #pragma unroll
-    for (int t = 0; t < TSIZE; t++) {
-      const int cur_idx = threadIdx.x + blockDim.x * t;
-      const int item_idx = offset + cur_idx;
+        for (int t = 0; t < TSIZE; t++)
+        {
+            const int cur_idx  = threadIdx.x + blockDim.x * t;
+            const int item_idx = offset + cur_idx;
+
+            if (item_idx < max_idx)
+            {
+                loc_bboxIndex[t] = beforeNMS_index_array[item_idx];
+
+                if (loc_bboxIndex[t] >= 0)
+                // if (loc_bboxIndex[t] != -1)
+                {
+                    const int bbox_data_idx = share_location ?
+                                                  (loc_bboxIndex[t] % num_preds_per_class + bbox_idx_offset) :
+                                                  loc_bboxIndex[t];
+                    memcpy(&loc_bbox[t * 5], &bbox_data[bbox_data_idx * 5], 5 * sizeof(T_BBOX));
+                    kept_bboxinfo_flag[cur_idx] = true;
+                }
+                else
+                {
+                    kept_bboxinfo_flag[cur_idx] = false;
+                }
+            }
+            else
+            {
+                kept_bboxinfo_flag[cur_idx] = false;
+            }
+        }
 
-      if (item_idx < max_idx) {
-        loc_bboxIndex[t] = beforeNMS_index_array[item_idx];
+        // filter out overlapped boxes with lower scores
+        int ref_item_idx = offset;
+        int ref_bbox_idx = share_location ?
+                               (beforeNMS_index_array[ref_item_idx] % num_preds_per_class + bbox_idx_offset) :
+                               beforeNMS_index_array[ref_item_idx];
 
-        if (loc_bboxIndex[t] >= 0)
-        // if (loc_bboxIndex[t] != -1)
+        while ((ref_bbox_idx != -1) && ref_item_idx < max_idx)
         {
-          const int bbox_data_idx = share_location
-                                        ? (loc_bboxIndex[t] % num_preds_per_class + bbox_idx_offset)
-                                        : loc_bboxIndex[t];
-          memcpy(&loc_bbox[t * 5], &bbox_data[bbox_data_idx * 5], 5 * sizeof(T_BBOX));
-          kept_bboxinfo_flag[cur_idx] = true;
-        } else {
-          kept_bboxinfo_flag[cur_idx] = false;
+            T_BBOX ref_bbox[5];
+            memcpy(&ref_bbox[0], &bbox_data[ref_bbox_idx * 5], 5 * sizeof(T_BBOX));
+
+            // Eliminate shared memory RAW hazard
+            __syncthreads();
+
+            for (int t = 0; t < TSIZE; t++)
+            {
+                const int cur_idx  = threadIdx.x + blockDim.x * t;
+                const int item_idx = offset + cur_idx;
+
+                if ((kept_bboxinfo_flag[cur_idx]) && (item_idx > ref_item_idx))
+                {
+                    // TODO: may need to add bool normalized as argument, HERE true means
+                    // normalized
+                    if (single_box_iou_rotated(&ref_bbox[0], loc_bbox + t * 5) > nms_threshold)
+                    {
+                        kept_bboxinfo_flag[cur_idx] = false;
+                    }
+                }
+            }
+            __syncthreads();
+
+            do {
+                ref_item_idx++;
+            } while (ref_item_idx < max_idx && !kept_bboxinfo_flag[ref_item_idx - offset]);
+
+            ref_bbox_idx = share_location ?
+                               (beforeNMS_index_array[ref_item_idx] % num_preds_per_class + bbox_idx_offset) :
+                               beforeNMS_index_array[ref_item_idx];
         }
-      } else {
-        kept_bboxinfo_flag[cur_idx] = false;
-      }
-    }
 
-    // filter out overlapped boxes with lower scores
-    int ref_item_idx = offset;
-    int ref_bbox_idx =
-        share_location
-            ? (beforeNMS_index_array[ref_item_idx] % num_preds_per_class + bbox_idx_offset)
-            : beforeNMS_index_array[ref_item_idx];
-
-    while ((ref_bbox_idx != -1) && ref_item_idx < max_idx) {
-      T_BBOX ref_bbox[5];
-      memcpy(&ref_bbox[0], &bbox_data[ref_bbox_idx * 5], 5 * sizeof(T_BBOX));
-
-      // Eliminate shared memory RAW hazard
-      __syncthreads();
-
-      for (int t = 0; t < TSIZE; t++) {
-        const int cur_idx = threadIdx.x + blockDim.x * t;
-        const int item_idx = offset + cur_idx;
-
-        if ((kept_bboxinfo_flag[cur_idx]) && (item_idx > ref_item_idx)) {
-          // TODO: may need to add bool normalized as argument, HERE true means
-          // normalized
-          if (single_box_iou_rotated(&ref_bbox[0], loc_bbox + t * 5) > nms_threshold) {
-            kept_bboxinfo_flag[cur_idx] = false;
-          }
+        // store data
+        for (int t = 0; t < TSIZE; t++)
+        {
+            const int cur_idx        = threadIdx.x + blockDim.x * t;
+            const int read_item_idx  = offset + cur_idx;
+            const int write_item_idx = (i * num_classes * top_k + blockIdx.x * top_k) + cur_idx;
+            /*
+             * If not not keeping the bbox
+             * Set the score to 0
+             * Set the bounding box index to -1
+             */
+            if (read_item_idx < max_idx)
+            {
+                afterNMS_scores[write_item_idx]      = kept_bboxinfo_flag[cur_idx] ?
+                                                           beforeNMS_scores[read_item_idx] :
+                                                           0.0f;
+                afterNMS_index_array[write_item_idx] = kept_bboxinfo_flag[cur_idx] ? loc_bboxIndex[t] : -1;
+            }
         }
-      }
-      __syncthreads();
-
-      do {
-        ref_item_idx++;
-      } while (ref_item_idx < max_idx && !kept_bboxinfo_flag[ref_item_idx - offset]);
-
-      ref_bbox_idx =
-          share_location
-              ? (beforeNMS_index_array[ref_item_idx] % num_preds_per_class + bbox_idx_offset)
-              : beforeNMS_index_array[ref_item_idx];
     }
-
-    // store data
-    for (int t = 0; t < TSIZE; t++) {
-      const int cur_idx = threadIdx.x + blockDim.x * t;
-      const int read_item_idx = offset + cur_idx;
-      const int write_item_idx = (i * num_classes * top_k + blockIdx.x * top_k) + cur_idx;
-      /*
-       * If not not keeping the bbox
-       * Set the score to 0
-       * Set the bounding box index to -1
-       */
-      if (read_item_idx < max_idx) {
-        afterNMS_scores[write_item_idx] =
-            kept_bboxinfo_flag[cur_idx] ? beforeNMS_scores[read_item_idx] : 0.0f;
-        afterNMS_index_array[write_item_idx] = kept_bboxinfo_flag[cur_idx] ? loc_bboxIndex[t] : -1;
-      }
-    }
-  }
 }
 
-template <typename T_SCORE, typename T_BBOX>
-pluginStatus_t allClassRotatedNMS_gpu(cudaStream_t stream, const int num, const int num_classes,
-                                      const int num_preds_per_class, const int top_k,
-                                      const float nms_threshold, const bool share_location,
-                                      const bool isNormalized, void *bbox_data,
-                                      void *beforeNMS_scores, void *beforeNMS_index_array,
-                                      void *afterNMS_scores, void *afterNMS_index_array) {
+template<typename T_SCORE, typename T_BBOX>
+pluginStatus_t allClassRotatedNMS_gpu(cudaStream_t stream,
+                                      const int    num,
+                                      const int    num_classes,
+                                      const int    num_preds_per_class,
+                                      const int    top_k,
+                                      const float  nms_threshold,
+                                      const bool   share_location,
+                                      const bool   isNormalized,
+                                      void*        bbox_data,
+                                      void*        beforeNMS_scores,
+                                      void*        beforeNMS_index_array,
+                                      void*        afterNMS_scores,
+                                      void*        afterNMS_index_array)
+{
 #define P(tsize) allClassRotatedNMS_kernel<T_SCORE, T_BBOX, (tsize)>
 
-  void (*kernel[10])(const int, const int, const int, const int, const float, const bool,
-                     const bool, float *, T_SCORE *, int *, T_SCORE *, int *) = {
-      P(1), P(2), P(3), P(4), P(5), P(6), P(7), P(8), P(9), P(10),
-  };
-
-  const int BS = 512;
-  const int GS = num_classes;
-  const int t_size = (top_k + BS - 1) / BS;
-
-  ASSERT(t_size <= 10);
-  kernel[t_size - 1]<<<GS, BS, BS * t_size * sizeof(bool), stream>>>(
-      num, num_classes, num_preds_per_class, top_k, nms_threshold, share_location, isNormalized,
-      (T_BBOX *)bbox_data, (T_SCORE *)beforeNMS_scores, (int *)beforeNMS_index_array,
-      (T_SCORE *)afterNMS_scores, (int *)afterNMS_index_array);
-
-  CSC(cudaGetLastError(), STATUS_FAILURE);
-  return STATUS_SUCCESS;
+    void (*kernel[10])(const int,
+                       const int,
+                       const int,
+                       const int,
+                       const float,
+                       const bool,
+                       const bool,
+                       float*,
+                       T_SCORE*,
+                       int*,
+                       T_SCORE*,
+                       int*) = {
+        P(1),
+        P(2),
+        P(3),
+        P(4),
+        P(5),
+        P(6),
+        P(7),
+        P(8),
+        P(9),
+        P(10),
+    };
+
+    const int BS     = 512;
+    const int GS     = num_classes;
+    const int t_size = (top_k + BS - 1) / BS;
+
+    ASSERT(t_size <= 10);
+    kernel[t_size - 1]<<<GS, BS, BS * t_size * sizeof(bool), stream>>>(num,
+                                                                       num_classes,
+                                                                       num_preds_per_class,
+                                                                       top_k,
+                                                                       nms_threshold,
+                                                                       share_location,
+                                                                       isNormalized,
+                                                                       (T_BBOX*)bbox_data,
+                                                                       (T_SCORE*)beforeNMS_scores,
+                                                                       (int*)beforeNMS_index_array,
+                                                                       (T_SCORE*)afterNMS_scores,
+                                                                       (int*)afterNMS_index_array);
+
+    CSC(cudaGetLastError(), STATUS_FAILURE);
+    return STATUS_SUCCESS;
 }
 
 // allClassNMS LAUNCH CONFIG
-typedef pluginStatus_t (*rotatedNmsFunc)(cudaStream_t, const int, const int, const int, const int,
-                                         const float, const bool, const bool, void *, void *,
-                                         void *, void *, void *);
-
-struct rotatedNmsLaunchConfig {
-  DataType t_score;
-  DataType t_bbox;
-  rotatedNmsFunc function;
-
-  rotatedNmsLaunchConfig(DataType t_score, DataType t_bbox) : t_score(t_score), t_bbox(t_bbox) {}
-  rotatedNmsLaunchConfig(DataType t_score, DataType t_bbox, rotatedNmsFunc function)
-      : t_score(t_score), t_bbox(t_bbox), function(function) {}
-  bool operator==(const rotatedNmsLaunchConfig &other) {
-    return t_score == other.t_score && t_bbox == other.t_bbox;
-  }
+typedef pluginStatus_t (*rotatedNmsFunc)(cudaStream_t,
+                                         const int,
+                                         const int,
+                                         const int,
+                                         const int,
+                                         const float,
+                                         const bool,
+                                         const bool,
+                                         void*,
+                                         void*,
+                                         void*,
+                                         void*,
+                                         void*);
+
+struct rotatedNmsLaunchConfig
+{
+    DataType       t_score;
+    DataType       t_bbox;
+    rotatedNmsFunc function;
+
+    rotatedNmsLaunchConfig(DataType t_score, DataType t_bbox)
+        : t_score(t_score)
+        , t_bbox(t_bbox)
+    {
+    }
+    rotatedNmsLaunchConfig(DataType t_score, DataType t_bbox, rotatedNmsFunc function)
+        : t_score(t_score)
+        , t_bbox(t_bbox)
+        , function(function)
+    {
+    }
+    bool operator==(const rotatedNmsLaunchConfig& other)
+    {
+        return t_score == other.t_score && t_bbox == other.t_bbox;
+    }
 };
 
 static std::vector<rotatedNmsLaunchConfig> rotatedNmsFuncVec;
 
-bool rotatedNmsInit() {
-  rotatedNmsFuncVec.push_back(rotatedNmsLaunchConfig(DataType::kFLOAT, DataType::kFLOAT,
-                                                     allClassRotatedNMS_gpu<float, float>));
-  return true;
+bool                                       rotatedNmsInit()
+{
+    rotatedNmsFuncVec.push_back(rotatedNmsLaunchConfig(DataType::kFLOAT,
+                                                       DataType::kFLOAT,
+                                                       allClassRotatedNMS_gpu<float, float>));
+    return true;
 }
 
-static bool initialized = rotatedNmsInit();
-
-pluginStatus_t allClassRotatedNMS(cudaStream_t stream, const int num, const int num_classes,
-                                  const int num_preds_per_class, const int top_k,
-                                  const float nms_threshold, const bool share_location,
-                                  const bool isNormalized, const DataType DT_SCORE,
-                                  const DataType DT_BBOX, void *bbox_data, void *beforeNMS_scores,
-                                  void *beforeNMS_index_array, void *afterNMS_scores,
-                                  void *afterNMS_index_array, bool) {
-  auto __cuda_arch__ = get_cuda_arch(0);  // assume there is only one arch 7.2 device
-  if (__cuda_arch__ == 720 && top_k >= 1000) {
-    printf("Warning: pre_top_k need to be reduced for devices with arch 7.2, got pre_top_k=%d\n",
-           top_k);
-  }
-  rotatedNmsLaunchConfig lc(DT_SCORE, DT_BBOX);
-
-  for (unsigned i = 0; i < rotatedNmsFuncVec.size(); ++i) {
-    if (lc == rotatedNmsFuncVec[i]) {
-      DEBUG_PRINTF("all class rotated nms kernel %d\n", i);
-      return rotatedNmsFuncVec[i].function(stream, num, num_classes, num_preds_per_class, top_k,
-                                           nms_threshold, share_location, isNormalized, bbox_data,
-                                           beforeNMS_scores, beforeNMS_index_array, afterNMS_scores,
-                                           afterNMS_index_array);
+static bool    initialized = rotatedNmsInit();
+
+pluginStatus_t allClassRotatedNMS(cudaStream_t   stream,
+                                  const int      num,
+                                  const int      num_classes,
+                                  const int      num_preds_per_class,
+                                  const int      top_k,
+                                  const float    nms_threshold,
+                                  const bool     share_location,
+                                  const bool     isNormalized,
+                                  const DataType DT_SCORE,
+                                  const DataType DT_BBOX,
+                                  void*          bbox_data,
+                                  void*          beforeNMS_scores,
+                                  void*          beforeNMS_index_array,
+                                  void*          afterNMS_scores,
+                                  void*          afterNMS_index_array,
+                                  bool)
+{
+    auto __cuda_arch__ = get_cuda_arch(0);  // assume there is only one arch 7.2 device
+    if (__cuda_arch__ == 720 && top_k >= 1000)
+    {
+        printf("Warning: pre_top_k need to be reduced for devices with arch 7.2, got pre_top_k=%d\n",
+               top_k);
+    }
+    rotatedNmsLaunchConfig lc(DT_SCORE, DT_BBOX);
+
+    for (unsigned i = 0; i < rotatedNmsFuncVec.size(); ++i)
+    {
+        if (lc == rotatedNmsFuncVec[i])
+        {
+            DEBUG_PRINTF("all class rotated nms kernel %d\n", i);
+            return rotatedNmsFuncVec[i].function(stream,
+                                                 num,
+                                                 num_classes,
+                                                 num_preds_per_class,
+                                                 top_k,
+                                                 nms_threshold,
+                                                 share_location,
+                                                 isNormalized,
+                                                 bbox_data,
+                                                 beforeNMS_scores,
+                                                 beforeNMS_index_array,
+                                                 afterNMS_scores,
+                                                 afterNMS_index_array);
+        }
     }
-  }
-  return STATUS_BAD_PARAM;
+    return STATUS_BAD_PARAM;
 }
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/batched_nms_kernel.cpp b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/batched_nms_kernel.cpp
index 71cb7a8592..b5f880d87f 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/batched_nms_kernel.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/batched_nms_kernel.cpp
@@ -3,123 +3,215 @@
 // https://github.com/NVIDIA/TensorRT/tree/master/plugin/batchedNMSPlugin
 #include "nms/batched_nms_kernel.hpp"
 
-pluginStatus_t nmsInference(cudaStream_t stream, const int N, const int perBatchBoxesSize,
-                            const int perBatchScoresSize, const bool shareLocation,
-                            const int backgroundLabelId, const int numPredsPerClass,
-                            const int numClasses, const int topK, const int keepTopK,
-                            const float scoreThreshold, const float iouThreshold,
-                            const DataType DT_BBOX, const void* locData, const DataType DT_SCORE,
-                            const void* confData, void* nmsedDets, void* nmsedLabels,
-                            void* nmsedIndex, void* workspace, bool isNormalized, bool confSigmoid,
-                            bool clipBoxes, bool rotated) {
-  const int topKVal = topK < 0 ? numPredsPerClass : topK;
-  const int keepTopKVal = keepTopK < 0 ? numPredsPerClass : keepTopK;
-  // locCount = batch_size * number_boxes_per_sample * 4
-  const int locCount = N * perBatchBoxesSize;
-  /*
-   * shareLocation
-   * Bounding box are shared among all classes, i.e., a bounding box could be
-   * classified as any candidate class. Otherwise Bounding box are designed for
-   * specific classes, i.e., a bounding box could be classified as one certain
-   * class or not (binary classification).
-   */
-  const int numLocClasses = shareLocation ? 1 : numClasses;
-
-  size_t bboxDataSize = detectionForwardBBoxDataSize(N, perBatchBoxesSize, DataType::kFLOAT);
-  void* bboxDataRaw = workspace;
-  cudaMemcpyAsync(bboxDataRaw, locData, bboxDataSize, cudaMemcpyDeviceToDevice, stream);
-  pluginStatus_t status;
-
-  /*
-   * bboxDataRaw format:
-   * [batch size, numPriors (per sample), numLocClasses, 4]
-   */
-  // float for now
-  void* bboxData;
-  size_t bboxPermuteSize =
-      detectionForwardBBoxPermuteSize(shareLocation, N, perBatchBoxesSize, DataType::kFLOAT);
-  void* bboxPermute = nextWorkspacePtr((int8_t*)bboxDataRaw, bboxDataSize);
-
-  /*
-   * After permutation, bboxData format:
-   * [batch_size, numLocClasses, numPriors (per sample) (numPredsPerClass), 4]
-   * This is equivalent to swapping axis
-   */
-  if (!shareLocation) {
-    status = permuteData(stream, locCount, numLocClasses, numPredsPerClass, rotated ? 5 : 4,
-                         DataType::kFLOAT, false, bboxDataRaw, bboxPermute);
+pluginStatus_t nmsInference(cudaStream_t   stream,
+                            const int      N,
+                            const int      perBatchBoxesSize,
+                            const int      perBatchScoresSize,
+                            const bool     shareLocation,
+                            const int      backgroundLabelId,
+                            const int      numPredsPerClass,
+                            const int      numClasses,
+                            const int      topK,
+                            const int      keepTopK,
+                            const float    scoreThreshold,
+                            const float    iouThreshold,
+                            const DataType DT_BBOX,
+                            const void*    locData,
+                            const DataType DT_SCORE,
+                            const void*    confData,
+                            void*          nmsedDets,
+                            void*          nmsedLabels,
+                            void*          nmsedIndex,
+                            void*          workspace,
+                            bool           isNormalized,
+                            bool           confSigmoid,
+                            bool           clipBoxes,
+                            bool           rotated)
+{
+    const int topKVal       = topK < 0 ? numPredsPerClass : topK;
+    const int keepTopKVal   = keepTopK < 0 ? numPredsPerClass : keepTopK;
+    // locCount = batch_size * number_boxes_per_sample * 4
+    const int locCount      = N * perBatchBoxesSize;
+    /*
+     * shareLocation
+     * Bounding box are shared among all classes, i.e., a bounding box could be
+     * classified as any candidate class. Otherwise Bounding box are designed for
+     * specific classes, i.e., a bounding box could be classified as one certain
+     * class or not (binary classification).
+     */
+    const int numLocClasses = shareLocation ? 1 : numClasses;
+
+    size_t    bboxDataSize = detectionForwardBBoxDataSize(N, perBatchBoxesSize, DataType::kFLOAT);
+    void*     bboxDataRaw  = workspace;
+    cudaMemcpyAsync(bboxDataRaw, locData, bboxDataSize, cudaMemcpyDeviceToDevice, stream);
+    pluginStatus_t status;
+
+    /*
+     * bboxDataRaw format:
+     * [batch size, numPriors (per sample), numLocClasses, 4]
+     */
+    // float for now
+    void*          bboxData;
+    size_t         bboxPermuteSize = detectionForwardBBoxPermuteSize(shareLocation,
+                                                             N,
+                                                             perBatchBoxesSize,
+                                                             DataType::kFLOAT);
+    void* bboxPermute = nextWorkspacePtr((int8_t*)bboxDataRaw, bboxDataSize);
+
+    /*
+     * After permutation, bboxData format:
+     * [batch_size, numLocClasses, numPriors (per sample) (numPredsPerClass), 4]
+     * This is equivalent to swapping axis
+     */
+    if (!shareLocation)
+    {
+        status = permuteData(stream,
+                             locCount,
+                             numLocClasses,
+                             numPredsPerClass,
+                             rotated ? 5 : 4,
+                             DataType::kFLOAT,
+                             false,
+                             bboxDataRaw,
+                             bboxPermute);
+        ASSERT_FAILURE(status == STATUS_SUCCESS);
+        bboxData = bboxPermute;
+    }
+    /*
+     * If shareLocation, numLocClasses = 1
+     * No need to permute data on linear memory
+     */
+    else
+    {
+        bboxData = bboxDataRaw;
+    }
+
+    /*
+     * Conf data format
+     * [batch size, numPriors * param.numClasses, 1, 1]
+     */
+    const int numScores       = N * perBatchScoresSize;
+    size_t    totalScoresSize = detectionForwardPreNMSSize(N, perBatchScoresSize);
+    void*     scores          = nextWorkspacePtr((int8_t*)bboxPermute, bboxPermuteSize);
+
+    // need a conf_scores
+    /*
+     * After permutation, bboxData format:
+     * [batch_size, numClasses, numPredsPerClass, 1]
+     */
+    status = permuteData(stream,
+                         numScores,
+                         numClasses,
+                         numPredsPerClass,
+                         1,
+                         DataType::kFLOAT,
+                         confSigmoid,
+                         confData,
+                         scores);
     ASSERT_FAILURE(status == STATUS_SUCCESS);
-    bboxData = bboxPermute;
-  }
-  /*
-   * If shareLocation, numLocClasses = 1
-   * No need to permute data on linear memory
-   */
-  else {
-    bboxData = bboxDataRaw;
-  }
-
-  /*
-   * Conf data format
-   * [batch size, numPriors * param.numClasses, 1, 1]
-   */
-  const int numScores = N * perBatchScoresSize;
-  size_t totalScoresSize = detectionForwardPreNMSSize(N, perBatchScoresSize);
-  void* scores = nextWorkspacePtr((int8_t*)bboxPermute, bboxPermuteSize);
-
-  // need a conf_scores
-  /*
-   * After permutation, bboxData format:
-   * [batch_size, numClasses, numPredsPerClass, 1]
-   */
-  status = permuteData(stream, numScores, numClasses, numPredsPerClass, 1, DataType::kFLOAT,
-                       confSigmoid, confData, scores);
-  ASSERT_FAILURE(status == STATUS_SUCCESS);
-
-  size_t indicesSize = detectionForwardPreNMSSize(N, perBatchScoresSize);
-  void* indices = nextWorkspacePtr((int8_t*)scores, totalScoresSize);
-
-  size_t postNMSScoresSize = detectionForwardPostNMSSize(N, numClasses, topKVal);
-  size_t postNMSIndicesSize = detectionForwardPostNMSSize(N, numClasses, topKVal);
-  void* postNMSScores = nextWorkspacePtr((int8_t*)indices, indicesSize);
-  void* postNMSIndices = nextWorkspacePtr((int8_t*)postNMSScores, postNMSScoresSize);
-
-  void* sortingWorkspace = nextWorkspacePtr((int8_t*)postNMSIndices, postNMSIndicesSize);
-  // Sort the scores so that the following NMS could be applied.
-
-  status = sortScoresPerClass(stream, N, numClasses, numPredsPerClass, backgroundLabelId,
-                              scoreThreshold, DataType::kFLOAT, scores, indices, sortingWorkspace);
-  ASSERT_FAILURE(status == STATUS_SUCCESS);
-
-  // This is set to true as the input bounding boxes are of the format [ymin,
-  // xmin, ymax, xmax]. The default implementation assumes [xmin, ymin, xmax,
-  // ymax]
-  bool flipXY = false;
-  // NMS
-  if (rotated) {
-    status = allClassRotatedNMS(stream, N, numClasses, numPredsPerClass, topKVal, iouThreshold,
-                                shareLocation, isNormalized, DataType::kFLOAT, DataType::kFLOAT,
-                                bboxData, scores, indices, postNMSScores, postNMSIndices, flipXY);
-  } else {
-    status = allClassNMS(stream, N, numClasses, numPredsPerClass, topKVal, iouThreshold,
-                         shareLocation, isNormalized, DataType::kFLOAT, DataType::kFLOAT, bboxData,
-                         scores, indices, postNMSScores, postNMSIndices, flipXY);
-  }
-
-  ASSERT_FAILURE(status == STATUS_SUCCESS);
-
-  // Sort the bounding boxes after NMS using scores
-  status = sortScoresPerImage(stream, N, numClasses * topKVal, DataType::kFLOAT, postNMSScores,
-                              postNMSIndices, scores, indices, sortingWorkspace);
-
-  ASSERT_FAILURE(status == STATUS_SUCCESS);
-
-  // Gather data from the sorted bounding boxes after NMS
-  status = gatherNMSOutputs(stream, shareLocation, N, numPredsPerClass, numClasses, topKVal,
-                            keepTopKVal, DataType::kFLOAT, DataType::kFLOAT, indices, scores,
-                            bboxData, nmsedDets, nmsedLabels, nmsedIndex, clipBoxes, rotated);
-
-  ASSERT_FAILURE(status == STATUS_SUCCESS);
-
-  return STATUS_SUCCESS;
+
+    size_t indicesSize = detectionForwardPreNMSSize(N, perBatchScoresSize);
+    void*  indices     = nextWorkspacePtr((int8_t*)scores, totalScoresSize);
+
+    size_t postNMSScoresSize  = detectionForwardPostNMSSize(N, numClasses, topKVal);
+    size_t postNMSIndicesSize = detectionForwardPostNMSSize(N, numClasses, topKVal);
+    void*  postNMSScores      = nextWorkspacePtr((int8_t*)indices, indicesSize);
+    void*  postNMSIndices     = nextWorkspacePtr((int8_t*)postNMSScores, postNMSScoresSize);
+
+    void*  sortingWorkspace = nextWorkspacePtr((int8_t*)postNMSIndices, postNMSIndicesSize);
+    // Sort the scores so that the following NMS could be applied.
+
+    status = sortScoresPerClass(stream,
+                                N,
+                                numClasses,
+                                numPredsPerClass,
+                                backgroundLabelId,
+                                scoreThreshold,
+                                DataType::kFLOAT,
+                                scores,
+                                indices,
+                                sortingWorkspace);
+    ASSERT_FAILURE(status == STATUS_SUCCESS);
+
+    // This is set to true as the input bounding boxes are of the format [ymin,
+    // xmin, ymax, xmax]. The default implementation assumes [xmin, ymin, xmax,
+    // ymax]
+    bool flipXY = false;
+    // NMS
+    if (rotated)
+    {
+        status = allClassRotatedNMS(stream,
+                                    N,
+                                    numClasses,
+                                    numPredsPerClass,
+                                    topKVal,
+                                    iouThreshold,
+                                    shareLocation,
+                                    isNormalized,
+                                    DataType::kFLOAT,
+                                    DataType::kFLOAT,
+                                    bboxData,
+                                    scores,
+                                    indices,
+                                    postNMSScores,
+                                    postNMSIndices,
+                                    flipXY);
+    }
+    else
+    {
+        status = allClassNMS(stream,
+                             N,
+                             numClasses,
+                             numPredsPerClass,
+                             topKVal,
+                             iouThreshold,
+                             shareLocation,
+                             isNormalized,
+                             DataType::kFLOAT,
+                             DataType::kFLOAT,
+                             bboxData,
+                             scores,
+                             indices,
+                             postNMSScores,
+                             postNMSIndices,
+                             flipXY);
+    }
+
+    ASSERT_FAILURE(status == STATUS_SUCCESS);
+
+    // Sort the bounding boxes after NMS using scores
+    status = sortScoresPerImage(stream,
+                                N,
+                                numClasses * topKVal,
+                                DataType::kFLOAT,
+                                postNMSScores,
+                                postNMSIndices,
+                                scores,
+                                indices,
+                                sortingWorkspace);
+
+    ASSERT_FAILURE(status == STATUS_SUCCESS);
+
+    // Gather data from the sorted bounding boxes after NMS
+    status = gatherNMSOutputs(stream,
+                              shareLocation,
+                              N,
+                              numPredsPerClass,
+                              numClasses,
+                              topKVal,
+                              keepTopKVal,
+                              DataType::kFLOAT,
+                              DataType::kFLOAT,
+                              indices,
+                              scores,
+                              bboxData,
+                              nmsedDets,
+                              nmsedLabels,
+                              nmsedIndex,
+                              clipBoxes,
+                              rotated);
+
+    ASSERT_FAILURE(status == STATUS_SUCCESS);
+
+    return STATUS_SUCCESS;
 }
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/gatherNMSOutputs.cu b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/gatherNMSOutputs.cu
index 58419f8c16..803924a4ee 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/gatherNMSOutputs.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/gatherNMSOutputs.cu
@@ -6,159 +6,237 @@
 #include "nms/kernel.h"
 #include "trt_plugin_helper.hpp"
 
-template <typename T_BBOX, typename T_SCORE, bool rotated, unsigned nthds_per_cta>
-__launch_bounds__(nthds_per_cta) __global__
-    void gatherNMSOutputs_kernel(const bool shareLocation, const int numImages,
-                                 const int numPredsPerClass, const int numClasses, const int topK,
-                                 const int keepTopK, const int *indices, const T_SCORE *scores,
-                                 const T_BBOX *bboxData, T_BBOX *nmsedDets, int *nmsedLabels,
-                                 int *nmsedIndex, bool clipBoxes) {
-  if (keepTopK > topK) return;
-  for (int i = blockIdx.x * nthds_per_cta + threadIdx.x; i < numImages * keepTopK;
-       i += gridDim.x * nthds_per_cta) {
-    const int imgId = i / keepTopK;
-    const int detId = i % keepTopK;
-    const int offset = imgId * numClasses * topK;
-    const int index = indices[offset + detId];
-    const T_SCORE score = scores[offset + detId];
-    if (index == -1) {
-      nmsedLabels[i] = -1;
-      if (nmsedIndex != nullptr) {
-        nmsedIndex[i] = -1;
-      }
-      if (rotated) {
-        nmsedDets[i * 6] = 0;
-        nmsedDets[i * 6 + 1] = 0;
-        nmsedDets[i * 6 + 2] = 0;
-        nmsedDets[i * 6 + 3] = 0;
-        nmsedDets[i * 6 + 4] = 0;
-        nmsedDets[i * 6 + 5] = 0;
-      } else {
-        nmsedDets[i * 5] = 0;
-        nmsedDets[i * 5 + 1] = 0;
-        nmsedDets[i * 5 + 2] = 0;
-        nmsedDets[i * 5 + 3] = 0;
-        nmsedDets[i * 5 + 4] = 0;
-      }
-    } else {
-      const int bboxOffset =
-          imgId * (shareLocation ? numPredsPerClass : (numClasses * numPredsPerClass));
-      nmsedLabels[i] = (index % (numClasses * numPredsPerClass)) / numPredsPerClass;  // label
-      if (rotated) {
-        const int bboxId = ((shareLocation ? (index % numPredsPerClass)
-                                           : index % (numClasses * numPredsPerClass)) +
-                            bboxOffset) *
-                           5;
-        if (nmsedIndex != nullptr) {
-          nmsedIndex[i] = bboxId / 5 - bboxOffset;
+template<typename T_BBOX, typename T_SCORE, bool rotated, unsigned nthds_per_cta>
+__launch_bounds__(nthds_per_cta) __global__ void gatherNMSOutputs_kernel(const bool     shareLocation,
+                                                                         const int      numImages,
+                                                                         const int      numPredsPerClass,
+                                                                         const int      numClasses,
+                                                                         const int      topK,
+                                                                         const int      keepTopK,
+                                                                         const int*     indices,
+                                                                         const T_SCORE* scores,
+                                                                         const T_BBOX*  bboxData,
+                                                                         T_BBOX*        nmsedDets,
+                                                                         int*           nmsedLabels,
+                                                                         int*           nmsedIndex,
+                                                                         bool           clipBoxes)
+{
+    if (keepTopK > topK) return;
+
+    for (int i = blockIdx.x * nthds_per_cta + threadIdx.x; i < numImages * keepTopK; i += gridDim.x * nthds_per_cta)
+    {
+        const int     imgId  = i / keepTopK;
+        const int     detId  = i % keepTopK;
+        const int     offset = imgId * numClasses * topK;
+        const int     index  = indices[offset + detId];
+        const T_SCORE score  = scores[offset + detId];
+
+        if (index == -1)
+        {
+            nmsedLabels[i] = -1;
+            if (nmsedIndex != nullptr)
+            {
+                nmsedIndex[i] = -1;
+            }
+            if (rotated)
+            {
+                nmsedDets[i * 6]     = 0;
+                nmsedDets[i * 6 + 1] = 0;
+                nmsedDets[i * 6 + 2] = 0;
+                nmsedDets[i * 6 + 3] = 0;
+                nmsedDets[i * 6 + 4] = 0;
+                nmsedDets[i * 6 + 5] = 0;
+            }
+            else
+            {
+                nmsedDets[i * 5]     = 0;
+                nmsedDets[i * 5 + 1] = 0;
+                nmsedDets[i * 5 + 2] = 0;
+                nmsedDets[i * 5 + 3] = 0;
+                nmsedDets[i * 5 + 4] = 0;
+            }
         }
-        // clipped bbox xmin
-        nmsedDets[i * 6] =
-            clipBoxes ? max(min(bboxData[bboxId], T_BBOX(1.)), T_BBOX(0.)) : bboxData[bboxId];
-        // clipped bbox ymin
-        nmsedDets[i * 6 + 1] = clipBoxes ? max(min(bboxData[bboxId + 1], T_BBOX(1.)), T_BBOX(0.))
-                                         : bboxData[bboxId + 1];
-        // clipped bbox xmax
-        nmsedDets[i * 6 + 2] = clipBoxes ? max(min(bboxData[bboxId + 2], T_BBOX(1.)), T_BBOX(0.))
-                                         : bboxData[bboxId + 2];
-        // clipped bbox ymax
-        nmsedDets[i * 6 + 3] = clipBoxes ? max(min(bboxData[bboxId + 3], T_BBOX(1.)), T_BBOX(0.))
-                                         : bboxData[bboxId + 3];
-        // clipped bbox angle
-        nmsedDets[i * 6 + 4] = clipBoxes ? max(min(bboxData[bboxId + 4], T_BBOX(1.)), T_BBOX(0.))
-                                         : bboxData[bboxId + 4];
-        nmsedDets[i * 6 + 5] = score;
-      } else {
-        const int bboxId = ((shareLocation ? (index % numPredsPerClass)
-                                           : index % (numClasses * numPredsPerClass)) +
-                            bboxOffset) *
-                           4;
-        if (nmsedIndex != nullptr) {
-          nmsedIndex[i] = bboxId / 4 - bboxOffset;
+        else
+        {
+            const int bboxOffset =
+                imgId * (shareLocation ? numPredsPerClass : (numClasses * numPredsPerClass));
+            nmsedLabels[i] = (index % (numClasses * numPredsPerClass)) / numPredsPerClass;  // label
+            if (rotated)
+            {
+                const int bboxId = ((shareLocation ? (index % numPredsPerClass) : index % (numClasses * numPredsPerClass)) +
+                                    bboxOffset) *
+                                   5;
+                if (nmsedIndex != nullptr)
+                {
+                    nmsedIndex[i] = bboxId / 5 - bboxOffset;
+                }
+                // clipped bbox xmin
+                nmsedDets[i * 6] =
+                    clipBoxes ? max(min(bboxData[bboxId], T_BBOX(1.)), T_BBOX(0.)) : bboxData[bboxId];
+                // clipped bbox ymin
+                nmsedDets[i * 6 + 1] = clipBoxes ? max(min(bboxData[bboxId + 1], T_BBOX(1.)), T_BBOX(0.)) : bboxData[bboxId + 1];
+                // clipped bbox xmax
+                nmsedDets[i * 6 + 2] = clipBoxes ? max(min(bboxData[bboxId + 2], T_BBOX(1.)), T_BBOX(0.)) : bboxData[bboxId + 2];
+                // clipped bbox ymax
+                nmsedDets[i * 6 + 3] = clipBoxes ? max(min(bboxData[bboxId + 3], T_BBOX(1.)), T_BBOX(0.)) : bboxData[bboxId + 3];
+                // clipped bbox angle
+                nmsedDets[i * 6 + 4] = clipBoxes ? max(min(bboxData[bboxId + 4], T_BBOX(1.)), T_BBOX(0.)) : bboxData[bboxId + 4];
+                nmsedDets[i * 6 + 5] = score;
+            }
+            else
+            {
+                const int bboxId = ((shareLocation ? (index % numPredsPerClass) : index % (numClasses * numPredsPerClass)) +
+                                    bboxOffset) *
+                                   4;
+                if (nmsedIndex != nullptr)
+                {
+                    nmsedIndex[i] = bboxId / 4 - bboxOffset;
+                }
+                // clipped bbox xmin
+                nmsedDets[i * 5] =
+                    clipBoxes ? max(min(bboxData[bboxId], T_BBOX(1.)), T_BBOX(0.)) : bboxData[bboxId];
+                // clipped bbox ymin
+                nmsedDets[i * 5 + 1] = clipBoxes ? max(min(bboxData[bboxId + 1], T_BBOX(1.)), T_BBOX(0.)) : bboxData[bboxId + 1];
+                // clipped bbox xmax
+                nmsedDets[i * 5 + 2] = clipBoxes ? max(min(bboxData[bboxId + 2], T_BBOX(1.)), T_BBOX(0.)) : bboxData[bboxId + 2];
+                // clipped bbox ymax
+                nmsedDets[i * 5 + 3] = clipBoxes ? max(min(bboxData[bboxId + 3], T_BBOX(1.)), T_BBOX(0.)) : bboxData[bboxId + 3];
+                nmsedDets[i * 5 + 4] = score;
+            }
         }
-        // clipped bbox xmin
-        nmsedDets[i * 5] =
-            clipBoxes ? max(min(bboxData[bboxId], T_BBOX(1.)), T_BBOX(0.)) : bboxData[bboxId];
-        // clipped bbox ymin
-        nmsedDets[i * 5 + 1] = clipBoxes ? max(min(bboxData[bboxId + 1], T_BBOX(1.)), T_BBOX(0.))
-                                         : bboxData[bboxId + 1];
-        // clipped bbox xmax
-        nmsedDets[i * 5 + 2] = clipBoxes ? max(min(bboxData[bboxId + 2], T_BBOX(1.)), T_BBOX(0.))
-                                         : bboxData[bboxId + 2];
-        // clipped bbox ymax
-        nmsedDets[i * 5 + 3] = clipBoxes ? max(min(bboxData[bboxId + 3], T_BBOX(1.)), T_BBOX(0.))
-                                         : bboxData[bboxId + 3];
-        nmsedDets[i * 5 + 4] = score;
-      }
     }
-  }
 }
 
-template <typename T_BBOX, typename T_SCORE, bool rotated>
-pluginStatus_t gatherNMSOutputs_gpu(cudaStream_t stream, const bool shareLocation,
-                                    const int numImages, const int numPredsPerClass,
-                                    const int numClasses, const int topK, const int keepTopK,
-                                    const void *indices, const void *scores, const void *bboxData,
-                                    void *nmsedDets, void *nmsedLabels, void *nmsedIndex,
-                                    bool clipBoxes) {
-  const int BS = 32;
-  const int GS = 32;
-  gatherNMSOutputs_kernel<T_BBOX, T_SCORE, rotated, BS><<<GS, BS, 0, stream>>>(
-      shareLocation, numImages, numPredsPerClass, numClasses, topK, keepTopK, (int *)indices,
-      (T_SCORE *)scores, (T_BBOX *)bboxData, (T_BBOX *)nmsedDets, (int *)nmsedLabels,
-      (int *)nmsedIndex, clipBoxes);
+template<typename T_BBOX, typename T_SCORE, bool rotated>
+pluginStatus_t gatherNMSOutputs_gpu(cudaStream_t stream,
+                                    const bool   shareLocation,
+                                    const int    numImages,
+                                    const int    numPredsPerClass,
+                                    const int    numClasses,
+                                    const int    topK,
+                                    const int    keepTopK,
+                                    const void*  indices,
+                                    const void*  scores,
+                                    const void*  bboxData,
+                                    void*        nmsedDets,
+                                    void*        nmsedLabels,
+                                    void*        nmsedIndex,
+                                    bool         clipBoxes)
+{
+    const int BS = 32;
+    const int GS = 32;
+    gatherNMSOutputs_kernel<T_BBOX, T_SCORE, rotated, BS><<<GS, BS, 0, stream>>>(
+        shareLocation,
+        numImages,
+        numPredsPerClass,
+        numClasses,
+        topK,
+        keepTopK,
+        (int*)indices,
+        (T_SCORE*)scores,
+        (T_BBOX*)bboxData,
+        (T_BBOX*)nmsedDets,
+        (int*)nmsedLabels,
+        (int*)nmsedIndex,
+        clipBoxes);
 
-  CSC(cudaGetLastError(), STATUS_FAILURE);
-  return STATUS_SUCCESS;
+    CSC(cudaGetLastError(), STATUS_FAILURE);
+    return STATUS_SUCCESS;
 }
 
 // gatherNMSOutputs LAUNCH CONFIG {{{
-typedef pluginStatus_t (*nmsOutFunc)(cudaStream_t, const bool, const int, const int, const int,
-                                     const int, const int, const void *, const void *, const void *,
-                                     void *, void *, void *, bool);
-struct nmsOutLaunchConfig {
-  DataType t_bbox;
-  DataType t_score;
-  bool rotated;
-  nmsOutFunc function;
+typedef pluginStatus_t (*nmsOutFunc)(cudaStream_t,
+                                     const bool,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const int,
+                                     const void*,
+                                     const void*,
+                                     const void*,
+                                     void*,
+                                     void*,
+                                     void*,
+                                     bool);
+struct nmsOutLaunchConfig
+{
+    DataType   t_bbox;
+    DataType   t_score;
+    bool       rotated;
+    nmsOutFunc function;
 
-  nmsOutLaunchConfig(DataType t_bbox, DataType t_score, bool rotated)
-      : t_bbox(t_bbox), t_score(t_score), rotated(rotated) {}
-  nmsOutLaunchConfig(DataType t_bbox, DataType t_score, bool rotated, nmsOutFunc function)
-      : t_bbox(t_bbox), t_score(t_score), rotated(rotated), function(function) {}
-  bool operator==(const nmsOutLaunchConfig &other) {
-    return t_bbox == other.t_bbox && t_score == other.t_score && rotated == other.rotated;
-  }
+    nmsOutLaunchConfig(DataType t_bbox, DataType t_score, bool rotated)
+        : t_bbox(t_bbox)
+        , t_score(t_score)
+        , rotated(rotated)
+    {
+    }
+    nmsOutLaunchConfig(DataType t_bbox, DataType t_score, bool rotated, nmsOutFunc function)
+        : t_bbox(t_bbox)
+        , t_score(t_score)
+        , rotated(rotated)
+        , function(function)
+    {
+    }
+    bool operator==(const nmsOutLaunchConfig& other)
+    {
+        return t_bbox == other.t_bbox && t_score == other.t_score && rotated == other.rotated;
+    }
 };
 
 using nvinfer1::DataType;
 
 static std::vector<nmsOutLaunchConfig> nmsOutFuncVec;
 
-bool nmsOutputInit() {
-  nmsOutFuncVec.push_back(nmsOutLaunchConfig(DataType::kFLOAT, DataType::kFLOAT, false,
-                                             gatherNMSOutputs_gpu<float, float, false>));
-  nmsOutFuncVec.push_back(nmsOutLaunchConfig(DataType::kFLOAT, DataType::kFLOAT, true,
-                                             gatherNMSOutputs_gpu<float, float, true>));
-  return true;
+bool                                   nmsOutputInit()
+{
+    nmsOutFuncVec.push_back(nmsOutLaunchConfig(DataType::kFLOAT, DataType::kFLOAT, false, gatherNMSOutputs_gpu<float, float, false>));
+    nmsOutFuncVec.push_back(nmsOutLaunchConfig(DataType::kFLOAT, DataType::kFLOAT, true, gatherNMSOutputs_gpu<float, float, true>));
+    return true;
 }
 
-static bool initialized = nmsOutputInit();
+static bool    initialized = nmsOutputInit();
 
-pluginStatus_t gatherNMSOutputs(cudaStream_t stream, const bool shareLocation, const int numImages,
-                                const int numPredsPerClass, const int numClasses, const int topK,
-                                const int keepTopK, const DataType DT_BBOX, const DataType DT_SCORE,
-                                const void *indices, const void *scores, const void *bboxData,
-                                void *nmsedDets, void *nmsedLabels, void *nmsedIndex,
-                                bool clipBoxes, bool rotated) {
-  nmsOutLaunchConfig lc = nmsOutLaunchConfig(DT_BBOX, DT_SCORE, rotated);
-  for (unsigned i = 0; i < nmsOutFuncVec.size(); ++i) {
-    if (lc == nmsOutFuncVec[i]) {
-      DEBUG_PRINTF("gatherNMSOutputs kernel %d\n", i);
-      return nmsOutFuncVec[i].function(stream, shareLocation, numImages, numPredsPerClass,
-                                       numClasses, topK, keepTopK, indices, scores, bboxData,
-                                       nmsedDets, nmsedLabels, nmsedIndex, clipBoxes);
+pluginStatus_t gatherNMSOutputs(cudaStream_t   stream,
+                                const bool     shareLocation,
+                                const int      numImages,
+                                const int      numPredsPerClass,
+                                const int      numClasses,
+                                const int      topK,
+                                const int      keepTopK,
+                                const DataType DT_BBOX,
+                                const DataType DT_SCORE,
+                                const void*    indices,
+                                const void*    scores,
+                                const void*    bboxData,
+                                void*          nmsedDets,
+                                void*          nmsedLabels,
+                                void*          nmsedIndex,
+                                bool           clipBoxes,
+                                bool           rotated)
+{
+    nmsOutLaunchConfig lc = nmsOutLaunchConfig(DT_BBOX, DT_SCORE, rotated);
+    for (unsigned i = 0; i < nmsOutFuncVec.size(); ++i)
+    {
+        if (lc == nmsOutFuncVec[i])
+        {
+            DEBUG_PRINTF("gatherNMSOutputs kernel %d\n", i);
+            return nmsOutFuncVec[i].function(stream,
+                                             shareLocation,
+                                             numImages,
+                                             numPredsPerClass,
+                                             numClasses,
+                                             topK,
+                                             keepTopK,
+                                             indices,
+                                             scores,
+                                             bboxData,
+                                             nmsedDets,
+                                             nmsedLabels,
+                                             nmsedIndex,
+                                             clipBoxes);
+        }
     }
-  }
-  return STATUS_BAD_PARAM;
+    return STATUS_BAD_PARAM;
 }
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/kernel.cu b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/kernel.cu
index f0e1c9d0cc..36228de174 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/kernel.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/kernel.cu
@@ -12,96 +12,118 @@
 #define CUDA_MEM_ALIGN 256
 
 // return cuda arch
-size_t get_cuda_arch(int devID) {
-  int computeMode = -1, major = 0, minor = 0;
-  CUASSERT(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, devID));
-  CUASSERT(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devID));
-  CUASSERT(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devID));
-  return major * 100 + minor * 10;
+size_t get_cuda_arch(int devID)
+{
+    int computeMode = -1, major = 0, minor = 0;
+    CUASSERT(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, devID));
+    CUASSERT(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devID));
+    CUASSERT(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devID));
+    return major * 100 + minor * 10;
 }
 
 // ALIGNPTR
-int8_t *alignPtr(int8_t *ptr, uintptr_t to) {
-  uintptr_t addr = (uintptr_t)ptr;
-  if (addr % to) {
-    addr += to - addr % to;
-  }
-  return (int8_t *)addr;
+int8_t* alignPtr(int8_t* ptr, uintptr_t to)
+{
+    uintptr_t addr = (uintptr_t)ptr;
+    if (addr % to)
+    {
+        addr += to - addr % to;
+    }
+    return (int8_t*)addr;
 }
 
 // NEXTWORKSPACEPTR
-int8_t *nextWorkspacePtr(int8_t *ptr, uintptr_t previousWorkspaceSize) {
-  uintptr_t addr = (uintptr_t)ptr;
-  addr += previousWorkspaceSize;
-  return alignPtr((int8_t *)addr, CUDA_MEM_ALIGN);
+int8_t* nextWorkspacePtr(int8_t* ptr, uintptr_t previousWorkspaceSize)
+{
+    uintptr_t addr = (uintptr_t)ptr;
+    addr += previousWorkspaceSize;
+    return alignPtr((int8_t*)addr, CUDA_MEM_ALIGN);
 }
 
 // CALCULATE TOTAL WORKSPACE SIZE
-size_t calculateTotalWorkspaceSize(size_t *workspaces, int count) {
-  size_t total = 0;
-  for (int i = 0; i < count; i++) {
-    total += workspaces[i];
-    if (workspaces[i] % CUDA_MEM_ALIGN) {
-      total += CUDA_MEM_ALIGN - (workspaces[i] % CUDA_MEM_ALIGN);
+size_t calculateTotalWorkspaceSize(size_t* workspaces, int count)
+{
+    size_t total = 0;
+    for (int i = 0; i < count; i++)
+    {
+        total += workspaces[i];
+        if (workspaces[i] % CUDA_MEM_ALIGN)
+        {
+            total += CUDA_MEM_ALIGN - (workspaces[i] % CUDA_MEM_ALIGN);
+        }
     }
-  }
-  return total;
+    return total;
 }
 
 using nvinfer1::DataType;
 
-template <unsigned nthds_per_cta>
-__launch_bounds__(nthds_per_cta) __global__
-    void setUniformOffsets_kernel(const int num_segments, const int offset, int *d_offsets) {
-  const int idx = blockIdx.x * nthds_per_cta + threadIdx.x;
-  if (idx <= num_segments) d_offsets[idx] = idx * offset;
+template<unsigned nthds_per_cta>
+__launch_bounds__(nthds_per_cta) __global__ void setUniformOffsets_kernel(const int num_segments,
+                                                                          const int offset,
+                                                                          int*      d_offsets)
+{
+    const int idx = blockIdx.x * nthds_per_cta + threadIdx.x;
+    if (idx <= num_segments) d_offsets[idx] = idx * offset;
 }
 
-void setUniformOffsets(cudaStream_t stream, const int num_segments, const int offset,
-                       int *d_offsets) {
-  const int BS = 32;
-  const int GS = (num_segments + 1 + BS - 1) / BS;
-  setUniformOffsets_kernel<BS><<<GS, BS, 0, stream>>>(num_segments, offset, d_offsets);
+void setUniformOffsets(cudaStream_t stream, const int num_segments, const int offset, int* d_offsets)
+{
+    const int BS = 32;
+    const int GS = (num_segments + 1 + BS - 1) / BS;
+    setUniformOffsets_kernel<BS><<<GS, BS, 0, stream>>>(num_segments, offset, d_offsets);
 }
 
-size_t detectionForwardBBoxDataSize(int N, int C1, DataType DT_BBOX) {
-  if (DT_BBOX == DataType::kFLOAT) {
-    return N * C1 * sizeof(float);
-  }
+size_t detectionForwardBBoxDataSize(int N, int C1, DataType DT_BBOX)
+{
+    if (DT_BBOX == DataType::kFLOAT)
+    {
+        return N * C1 * sizeof(float);
+    }
 
-  printf("Only FP32 type bounding boxes are supported.\n");
-  return (size_t)-1;
+    printf("Only FP32 type bounding boxes are supported.\n");
+    return (size_t)-1;
 }
 
-size_t detectionForwardBBoxPermuteSize(bool shareLocation, int N, int C1, DataType DT_BBOX) {
-  if (DT_BBOX == DataType::kFLOAT) {
-    return shareLocation ? 0 : N * C1 * sizeof(float);
-  }
-  printf("Only FP32 type bounding boxes are supported.\n");
-  return (size_t)-1;
+size_t detectionForwardBBoxPermuteSize(bool shareLocation, int N, int C1, DataType DT_BBOX)
+{
+    if (DT_BBOX == DataType::kFLOAT)
+    {
+        return shareLocation ? 0 : N * C1 * sizeof(float);
+    }
+    printf("Only FP32 type bounding boxes are supported.\n");
+    return (size_t)-1;
 }
 
-size_t detectionForwardPreNMSSize(int N, int C2) {
-  ASSERT(sizeof(float) == sizeof(int));
-  return N * C2 * sizeof(float);
+size_t detectionForwardPreNMSSize(int N, int C2)
+{
+    ASSERT(sizeof(float) == sizeof(int));
+    return N * C2 * sizeof(float);
 }
 
-size_t detectionForwardPostNMSSize(int N, int numClasses, int topK) {
-  ASSERT(sizeof(float) == sizeof(int));
-  return N * numClasses * topK * sizeof(float);
+size_t detectionForwardPostNMSSize(int N, int numClasses, int topK)
+{
+    ASSERT(sizeof(float) == sizeof(int));
+    return N * numClasses * topK * sizeof(float);
 }
 
-size_t detectionInferenceWorkspaceSize(bool shareLocation, int N, int C1, int C2, int numClasses,
-                                       int numPredsPerClass, int topK, DataType DT_BBOX,
-                                       DataType DT_SCORE) {
-  size_t wss[7];
-  wss[0] = detectionForwardBBoxDataSize(N, C1, DT_BBOX);
-  wss[1] = detectionForwardBBoxPermuteSize(shareLocation, N, C1, DT_BBOX);
-  wss[2] = detectionForwardPreNMSSize(N, C2);
-  wss[3] = detectionForwardPreNMSSize(N, C2);
-  wss[4] = detectionForwardPostNMSSize(N, numClasses, topK);
-  wss[5] = detectionForwardPostNMSSize(N, numClasses, topK);
-  wss[6] = std::max(sortScoresPerClassWorkspaceSize(N, numClasses, numPredsPerClass, DT_SCORE),
-                    sortScoresPerImageWorkspaceSize(N, numClasses * topK, DT_SCORE));
-  return calculateTotalWorkspaceSize(wss, 7);
+size_t detectionInferenceWorkspaceSize(bool     shareLocation,
+                                       int      N,
+                                       int      C1,
+                                       int      C2,
+                                       int      numClasses,
+                                       int      numPredsPerClass,
+                                       int      topK,
+                                       DataType DT_BBOX,
+                                       DataType DT_SCORE)
+{
+    size_t wss[7];
+    wss[0] = detectionForwardBBoxDataSize(N, C1, DT_BBOX);
+    wss[1] = detectionForwardBBoxPermuteSize(shareLocation, N, C1, DT_BBOX);
+    wss[2] = detectionForwardPreNMSSize(N, C2);
+    wss[3] = detectionForwardPreNMSSize(N, C2);
+    wss[4] = detectionForwardPostNMSSize(N, numClasses, topK);
+    wss[5] = detectionForwardPostNMSSize(N, numClasses, topK);
+    wss[6] = std::max(sortScoresPerClassWorkspaceSize(N, numClasses, numPredsPerClass, DT_SCORE),
+                      sortScoresPerImageWorkspaceSize(N, numClasses * topK, DT_SCORE));
+    return calculateTotalWorkspaceSize(wss, 7);
 }
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/permuteData.cu b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/permuteData.cu
index 659c964970..327536d8b1 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/permuteData.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/permuteData.cu
@@ -5,72 +5,120 @@
 
 #include "nms/kernel.h"
 
-template <typename Dtype, unsigned nthds_per_cta>
-__launch_bounds__(nthds_per_cta) __global__
-    void permuteData_kernel(const int nthreads, const int num_classes, const int num_data,
-                            const int num_dim, bool confSigmoid, const Dtype *data,
-                            Dtype *new_data) {
-  // data format: [batch_size, num_data, num_classes, num_dim]
-  for (int index = blockIdx.x * nthds_per_cta + threadIdx.x; index < nthreads;
-       index += nthds_per_cta * gridDim.x) {
-    const int i = index % num_dim;
-    const int c = (index / num_dim) % num_classes;
-    const int d = (index / num_dim / num_classes) % num_data;
-    const int n = index / num_dim / num_classes / num_data;
-    const int new_index = ((n * num_classes + c) * num_data + d) * num_dim + i;
-    float result = data[index];
-    if (confSigmoid) result = exp(result) / (1 + exp(result));
+template<typename Dtype, unsigned nthds_per_cta>
+__launch_bounds__(nthds_per_cta) __global__ void permuteData_kernel(const int    nthreads,
+                                                                    const int    num_classes,
+                                                                    const int    num_data,
+                                                                    const int    num_dim,
+                                                                    bool         confSigmoid,
+                                                                    const Dtype* data,
+                                                                    Dtype*       new_data)
+{
+    // data format: [batch_size, num_data, num_classes, num_dim]
+    for (int index = blockIdx.x * nthds_per_cta + threadIdx.x; index < nthreads;
+         index += nthds_per_cta * gridDim.x)
+    {
+        const int i         = index % num_dim;
+        const int c         = (index / num_dim) % num_classes;
+        const int d         = (index / num_dim / num_classes) % num_data;
+        const int n         = index / num_dim / num_classes / num_data;
+        const int new_index = ((n * num_classes + c) * num_data + d) * num_dim + i;
+        float     result    = data[index];
+        if (confSigmoid) result = exp(result) / (1 + exp(result));
 
-    new_data[new_index] = result;
-  }
-  // new data format: [batch_size, num_classes, num_data, num_dim]
+        new_data[new_index] = result;
+    }
+    // new data format: [batch_size, num_classes, num_data, num_dim]
 }
 
-template <typename Dtype>
-pluginStatus_t permuteData_gpu(cudaStream_t stream, const int nthreads, const int num_classes,
-                               const int num_data, const int num_dim, bool confSigmoid,
-                               const void *data, void *new_data) {
-  const int BS = 512;
-  const int GS = (nthreads + BS - 1) / BS;
-  permuteData_kernel<Dtype, BS><<<GS, BS, 0, stream>>>(nthreads, num_classes, num_data, num_dim,
-                                                       confSigmoid, (const Dtype *)data,
-                                                       (Dtype *)new_data);
-  CSC(cudaGetLastError(), STATUS_FAILURE);
-  return STATUS_SUCCESS;
+template<typename Dtype>
+pluginStatus_t permuteData_gpu(cudaStream_t stream,
+                               const int    nthreads,
+                               const int    num_classes,
+                               const int    num_data,
+                               const int    num_dim,
+                               bool         confSigmoid,
+                               const void*  data,
+                               void*        new_data)
+{
+    const int BS = 512;
+    const int GS = (nthreads + BS - 1) / BS;
+    permuteData_kernel<Dtype, BS><<<GS, BS, 0, stream>>>(nthreads,
+                                                         num_classes,
+                                                         num_data,
+                                                         num_dim,
+                                                         confSigmoid,
+                                                         (const Dtype*)data,
+                                                         (Dtype*)new_data);
+    CSC(cudaGetLastError(), STATUS_FAILURE);
+    return STATUS_SUCCESS;
 }
 
 // permuteData LAUNCH CONFIG
-typedef pluginStatus_t (*pdFunc)(cudaStream_t, const int, const int, const int, const int, bool,
-                                 const void *, void *);
+typedef pluginStatus_t (*pdFunc)(cudaStream_t,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 const int,
+                                 bool,
+                                 const void*,
+                                 void*);
 
-struct pdLaunchConfig {
-  DataType t_data;
-  pdFunc function;
+struct pdLaunchConfig
+{
+    DataType t_data;
+    pdFunc   function;
 
-  pdLaunchConfig(DataType t_data) : t_data(t_data) {}
-  pdLaunchConfig(DataType t_data, pdFunc function) : t_data(t_data), function(function) {}
-  bool operator==(const pdLaunchConfig &other) { return t_data == other.t_data; }
+    pdLaunchConfig(DataType t_data)
+        : t_data(t_data)
+    {
+    }
+    pdLaunchConfig(DataType t_data, pdFunc function)
+        : t_data(t_data)
+        , function(function)
+    {
+    }
+    bool operator==(const pdLaunchConfig& other)
+    {
+        return t_data == other.t_data;
+    }
 };
 
 static std::vector<pdLaunchConfig> pdFuncVec;
 
-bool permuteDataInit() {
-  pdFuncVec.push_back(pdLaunchConfig(DataType::kFLOAT, permuteData_gpu<float>));
-  return true;
+bool                               permuteDataInit()
+{
+    pdFuncVec.push_back(pdLaunchConfig(DataType::kFLOAT, permuteData_gpu<float>));
+    return true;
 }
 
-static bool initialized = permuteDataInit();
+static bool    initialized = permuteDataInit();
 
-pluginStatus_t permuteData(cudaStream_t stream, const int nthreads, const int num_classes,
-                           const int num_data, const int num_dim, const DataType DT_DATA,
-                           bool confSigmoid, const void *data, void *new_data) {
-  pdLaunchConfig lc = pdLaunchConfig(DT_DATA);
-  for (unsigned i = 0; i < pdFuncVec.size(); ++i) {
-    if (lc == pdFuncVec[i]) {
-      DEBUG_PRINTF("permuteData kernel %d\n", i);
-      return pdFuncVec[i].function(stream, nthreads, num_classes, num_data, num_dim, confSigmoid,
-                                   data, new_data);
+pluginStatus_t permuteData(cudaStream_t   stream,
+                           const int      nthreads,
+                           const int      num_classes,
+                           const int      num_data,
+                           const int      num_dim,
+                           const DataType DT_DATA,
+                           bool           confSigmoid,
+                           const void*    data,
+                           void*          new_data)
+{
+    pdLaunchConfig lc = pdLaunchConfig(DT_DATA);
+    for (unsigned i = 0; i < pdFuncVec.size(); ++i)
+    {
+        if (lc == pdFuncVec[i])
+        {
+            DEBUG_PRINTF("permuteData kernel %d\n", i);
+            return pdFuncVec[i].function(stream,
+                                         nthreads,
+                                         num_classes,
+                                         num_data,
+                                         num_dim,
+                                         confSigmoid,
+                                         data,
+                                         new_data);
+        }
     }
-  }
-  return STATUS_BAD_PARAM;
+    return STATUS_BAD_PARAM;
 }
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/sortScoresPerClass.cu b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/sortScoresPerClass.cu
index e72f040cc9..df506d3896 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/sortScoresPerClass.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/sortScoresPerClass.cu
@@ -8,134 +8,209 @@
 #include "nms/kernel.h"
 #include "trt_plugin_helper.hpp"
 
-template <typename T_SCORE, unsigned nthds_per_cta>
-__launch_bounds__(nthds_per_cta) __global__
-    void prepareSortData(const int num, const int num_classes, const int num_preds_per_class,
-                         const int background_label_id, const float confidence_threshold,
-                         T_SCORE *conf_scores_gpu, T_SCORE *temp_scores, int *temp_idx,
-                         int *d_offsets) {
-  // Prepare scores data for sort
-  const int cur_idx = blockIdx.x * nthds_per_cta + threadIdx.x;
-  const int numPredsPerBatch = num_classes * num_preds_per_class;
-  if (cur_idx < numPredsPerBatch) {
-    const int class_idx = cur_idx / num_preds_per_class;
-    for (int i = 0; i < num; i++) {
-      const int targetIdx = i * numPredsPerBatch + cur_idx;
-      const T_SCORE score = conf_scores_gpu[targetIdx];
+template<typename T_SCORE, unsigned nthds_per_cta>
+__launch_bounds__(nthds_per_cta) __global__ void prepareSortData(const int   num,
+                                                                 const int   num_classes,
+                                                                 const int   num_preds_per_class,
+                                                                 const int   background_label_id,
+                                                                 const float confidence_threshold,
+                                                                 T_SCORE*    conf_scores_gpu,
+                                                                 T_SCORE*    temp_scores,
+                                                                 int*        temp_idx,
+                                                                 int*        d_offsets)
+{
+    // Prepare scores data for sort
+    const int cur_idx          = blockIdx.x * nthds_per_cta + threadIdx.x;
+    const int numPredsPerBatch = num_classes * num_preds_per_class;
+    if (cur_idx < numPredsPerBatch)
+    {
+        const int class_idx = cur_idx / num_preds_per_class;
+        for (int i = 0; i < num; i++)
+        {
+            const int     targetIdx = i * numPredsPerBatch + cur_idx;
+            const T_SCORE score     = conf_scores_gpu[targetIdx];
 
-      // "Clear" background labeled score and index
-      // Because we do not care about background
-      if (class_idx == background_label_id) {
-        // Set scores to 0
-        // Set label = -1
-        temp_scores[targetIdx] = 0.0f;
-        temp_idx[targetIdx] = -1;
-        conf_scores_gpu[targetIdx] = 0.0f;
-      }
-      // "Clear" scores lower than threshold
-      else {
-        if (score > confidence_threshold) {
-          temp_scores[targetIdx] = score;
-          temp_idx[targetIdx] = cur_idx + i * numPredsPerBatch;
-        } else {
-          // Set scores to 0
-          // Set label = -1
-          temp_scores[targetIdx] = 0.0f;
-          temp_idx[targetIdx] = -1;
-          conf_scores_gpu[targetIdx] = 0.0f;
-          // TODO: HERE writing memory too many times
-        }
-      }
+            // "Clear" background labeled score and index
+            // Because we do not care about background
+            if (class_idx == background_label_id)
+            {
+                // Set scores to 0
+                // Set label = -1
+                temp_scores[targetIdx]     = 0.0f;
+                temp_idx[targetIdx]        = -1;
+                conf_scores_gpu[targetIdx] = 0.0f;
+            }
+            // "Clear" scores lower than threshold
+            else
+            {
+                if (score > confidence_threshold)
+                {
+                    temp_scores[targetIdx] = score;
+                    temp_idx[targetIdx]    = cur_idx + i * numPredsPerBatch;
+                }
+                else
+                {
+                    // Set scores to 0
+                    // Set label = -1
+                    temp_scores[targetIdx]     = 0.0f;
+                    temp_idx[targetIdx]        = -1;
+                    conf_scores_gpu[targetIdx] = 0.0f;
+                    // TODO: HERE writing memory too many times
+                }
+            }
 
-      if ((cur_idx % num_preds_per_class) == 0) {
-        const int offset_ct = i * num_classes + cur_idx / num_preds_per_class;
-        d_offsets[offset_ct] = offset_ct * num_preds_per_class;
-        // set the last element in d_offset
-        if (blockIdx.x == 0 && threadIdx.x == 0)
-          d_offsets[num * num_classes] = num * numPredsPerBatch;
-      }
+            if ((cur_idx % num_preds_per_class) == 0)
+            {
+                const int offset_ct  = i * num_classes + cur_idx / num_preds_per_class;
+                d_offsets[offset_ct] = offset_ct * num_preds_per_class;
+                // set the last element in d_offset
+                if (blockIdx.x == 0 && threadIdx.x == 0)
+                    d_offsets[num * num_classes] = num * numPredsPerBatch;
+            }
+        }
     }
-  }
 }
 
-template <typename T_SCORE>
-pluginStatus_t sortScoresPerClass_gpu(cudaStream_t stream, const int num, const int num_classes,
-                                      const int num_preds_per_class, const int background_label_id,
-                                      const float confidence_threshold, void *conf_scores_gpu,
-                                      void *index_array_gpu, void *workspace) {
-  const int num_segments = num * num_classes;
-  void *temp_scores = workspace;
-  const int arrayLen = num * num_classes * num_preds_per_class;
-  void *temp_idx = nextWorkspacePtr((int8_t *)temp_scores, arrayLen * sizeof(T_SCORE));
-  void *d_offsets = nextWorkspacePtr((int8_t *)temp_idx, arrayLen * sizeof(int));
-  size_t cubOffsetSize = (num_segments + 1) * sizeof(int);
-  void *cubWorkspace = nextWorkspacePtr((int8_t *)d_offsets, cubOffsetSize);
+template<typename T_SCORE>
+pluginStatus_t sortScoresPerClass_gpu(cudaStream_t stream,
+                                      const int    num,
+                                      const int    num_classes,
+                                      const int    num_preds_per_class,
+                                      const int    background_label_id,
+                                      const float  confidence_threshold,
+                                      void*        conf_scores_gpu,
+                                      void*        index_array_gpu,
+                                      void*        workspace)
+{
+    const int num_segments  = num * num_classes;
+    void*     temp_scores   = workspace;
+    const int arrayLen      = num * num_classes * num_preds_per_class;
+    void*     temp_idx      = nextWorkspacePtr((int8_t*)temp_scores, arrayLen * sizeof(T_SCORE));
+    void*     d_offsets     = nextWorkspacePtr((int8_t*)temp_idx, arrayLen * sizeof(int));
+    size_t    cubOffsetSize = (num_segments + 1) * sizeof(int);
+    void*     cubWorkspace  = nextWorkspacePtr((int8_t*)d_offsets, cubOffsetSize);
 
-  const int BS = 512;
-  const int GS = (num_classes * num_preds_per_class + BS - 1) / BS;
-  prepareSortData<T_SCORE, BS><<<GS, BS, 0, stream>>>(
-      num, num_classes, num_preds_per_class, background_label_id, confidence_threshold,
-      (T_SCORE *)conf_scores_gpu, (T_SCORE *)temp_scores, (int *)temp_idx, (int *)d_offsets);
+    const int BS = 512;
+    const int GS = (num_classes * num_preds_per_class + BS - 1) / BS;
+    prepareSortData<T_SCORE, BS><<<GS, BS, 0, stream>>>(
+        num,
+        num_classes,
+        num_preds_per_class,
+        background_label_id,
+        confidence_threshold,
+        (T_SCORE*)conf_scores_gpu,
+        (T_SCORE*)temp_scores,
+        (int*)temp_idx,
+        (int*)d_offsets);
 
-  size_t temp_storage_bytes = cubSortPairsWorkspaceSize<T_SCORE, int>(arrayLen, num_segments);
-  cub::DeviceSegmentedRadixSort::SortPairsDescending(
-      cubWorkspace, temp_storage_bytes, (const T_SCORE *)(temp_scores),
-      (T_SCORE *)(conf_scores_gpu), (const int *)(temp_idx), (int *)(index_array_gpu), arrayLen,
-      num_segments, (const int *)d_offsets, (const int *)d_offsets + 1, 0, sizeof(T_SCORE) * 8,
-      stream);
-  CSC(cudaGetLastError(), STATUS_FAILURE);
-  return STATUS_SUCCESS;
+    size_t temp_storage_bytes = cubSortPairsWorkspaceSize<T_SCORE, int>(arrayLen, num_segments);
+    cub::DeviceSegmentedRadixSort::SortPairsDescending(
+        cubWorkspace,
+        temp_storage_bytes,
+        (const T_SCORE*)(temp_scores),
+        (T_SCORE*)(conf_scores_gpu),
+        (const int*)(temp_idx),
+        (int*)(index_array_gpu),
+        arrayLen,
+        num_segments,
+        (const int*)d_offsets,
+        (const int*)d_offsets + 1,
+        0,
+        sizeof(T_SCORE) * 8,
+        stream);
+    CSC(cudaGetLastError(), STATUS_FAILURE);
+    return STATUS_SUCCESS;
 }
 
 // sortScoresPerClass LAUNCH CONFIG
-typedef pluginStatus_t (*sspcFunc)(cudaStream_t, const int, const int, const int, const int,
-                                   const float, void *, void *, void *);
-struct sspcLaunchConfig {
-  DataType t_score;
-  sspcFunc function;
+typedef pluginStatus_t (*sspcFunc)(cudaStream_t,
+                                   const int,
+                                   const int,
+                                   const int,
+                                   const int,
+                                   const float,
+                                   void*,
+                                   void*,
+                                   void*);
+struct sspcLaunchConfig
+{
+    DataType t_score;
+    sspcFunc function;
 
-  sspcLaunchConfig(DataType t_score) : t_score(t_score) {}
-  sspcLaunchConfig(DataType t_score, sspcFunc function) : t_score(t_score), function(function) {}
-  bool operator==(const sspcLaunchConfig &other) { return t_score == other.t_score; }
+    sspcLaunchConfig(DataType t_score)
+        : t_score(t_score)
+    {
+    }
+    sspcLaunchConfig(DataType t_score, sspcFunc function)
+        : t_score(t_score)
+        , function(function)
+    {
+    }
+    bool operator==(const sspcLaunchConfig& other)
+    {
+        return t_score == other.t_score;
+    }
 };
 
 static std::vector<sspcLaunchConfig> sspcFuncVec;
-bool sspcInit() {
-  sspcFuncVec.push_back(sspcLaunchConfig(DataType::kFLOAT, sortScoresPerClass_gpu<float>));
-  return true;
+bool                                 sspcInit()
+{
+    sspcFuncVec.push_back(sspcLaunchConfig(DataType::kFLOAT, sortScoresPerClass_gpu<float>));
+    return true;
 }
 
-static bool initialized = sspcInit();
+static bool    initialized = sspcInit();
 
-pluginStatus_t sortScoresPerClass(cudaStream_t stream, const int num, const int num_classes,
-                                  const int num_preds_per_class, const int background_label_id,
-                                  const float confidence_threshold, const DataType DT_SCORE,
-                                  void *conf_scores_gpu, void *index_array_gpu, void *workspace) {
-  sspcLaunchConfig lc = sspcLaunchConfig(DT_SCORE);
-  for (unsigned i = 0; i < sspcFuncVec.size(); ++i) {
-    if (lc == sspcFuncVec[i]) {
-      DEBUG_PRINTF("sortScoresPerClass kernel %d\n", i);
-      return sspcFuncVec[i].function(stream, num, num_classes, num_preds_per_class,
-                                     background_label_id, confidence_threshold, conf_scores_gpu,
-                                     index_array_gpu, workspace);
+pluginStatus_t sortScoresPerClass(cudaStream_t   stream,
+                                  const int      num,
+                                  const int      num_classes,
+                                  const int      num_preds_per_class,
+                                  const int      background_label_id,
+                                  const float    confidence_threshold,
+                                  const DataType DT_SCORE,
+                                  void*          conf_scores_gpu,
+                                  void*          index_array_gpu,
+                                  void*          workspace)
+{
+    sspcLaunchConfig lc = sspcLaunchConfig(DT_SCORE);
+    for (unsigned i = 0; i < sspcFuncVec.size(); ++i)
+    {
+        if (lc == sspcFuncVec[i])
+        {
+            DEBUG_PRINTF("sortScoresPerClass kernel %d\n", i);
+            return sspcFuncVec[i].function(stream,
+                                           num,
+                                           num_classes,
+                                           num_preds_per_class,
+                                           background_label_id,
+                                           confidence_threshold,
+                                           conf_scores_gpu,
+                                           index_array_gpu,
+                                           workspace);
+        }
     }
-  }
-  return STATUS_BAD_PARAM;
+    return STATUS_BAD_PARAM;
 }
 
-size_t sortScoresPerClassWorkspaceSize(const int num, const int num_classes,
-                                       const int num_preds_per_class, const DataType DT_CONF) {
-  size_t wss[4];
-  const int arrayLen = num * num_classes * num_preds_per_class;
-  wss[0] = arrayLen * mmdeploy::getElementSize(DT_CONF);  // temp scores
-  wss[1] = arrayLen * sizeof(int);                        // temp indices
-  wss[2] = (num * num_classes + 1) * sizeof(int);         // offsets
-  if (DT_CONF == DataType::kFLOAT) {
-    wss[3] = cubSortPairsWorkspaceSize<float, int>(arrayLen, num * num_classes);  // cub workspace
-  } else {
-    printf("SCORE type not supported\n");
-    return (size_t)-1;
-  }
+size_t sortScoresPerClassWorkspaceSize(const int      num,
+                                       const int      num_classes,
+                                       const int      num_preds_per_class,
+                                       const DataType DT_CONF)
+{
+    size_t    wss[4];
+    const int arrayLen = num * num_classes * num_preds_per_class;
+    wss[0]             = arrayLen * mmdeploy::getElementSize(DT_CONF);  // temp scores
+    wss[1]             = arrayLen * sizeof(int);                        // temp indices
+    wss[2]             = (num * num_classes + 1) * sizeof(int);         // offsets
+    if (DT_CONF == DataType::kFLOAT)
+    {
+        wss[3] = cubSortPairsWorkspaceSize<float, int>(arrayLen, num * num_classes);  // cub workspace
+    }
+    else
+    {
+        printf("SCORE type not supported\n");
+        return (size_t)-1;
+    }
 
-  return calculateTotalWorkspaceSize(wss, 4);
+    return calculateTotalWorkspaceSize(wss, 4);
 }
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/sortScoresPerImage.cu b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/sortScoresPerImage.cu
index a6ad70262d..ab60b5f88a 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/sortScoresPerImage.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/nms/sortScoresPerImage.cu
@@ -7,75 +7,125 @@
 #include "nms/cub_helper.h"
 #include "nms/kernel.h"
 
-template <typename T_SCORE>
-pluginStatus_t sortScoresPerImage_gpu(cudaStream_t stream, const int num_images,
-                                      const int num_items_per_image, void *unsorted_scores,
-                                      void *unsorted_bbox_indices, void *sorted_scores,
-                                      void *sorted_bbox_indices, void *workspace) {
-  void *d_offsets = workspace;
-  void *cubWorkspace = nextWorkspacePtr((int8_t *)d_offsets, (num_images + 1) * sizeof(int));
+template<typename T_SCORE>
+pluginStatus_t sortScoresPerImage_gpu(cudaStream_t stream,
+                                      const int    num_images,
+                                      const int    num_items_per_image,
+                                      void*        unsorted_scores,
+                                      void*        unsorted_bbox_indices,
+                                      void*        sorted_scores,
+                                      void*        sorted_bbox_indices,
+                                      void*        workspace)
+{
+    void* d_offsets    = workspace;
+    void* cubWorkspace = nextWorkspacePtr((int8_t*)d_offsets, (num_images + 1) * sizeof(int));
 
-  setUniformOffsets(stream, num_images, num_items_per_image, (int *)d_offsets);
+    setUniformOffsets(stream, num_images, num_items_per_image, (int*)d_offsets);
 
-  const int arrayLen = num_images * num_items_per_image;
-  size_t temp_storage_bytes = cubSortPairsWorkspaceSize<T_SCORE, int>(arrayLen, num_images);
-  cub::DeviceSegmentedRadixSort::SortPairsDescending(
-      cubWorkspace, temp_storage_bytes, (const T_SCORE *)(unsorted_scores),
-      (T_SCORE *)(sorted_scores), (const int *)(unsorted_bbox_indices),
-      (int *)(sorted_bbox_indices), arrayLen, num_images, (const int *)d_offsets,
-      (const int *)d_offsets + 1, 0, sizeof(T_SCORE) * 8, stream);
-  CSC(cudaGetLastError(), STATUS_FAILURE);
-  return STATUS_SUCCESS;
+    const int arrayLen           = num_images * num_items_per_image;
+    size_t    temp_storage_bytes = cubSortPairsWorkspaceSize<T_SCORE, int>(arrayLen, num_images);
+    cub::DeviceSegmentedRadixSort::SortPairsDescending(
+        cubWorkspace,
+        temp_storage_bytes,
+        (const T_SCORE*)(unsorted_scores),
+        (T_SCORE*)(sorted_scores),
+        (const int*)(unsorted_bbox_indices),
+        (int*)(sorted_bbox_indices),
+        arrayLen,
+        num_images,
+        (const int*)d_offsets,
+        (const int*)d_offsets + 1,
+        0,
+        sizeof(T_SCORE) * 8,
+        stream);
+    CSC(cudaGetLastError(), STATUS_FAILURE);
+    return STATUS_SUCCESS;
 }
 
 // sortScoresPerImage LAUNCH CONFIG
-typedef pluginStatus_t (*sspiFunc)(cudaStream_t, const int, const int, void *, void *, void *,
-                                   void *, void *);
-struct sspiLaunchConfig {
-  DataType t_score;
-  sspiFunc function;
+typedef pluginStatus_t (*sspiFunc)(cudaStream_t,
+                                   const int,
+                                   const int,
+                                   void*,
+                                   void*,
+                                   void*,
+                                   void*,
+                                   void*);
+struct sspiLaunchConfig
+{
+    DataType t_score;
+    sspiFunc function;
 
-  sspiLaunchConfig(DataType t_score) : t_score(t_score) {}
-  sspiLaunchConfig(DataType t_score, sspiFunc function) : t_score(t_score), function(function) {}
-  bool operator==(const sspiLaunchConfig &other) { return t_score == other.t_score; }
+    sspiLaunchConfig(DataType t_score)
+        : t_score(t_score)
+    {
+    }
+    sspiLaunchConfig(DataType t_score, sspiFunc function)
+        : t_score(t_score)
+        , function(function)
+    {
+    }
+    bool operator==(const sspiLaunchConfig& other)
+    {
+        return t_score == other.t_score;
+    }
 };
 
 static std::vector<sspiLaunchConfig> sspiFuncVec;
-bool sspiInit() {
-  sspiFuncVec.push_back(sspiLaunchConfig(DataType::kFLOAT, sortScoresPerImage_gpu<float>));
-  return true;
+bool                                 sspiInit()
+{
+    sspiFuncVec.push_back(sspiLaunchConfig(DataType::kFLOAT, sortScoresPerImage_gpu<float>));
+    return true;
 }
 
-static bool initialized = sspiInit();
+static bool    initialized = sspiInit();
 
-pluginStatus_t sortScoresPerImage(cudaStream_t stream, const int num_images,
-                                  const int num_items_per_image, const DataType DT_SCORE,
-                                  void *unsorted_scores, void *unsorted_bbox_indices,
-                                  void *sorted_scores, void *sorted_bbox_indices, void *workspace) {
-  sspiLaunchConfig lc = sspiLaunchConfig(DT_SCORE);
-  for (unsigned i = 0; i < sspiFuncVec.size(); ++i) {
-    if (lc == sspiFuncVec[i]) {
-      DEBUG_PRINTF("sortScoresPerImage kernel %d\n", i);
-      return sspiFuncVec[i].function(stream, num_images, num_items_per_image, unsorted_scores,
-                                     unsorted_bbox_indices, sorted_scores, sorted_bbox_indices,
-                                     workspace);
+pluginStatus_t sortScoresPerImage(cudaStream_t   stream,
+                                  const int      num_images,
+                                  const int      num_items_per_image,
+                                  const DataType DT_SCORE,
+                                  void*          unsorted_scores,
+                                  void*          unsorted_bbox_indices,
+                                  void*          sorted_scores,
+                                  void*          sorted_bbox_indices,
+                                  void*          workspace)
+{
+    sspiLaunchConfig lc = sspiLaunchConfig(DT_SCORE);
+    for (unsigned i = 0; i < sspiFuncVec.size(); ++i)
+    {
+        if (lc == sspiFuncVec[i])
+        {
+            DEBUG_PRINTF("sortScoresPerImage kernel %d\n", i);
+            return sspiFuncVec[i].function(stream,
+                                           num_images,
+                                           num_items_per_image,
+                                           unsorted_scores,
+                                           unsorted_bbox_indices,
+                                           sorted_scores,
+                                           sorted_bbox_indices,
+                                           workspace);
+        }
     }
-  }
-  return STATUS_BAD_PARAM;
+    return STATUS_BAD_PARAM;
 }
 
-size_t sortScoresPerImageWorkspaceSize(const int num_images, const int num_items_per_image,
-                                       const DataType DT_SCORE) {
-  const int arrayLen = num_images * num_items_per_image;
-  size_t wss[2];
-  wss[0] = (num_images + 1) * sizeof(int);  // offsets
-  if (DT_SCORE == DataType::kFLOAT) {
-    wss[1] = cubSortPairsWorkspaceSize<float, int>(arrayLen,
-                                                   num_images);  // cub workspace
-  } else {
-    printf("SCORE type not supported.\n");
-    return (size_t)-1;
-  }
+size_t sortScoresPerImageWorkspaceSize(const int      num_images,
+                                       const int      num_items_per_image,
+                                       const DataType DT_SCORE)
+{
+    const int arrayLen = num_images * num_items_per_image;
+    size_t    wss[2];
+    wss[0] = (num_images + 1) * sizeof(int);  // offsets
+    if (DT_SCORE == DataType::kFLOAT)
+    {
+        wss[1] = cubSortPairsWorkspaceSize<float, int>(arrayLen,
+                                                       num_images);  // cub workspace
+    }
+    else
+    {
+        printf("SCORE type not supported.\n");
+        return (size_t)-1;
+    }
 
-  return calculateTotalWorkspaceSize(wss, 2);
+    return calculateTotalWorkspaceSize(wss, 2);
 }
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/trt_cuda_helper.cu b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/trt_cuda_helper.cu
index 47e8ae8615..ad0a1bf6de 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/common_impl/trt_cuda_helper.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/common_impl/trt_cuda_helper.cu
@@ -4,92 +4,145 @@
 
 using mmdeploy::TensorDesc;
 
-template <class scalar_t>
-__global__ void copy_permute_kernel(scalar_t *__restrict__ dst, const scalar_t *__restrict__ src,
-                                    int n, TensorDesc ts_src_stride, TensorDesc ts_dst_stride,
-                                    TensorDesc ts_permute) {
-  const int src_dim = ts_src_stride.dim;
-  const auto src_stride = ts_src_stride.stride;
-  const auto dst_stride = ts_dst_stride.stride;
-  const auto permute = ts_permute.shape;
-  CUDA_1D_KERNEL_LOOP(index, n) {
-    size_t dst_index = index;
-    size_t src_index = 0;
-    for (int i = 0; i < src_dim; ++i) {
-      int dim_index = dst_index / dst_stride[i];
-      dst_index = dst_index % dst_stride[i];
-      src_index += dim_index * src_stride[permute[i]];
+template<class scalar_t>
+__global__ void copy_permute_kernel(scalar_t* __restrict__ dst,
+                                    const scalar_t* __restrict__ src,
+                                    int        n,
+                                    TensorDesc ts_src_stride,
+                                    TensorDesc ts_dst_stride,
+                                    TensorDesc ts_permute)
+{
+    const int  src_dim    = ts_src_stride.dim;
+    const auto src_stride = ts_src_stride.stride;
+    const auto dst_stride = ts_dst_stride.stride;
+    const auto permute    = ts_permute.shape;
+    CUDA_1D_KERNEL_LOOP(index, n)
+    {
+        size_t dst_index = index;
+        size_t src_index = 0;
+        for (int i = 0; i < src_dim; ++i)
+        {
+            int dim_index = dst_index / dst_stride[i];
+            dst_index     = dst_index % dst_stride[i];
+            src_index += dim_index * src_stride[permute[i]];
+        }
+        dst[index] = src[src_index];
     }
-    dst[index] = src[src_index];
-  }
 }
 
-template <class scalar_t>
-void memcpyPermute(scalar_t *dst, const scalar_t *src, int *src_size, int *permute, int src_dim,
-                   cudaStream_t stream) {
-  size_t copy_size = 1;
-  TensorDesc ts_permute;
-  memcpy(&(ts_permute.shape[0]), permute, src_dim * sizeof(int));
+template<class scalar_t>
+void memcpyPermute(scalar_t*       dst,
+                   const scalar_t* src,
+                   int*            src_size,
+                   int*            permute,
+                   int             src_dim,
+                   cudaStream_t    stream)
+{
+    size_t     copy_size = 1;
+    TensorDesc ts_permute;
+    memcpy(&(ts_permute.shape[0]), permute, src_dim * sizeof(int));
 
-  TensorDesc ts_src_stride;
-  TensorDesc ts_dst_stride;
-  ts_src_stride.dim = src_dim;
-  ts_dst_stride.dim = src_dim;
-  int *src_stride = &(ts_src_stride.stride[0]);
-  int *dst_stride = &(ts_dst_stride.stride[0]);
-  int *dst_size = &(ts_dst_stride.shape[0]);
-  src_stride[src_dim - 1] = 1;
-  dst_stride[src_dim - 1] = 1;
+    TensorDesc ts_src_stride;
+    TensorDesc ts_dst_stride;
+    ts_src_stride.dim       = src_dim;
+    ts_dst_stride.dim       = src_dim;
+    int* src_stride         = &(ts_src_stride.stride[0]);
+    int* dst_stride         = &(ts_dst_stride.stride[0]);
+    int* dst_size           = &(ts_dst_stride.shape[0]);
+    src_stride[src_dim - 1] = 1;
+    dst_stride[src_dim - 1] = 1;
 
-  for (int i = src_dim - 1; i >= 0; --i) {
-    dst_size[i] = src_size[permute[i]];
-    if (i < src_dim - 1) {
-      src_stride[i] = src_stride[i + 1] * src_size[i + 1];
+    for (int i = src_dim - 1; i >= 0; --i)
+    {
+        dst_size[i] = src_size[permute[i]];
+        if (i < src_dim - 1)
+        {
+            src_stride[i] = src_stride[i + 1] * src_size[i + 1];
+        }
     }
-  }
 
-  for (int i = src_dim - 1; i >= 0; --i) {
-    copy_size *= dst_size[i];
-    if (i < src_dim - 1) {
-      dst_stride[i] = dst_stride[i + 1] * dst_size[i + 1];
+    for (int i = src_dim - 1; i >= 0; --i)
+    {
+        copy_size *= dst_size[i];
+        if (i < src_dim - 1)
+        {
+            dst_stride[i] = dst_stride[i + 1] * dst_size[i + 1];
+        }
     }
-  }
 
-  copy_permute_kernel<scalar_t><<<GET_BLOCKS(copy_size), THREADS_PER_BLOCK, 0, stream>>>(
-      dst, src, copy_size, ts_src_stride, ts_dst_stride, ts_permute);
+    copy_permute_kernel<scalar_t><<<GET_BLOCKS(copy_size), THREADS_PER_BLOCK, 0, stream>>>(
+        dst,
+        src,
+        copy_size,
+        ts_src_stride,
+        ts_dst_stride,
+        ts_permute);
 }
 
-template void memcpyPermute<float>(float *dst, const float *src, int *src_size, int *permute,
-                                   int src_dim, cudaStream_t stream);
-template void memcpyPermute<half>(half *dst, const half *src, int *src_size, int *permute,
-                                  int src_dim, cudaStream_t stream);
+template void memcpyPermute<float>(float*       dst,
+                                   const float* src,
+                                   int*         src_size,
+                                   int*         permute,
+                                   int          src_dim,
+                                   cudaStream_t stream);
 
-cudnnStatus_t convert_trt2cudnn_dtype(nvinfer1::DataType trt_dtype, cudnnDataType_t *cudnn_dtype) {
-  switch (trt_dtype) {
-    case nvinfer1::DataType::kFLOAT:
-      *cudnn_dtype = CUDNN_DATA_FLOAT;
-      break;
-    case nvinfer1::DataType::kHALF:
-      *cudnn_dtype = CUDNN_DATA_HALF;
-      break;
-    default:
-      return CUDNN_STATUS_BAD_PARAM;
-  }
-  return CUDNN_STATUS_SUCCESS;
+template void memcpyPermute<half>(half*        dst,
+                                  const half*  src,
+                                  int*         src_size,
+                                  int*         permute,
+                                  int          src_dim,
+                                  cudaStream_t stream);
+
+cudnnStatus_t convert_trt2cudnn_dtype(nvinfer1::DataType trt_dtype, cudnnDataType_t* cudnn_dtype)
+{
+    switch (trt_dtype)
+    {
+        case nvinfer1::DataType::kFLOAT:
+            *cudnn_dtype = CUDNN_DATA_FLOAT;
+            break;
+        case nvinfer1::DataType::kHALF:
+            *cudnn_dtype = CUDNN_DATA_HALF;
+            break;
+        default:
+            return CUDNN_STATUS_BAD_PARAM;
+    }
+    return CUDNN_STATUS_SUCCESS;
 }
 
-template <>
-cublasStatus_t cublasGemmWrap<float>(cublasHandle_t handle, cublasOperation_t transa,
-                                     cublasOperation_t transb, int m, int n, int k,
-                                     const float *alpha, const float *A, int lda, const float *B,
-                                     int ldb, const float *beta, float *C, int ldc) {
-  return cublasSgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+template<>
+cublasStatus_t cublasGemmWrap<float>(cublasHandle_t    handle,
+                                     cublasOperation_t transa,
+                                     cublasOperation_t transb,
+                                     int               m,
+                                     int               n,
+                                     int               k,
+                                     const float*      alpha,
+                                     const float*      A,
+                                     int               lda,
+                                     const float*      B,
+                                     int               ldb,
+                                     const float*      beta,
+                                     float*            C,
+                                     int               ldc)
+{
+    return cublasSgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-template <>
-cublasStatus_t cublasGemmWrap<half>(cublasHandle_t handle, cublasOperation_t transa,
-                                    cublasOperation_t transb, int m, int n, int k,
-                                    const half *alpha, const half *A, int lda, const half *B,
-                                    int ldb, const half *beta, half *C, int ldc) {
-  return cublasHgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+template<>
+cublasStatus_t cublasGemmWrap<half>(cublasHandle_t    handle,
+                                    cublasOperation_t transa,
+                                    cublasOperation_t transb,
+                                    int               m,
+                                    int               n,
+                                    int               k,
+                                    const half*       alpha,
+                                    const half*       A,
+                                    int               lda,
+                                    const half*       B,
+                                    int               ldb,
+                                    const half*       beta,
+                                    half*             C,
+                                    int               ldc)
+{
+    return cublasHgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv.cpp b/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv.cpp
index 0d518323d2..247093db2f 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv.cpp
@@ -10,254 +10,346 @@
 
 using namespace nvinfer1;
 
-namespace mmdeploy {
-namespace {
-static const char *PLUGIN_VERSION{"1"};
-static const char *PLUGIN_NAME{"MMCVDeformConv2d"};
-}  // namespace
-
-DeformableConvPluginDynamic::DeformableConvPluginDynamic(const std::string &name,
-                                                         const nvinfer1::Dims stride,
-                                                         const nvinfer1::Dims padding,
-                                                         const nvinfer1::Dims dilation,
-                                                         const int deformableGroup, const int group)
-    : TRTPluginBase(name),
-      mStride(stride),
-      mPadding(padding),
-      mDilation(dilation),
-      mDeformableGroup(deformableGroup),
-      mGroup(group) {}
-
-DeformableConvPluginDynamic::DeformableConvPluginDynamic(const std::string name, const void *data,
-                                                         size_t length)
-    : TRTPluginBase(name) {
-  deserialize_value(&data, &length, &mStride);
-  deserialize_value(&data, &length, &mPadding);
-  deserialize_value(&data, &length, &mDilation);
-  deserialize_value(&data, &length, &mDeformableGroup);
-  deserialize_value(&data, &length, &mGroup);
-}
-DeformableConvPluginDynamic::~DeformableConvPluginDynamic() {}
-
-nvinfer1::IPluginV2DynamicExt *DeformableConvPluginDynamic::clone() const TRT_NOEXCEPT {
-  DeformableConvPluginDynamic *plugin = new DeformableConvPluginDynamic(
-      mLayerName, mStride, mPadding, mDilation, mDeformableGroup, mGroup);
-  plugin->setPluginNamespace(getPluginNamespace());
-
-  return plugin;
-}
-
-nvinfer1::DimsExprs DeformableConvPluginDynamic::getOutputDimensions(
-    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
-    nvinfer1::IExprBuilder &exprBuilder) TRT_NOEXCEPT {
-  // input[0] == input
-  // input[1] == offset
-  // input[2] == weight
-  nvinfer1::DimsExprs ret;
-  ret.nbDims = 4;
-  ret.d[0] = inputs[0].d[0];
-  ret.d[1] = inputs[2].d[0];
-
-  ret.d[2] = inputs[1].d[2];
-  ret.d[3] = inputs[1].d[3];
-
-  return ret;
-}
-
-bool DeformableConvPluginDynamic::supportsFormatCombination(
-    int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT {
-  if (pos == 0) {
-    return ((ioDesc[pos].type == nvinfer1::DataType::kFLOAT ||
-             ioDesc[pos].type == nvinfer1::DataType::kHALF) &&
-            ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR);
-  } else {
-    return ioDesc[pos].type == ioDesc[0].type && ioDesc[pos].format == ioDesc[0].format;
-  }
-}
-
-void DeformableConvPluginDynamic::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *inputs,
-                                                  int nbInputs,
-                                                  const nvinfer1::DynamicPluginTensorDesc *outputs,
-                                                  int nbOutputs) TRT_NOEXCEPT {}
-
-size_t DeformableConvPluginDynamic::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs,
-                                                     int nbInputs,
-                                                     const nvinfer1::PluginTensorDesc *outputs,
-                                                     int nbOutputs) const TRT_NOEXCEPT {
-  int sizeof_dtype = mmdeploy::getElementSize(outputs[0].type);
-
-  int batch_size = inputs[0].dims.d[0];
-  int nInputPlane = inputs[0].dims.d[1];
-  int inputHeight = inputs[0].dims.d[2];
-  int inputWidth = inputs[0].dims.d[3];
-
-  int nOutputPlane = outputs[0].dims.d[1];
-  int outputHeight = outputs[0].dims.d[2];
-  int outputWidth = outputs[0].dims.d[3];
-
-  int kW = inputs[2].dims.d[2];
-  int kH = inputs[2].dims.d[3];
-  int im2col_step = std::min(32, batch_size);
-
-  size_t col_size = mmdeploy::getAlignedSize(nInputPlane * kW * kH * im2col_step * outputHeight *
-                                             outputWidth * sizeof_dtype);
-
-  size_t out_size = 0;
-  if (im2col_step != 1)
-    out_size = mmdeploy::getAlignedSize(batch_size * nOutputPlane * outputHeight * outputWidth *
-                                        sizeof_dtype);
-
-  return col_size + out_size;
-}
-
-int DeformableConvPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
-                                         const nvinfer1::PluginTensorDesc *outputDesc,
-                                         const void *const *inputs, void *const *outputs,
-                                         void *workSpace, cudaStream_t stream) TRT_NOEXCEPT {
-  int batch = inputDesc[0].dims.d[0];
-  int channels = inputDesc[0].dims.d[1];
-  int height = inputDesc[0].dims.d[2];
-  int width = inputDesc[0].dims.d[3];
-  int channels_out = outputDesc[0].dims.d[1];
-  int kernel_h = inputDesc[2].dims.d[2];
-  int kernel_w = inputDesc[2].dims.d[3];
-
-  const void *x = inputs[0];
-  const void *offset = inputs[1];
-  const void *weight = inputs[2];
-  void *output = outputs[0];
-  int im2col_step = std::min(batch, 32);
-
-  auto data_type = inputDesc[0].type;
-  switch (data_type) {
-    case nvinfer1::DataType::kFLOAT:
-      deform_conv<float>((float *)x, (float *)weight, (float *)offset, (float *)output, workSpace,
-                         batch, channels, height, width, channels_out, kernel_w, kernel_h,
-                         mStride.d[0], mStride.d[1], mPadding.d[0], mPadding.d[1], mDilation.d[0],
-                         mDilation.d[1], mGroup, mDeformableGroup, im2col_step, m_cublas_handle,
-                         stream);
-      break;
-    case nvinfer1::DataType::kHALF:
-      deform_conv<half>((half *)x, (half *)weight, (half *)offset, (half *)output, workSpace, batch,
-                        channels, height, width, channels_out, kernel_w, kernel_h, mStride.d[0],
-                        mStride.d[1], mPadding.d[0], mPadding.d[1], mDilation.d[0], mDilation.d[1],
-                        mGroup, mDeformableGroup, im2col_step, m_cublas_handle, stream);
-      break;
-    default:
-      return 1;
-  }
-
-  return 0;
-}
-
-nvinfer1::DataType DeformableConvPluginDynamic::getOutputDataType(
-    int index, const nvinfer1::DataType *inputTypes, int nbInputs) const TRT_NOEXCEPT {
-  return inputTypes[0];
-}
-
-// IPluginV2 Methods
-const char *DeformableConvPluginDynamic::getPluginType() const TRT_NOEXCEPT { return PLUGIN_NAME; }
-
-const char *DeformableConvPluginDynamic::getPluginVersion() const TRT_NOEXCEPT {
-  return PLUGIN_VERSION;
-}
-
-int DeformableConvPluginDynamic::getNbOutputs() const TRT_NOEXCEPT { return 1; }
-
-size_t DeformableConvPluginDynamic::getSerializationSize() const TRT_NOEXCEPT {
-  return serialized_size(mStride) + serialized_size(mPadding) + serialized_size(mDilation) +
-         serialized_size(mDeformableGroup) + serialized_size(mGroup);
-}
-
-void DeformableConvPluginDynamic::serialize(void *buffer) const TRT_NOEXCEPT {
-  serialize_value(&buffer, mStride);
-  serialize_value(&buffer, mPadding);
-  serialize_value(&buffer, mDilation);
-  serialize_value(&buffer, mDeformableGroup);
-  serialize_value(&buffer, mGroup);
-}
-
-void DeformableConvPluginDynamic::attachToContext(
-    cudnnContext *cudnnContext, cublasContext *cublasContext,
-    nvinfer1::IGpuAllocator *gpuAllocator) TRT_NOEXCEPT {
-  m_cublas_handle = cublasContext;
-}
-
-void DeformableConvPluginDynamic::detachFromContext() TRT_NOEXCEPT {}
-
-////////////////////// creator /////////////////////////////
-
-DeformableConvPluginDynamicCreator::DeformableConvPluginDynamicCreator() {
-  mPluginAttributes.clear();
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("stride"));
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("padding"));
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("dilation"));
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("groups"));
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("deform_groups"));
-  mFC.nbFields = mPluginAttributes.size();
-  mFC.fields = mPluginAttributes.data();
-}
-
-const char *DeformableConvPluginDynamicCreator::getPluginName() const TRT_NOEXCEPT {
-  return PLUGIN_NAME;
-}
-
-const char *DeformableConvPluginDynamicCreator::getPluginVersion() const TRT_NOEXCEPT {
-  return PLUGIN_VERSION;
-}
-
-nvinfer1::IPluginV2 *DeformableConvPluginDynamicCreator::createPlugin(
-    const char *name, const nvinfer1::PluginFieldCollection *fc) TRT_NOEXCEPT {
-  nvinfer1::Dims stride{2, {1, 1}};
-  nvinfer1::Dims padding{2, {0, 0}};
-  nvinfer1::Dims dilation{2, {1, 1}};
-  int deformableGroup = 1;
-  int group = 1;
-
-  for (int i = 0; i < fc->nbFields; i++) {
-    if (fc->fields[i].data == nullptr) {
-      continue;
+namespace mmdeploy
+{
+    namespace
+    {
+        static const char* PLUGIN_VERSION{"1"};
+        static const char* PLUGIN_NAME{"MMCVDeformConv2d"};
+    }  // namespace
+
+    DeformableConvPluginDynamic::DeformableConvPluginDynamic(const std::string&   name,
+                                                             const nvinfer1::Dims stride,
+                                                             const nvinfer1::Dims padding,
+                                                             const nvinfer1::Dims dilation,
+                                                             const int            deformableGroup,
+                                                             const int            group)
+        : TRTPluginBase(name)
+        , mStride(stride)
+        , mPadding(padding)
+        , mDilation(dilation)
+        , mDeformableGroup(deformableGroup)
+        , mGroup(group)
+    {
     }
-    std::string field_name(fc->fields[i].name);
 
-    if (field_name.compare("deform_groups") == 0) {
-      deformableGroup = static_cast<const int *>(fc->fields[i].data)[0];
+    DeformableConvPluginDynamic::DeformableConvPluginDynamic(const std::string name, const void* data, size_t length)
+        : TRTPluginBase(name)
+    {
+        deserialize_value(&data, &length, &mStride);
+        deserialize_value(&data, &length, &mPadding);
+        deserialize_value(&data, &length, &mDilation);
+        deserialize_value(&data, &length, &mDeformableGroup);
+        deserialize_value(&data, &length, &mGroup);
+    }
+    DeformableConvPluginDynamic::~DeformableConvPluginDynamic() {}
+
+    nvinfer1::IPluginV2DynamicExt* DeformableConvPluginDynamic::clone() const TRT_NOEXCEPT
+    {
+        DeformableConvPluginDynamic* plugin = new DeformableConvPluginDynamic(
+            mLayerName,
+            mStride,
+            mPadding,
+            mDilation,
+            mDeformableGroup,
+            mGroup);
+        plugin->setPluginNamespace(getPluginNamespace());
+
+        return plugin;
+    }
+
+    nvinfer1::DimsExprs DeformableConvPluginDynamic::getOutputDimensions(
+        int                        outputIndex,
+        const nvinfer1::DimsExprs* inputs,
+        int                        nbInputs,
+        nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT
+    {
+        // input[0] == input
+        // input[1] == offset
+        // input[2] == weight
+        nvinfer1::DimsExprs ret;
+        ret.nbDims = 4;
+        ret.d[0]   = inputs[0].d[0];
+        ret.d[1]   = inputs[2].d[0];
+
+        ret.d[2] = inputs[1].d[2];
+        ret.d[3] = inputs[1].d[3];
+
+        return ret;
+    }
+
+    bool DeformableConvPluginDynamic::supportsFormatCombination(
+        int                               pos,
+        const nvinfer1::PluginTensorDesc* ioDesc,
+        int                               nbInputs,
+        int                               nbOutputs) TRT_NOEXCEPT
+    {
+        if (pos == 0)
+        {
+            return ((ioDesc[pos].type == nvinfer1::DataType::kFLOAT ||
+                     ioDesc[pos].type == nvinfer1::DataType::kHALF) &&
+                    ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR);
+        }
+        else
+        {
+            return ioDesc[pos].type == ioDesc[0].type && ioDesc[pos].format == ioDesc[0].format;
+        }
+    }
+
+    void   DeformableConvPluginDynamic::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs,
+                                                      int                                      nbInputs,
+                                                      const nvinfer1::DynamicPluginTensorDesc* outputs,
+                                                      int                                      nbOutputs) TRT_NOEXCEPT {}
+
+    size_t DeformableConvPluginDynamic::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                                         int                               nbInputs,
+                                                         const nvinfer1::PluginTensorDesc* outputs,
+                                                         int                               nbOutputs) const TRT_NOEXCEPT
+    {
+        int    sizeof_dtype = mmdeploy::getElementSize(outputs[0].type);
+
+        int    batch_size  = inputs[0].dims.d[0];
+        int    nInputPlane = inputs[0].dims.d[1];
+        int    inputHeight = inputs[0].dims.d[2];
+        int    inputWidth  = inputs[0].dims.d[3];
+
+        int    nOutputPlane = outputs[0].dims.d[1];
+        int    outputHeight = outputs[0].dims.d[2];
+        int    outputWidth  = outputs[0].dims.d[3];
+
+        int    kW          = inputs[2].dims.d[2];
+        int    kH          = inputs[2].dims.d[3];
+        int    im2col_step = std::min(32, batch_size);
+
+        size_t col_size = mmdeploy::getAlignedSize(nInputPlane * kW * kH * im2col_step * outputHeight *
+                                                   outputWidth * sizeof_dtype);
+
+        size_t out_size = 0;
+        if (im2col_step != 1)
+            out_size = mmdeploy::getAlignedSize(batch_size * nOutputPlane * outputHeight * outputWidth *
+                                                sizeof_dtype);
+
+        return col_size + out_size;
+    }
+
+    int DeformableConvPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                             const nvinfer1::PluginTensorDesc* outputDesc,
+                                             const void* const*                inputs,
+                                             void* const*                      outputs,
+                                             void*                             workSpace,
+                                             cudaStream_t                      stream) TRT_NOEXCEPT
+    {
+        int         batch        = inputDesc[0].dims.d[0];
+        int         channels     = inputDesc[0].dims.d[1];
+        int         height       = inputDesc[0].dims.d[2];
+        int         width        = inputDesc[0].dims.d[3];
+        int         channels_out = outputDesc[0].dims.d[1];
+        int         kernel_h     = inputDesc[2].dims.d[2];
+        int         kernel_w     = inputDesc[2].dims.d[3];
+
+        const void* x           = inputs[0];
+        const void* offset      = inputs[1];
+        const void* weight      = inputs[2];
+        void*       output      = outputs[0];
+        int         im2col_step = std::min(batch, 32);
+
+        auto        data_type = inputDesc[0].type;
+        switch (data_type)
+        {
+            case nvinfer1::DataType::kFLOAT:
+                deform_conv<float>((float*)x,
+                                   (float*)weight,
+                                   (float*)offset,
+                                   (float*)output,
+                                   workSpace,
+                                   batch,
+                                   channels,
+                                   height,
+                                   width,
+                                   channels_out,
+                                   kernel_w,
+                                   kernel_h,
+                                   mStride.d[0],
+                                   mStride.d[1],
+                                   mPadding.d[0],
+                                   mPadding.d[1],
+                                   mDilation.d[0],
+                                   mDilation.d[1],
+                                   mGroup,
+                                   mDeformableGroup,
+                                   im2col_step,
+                                   m_cublas_handle,
+                                   stream);
+                break;
+            case nvinfer1::DataType::kHALF:
+                deform_conv<half>((half*)x,
+                                  (half*)weight,
+                                  (half*)offset,
+                                  (half*)output,
+                                  workSpace,
+                                  batch,
+                                  channels,
+                                  height,
+                                  width,
+                                  channels_out,
+                                  kernel_w,
+                                  kernel_h,
+                                  mStride.d[0],
+                                  mStride.d[1],
+                                  mPadding.d[0],
+                                  mPadding.d[1],
+                                  mDilation.d[0],
+                                  mDilation.d[1],
+                                  mGroup,
+                                  mDeformableGroup,
+                                  im2col_step,
+                                  m_cublas_handle,
+                                  stream);
+                break;
+            default:
+                return 1;
+        }
+
+        return 0;
+    }
+
+    nvinfer1::DataType DeformableConvPluginDynamic::getOutputDataType(
+        int                       index,
+        const nvinfer1::DataType* inputTypes,
+        int                       nbInputs) const TRT_NOEXCEPT
+    {
+        return inputTypes[0];
+    }
+
+    // IPluginV2 Methods
+    const char* DeformableConvPluginDynamic::getPluginType() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
+    }
+
+    const char* DeformableConvPluginDynamic::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
+    }
+
+    int DeformableConvPluginDynamic::getNbOutputs() const TRT_NOEXCEPT
+    {
+        return 1;
+    }
+
+    size_t DeformableConvPluginDynamic::getSerializationSize() const TRT_NOEXCEPT
+    {
+        return serialized_size(mStride) + serialized_size(mPadding) + serialized_size(mDilation) +
+               serialized_size(mDeformableGroup) + serialized_size(mGroup);
+    }
+
+    void DeformableConvPluginDynamic::serialize(void* buffer) const TRT_NOEXCEPT
+    {
+        serialize_value(&buffer, mStride);
+        serialize_value(&buffer, mPadding);
+        serialize_value(&buffer, mDilation);
+        serialize_value(&buffer, mDeformableGroup);
+        serialize_value(&buffer, mGroup);
+    }
+
+    void DeformableConvPluginDynamic::attachToContext(
+        cudnnContext*            cudnnContext,
+        cublasContext*           cublasContext,
+        nvinfer1::IGpuAllocator* gpuAllocator) TRT_NOEXCEPT
+    {
+        m_cublas_handle = cublasContext;
+    }
+
+    void DeformableConvPluginDynamic::detachFromContext() TRT_NOEXCEPT {}
+
+    ////////////////////// creator /////////////////////////////
+
+    DeformableConvPluginDynamicCreator::DeformableConvPluginDynamicCreator()
+    {
+        mPluginAttributes.clear();
+        mPluginAttributes.emplace_back(nvinfer1::PluginField("stride"));
+        mPluginAttributes.emplace_back(nvinfer1::PluginField("padding"));
+        mPluginAttributes.emplace_back(nvinfer1::PluginField("dilation"));
+        mPluginAttributes.emplace_back(nvinfer1::PluginField("groups"));
+        mPluginAttributes.emplace_back(nvinfer1::PluginField("deform_groups"));
+        mFC.nbFields = mPluginAttributes.size();
+        mFC.fields   = mPluginAttributes.data();
     }
 
-    if (field_name.compare("groups") == 0) {
-      group = static_cast<const int *>(fc->fields[i].data)[0];
+    const char* DeformableConvPluginDynamicCreator::getPluginName() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
     }
 
-    if (field_name.compare("stride") == 0) {
-      stride.nbDims = 2;
-      stride.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
-      stride.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
+    const char* DeformableConvPluginDynamicCreator::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
     }
 
-    if (field_name.compare("padding") == 0) {
-      padding.nbDims = 2;
-      padding.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
-      padding.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
+    nvinfer1::IPluginV2* DeformableConvPluginDynamicCreator::createPlugin(
+        const char*                            name,
+        const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT
+    {
+        nvinfer1::Dims stride{2, {1, 1}};
+        nvinfer1::Dims padding{2, {0, 0}};
+        nvinfer1::Dims dilation{2, {1, 1}};
+        int            deformableGroup = 1;
+        int            group           = 1;
+
+        for (int i = 0; i < fc->nbFields; i++)
+        {
+            if (fc->fields[i].data == nullptr)
+            {
+                continue;
+            }
+            std::string field_name(fc->fields[i].name);
+
+            if (field_name.compare("deform_groups") == 0)
+            {
+                deformableGroup = static_cast<const int*>(fc->fields[i].data)[0];
+            }
+
+            if (field_name.compare("groups") == 0)
+            {
+                group = static_cast<const int*>(fc->fields[i].data)[0];
+            }
+
+            if (field_name.compare("stride") == 0)
+            {
+                stride.nbDims = 2;
+                stride.d[0]   = static_cast<const int*>(fc->fields[i].data)[0];
+                stride.d[1]   = static_cast<const int*>(fc->fields[i].data)[1];
+            }
+
+            if (field_name.compare("padding") == 0)
+            {
+                padding.nbDims = 2;
+                padding.d[0]   = static_cast<const int*>(fc->fields[i].data)[0];
+                padding.d[1]   = static_cast<const int*>(fc->fields[i].data)[1];
+            }
+
+            if (field_name.compare("dilation") == 0)
+            {
+                dilation.nbDims = 2;
+                dilation.d[0]   = static_cast<const int*>(fc->fields[i].data)[0];
+                dilation.d[1]   = static_cast<const int*>(fc->fields[i].data)[1];
+            }
+        }
+
+        DeformableConvPluginDynamic* plugin =
+            new DeformableConvPluginDynamic(name, stride, padding, dilation, deformableGroup, group);
+        plugin->setPluginNamespace(getPluginNamespace());
+        return plugin;
     }
 
-    if (field_name.compare("dilation") == 0) {
-      dilation.nbDims = 2;
-      dilation.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
-      dilation.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
+    nvinfer1::IPluginV2* DeformableConvPluginDynamicCreator::deserializePlugin(
+        const char* name,
+        const void* serialData,
+        size_t      serialLength) TRT_NOEXCEPT
+    {
+        auto plugin = new DeformableConvPluginDynamic(name, serialData, serialLength);
+        plugin->setPluginNamespace(getPluginNamespace());
+        return plugin;
     }
-  }
-
-  DeformableConvPluginDynamic *plugin =
-      new DeformableConvPluginDynamic(name, stride, padding, dilation, deformableGroup, group);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-
-nvinfer1::IPluginV2 *DeformableConvPluginDynamicCreator::deserializePlugin(
-    const char *name, const void *serialData, size_t serialLength) TRT_NOEXCEPT {
-  auto plugin = new DeformableConvPluginDynamic(name, serialData, serialLength);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-REGISTER_TENSORRT_PLUGIN(DeformableConvPluginDynamicCreator);
+    REGISTER_TENSORRT_PLUGIN(DeformableConvPluginDynamicCreator);
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv.hpp b/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv.hpp
index 3ea0ccbefe..09845327ca 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv.hpp
@@ -9,73 +9,99 @@
 
 #include "trt_plugin_base.hpp"
 
-namespace mmdeploy {
-class DeformableConvPluginDynamic : public TRTPluginBase {
- public:
-  DeformableConvPluginDynamic(const std::string &name, const nvinfer1::Dims stride,
-                              const nvinfer1::Dims padding, const nvinfer1::Dims dilation,
-                              const int deformableGroup, const int group);
-
-  DeformableConvPluginDynamic(const std::string name, const void *data, size_t length);
-
-  DeformableConvPluginDynamic() = delete;
-
-  ~DeformableConvPluginDynamic() TRT_NOEXCEPT override;
-
-  // IPluginV2DynamicExt Methods
-  nvinfer1::IPluginV2DynamicExt *clone() const TRT_NOEXCEPT override;
-  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs,
-                                          int nbInputs, nvinfer1::IExprBuilder &exprBuilder)
-      TRT_NOEXCEPT override;
-  bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs,
-                                 int nbOutputs) TRT_NOEXCEPT override;
-  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs,
-                       const nvinfer1::DynamicPluginTensorDesc *out,
-                       int nbOutputs) TRT_NOEXCEPT override;
-  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
-                          const nvinfer1::PluginTensorDesc *outputs,
-                          int nbOutputs) const TRT_NOEXCEPT override;
-  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
-              const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
-              void *const *outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT override;
-  void attachToContext(cudnnContext *cudnnContext, cublasContext *cublasContext,
-                       nvinfer1::IGpuAllocator *gpuAllocator) TRT_NOEXCEPT override;
-  void detachFromContext() TRT_NOEXCEPT override;
-
-  // IPluginV2Ext Methods
-  nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
-                                       int nbInputs) const TRT_NOEXCEPT override;
-
-  // IPluginV2 Methods
-  const char *getPluginType() const TRT_NOEXCEPT override;
-  const char *getPluginVersion() const TRT_NOEXCEPT override;
-  int getNbOutputs() const TRT_NOEXCEPT override;
-  size_t getSerializationSize() const TRT_NOEXCEPT override;
-  void serialize(void *buffer) const TRT_NOEXCEPT override;
-
- private:
-  nvinfer1::Dims mStride;
-  nvinfer1::Dims mPadding;
-  nvinfer1::Dims mDilation;
-  int mDeformableGroup;
-  int mGroup;
-
-  cublasHandle_t m_cublas_handle;
-};
-
-class DeformableConvPluginDynamicCreator : public TRTPluginCreatorBase {
- public:
-  DeformableConvPluginDynamicCreator();
-
-  const char *getPluginName() const TRT_NOEXCEPT override;
-
-  const char *getPluginVersion() const TRT_NOEXCEPT override;
-
-  nvinfer1::IPluginV2 *createPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
-      TRT_NOEXCEPT override;
-
-  nvinfer1::IPluginV2 *deserializePlugin(const char *name, const void *serialData,
-                                         size_t serialLength) TRT_NOEXCEPT override;
-};
+namespace mmdeploy
+{
+    class DeformableConvPluginDynamic : public TRTPluginBase
+    {
+      public:
+        DeformableConvPluginDynamic(const std::string&   name,
+                                    const nvinfer1::Dims stride,
+                                    const nvinfer1::Dims padding,
+                                    const nvinfer1::Dims dilation,
+                                    const int            deformableGroup,
+                                    const int            group);
+
+        DeformableConvPluginDynamic(const std::string name,
+                                    const void*       data,
+                                    size_t            length);
+
+        DeformableConvPluginDynamic() = delete;
+
+        ~DeformableConvPluginDynamic() TRT_NOEXCEPT override;
+
+        // IPluginV2DynamicExt Methods
+        nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
+
+        nvinfer1::DimsExprs            getOutputDimensions(int                        outputIndex,
+                                                           const nvinfer1::DimsExprs* inputs,
+                                                           int                        nbInputs,
+                                                           nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT override;
+
+        bool                           supportsFormatCombination(int                               pos,
+                                                                 const nvinfer1::PluginTensorDesc* ioDesc,
+                                                                 int                               nbInputs,
+                                                                 int                               nbOutputs) TRT_NOEXCEPT override;
+
+        void                           configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                                                       int                                      nbInputs,
+                                                       const nvinfer1::DynamicPluginTensorDesc* out,
+                                                       int                                      nbOutputs) TRT_NOEXCEPT override;
+
+        size_t                         getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                                        int                               nbInputs,
+                                                        const nvinfer1::PluginTensorDesc* outputs,
+                                                        int                               nbOutputs) const TRT_NOEXCEPT override;
+
+        int                enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                   const nvinfer1::PluginTensorDesc* outputDesc,
+                                   const void* const*                inputs,
+                                   void* const*                      outputs,
+                                   void*                             workspace,
+                                   cudaStream_t                      stream) TRT_NOEXCEPT override;
+
+        void                           attachToContext(cudnnContext*            cudnnContext,
+                                                       cublasContext*           cublasContext,
+                                                       nvinfer1::IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override;
+
+        void               detachFromContext() TRT_NOEXCEPT override;
+
+        // IPluginV2Ext Methods
+        nvinfer1::DataType             getOutputDataType(int                       index,
+                                                         const nvinfer1::DataType* inputTypes,
+                                                         int                       nbInputs) const TRT_NOEXCEPT override;
+
+        // IPluginV2 Methods
+        const char*        getPluginType() const TRT_NOEXCEPT override;
+        const char*        getPluginVersion() const TRT_NOEXCEPT override;
+        int                getNbOutputs() const TRT_NOEXCEPT override;
+        size_t             getSerializationSize() const TRT_NOEXCEPT override;
+        void               serialize(void* buffer) const TRT_NOEXCEPT override;
+
+      private:
+        nvinfer1::Dims mStride;
+        nvinfer1::Dims mPadding;
+        nvinfer1::Dims mDilation;
+        int            mDeformableGroup;
+        int            mGroup;
+
+        cublasHandle_t m_cublas_handle;
+    };
+
+    class DeformableConvPluginDynamicCreator : public TRTPluginCreatorBase
+    {
+      public:
+        DeformableConvPluginDynamicCreator();
+
+        const char*          getPluginName() const TRT_NOEXCEPT override;
+
+        const char*          getPluginVersion() const TRT_NOEXCEPT override;
+
+        nvinfer1::IPluginV2* createPlugin(const char*                            name,
+                                          const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override;
+
+        nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                               const void* serialData,
+                                               size_t      serialLength) TRT_NOEXCEPT override;
+    };
 }  // namespace mmdeploy
 #endif  // TRT_DEFORM_CONV_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.cu b/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.cu
index 3f401fc9e2..e62bdb0a48 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.cu
@@ -68,105 +68,228 @@
 #include "trt_deform_conv_kernel.hpp"
 #include "trt_plugin_helper.hpp"
 
-template <typename scalar_t>
-void deform_conv_im2col(const scalar_t* input, const scalar_t* offset, scalar_t* column,
-                        const int channels, const int height, const int width, const int ksize_h,
-                        const int ksize_w, const int pad_h, const int pad_w, const int stride_h,
-                        const int stride_w, const int dilation_h, const int dilation_w,
-                        const int parallel_imgs, const int deformable_group, cudaStream_t stream) {
-  int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
-  int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
-  int num_kernels = channels * height_col * width_col * parallel_imgs;
-  int channel_per_deformable_group = channels / deformable_group;
-
-  deformable_im2col_gpu_kernel<scalar_t><<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
-      num_kernels, input, offset, height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w,
-      dilation_h, dilation_w, channel_per_deformable_group, parallel_imgs, channels,
-      deformable_group, height_col, width_col, column);
-
-  cudaCheckError();
+template<typename scalar_t>
+void deform_conv_im2col(const scalar_t* input,
+                        const scalar_t* offset,
+                        scalar_t*       column,
+                        const int       channels,
+                        const int       height,
+                        const int       width,
+                        const int       ksize_h,
+                        const int       ksize_w,
+                        const int       pad_h,
+                        const int       pad_w,
+                        const int       stride_h,
+                        const int       stride_w,
+                        const int       dilation_h,
+                        const int       dilation_w,
+                        const int       parallel_imgs,
+                        const int       deformable_group,
+                        cudaStream_t    stream)
+{
+    int height_col                   = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+    int width_col                    = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+    int num_kernels                  = channels * height_col * width_col * parallel_imgs;
+    int channel_per_deformable_group = channels / deformable_group;
+
+    deformable_im2col_gpu_kernel<scalar_t><<<GET_BLOCKS(num_kernels),
+                                             THREADS_PER_BLOCK,
+                                             0,
+                                             stream>>>(num_kernels,
+                                                       input,
+                                                       offset,
+                                                       height,
+                                                       width,
+                                                       ksize_h,
+                                                       ksize_w,
+                                                       pad_h,
+                                                       pad_w,
+                                                       stride_h,
+                                                       stride_w,
+                                                       dilation_h,
+                                                       dilation_w,
+                                                       channel_per_deformable_group,
+                                                       parallel_imgs,
+                                                       channels,
+                                                       deformable_group,
+                                                       height_col,
+                                                       width_col,
+                                                       column);
+
+    cudaCheckError();
 }
 
-template <typename scalar_t>
-void deform_conv(const scalar_t* input, const scalar_t* weight, const scalar_t* offset,
-                 scalar_t* output, void* workspace, int batchSize, int nInputPlane, int inputHeight,
-                 int inputWidth, int nOutputPlane, int kW, int kH, int dW, int dH, int padW,
-                 int padH, int dilationW, int dilationH, int group, int deformable_group,
-                 int im2col_step, cublasHandle_t cublas_handle, cudaStream_t stream) {
-  size_t word_size = sizeof(scalar_t);
-
-  im2col_step = std::min(int(batchSize), im2col_step);
-  long outputWidth = (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
-  long outputHeight = (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
-
-  long outputHW = outputHeight * outputWidth;
-  long kHW = kH * kW;
-  long columns_size =
-      mmdeploy::getAlignedSize(nInputPlane * kHW * im2col_step * outputHW * word_size);
-
-  // column buffer for img2col
-  char* workspace_ptr = reinterpret_cast<char*>(workspace);
-  scalar_t* columns = reinterpret_cast<scalar_t*>(workspace_ptr);
-  workspace_ptr = workspace_ptr + columns_size;
-
-  scalar_t* output_buffer;
-  if (im2col_step == 1) {
-    output_buffer = output;
-  } else {
-    // output need permute when im2col_step!=1
-    output_buffer = reinterpret_cast<scalar_t*>(workspace_ptr);
-  }
-
-  long input_elt_step = im2col_step * nInputPlane * inputHeight * inputWidth;
-  long offset_elt_step = im2col_step * deformable_group * 2 * kHW * outputHW;
-  long out_buffer_step = nOutputPlane * im2col_step * outputHW;
-  long col_g_step = nInputPlane * kHW * im2col_step * outputHW / group;
-  long weight_g_step = nOutputPlane * nInputPlane * kHW / (group * group);
-  long out_buffer_g_step = out_buffer_step / group;
-  int m = nOutputPlane / group;
-  int n = im2col_step * outputHW;
-  int k = nInputPlane * kHW / group;
-  scalar_t alpha = 1.f;
-  scalar_t beta = 0.f;
-
-  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
-    const scalar_t* input_start = input + elt * input_elt_step;
-    const scalar_t* offset_start = offset + elt * offset_elt_step;
-
-    deform_conv_im2col<scalar_t>(input_start, offset_start, columns, nInputPlane, inputHeight,
-                                 inputWidth, kH, kW, padH, padW, dH, dW, dilationH, dilationW,
-                                 im2col_step, deformable_group, stream);
-
-    for (int g = 0; g < group; ++g) {
-      const scalar_t* weight_start = weight + g * weight_g_step;
-      scalar_t* col_start = columns + g * col_g_step;
-      scalar_t* out_buffer_start = output_buffer + elt * out_buffer_step + g * out_buffer_g_step;
-
-      cublasGemmWrap<scalar_t>(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &alpha, col_start,
-                               n, weight_start, k, &beta, out_buffer_start, n);
-      cudaCheckError();
+template<typename scalar_t>
+void deform_conv(const scalar_t* input,
+                 const scalar_t* weight,
+                 const scalar_t* offset,
+                 scalar_t*       output,
+                 void*           workspace,
+                 int             batchSize,
+                 int             nInputPlane,
+                 int             inputHeight,
+                 int             inputWidth,
+                 int             nOutputPlane,
+                 int             kW,
+                 int             kH,
+                 int             dW,
+                 int             dH,
+                 int             padW,
+                 int             padH,
+                 int             dilationW,
+                 int             dilationH,
+                 int             group,
+                 int             deformable_group,
+                 int             im2col_step,
+                 cublasHandle_t  cublas_handle,
+                 cudaStream_t    stream)
+{
+    size_t word_size = sizeof(scalar_t);
+
+    im2col_step       = std::min(int(batchSize), im2col_step);
+    long outputWidth  = (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+    long outputHeight = (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+    long outputHW = outputHeight * outputWidth;
+    long kHW      = kH * kW;
+    long      columns_size = mmdeploy::getAlignedSize(nInputPlane * kHW * im2col_step * outputHW * word_size);
+
+    // column buffer for img2col
+    char*     workspace_ptr = reinterpret_cast<char*>(workspace);
+    scalar_t* columns       = reinterpret_cast<scalar_t*>(workspace_ptr);
+    workspace_ptr           = workspace_ptr + columns_size;
+
+    scalar_t* output_buffer;
+    if (im2col_step == 1)
+    {
+        output_buffer = output;
+    }
+    else
+    {
+        // output need permute when im2col_step!=1
+        output_buffer = reinterpret_cast<scalar_t*>(workspace_ptr);
+    }
+
+    long     input_elt_step    = im2col_step * nInputPlane * inputHeight * inputWidth;
+    long     offset_elt_step   = im2col_step * deformable_group * 2 * kHW * outputHW;
+    long     out_buffer_step   = nOutputPlane * im2col_step * outputHW;
+    long     col_g_step        = nInputPlane * kHW * im2col_step * outputHW / group;
+    long     weight_g_step     = nOutputPlane * nInputPlane * kHW / (group * group);
+    long     out_buffer_g_step = out_buffer_step / group;
+    int      m                 = nOutputPlane / group;
+    int      n                 = im2col_step * outputHW;
+    int      k                 = nInputPlane * kHW / group;
+    scalar_t alpha             = 1.f;
+    scalar_t beta              = 0.f;
+
+    for (int elt = 0; elt < batchSize / im2col_step; elt++)
+    {
+        const scalar_t* input_start  = input + elt * input_elt_step;
+        const scalar_t* offset_start = offset + elt * offset_elt_step;
+
+        deform_conv_im2col<scalar_t>(input_start,
+                                     offset_start,
+                                     columns,
+                                     nInputPlane,
+                                     inputHeight,
+                                     inputWidth,
+                                     kH,
+                                     kW,
+                                     padH,
+                                     padW,
+                                     dH,
+                                     dW,
+                                     dilationH,
+                                     dilationW,
+                                     im2col_step,
+                                     deformable_group,
+                                     stream);
+
+        for (int g = 0; g < group; ++g)
+        {
+            const scalar_t* weight_start     = weight + g * weight_g_step;
+            scalar_t*       col_start        = columns + g * col_g_step;
+            scalar_t*       out_buffer_start = output_buffer + elt * out_buffer_step + g * out_buffer_g_step;
+
+            cublasGemmWrap<scalar_t>(cublas_handle,
+                                     CUBLAS_OP_N,
+                                     CUBLAS_OP_N,
+                                     n,
+                                     m,
+                                     k,
+                                     &alpha,
+                                     col_start,
+                                     n,
+                                     weight_start,
+                                     k,
+                                     &beta,
+                                     out_buffer_start,
+                                     n);
+            cudaCheckError();
+        }
+    }
+
+    if (im2col_step != 1)
+    {
+        int output_buffer_shape[5]   = {batchSize / im2col_step,
+                                        nOutputPlane,
+                                        im2col_step,
+                                        static_cast<int>(outputHeight),
+                                        static_cast<int>(outputWidth)};
+        int output_buffer_permute[5] = {0, 2, 1, 3, 4};
+        memcpyPermute<scalar_t>(output,
+                                output_buffer,
+                                &output_buffer_shape[0],
+                                &output_buffer_permute[0],
+                                5,
+                                stream);
     }
-  }
-
-  if (im2col_step != 1) {
-    int output_buffer_shape[5] = {batchSize / im2col_step, nOutputPlane, im2col_step,
-                                  static_cast<int>(outputHeight), static_cast<int>(outputWidth)};
-    int output_buffer_permute[5] = {0, 2, 1, 3, 4};
-    memcpyPermute<scalar_t>(output, output_buffer, &output_buffer_shape[0],
-                            &output_buffer_permute[0], 5, stream);
-  }
 }
 
-template void deform_conv<float>(const float* input, const float* weight, const float* offset,
-                                 float* output, void* workspace, int batchSize, int nInputPlane,
-                                 int inputHeight, int inputWidth, int nOutputPlane, int kW, int kH,
-                                 int dW, int dH, int padW, int padH, int dilationW, int dilationH,
-                                 int group, int deformable_group, int im2col_step,
-                                 cublasHandle_t cublas_handle, cudaStream_t stream);
-
-template void deform_conv<__half>(const __half* input, const __half* weight, const __half* offset,
-                                  __half* output, void* workspace, int batchSize, int nInputPlane,
-                                  int inputHeight, int inputWidth, int nOutputPlane, int kW, int kH,
-                                  int dW, int dH, int padW, int padH, int dilationW, int dilationH,
-                                  int group, int deformable_group, int im2col_step,
-                                  cublasHandle_t cublas_handle, cudaStream_t stream);
+template void deform_conv<float>(const float*   input,
+                                 const float*   weight,
+                                 const float*   offset,
+                                 float*         output,
+                                 void*          workspace,
+                                 int            batchSize,
+                                 int            nInputPlane,
+                                 int            inputHeight,
+                                 int            inputWidth,
+                                 int            nOutputPlane,
+                                 int            kW,
+                                 int            kH,
+                                 int            dW,
+                                 int            dH,
+                                 int            padW,
+                                 int            padH,
+                                 int            dilationW,
+                                 int            dilationH,
+                                 int            group,
+                                 int            deformable_group,
+                                 int            im2col_step,
+                                 cublasHandle_t cublas_handle,
+                                 cudaStream_t   stream);
+
+template void deform_conv<__half>(const __half*  input,
+                                  const __half*  weight,
+                                  const __half*  offset,
+                                  __half*        output,
+                                  void*          workspace,
+                                  int            batchSize,
+                                  int            nInputPlane,
+                                  int            inputHeight,
+                                  int            inputWidth,
+                                  int            nOutputPlane,
+                                  int            kW,
+                                  int            kH,
+                                  int            dW,
+                                  int            dH,
+                                  int            padW,
+                                  int            padH,
+                                  int            dilationW,
+                                  int            dilationH,
+                                  int            group,
+                                  int            deformable_group,
+                                  int            im2col_step,
+                                  cublasHandle_t cublas_handle,
+                                  cudaStream_t   stream);
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.cuh b/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.cuh
index c91f17ca4a..85e675bf9c 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.cuh
+++ b/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.cuh
@@ -67,108 +67,133 @@
 
 #include "common_cuda_helper.hpp"
 
-template <typename scalar_t>
+template<typename scalar_t>
 __device__ __forceinline__ scalar_t deformable_im2col_bilinear(const scalar_t* __restrict__ input,
-                                                               const int height, const int width,
-                                                               float h, float w) {
-  if (h <= -1 || height <= h || w <= -1 || width <= w) {
-    return 0;
-  }
+                                                               const int height,
+                                                               const int width,
+                                                               float     h,
+                                                               float     w)
+{
+    if (h <= -1 || height <= h || w <= -1 || width <= w)
+    {
+        return 0;
+    }
 
-  const int h_low = floorf(h);
-  const int w_low = floorf(w);
+    const int h_low = floorf(h);
+    const int w_low = floorf(w);
 
-  input += h_low * width;
-  const scalar_t v1 = (h_low >= 0 && w_low >= 0) ? input[w_low] : static_cast<scalar_t>(0.0f);
-  const int w_high = w_low + 1;
-  const scalar_t v2 =
-      (h_low >= 0 && w_high <= width - 1) ? input[w_high] : static_cast<scalar_t>(0.0f);
-  const scalar_t lw = w - w_low;
-  const scalar_t v_low = fmaf(v2 - v1, lw, v1);
-  input += width;
-  const scalar_t v3 =
-      (h_low <= height - 2 && w_low >= 0) ? input[w_low] : static_cast<scalar_t>(0.0f);
-  const scalar_t v4 =
-      (h_low <= height - 2 && w_high <= width - 1) ? input[w_high] : static_cast<scalar_t>(0.0f);
-  const scalar_t v_high = fmaf(v4 - v3, lw, v3);
-  const scalar_t lh = h - h_low;
-  const scalar_t val = fmaf(v_high - v_low, lh, v_low);
-  return val;
+    input += h_low * width;
+    const scalar_t v1     = (h_low >= 0 && w_low >= 0) ? input[w_low] : static_cast<scalar_t>(0.0f);
+    const int      w_high = w_low + 1;
+    const scalar_t v2 =
+        (h_low >= 0 && w_high <= width - 1) ? input[w_high] : static_cast<scalar_t>(0.0f);
+    const scalar_t lw    = w - w_low;
+    const scalar_t v_low = fmaf(v2 - v1, lw, v1);
+    input += width;
+    const scalar_t v3 =
+        (h_low <= height - 2 && w_low >= 0) ? input[w_low] : static_cast<scalar_t>(0.0f);
+    const scalar_t v4 =
+        (h_low <= height - 2 && w_high <= width - 1) ? input[w_high] : static_cast<scalar_t>(0.0f);
+    const scalar_t v_high = fmaf(v4 - v3, lw, v3);
+    const scalar_t lh     = h - h_low;
+    const scalar_t val    = fmaf(v_high - v_low, lh, v_low);
+    return val;
 }
 
-template <>
+template<>
 __device__ __forceinline__ __half deformable_im2col_bilinear(const __half* __restrict__ input,
-                                                             const int height, const int width,
-                                                             float h, float w) {
-  if (h <= -1 || height <= h || w <= -1 || width <= w) {
-    return 0;
-  }
+                                                             const int height,
+                                                             const int width,
+                                                             float     h,
+                                                             float     w)
+{
+    if (h <= -1 || height <= h || w <= -1 || width <= w)
+    {
+        return 0;
+    }
 
-  const int h_low = floorf(h);
-  const int w_low = floorf(w);
+    const int h_low = floorf(h);
+    const int w_low = floorf(w);
 
-  input += h_low * width;
-  const float v1 = (h_low >= 0 && w_low >= 0) ? __half2float(input[w_low]) : 0.0f;
-  const int w_high = w_low + 1;
-  const float v2 = (h_low >= 0 && w_high <= width - 1) ? __half2float(input[w_high]) : 0.0f;
-  const float lw = w - w_low;
-  const float v_low = fmaf(v2 - v1, lw, v1);
-  input += width;
-  const float v3 = (h_low <= height - 2 && w_low >= 0) ? __half2float(input[w_low]) : 0.0f;
-  const float v4 =
-      (h_low <= height - 2 && w_high <= width - 1) ? __half2float(input[w_high]) : 0.0f;
-  const float v_high = fmaf(v4 - v3, lw, v3);
-  const float lh = h - h_low;
-  const float val = fmaf(v_high - v_low, lh, v_low);
-  return __float2half(val);
+    input += h_low * width;
+    const float v1     = (h_low >= 0 && w_low >= 0) ? __half2float(input[w_low]) : 0.0f;
+    const int   w_high = w_low + 1;
+    const float v2     = (h_low >= 0 && w_high <= width - 1) ? __half2float(input[w_high]) : 0.0f;
+    const float lw     = w - w_low;
+    const float v_low  = fmaf(v2 - v1, lw, v1);
+    input += width;
+    const float v3 = (h_low <= height - 2 && w_low >= 0) ? __half2float(input[w_low]) : 0.0f;
+    const float v4 =
+        (h_low <= height - 2 && w_high <= width - 1) ? __half2float(input[w_high]) : 0.0f;
+    const float v_high = fmaf(v4 - v3, lw, v3);
+    const float lh     = h - h_low;
+    const float val    = fmaf(v_high - v_low, lh, v_low);
+    return __float2half(val);
 }
 
-template <typename scalar_t>
-__global__ void deformable_im2col_gpu_kernel(
-    const int n, const scalar_t* __restrict__ data_im, const scalar_t* __restrict__ data_offset,
-    const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h,
-    const int pad_w, const int stride_h, const int stride_w, const int dilation_h,
-    const int dilation_w, const int channel_per_deformable_group, const int batch_size,
-    const int num_channels, const int deformable_group, const int height_col, const int width_col,
-    scalar_t* __restrict__ data_col) {
-  const int hw_col = height_col * width_col;
-  const int data_col_step = batch_size * hw_col;
+template<typename scalar_t>
+__global__ void deformable_im2col_gpu_kernel(const int n,
+                                             const scalar_t* __restrict__ data_im,
+                                             const scalar_t* __restrict__ data_offset,
+                                             const int height,
+                                             const int width,
+                                             const int kernel_h,
+                                             const int kernel_w,
+                                             const int pad_h,
+                                             const int pad_w,
+                                             const int stride_h,
+                                             const int stride_w,
+                                             const int dilation_h,
+                                             const int dilation_w,
+                                             const int channel_per_deformable_group,
+                                             const int batch_size,
+                                             const int num_channels,
+                                             const int deformable_group,
+                                             const int height_col,
+                                             const int width_col,
+                                             scalar_t* __restrict__ data_col)
+{
+    const int hw_col        = height_col * width_col;
+    const int data_col_step = batch_size * hw_col;
 
-  CUDA_1D_KERNEL_LOOP(index, n) {
-    // index index of output matrix
-    int tmp_index = index;
-    const int w_col = tmp_index % width_col;
-    tmp_index /= width_col;
-    const int h_col = tmp_index % height_col;
-    tmp_index /= height_col;
-    const int b_col = tmp_index % batch_size;
-    const int c_im = tmp_index / batch_size;
-    const int c_col = c_im * kernel_h * kernel_w;
+    CUDA_1D_KERNEL_LOOP(index, n)
+    {
+        // index index of output matrix
+        int       tmp_index = index;
+        const int w_col     = tmp_index % width_col;
+        tmp_index /= width_col;
+        const int h_col = tmp_index % height_col;
+        tmp_index /= height_col;
+        const int b_col = tmp_index % batch_size;
+        const int c_im  = tmp_index / batch_size;
+        const int c_col = c_im * kernel_h * kernel_w;
 
-    // compute deformable group index
-    const int deformable_group_index = c_im / channel_per_deformable_group;
+        // compute deformable group index
+        const int deformable_group_index = c_im / channel_per_deformable_group;
 
-    const int h_in = h_col * stride_h - pad_h;
-    const int w_in = w_col * stride_w - pad_w;
-    scalar_t* __restrict__ data_col_ptr = data_col + c_col * data_col_step + index % data_col_step;
-    const scalar_t* __restrict__ data_im_ptr =
-        data_im + (b_col * num_channels + c_im) * height * width;
-    const scalar_t* __restrict__ data_offset_ptr =
-        data_offset +
-        ((b_col * deformable_group + deformable_group_index) << 1) * kernel_h * kernel_w * hw_col +
-        h_col * width_col + w_col;
-    for (int i = 0; i < kernel_h; ++i) {
-      for (int j = 0; j < kernel_w; ++j) {
-        const int data_offset_h = (i * kernel_w + j) * hw_col << 1;
-        const scalar_t offset_h = data_offset_ptr[data_offset_h];
-        const int data_offset_w = data_offset_h + hw_col;
-        const scalar_t offset_w = data_offset_ptr[data_offset_w];
-        const scalar_t h_im = h_in + i * dilation_h + (float)offset_h;
-        const scalar_t w_im = w_in + j * dilation_w + (float)offset_w;
-        const scalar_t val = deformable_im2col_bilinear(data_im_ptr, height, width, h_im, w_im);
-        *data_col_ptr = val;
-        data_col_ptr += data_col_step;
-      }
+        const int h_in                      = h_col * stride_h - pad_h;
+        const int w_in                      = w_col * stride_w - pad_w;
+        scalar_t* __restrict__ data_col_ptr = data_col + c_col * data_col_step + index % data_col_step;
+        const scalar_t* __restrict__ data_im_ptr =
+            data_im + (b_col * num_channels + c_im) * height * width;
+        const scalar_t* __restrict__ data_offset_ptr =
+            data_offset +
+            ((b_col * deformable_group + deformable_group_index) << 1) * kernel_h * kernel_w * hw_col +
+            h_col * width_col + w_col;
+        for (int i = 0; i < kernel_h; ++i)
+        {
+            for (int j = 0; j < kernel_w; ++j)
+            {
+                const int      data_offset_h = (i * kernel_w + j) * hw_col << 1;
+                const scalar_t offset_h      = data_offset_ptr[data_offset_h];
+                const int      data_offset_w = data_offset_h + hw_col;
+                const scalar_t offset_w      = data_offset_ptr[data_offset_w];
+                const scalar_t h_im          = h_in + i * dilation_h + (float)offset_h;
+                const scalar_t w_im          = w_in + j * dilation_w + (float)offset_w;
+                const scalar_t val           = deformable_im2col_bilinear(data_im_ptr, height, width, h_im, w_im);
+                *data_col_ptr                = val;
+                data_col_ptr += data_col_step;
+            }
+        }
     }
-  }
 }
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.hpp b/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.hpp
index 3d8f6dfc45..012dc894f8 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/deform_conv/trt_deform_conv_kernel.hpp
@@ -4,17 +4,47 @@
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
 
-template <typename scalar_t>
-void deform_conv_im2col(const scalar_t* input, const scalar_t* offset, scalar_t* column,
-                        const int channels, const int height, const int width, const int ksize_h,
-                        const int ksize_w, const int pad_h, const int pad_w, const int stride_h,
-                        const int stride_w, const int dilation_h, const int dilation_w,
-                        const int parallel_imgs, const int deformable_group, cudaStream_t stream);
+template<typename scalar_t>
+void deform_conv_im2col(const scalar_t* input,
+                        const scalar_t* offset,
+                        scalar_t*       column,
+                        const int       channels,
+                        const int       height,
+                        const int       width,
+                        const int       ksize_h,
+                        const int       ksize_w,
+                        const int       pad_h,
+                        const int       pad_w,
+                        const int       stride_h,
+                        const int       stride_w,
+                        const int       dilation_h,
+                        const int       dilation_w,
+                        const int       parallel_imgs,
+                        const int       deformable_group,
+                        cudaStream_t    stream);
 
-template <typename scalar_t>
-void deform_conv(const scalar_t* input, const scalar_t* weight, const scalar_t* offset,
-                 scalar_t* output, void* workspace, int batchSize, int nInputPlane, int inputHeight,
-                 int inputWidth, int nOutputPlane, int kW, int kH, int dW, int dH, int padW,
-                 int padH, int dilationW, int dilationH, int group, int deformable_group,
-                 int im2col_step, cublasHandle_t cublas_handle, cudaStream_t stream);
+template<typename scalar_t>
+void deform_conv(const scalar_t* input,
+                 const scalar_t* weight,
+                 const scalar_t* offset,
+                 scalar_t*       output,
+                 void*           workspace,
+                 int             batchSize,
+                 int             nInputPlane,
+                 int             inputHeight,
+                 int             inputWidth,
+                 int             nOutputPlane,
+                 int             kW,
+                 int             kH,
+                 int             dW,
+                 int             dH,
+                 int             padW,
+                 int             padH,
+                 int             dilationW,
+                 int             dilationH,
+                 int             group,
+                 int             deformable_group,
+                 int             im2col_step,
+                 cublasHandle_t  cublas_handle,
+                 cudaStream_t    stream);
 #endif  // TRT_DEFORM_CONV_KERNEL_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk.cpp b/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk.cpp
index b5e6c0b677..2de48da10b 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk.cpp
@@ -10,141 +10,203 @@
 #include "gather_topk_kernel.hpp"
 #include "trt_serialize.hpp"
 
-namespace mmdeploy {
-namespace {
-static const char *PLUGIN_VERSION{"1"};
-static const char *PLUGIN_NAME{"GatherTopk"};
-}  // namespace
-
-GatherTopk::GatherTopk(const std::string &name) : TRTPluginBase(name) {}
-
-GatherTopk::GatherTopk(const std::string name, const void *data, size_t length)
-    : TRTPluginBase(name) {}
-
-nvinfer1::IPluginV2DynamicExt *GatherTopk::clone() const TRT_NOEXCEPT {
-  GatherTopk *plugin = new GatherTopk(mLayerName);
-  plugin->setPluginNamespace(getPluginNamespace());
-
-  return plugin;
-}
-
-nvinfer1::DimsExprs GatherTopk::getOutputDimensions(
-    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
-    nvinfer1::IExprBuilder &exprBuilder) TRT_NOEXCEPT {
-  assert(inputs[0].nbDims >= inputs[1].nbDims);
-  nvinfer1::DimsExprs ret;
-  ret.nbDims = inputs[0].nbDims;
-  for (int i = 0; i < inputs[1].nbDims; ++i) {
-    ret.d[i] = inputs[1].d[i];
-  }
-  for (int i = inputs[1].nbDims; i < inputs[0].nbDims; ++i) {
-    ret.d[i] = inputs[0].d[i];
-  }
-  return ret;
-}
-
-bool GatherTopk::supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc,
-                                           int nbInputs, int nbOutputs) TRT_NOEXCEPT {
-  switch (pos) {
-    case 0:
-      // data
-      return (ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
-              ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR) ||
-             (ioDesc[pos].type == nvinfer1::DataType::kINT32 &&
-              ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR);
-    case 1:
-      // indices
-      return ioDesc[pos].type == nvinfer1::DataType::kINT32 &&
-             ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR;
-    case 2:
-      // output
-      return ioDesc[pos].type == ioDesc[0].type && ioDesc[pos].format == ioDesc[0].format;
-    default:
-      return true;
-  }
-  return true;
-}
-
-void GatherTopk::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
-                                 const nvinfer1::DynamicPluginTensorDesc *outputs,
-                                 int nbOutputs) TRT_NOEXCEPT {}
-
-size_t GatherTopk::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
-                                    const nvinfer1::PluginTensorDesc *outputs,
-                                    int nbOutputs) const TRT_NOEXCEPT {
-  return 0;
-}
-
-int GatherTopk::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
-                        const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
-                        void *const *outputs, void *workSpace, cudaStream_t stream) TRT_NOEXCEPT {
-  const int *dims = &(inputDesc[0].dims.d[0]);
-  const int *indices_dims = &(inputDesc[1].dims.d[0]);
-  int nbDims = inputDesc[0].dims.nbDims;
-  int indice_nbDims = inputDesc[1].dims.nbDims;
-
-  const void *data = inputs[0];
-  const void *indices = inputs[1];
-  void *output = outputs[0];
-
-  auto data_type = inputDesc[0].type;
-
-  switch (data_type) {
-    case nvinfer1::DataType::kFLOAT:
-      gather_topk_impl<float>((float *)data, (int *)indices, dims, nbDims, indices_dims,
-                              indice_nbDims, (float *)output, stream);
-      break;
-
-    case nvinfer1::DataType::kINT32:
-      gather_topk_impl<int>((int *)data, (int *)indices, dims, nbDims, indices_dims, indice_nbDims,
-                            (int *)output, stream);
-      break;
-    default:
-      break;
-  }
-
-  return 0;
-}
-
-nvinfer1::DataType GatherTopk::getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
-                                                 int nbInputs) const TRT_NOEXCEPT {
-  return inputTypes[0];
-}
-
-// IPluginV2 Methods
-const char *GatherTopk::getPluginType() const TRT_NOEXCEPT { return PLUGIN_NAME; }
-
-const char *GatherTopk::getPluginVersion() const TRT_NOEXCEPT { return PLUGIN_VERSION; }
-
-int GatherTopk::getNbOutputs() const TRT_NOEXCEPT { return 1; }
-
-size_t GatherTopk::getSerializationSize() const TRT_NOEXCEPT { return 0; }
-
-void GatherTopk::serialize(void *buffer) const TRT_NOEXCEPT {}
-
-GatherTopkCreator::GatherTopkCreator() {
-  mPluginAttributes.clear();
-  mFC.nbFields = mPluginAttributes.size();
-  mFC.fields = mPluginAttributes.data();
-}
-
-const char *GatherTopkCreator::getPluginName() const TRT_NOEXCEPT { return PLUGIN_NAME; }
-
-const char *GatherTopkCreator::getPluginVersion() const TRT_NOEXCEPT { return PLUGIN_VERSION; }
-
-nvinfer1::IPluginV2 *GatherTopkCreator::createPlugin(
-    const char *name, const nvinfer1::PluginFieldCollection *fc) TRT_NOEXCEPT {
-  auto *plugin = new GatherTopk(name);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-
-nvinfer1::IPluginV2 *GatherTopkCreator::deserializePlugin(const char *name, const void *serialData,
-                                                          size_t serialLength) TRT_NOEXCEPT {
-  auto plugin = new GatherTopk(name, serialData, serialLength);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-
-REGISTER_TENSORRT_PLUGIN(GatherTopkCreator);
+namespace mmdeploy
+{
+    namespace
+    {
+        static const char* PLUGIN_VERSION{"1"};
+        static const char* PLUGIN_NAME{"GatherTopk"};
+    }  // namespace
+
+    GatherTopk::GatherTopk(const std::string& name)
+        : TRTPluginBase(name)
+    {
+    }
+
+    GatherTopk::GatherTopk(const std::string name, const void* data, size_t length)
+        : TRTPluginBase(name)
+    {
+    }
+
+    nvinfer1::IPluginV2DynamicExt* GatherTopk::clone() const TRT_NOEXCEPT
+    {
+        GatherTopk* plugin = new GatherTopk(mLayerName);
+        plugin->setPluginNamespace(getPluginNamespace());
+
+        return plugin;
+    }
+
+    nvinfer1::DimsExprs GatherTopk::getOutputDimensions(
+        int                        outputIndex,
+        const nvinfer1::DimsExprs* inputs,
+        int                        nbInputs,
+        nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT
+    {
+        assert(inputs[0].nbDims >= inputs[1].nbDims);
+        nvinfer1::DimsExprs ret;
+        ret.nbDims = inputs[0].nbDims;
+        for (int i = 0; i < inputs[1].nbDims; ++i)
+        {
+            ret.d[i] = inputs[1].d[i];
+        }
+        for (int i = inputs[1].nbDims; i < inputs[0].nbDims; ++i)
+        {
+            ret.d[i] = inputs[0].d[i];
+        }
+        return ret;
+    }
+
+    bool GatherTopk::supportsFormatCombination(int                               pos,
+                                               const nvinfer1::PluginTensorDesc* ioDesc,
+                                               int                               nbInputs,
+                                               int                               nbOutputs) TRT_NOEXCEPT
+    {
+        switch (pos)
+        {
+            case 0:
+                // data
+                return (ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
+                        ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR) ||
+                       (ioDesc[pos].type == nvinfer1::DataType::kINT32 &&
+                        ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR);
+            case 1:
+                // indices
+                return ioDesc[pos].type == nvinfer1::DataType::kINT32 &&
+                       ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR;
+            case 2:
+                // output
+                return ioDesc[pos].type == ioDesc[0].type && ioDesc[pos].format == ioDesc[0].format;
+            default:
+                return true;
+        }
+        return true;
+    }
+
+    void   GatherTopk::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs,
+                                     int                                      nbInputs,
+                                     const nvinfer1::DynamicPluginTensorDesc* outputs,
+                                     int                                      nbOutputs) TRT_NOEXCEPT {}
+
+    size_t GatherTopk::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                        int                               nbInputs,
+                                        const nvinfer1::PluginTensorDesc* outputs,
+                                        int                               nbOutputs) const TRT_NOEXCEPT
+    {
+        return 0;
+    }
+
+    int GatherTopk::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                            const nvinfer1::PluginTensorDesc* outputDesc,
+                            const void* const*                inputs,
+                            void* const*                      outputs,
+                            void*                             workSpace,
+                            cudaStream_t                      stream) TRT_NOEXCEPT
+    {
+        const int*  dims          = &(inputDesc[0].dims.d[0]);
+        const int*  indices_dims  = &(inputDesc[1].dims.d[0]);
+        int         nbDims        = inputDesc[0].dims.nbDims;
+        int         indice_nbDims = inputDesc[1].dims.nbDims;
+
+        const void* data    = inputs[0];
+        const void* indices = inputs[1];
+        void*       output  = outputs[0];
+
+        auto        data_type = inputDesc[0].type;
+
+        switch (data_type)
+        {
+            case nvinfer1::DataType::kFLOAT:
+                gather_topk_impl<float>((float*)data,
+                                        (int*)indices,
+                                        dims,
+                                        nbDims,
+                                        indices_dims,
+                                        indice_nbDims,
+                                        (float*)output,
+                                        stream);
+                break;
+
+            case nvinfer1::DataType::kINT32:
+                gather_topk_impl<int>((int*)data,
+                                      (int*)indices,
+                                      dims,
+                                      nbDims,
+                                      indices_dims,
+                                      indice_nbDims,
+                                      (int*)output,
+                                      stream);
+                break;
+            default:
+                break;
+        }
+
+        return 0;
+    }
+
+    nvinfer1::DataType GatherTopk::getOutputDataType(int                       index,
+                                                     const nvinfer1::DataType* inputTypes,
+                                                     int                       nbInputs) const TRT_NOEXCEPT
+    {
+        return inputTypes[0];
+    }
+
+    // IPluginV2 Methods
+    const char* GatherTopk::getPluginType() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
+    }
+
+    const char* GatherTopk::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
+    }
+
+    int GatherTopk::getNbOutputs() const TRT_NOEXCEPT
+    {
+        return 1;
+    }
+
+    size_t GatherTopk::getSerializationSize() const TRT_NOEXCEPT
+    {
+        return 0;
+    }
+
+    void GatherTopk::serialize(void* buffer) const TRT_NOEXCEPT {}
+
+    GatherTopkCreator::GatherTopkCreator()
+    {
+        mPluginAttributes.clear();
+        mFC.nbFields = mPluginAttributes.size();
+        mFC.fields   = mPluginAttributes.data();
+    }
+
+    const char* GatherTopkCreator::getPluginName() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
+    }
+
+    const char* GatherTopkCreator::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
+    }
+
+    nvinfer1::IPluginV2* GatherTopkCreator::createPlugin(
+        const char*                            name,
+        const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT
+    {
+        auto* plugin = new GatherTopk(name);
+        plugin->setPluginNamespace(getPluginNamespace());
+        return plugin;
+    }
+
+    nvinfer1::IPluginV2* GatherTopkCreator::deserializePlugin(const char* name,
+                                                              const void* serialData,
+                                                              size_t      serialLength) TRT_NOEXCEPT
+    {
+        auto plugin = new GatherTopk(name, serialData, serialLength);
+        plugin->setPluginNamespace(getPluginNamespace());
+        return plugin;
+    }
+
+    REGISTER_TENSORRT_PLUGIN(GatherTopkCreator);
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk.hpp b/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk.hpp
index 72f76a2678..d1a0df29e3 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk.hpp
@@ -9,56 +9,75 @@
 
 #include "trt_plugin_base.hpp"
 
-namespace mmdeploy {
-class GatherTopk : public TRTPluginBase {
- public:
-  GatherTopk(const std::string &name);
-
-  GatherTopk(const std::string name, const void *data, size_t length);
-
-  GatherTopk() = delete;
-
-  // IPluginV2DynamicExt Methods
-  nvinfer1::IPluginV2DynamicExt *clone() const TRT_NOEXCEPT override;
-  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs,
-                                          int nbInputs, nvinfer1::IExprBuilder &exprBuilder)
-      TRT_NOEXCEPT override;
-  bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs,
-                                 int nbOutputs) TRT_NOEXCEPT override;
-  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs,
-                       const nvinfer1::DynamicPluginTensorDesc *out,
-                       int nbOutputs) TRT_NOEXCEPT override;
-  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
-                          const nvinfer1::PluginTensorDesc *outputs,
-                          int nbOutputs) const TRT_NOEXCEPT override;
-  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
-              const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
-              void *const *outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT override;
-
-  // IPluginV2Ext Methods
-  nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
-                                       int nbInputs) const TRT_NOEXCEPT override;
-
-  // IPluginV2 Methods
-  const char *getPluginType() const TRT_NOEXCEPT override;
-  const char *getPluginVersion() const TRT_NOEXCEPT override;
-  int getNbOutputs() const TRT_NOEXCEPT override;
-  size_t getSerializationSize() const TRT_NOEXCEPT override;
-  void serialize(void *buffer) const TRT_NOEXCEPT override;
-};
-
-class GatherTopkCreator : public TRTPluginCreatorBase {
- public:
-  GatherTopkCreator();
-
-  const char *getPluginName() const TRT_NOEXCEPT override;
-
-  const char *getPluginVersion() const TRT_NOEXCEPT override;
-  nvinfer1::IPluginV2 *createPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
-      TRT_NOEXCEPT override;
-
-  nvinfer1::IPluginV2 *deserializePlugin(const char *name, const void *serialData,
-                                         size_t serialLength) TRT_NOEXCEPT override;
-};
+namespace mmdeploy
+{
+    class GatherTopk : public TRTPluginBase
+    {
+      public:
+        GatherTopk(const std::string& name);
+
+        GatherTopk(const std::string name, const void* data, size_t length);
+
+        GatherTopk() = delete;
+
+        // IPluginV2DynamicExt Methods
+        nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
+
+        nvinfer1::DimsExprs            getOutputDimensions(int                        outputIndex,
+                                                           const nvinfer1::DimsExprs* inputs,
+                                                           int                        nbInputs,
+                                                           nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT override;
+
+        bool                           supportsFormatCombination(int                               pos,
+                                                                 const nvinfer1::PluginTensorDesc* ioDesc,
+                                                                 int                               nbInputs,
+                                                                 int                               nbOutputs) TRT_NOEXCEPT override;
+
+        void                           configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                                                       int                                      nbInputs,
+                                                       const nvinfer1::DynamicPluginTensorDesc* out,
+                                                       int                                      nbOutputs) TRT_NOEXCEPT override;
+
+        size_t                         getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                                        int                               nbInputs,
+                                                        const nvinfer1::PluginTensorDesc* outputs,
+                                                        int                               nbOutputs) const TRT_NOEXCEPT override;
+
+        int                enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                   const nvinfer1::PluginTensorDesc* outputDesc,
+                                   const void* const*                inputs,
+                                   void* const*                      outputs,
+                                   void*                             workspace,
+                                   cudaStream_t                      stream) TRT_NOEXCEPT override;
+
+        // IPluginV2Ext Methods
+        nvinfer1::DataType             getOutputDataType(int                       index,
+                                                         const nvinfer1::DataType* inputTypes,
+                                                         int                       nbInputs) const TRT_NOEXCEPT override;
+
+        // IPluginV2 Methods
+        const char*        getPluginType() const TRT_NOEXCEPT override;
+        const char*        getPluginVersion() const TRT_NOEXCEPT override;
+        int                getNbOutputs() const TRT_NOEXCEPT override;
+        size_t             getSerializationSize() const TRT_NOEXCEPT override;
+        void               serialize(void* buffer) const TRT_NOEXCEPT override;
+    };
+
+    class GatherTopkCreator : public TRTPluginCreatorBase
+    {
+      public:
+        GatherTopkCreator();
+
+        const char*          getPluginName() const TRT_NOEXCEPT override;
+
+        const char*          getPluginVersion() const TRT_NOEXCEPT override;
+
+        nvinfer1::IPluginV2* createPlugin(const char*                            name,
+                                          const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override;
+
+        nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                               const void* serialData,
+                                               size_t      serialLength) TRT_NOEXCEPT override;
+    };
 }  // namespace mmdeploy
 #endif  // TRT_SCATTERND_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk_kernel.cu b/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk_kernel.cu
index 9a5c8ec963..3c1663d499 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk_kernel.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk_kernel.cu
@@ -8,39 +8,67 @@
 #include "gather_topk_kernel.hpp"
 #include "trt_plugin_helper.hpp"
 
-template <typename scalar_t>
-__global__ void gather_topk_kernel(const scalar_t* input, const int* indices, scalar_t* output,
-                                   int batch, int num_input, int num_indices, int channel) {
-  CUDA_1D_KERNEL_LOOP(index, batch * num_indices * channel) {
-    const int b_id = index / (num_indices * channel);
-    const int n_id = (index / channel) % num_indices;
-    const int c_id = index % channel;
+template<typename scalar_t>
+__global__ void gather_topk_kernel(const scalar_t* input,
+                                   const int*      indices,
+                                   scalar_t*       output,
+                                   int             batch,
+                                   int             num_input,
+                                   int             num_indices,
+                                   int             channel)
+{
+    CUDA_1D_KERNEL_LOOP(index, batch * num_indices * channel)
+    {
+        const int      b_id = index / (num_indices * channel);
+        const int      n_id = (index / channel) % num_indices;
+        const int      c_id = index % channel;
 
-    const int input_n_id = indices[b_id * num_indices + n_id];
-    const scalar_t value = input[b_id * num_input * channel + input_n_id * channel + c_id];
-    output[b_id * num_indices * channel + n_id * channel + c_id] = value;
-  }
+        const int      input_n_id                                    = indices[b_id * num_indices + n_id];
+        const scalar_t value                                         = input[b_id * num_input * channel + input_n_id * channel + c_id];
+        output[b_id * num_indices * channel + n_id * channel + c_id] = value;
+    }
 }
 
-template <typename scalar_t>
-void gather_topk_impl(const scalar_t* input, const int* indices, const int* dims, int nbDims,
-                      const int* indices_dims, int indice_nbDims, scalar_t* output,
-                      cudaStream_t stream) {
-  int batch = 1;
-  for (int i = 0; i < indice_nbDims - 1; ++i) batch *= dims[i];
-  int num_input = dims[indice_nbDims - 1];
-  int num_indices = indices_dims[indice_nbDims - 1];
-  int channel = 1;
-  for (int i = indice_nbDims; i < nbDims; ++i) channel *= dims[i];
-  const int col_block = DIVUP(batch * num_indices * channel, THREADS_PER_BLOCK);
-  gather_topk_kernel<<<col_block, THREADS_PER_BLOCK, 0, stream>>>(input, indices, output, batch,
-                                                                  num_input, num_indices, channel);
+template<typename scalar_t>
+void gather_topk_impl(const scalar_t* input,
+                      const int*      indices,
+                      const int*      dims,
+                      int             nbDims,
+                      const int*      indices_dims,
+                      int             indice_nbDims,
+                      scalar_t*       output,
+                      cudaStream_t    stream)
+{
+    int batch = 1;
+    for (int i = 0; i < indice_nbDims - 1; ++i) batch *= dims[i];
+    int num_input   = dims[indice_nbDims - 1];
+    int num_indices = indices_dims[indice_nbDims - 1];
+    int channel     = 1;
+    for (int i = indice_nbDims; i < nbDims; ++i) channel *= dims[i];
+    const int col_block = DIVUP(batch * num_indices * channel, THREADS_PER_BLOCK);
+    gather_topk_kernel<<<col_block, THREADS_PER_BLOCK, 0, stream>>>(input,
+                                                                    indices,
+                                                                    output,
+                                                                    batch,
+                                                                    num_input,
+                                                                    num_indices,
+                                                                    channel);
 }
 
-template void gather_topk_impl<float>(const float* input, const int* indices, const int* dims,
-                                      int nbDims, const int* indices_dims, int indice_nbDims,
-                                      float* output, cudaStream_t stream);
+template void gather_topk_impl<float>(const float* input,
+                                      const int*   indices,
+                                      const int*   dims,
+                                      int          nbDims,
+                                      const int*   indices_dims,
+                                      int          indice_nbDims,
+                                      float*       output,
+                                      cudaStream_t stream);
 
-template void gather_topk_impl<int32_t>(const int32_t* input, const int* indices, const int* dims,
-                                        int nbDims, const int* indices_dims, int indice_nbDims,
-                                        int32_t* output, cudaStream_t stream);
+template void gather_topk_impl<int32_t>(const int32_t* input,
+                                        const int*     indices,
+                                        const int*     dims,
+                                        int            nbDims,
+                                        const int*     indices_dims,
+                                        int            indice_nbDims,
+                                        int32_t*       output,
+                                        cudaStream_t   stream);
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk_kernel.hpp b/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk_kernel.hpp
index 1f9b428394..0c5c7e6011 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk_kernel.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/gather_topk/gather_topk_kernel.hpp
@@ -3,8 +3,13 @@
 #define TRT_GRID_SAMPLER_KERNEL_HPP
 #include <cuda_runtime.h>
 
-template <typename scalar_t>
-void gather_topk_impl(const scalar_t* input, const int* indices, const int* dims, int nbDims,
-                      const int* indices_dims, int indice_nbDims, scalar_t* output,
-                      cudaStream_t stream);
+template<typename scalar_t>
+void gather_topk_impl(const scalar_t* input,
+                      const int*      indices,
+                      const int*      dims,
+                      int             nbDims,
+                      const int*      indices_dims,
+                      int             indice_nbDims,
+                      scalar_t*       output,
+                      cudaStream_t    stream);
 #endif  // TRT_GRID_SAMPLER_KERNEL_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors.cpp b/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors.cpp
index 1850fbfc1a..761b61538b 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors.cpp
@@ -10,145 +10,202 @@
 
 using namespace nvinfer1;
 
-namespace mmdeploy {
-namespace {
-static const char *PLUGIN_VERSION{"1"};
-static const char *PLUGIN_NAME{"GridPriorsTRT"};
-}  // namespace
-
-GridPriorsTRT::GridPriorsTRT(const std::string &name, const nvinfer1::Dims stride)
-    : TRTPluginBase(name), mStride(stride) {}
-
-GridPriorsTRT::GridPriorsTRT(const std::string name, const void *data, size_t length)
-    : TRTPluginBase(name) {
-  deserialize_value(&data, &length, &mStride);
-}
-GridPriorsTRT::~GridPriorsTRT() {}
-
-nvinfer1::IPluginV2DynamicExt *GridPriorsTRT::clone() const TRT_NOEXCEPT {
-  GridPriorsTRT *plugin = new GridPriorsTRT(mLayerName, mStride);
-  plugin->setPluginNamespace(getPluginNamespace());
-
-  return plugin;
-}
-
-nvinfer1::DimsExprs GridPriorsTRT::getOutputDimensions(
-    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
-    nvinfer1::IExprBuilder &exprBuilder) TRT_NOEXCEPT {
-  // input[0] == base_anchor
-  // input[1] == empty_h
-  // input[2] == empty_w
-
-  nvinfer1::DimsExprs ret;
-  ret.nbDims = 2;
-  auto area =
-      exprBuilder.operation(nvinfer1::DimensionOperation::kPROD, *inputs[2].d[0], *inputs[1].d[0]);
-  ret.d[0] = exprBuilder.operation(nvinfer1::DimensionOperation::kPROD, *area, *(inputs[0].d[0]));
-  ret.d[1] = exprBuilder.constant(4);
-
-  return ret;
-}
-
-bool GridPriorsTRT::supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc,
-                                              int nbInputs, int nbOutputs) TRT_NOEXCEPT {
-  if (pos == 0) {
-    return (ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
-            ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR);
-  } else if (pos - nbInputs == 0) {
-    return ioDesc[pos].type == ioDesc[0].type && ioDesc[pos].format == ioDesc[0].format;
-  } else {
-    return true;
-  }
-}
-
-int GridPriorsTRT::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
-                           const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
-                           void *const *outputs, void *workSpace,
-                           cudaStream_t stream) TRT_NOEXCEPT {
-  int num_base_anchors = inputDesc[0].dims.d[0];
-  int feat_h = inputDesc[1].dims.d[0];
-  int feat_w = inputDesc[2].dims.d[0];
-
-  const void *base_anchor = inputs[0];
-  void *output = outputs[0];
-
-  auto data_type = inputDesc[0].type;
-  switch (data_type) {
-    case nvinfer1::DataType::kFLOAT:
-      trt_grid_priors_impl<float>((float *)base_anchor, (float *)output, num_base_anchors, feat_w,
-                                  feat_h, mStride.d[0], mStride.d[1], stream);
-      break;
-    default:
-      return 1;
-  }
-
-  return 0;
-}
-
-nvinfer1::DataType GridPriorsTRT::getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
-                                                    int nbInputs) const TRT_NOEXCEPT {
-  return inputTypes[0];
-}
-
-// IPluginV2 Methods
-const char *GridPriorsTRT::getPluginType() const TRT_NOEXCEPT { return PLUGIN_NAME; }
-
-const char *GridPriorsTRT::getPluginVersion() const TRT_NOEXCEPT { return PLUGIN_VERSION; }
-
-int GridPriorsTRT::getNbOutputs() const TRT_NOEXCEPT { return 1; }
-
-size_t GridPriorsTRT::getSerializationSize() const TRT_NOEXCEPT { return serialized_size(mStride); }
-
-void GridPriorsTRT::serialize(void *buffer) const TRT_NOEXCEPT {
-  serialize_value(&buffer, mStride);
-  ;
-}
-
-////////////////////// creator /////////////////////////////
-
-GridPriorsTRTCreator::GridPriorsTRTCreator() {
-  mPluginAttributes.clear();
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("stride_h"));
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("stride_w"));
-  mFC.nbFields = mPluginAttributes.size();
-  mFC.fields = mPluginAttributes.data();
-}
-
-const char *GridPriorsTRTCreator::getPluginName() const TRT_NOEXCEPT { return PLUGIN_NAME; }
-
-const char *GridPriorsTRTCreator::getPluginVersion() const TRT_NOEXCEPT { return PLUGIN_VERSION; }
-
-nvinfer1::IPluginV2 *GridPriorsTRTCreator::createPlugin(
-    const char *name, const nvinfer1::PluginFieldCollection *fc) TRT_NOEXCEPT {
-  int stride_w = 1;
-  int stride_h = 1;
-
-  for (int i = 0; i < fc->nbFields; i++) {
-    if (fc->fields[i].data == nullptr) {
-      continue;
-    }
-    std::string field_name(fc->fields[i].name);
-
-    if (field_name.compare("stride_w") == 0) {
-      stride_w = static_cast<const int *>(fc->fields[i].data)[0];
-    }
-    if (field_name.compare("stride_h") == 0) {
-      stride_h = static_cast<const int *>(fc->fields[i].data)[0];
-    }
-  }
-  nvinfer1::Dims stride{2, {stride_w, stride_h}};
-
-  GridPriorsTRT *plugin = new GridPriorsTRT(name, stride);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-
-nvinfer1::IPluginV2 *GridPriorsTRTCreator::deserializePlugin(const char *name,
-                                                             const void *serialData,
-                                                             size_t serialLength) TRT_NOEXCEPT {
-  auto plugin = new GridPriorsTRT(name, serialData, serialLength);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-REGISTER_TENSORRT_PLUGIN(GridPriorsTRTCreator);
+namespace mmdeploy
+{
+    namespace
+    {
+        static const char* PLUGIN_VERSION{"1"};
+        static const char* PLUGIN_NAME{"GridPriorsTRT"};
+    }  // namespace
+
+    GridPriorsTRT::GridPriorsTRT(const std::string& name, const nvinfer1::Dims stride)
+        : TRTPluginBase(name)
+        , mStride(stride)
+    {
+    }
+
+    GridPriorsTRT::GridPriorsTRT(const std::string name, const void* data, size_t length)
+        : TRTPluginBase(name)
+    {
+        deserialize_value(&data, &length, &mStride);
+    }
+    GridPriorsTRT::~GridPriorsTRT() {}
+
+    nvinfer1::IPluginV2DynamicExt* GridPriorsTRT::clone() const TRT_NOEXCEPT
+    {
+        GridPriorsTRT* plugin = new GridPriorsTRT(mLayerName, mStride);
+        plugin->setPluginNamespace(getPluginNamespace());
+
+        return plugin;
+    }
+
+    nvinfer1::DimsExprs GridPriorsTRT::getOutputDimensions(
+        int                        outputIndex,
+        const nvinfer1::DimsExprs* inputs,
+        int                        nbInputs,
+        nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT
+    {
+        // input[0] == base_anchor
+        // input[1] == empty_h
+        // input[2] == empty_w
+
+        nvinfer1::DimsExprs ret;
+        ret.nbDims = 2;
+        auto area =
+            exprBuilder.operation(nvinfer1::DimensionOperation::kPROD, *inputs[2].d[0], *inputs[1].d[0]);
+        ret.d[0] = exprBuilder.operation(nvinfer1::DimensionOperation::kPROD, *area, *(inputs[0].d[0]));
+        ret.d[1] = exprBuilder.constant(4);
+
+        return ret;
+    }
+
+    bool GridPriorsTRT::supportsFormatCombination(int                               pos,
+                                                  const nvinfer1::PluginTensorDesc* ioDesc,
+                                                  int                               nbInputs,
+                                                  int                               nbOutputs) TRT_NOEXCEPT
+    {
+        if (pos == 0)
+        {
+            return (ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
+                    ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR);
+        }
+        else if (pos - nbInputs == 0)
+        {
+            return ioDesc[pos].type == ioDesc[0].type && ioDesc[pos].format == ioDesc[0].format;
+        }
+        else
+        {
+            return true;
+        }
+    }
+
+    int GridPriorsTRT::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                               const nvinfer1::PluginTensorDesc* outputDesc,
+                               const void* const*                inputs,
+                               void* const*                      outputs,
+                               void*                             workSpace,
+                               cudaStream_t                      stream) TRT_NOEXCEPT
+    {
+        int         num_base_anchors = inputDesc[0].dims.d[0];
+        int         feat_h           = inputDesc[1].dims.d[0];
+        int         feat_w           = inputDesc[2].dims.d[0];
+
+        const void* base_anchor = inputs[0];
+        void*       output      = outputs[0];
+
+        auto        data_type = inputDesc[0].type;
+        switch (data_type)
+        {
+            case nvinfer1::DataType::kFLOAT:
+                trt_grid_priors_impl<float>((float*)base_anchor,
+                                            (float*)output,
+                                            num_base_anchors,
+                                            feat_w,
+                                            feat_h,
+                                            mStride.d[0],
+                                            mStride.d[1],
+                                            stream);
+                break;
+            default:
+                return 1;
+        }
+
+        return 0;
+    }
+
+    nvinfer1::DataType GridPriorsTRT::getOutputDataType(int                       index,
+                                                        const nvinfer1::DataType* inputTypes,
+                                                        int                       nbInputs) const TRT_NOEXCEPT
+    {
+        return inputTypes[0];
+    }
+
+    // IPluginV2 Methods
+    const char* GridPriorsTRT::getPluginType() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
+    }
+
+    const char* GridPriorsTRT::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
+    }
+
+    int GridPriorsTRT::getNbOutputs() const TRT_NOEXCEPT
+    {
+        return 1;
+    }
+
+    size_t GridPriorsTRT::getSerializationSize() const TRT_NOEXCEPT
+    {
+        return serialized_size(mStride);
+    }
+
+    void GridPriorsTRT::serialize(void* buffer) const TRT_NOEXCEPT
+    {
+        serialize_value(&buffer, mStride);
+        ;
+    }
+
+    ////////////////////// creator /////////////////////////////
+
+    GridPriorsTRTCreator::GridPriorsTRTCreator()
+    {
+        mPluginAttributes.clear();
+        mPluginAttributes.emplace_back(nvinfer1::PluginField("stride_h"));
+        mPluginAttributes.emplace_back(nvinfer1::PluginField("stride_w"));
+        mFC.nbFields = mPluginAttributes.size();
+        mFC.fields   = mPluginAttributes.data();
+    }
+
+    const char* GridPriorsTRTCreator::getPluginName() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
+    }
+
+    const char* GridPriorsTRTCreator::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
+    }
+
+    nvinfer1::IPluginV2* GridPriorsTRTCreator::createPlugin(
+        const char*                            name,
+        const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT
+    {
+        int stride_w = 1;
+        int stride_h = 1;
+
+        for (int i = 0; i < fc->nbFields; i++)
+        {
+            if (fc->fields[i].data == nullptr)
+            {
+                continue;
+            }
+            std::string field_name(fc->fields[i].name);
+
+            if (field_name.compare("stride_w") == 0)
+            {
+                stride_w = static_cast<const int*>(fc->fields[i].data)[0];
+            }
+            if (field_name.compare("stride_h") == 0)
+            {
+                stride_h = static_cast<const int*>(fc->fields[i].data)[0];
+            }
+        }
+        nvinfer1::Dims stride{2, {stride_w, stride_h}};
+
+        GridPriorsTRT* plugin = new GridPriorsTRT(name, stride);
+        plugin->setPluginNamespace(getPluginNamespace());
+        return plugin;
+    }
+
+    nvinfer1::IPluginV2* GridPriorsTRTCreator::deserializePlugin(const char* name,
+                                                                 const void* serialData,
+                                                                 size_t      serialLength) TRT_NOEXCEPT
+    {
+        auto plugin = new GridPriorsTRT(name, serialData, serialLength);
+        plugin->setPluginNamespace(getPluginNamespace());
+        return plugin;
+    }
+    REGISTER_TENSORRT_PLUGIN(GridPriorsTRTCreator);
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors.hpp b/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors.hpp
index 0036f62586..8285ba47ab 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors.hpp
@@ -9,58 +9,72 @@
 
 #include "trt_plugin_base.hpp"
 
-namespace mmdeploy {
-class GridPriorsTRT : public TRTPluginBase {
- public:
-  GridPriorsTRT(const std::string &name, const nvinfer1::Dims stride);
+namespace mmdeploy
+{
+    class GridPriorsTRT : public TRTPluginBase
+    {
+      public:
+        GridPriorsTRT(const std::string& name, const nvinfer1::Dims stride);
 
-  GridPriorsTRT(const std::string name, const void *data, size_t length);
+        GridPriorsTRT(const std::string name, const void* data, size_t length);
 
-  GridPriorsTRT() = delete;
+        GridPriorsTRT() = delete;
 
-  ~GridPriorsTRT() TRT_NOEXCEPT override;
+        ~GridPriorsTRT() TRT_NOEXCEPT override;
 
-  // IPluginV2DynamicExt Methods
-  nvinfer1::IPluginV2DynamicExt *clone() const TRT_NOEXCEPT override;
-  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs,
-                                          int nbInputs, nvinfer1::IExprBuilder &exprBuilder)
-      TRT_NOEXCEPT override;
-  bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs,
-                                 int nbOutputs) TRT_NOEXCEPT override;
-  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
-              const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
-              void *const *outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT override;
+        // IPluginV2DynamicExt Methods
+        nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
 
-  // IPluginV2Ext Methods
-  nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
-                                       int nbInputs) const TRT_NOEXCEPT override;
+        nvinfer1::DimsExprs            getOutputDimensions(int                        outputIndex,
+                                                           const nvinfer1::DimsExprs* inputs,
+                                                           int                        nbInputs,
+                                                           nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT override;
 
-  // IPluginV2 Methods
-  const char *getPluginType() const TRT_NOEXCEPT override;
-  const char *getPluginVersion() const TRT_NOEXCEPT override;
-  int getNbOutputs() const TRT_NOEXCEPT override;
-  size_t getSerializationSize() const TRT_NOEXCEPT override;
-  void serialize(void *buffer) const TRT_NOEXCEPT override;
+        bool                           supportsFormatCombination(int                               pos,
+                                                                 const nvinfer1::PluginTensorDesc* ioDesc,
+                                                                 int                               nbInputs,
+                                                                 int                               nbOutputs) TRT_NOEXCEPT override;
 
- private:
-  nvinfer1::Dims mStride;
+        int                enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                   const nvinfer1::PluginTensorDesc* outputDesc,
+                                   const void* const*                inputs,
+                                   void* const*                      outputs,
+                                   void*                             workspace,
+                                   cudaStream_t                      stream) TRT_NOEXCEPT override;
 
-  cublasHandle_t m_cublas_handle;
-};
+        // IPluginV2Ext Methods
+        nvinfer1::DataType             getOutputDataType(int                       index,
+                                                         const nvinfer1::DataType* inputTypes,
+                                                         int                       nbInputs) const TRT_NOEXCEPT override;
 
-class GridPriorsTRTCreator : public TRTPluginCreatorBase {
- public:
-  GridPriorsTRTCreator();
+        // IPluginV2 Methods
+        const char*        getPluginType() const TRT_NOEXCEPT override;
+        const char*        getPluginVersion() const TRT_NOEXCEPT override;
+        int                getNbOutputs() const TRT_NOEXCEPT override;
+        size_t             getSerializationSize() const TRT_NOEXCEPT override;
+        void               serialize(void* buffer) const TRT_NOEXCEPT override;
 
-  const char *getPluginName() const TRT_NOEXCEPT override;
+      private:
+        nvinfer1::Dims mStride;
 
-  const char *getPluginVersion() const TRT_NOEXCEPT override;
+        cublasHandle_t m_cublas_handle;
+    };
 
-  nvinfer1::IPluginV2 *createPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
-      TRT_NOEXCEPT override;
+    class GridPriorsTRTCreator : public TRTPluginCreatorBase
+    {
+      public:
+        GridPriorsTRTCreator();
 
-  nvinfer1::IPluginV2 *deserializePlugin(const char *name, const void *serialData,
-                                         size_t serialLength) TRT_NOEXCEPT override;
-};
+        const char*          getPluginName() const TRT_NOEXCEPT override;
+
+        const char*          getPluginVersion() const TRT_NOEXCEPT override;
+
+        nvinfer1::IPluginV2* createPlugin(const char*                            name,
+                                          const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override;
+
+        nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                               const void* serialData,
+                                               size_t      serialLength) TRT_NOEXCEPT override;
+    };
 }  // namespace mmdeploy
 #endif  // TRT_GRID_PRIORS_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors_kernel.cu b/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors_kernel.cu
index 72c33d179a..f6207eecc1 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors_kernel.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors_kernel.cu
@@ -5,39 +5,64 @@
 #include "trt_grid_priors_kernel.hpp"
 #include "trt_plugin_helper.hpp"
 
-template <typename scalar_t>
-__global__ void trt_grid_priors_kernel(const scalar_t* base_anchor, scalar_t* output,
-                                       int num_base_anchors, int feat_w, int feat_h, int stride_w,
-                                       int stride_h) {
-  // load base anchor into shared memory.
-  extern __shared__ scalar_t shared_base_anchor[];
-  for (int i = threadIdx.x; i < num_base_anchors * 4; i += blockDim.x) {
-    shared_base_anchor[i] = base_anchor[i];
-  }
-  __syncthreads();
+template<typename scalar_t>
+__global__ void trt_grid_priors_kernel(const scalar_t* base_anchor,
+                                       scalar_t*       output,
+                                       int             num_base_anchors,
+                                       int             feat_w,
+                                       int             feat_h,
+                                       int             stride_w,
+                                       int             stride_h)
+{
+    // load base anchor into shared memory.
+    extern __shared__ scalar_t shared_base_anchor[];
+    for (int i = threadIdx.x; i < num_base_anchors * 4; i += blockDim.x)
+    {
+        shared_base_anchor[i] = base_anchor[i];
+    }
+    __syncthreads();
 
-  CUDA_1D_KERNEL_LOOP(index, num_base_anchors * feat_w * feat_h) {
-    const int a_offset = (index % num_base_anchors) << 2;
-    const scalar_t w = scalar_t(((index / num_base_anchors) % feat_w) * stride_w);
-    const scalar_t h = scalar_t((index / (feat_w * num_base_anchors)) * stride_h);
+    CUDA_1D_KERNEL_LOOP(index, num_base_anchors * feat_w * feat_h)
+    {
+        const int      a_offset = (index % num_base_anchors) << 2;
+        const scalar_t w        = scalar_t(((index / num_base_anchors) % feat_w) * stride_w);
+        const scalar_t h        = scalar_t((index / (feat_w * num_base_anchors)) * stride_h);
 
-    auto out_start = output + index * 4;
-    out_start[0] = shared_base_anchor[a_offset] + w;
-    out_start[1] = shared_base_anchor[a_offset + 1] + h;
-    out_start[2] = shared_base_anchor[a_offset + 2] + w;
-    out_start[3] = shared_base_anchor[a_offset + 3] + h;
-  }
+        auto           out_start = output + index * 4;
+        out_start[0]             = shared_base_anchor[a_offset] + w;
+        out_start[1]             = shared_base_anchor[a_offset + 1] + h;
+        out_start[2]             = shared_base_anchor[a_offset + 2] + w;
+        out_start[3]             = shared_base_anchor[a_offset + 3] + h;
+    }
 }
 
-template <typename scalar_t>
-void trt_grid_priors_impl(const scalar_t* base_anchor, scalar_t* output, int num_base_anchors,
-                          int feat_w, int feat_h, int stride_w, int stride_h, cudaStream_t stream) {
-  trt_grid_priors_kernel<<<GET_BLOCKS(num_base_anchors * feat_w * feat_h), THREADS_PER_BLOCK,
-                           DIVUP(num_base_anchors * 4, 32) * 32 * sizeof(scalar_t), stream>>>(
-      base_anchor, output, (int)num_base_anchors, (int)feat_w, (int)feat_h, (int)stride_w,
-      (int)stride_h);
+template<typename scalar_t>
+void trt_grid_priors_impl(const scalar_t* base_anchor,
+                          scalar_t*       output,
+                          int             num_base_anchors,
+                          int             feat_w,
+                          int             feat_h,
+                          int             stride_w,
+                          int             stride_h,
+                          cudaStream_t    stream)
+{
+    trt_grid_priors_kernel<<<GET_BLOCKS(num_base_anchors * feat_w * feat_h),
+                             THREADS_PER_BLOCK,
+                             DIVUP(num_base_anchors * 4, 32) * 32 * sizeof(scalar_t),
+                             stream>>>(base_anchor,
+                                       output,
+                                       (int)num_base_anchors,
+                                       (int)feat_w,
+                                       (int)feat_h,
+                                       (int)stride_w,
+                                       (int)stride_h);
 }
 
-template void trt_grid_priors_impl<float>(const float* base_anchor, float* output,
-                                          int num_base_anchors, int feat_w, int feat_h,
-                                          int stride_w, int stride_h, cudaStream_t stream);
+template void trt_grid_priors_impl<float>(const float* base_anchor,
+                                          float*       output,
+                                          int          num_base_anchors,
+                                          int          feat_w,
+                                          int          feat_h,
+                                          int          stride_w,
+                                          int          stride_h,
+                                          cudaStream_t stream);
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors_kernel.hpp b/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors_kernel.hpp
index 77cef58c54..5de3690b30 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors_kernel.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/grid_priors/trt_grid_priors_kernel.hpp
@@ -3,8 +3,14 @@
 #define TRT_GRID_PRIORS_KERNEL_HPP
 #include <cuda_runtime.h>
 
-template <typename scalar_t>
-void trt_grid_priors_impl(const scalar_t* base_anchor, scalar_t* output, int num_base_anchors,
-                          int feat_w, int feat_h, int stride_w, int stride_h, cudaStream_t stream);
+template<typename scalar_t>
+void trt_grid_priors_impl(const scalar_t* base_anchor,
+                          scalar_t*       output,
+                          int             num_base_anchors,
+                          int             feat_w,
+                          int             feat_h,
+                          int             stride_w,
+                          int             stride_h,
+                          cudaStream_t    stream);
 
 #endif
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler.cpp b/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler.cpp
index 7e55686902..9894f7f0b4 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler.cpp
@@ -9,194 +9,257 @@
 #include "trt_plugin_helper.hpp"
 #include "trt_serialize.hpp"
 
-namespace mmdeploy {
-namespace {
-static const char *PLUGIN_VERSION{"1"};
-static const char *PLUGIN_NAME{"grid_sampler"};
-}  // namespace
-
-TRTGridSampler::TRTGridSampler(const std::string &name, int mode, int paddingMode,
-                               bool alignCorners)
-    : TRTPluginBase(name), mMode(mode), mPaddingMode(paddingMode), mAlignCorners(alignCorners) {}
-
-TRTGridSampler::TRTGridSampler(const std::string name, const void *data, size_t length)
-    : TRTPluginBase(name) {
-  deserialize_value(&data, &length, &mMode);
-  deserialize_value(&data, &length, &mPaddingMode);
-  deserialize_value(&data, &length, &mAlignCorners);
-}
-
-nvinfer1::IPluginV2DynamicExt *TRTGridSampler::clone() const TRT_NOEXCEPT {
-  TRTGridSampler *plugin = new TRTGridSampler(mLayerName, mMode, mPaddingMode, mAlignCorners);
-  plugin->setPluginNamespace(getPluginNamespace());
-
-  return plugin;
-}
-
-nvinfer1::DimsExprs TRTGridSampler::getOutputDimensions(
-    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
-    nvinfer1::IExprBuilder &exprBuilder) TRT_NOEXCEPT {
-  nvinfer1::DimsExprs ret;
-  ret.nbDims = inputs[0].nbDims;
-  ret.d[0] = inputs[0].d[0];
-  ret.d[1] = inputs[0].d[1];
-  for (int i = 2; i < ret.nbDims; ++i) {
-    ret.d[i] = inputs[1].d[i - 1];
-  }
-  return ret;
-}
-
-bool TRTGridSampler::supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc,
-                                               int nbInputs, int nbOutputs) TRT_NOEXCEPT {
-  if (pos == 0) {
-    return (ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
-            ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR);
-  } else {
-    return ioDesc[pos].type == ioDesc[0].type && ioDesc[pos].format == ioDesc[0].format;
-  }
-}
-
-void TRTGridSampler::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
-                                     const nvinfer1::DynamicPluginTensorDesc *outputs,
-                                     int nbOutputs) TRT_NOEXCEPT {
-  // Validate input arguments
-}
-
-size_t TRTGridSampler::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
-                                        const nvinfer1::PluginTensorDesc *outputs,
-                                        int nbOutputs) const TRT_NOEXCEPT {
-  return 0;
-}
-
-int TRTGridSampler::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
-                            const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
-                            void *const *outputs, void *workSpace,
-                            cudaStream_t stream) TRT_NOEXCEPT {
-  nvinfer1::Dims input_dims = inputDesc[0].dims;
-  nvinfer1::Dims grid_dims = inputDesc[1].dims;
-  nvinfer1::Dims output_dims = outputDesc[0].dims;
-
-  GridSamplerInterpolation interp_mode = GridSamplerInterpolation::Bilinear;
-  switch (mMode) {
-    case 0:
-      interp_mode = GridSamplerInterpolation::Bilinear;
-      break;
-    case 1:
-      interp_mode = GridSamplerInterpolation::Nearest;
-      break;
-    default:
-      break;
-  }
-
-  GridSamplerPadding padding_mode = GridSamplerPadding::Zeros;
-  switch (mPaddingMode) {
-    case 0:
-      padding_mode = GridSamplerPadding::Zeros;
-      break;
-
-    case 1:
-      padding_mode = GridSamplerPadding::Border;
-      break;
-
-    case 2:
-      padding_mode = GridSamplerPadding::Reflection;
-      break;
-    default:
-      break;
-  }
-
-  auto data_type = inputDesc[0].type;
-
-  switch (data_type) {
-    case nvinfer1::DataType::kFLOAT:
-      grid_sample<float>((float *)outputs[0], (float *)inputs[0], (float *)inputs[1],
-                         &(output_dims.d[0]), &(input_dims.d[0]), &(grid_dims.d[0]),
-                         input_dims.nbDims, interp_mode, padding_mode, mAlignCorners, stream);
-      break;
-    default:
-      return 1;
-      break;
-  }
-
-  return 0;
-}
-
-nvinfer1::DataType TRTGridSampler::getOutputDataType(int index,
-                                                     const nvinfer1::DataType *inputTypes,
-                                                     int nbInputs) const TRT_NOEXCEPT {
-  return inputTypes[0];
-}
-
-// IPluginV2 Methods
-const char *TRTGridSampler::getPluginType() const TRT_NOEXCEPT { return PLUGIN_NAME; }
-
-const char *TRTGridSampler::getPluginVersion() const TRT_NOEXCEPT { return PLUGIN_VERSION; }
-
-int TRTGridSampler::getNbOutputs() const TRT_NOEXCEPT { return 1; }
-
-size_t TRTGridSampler::getSerializationSize() const TRT_NOEXCEPT {
-  return serialized_size(mMode) + serialized_size(mPaddingMode) + serialized_size(mAlignCorners);
-}
-
-void TRTGridSampler::serialize(void *buffer) const TRT_NOEXCEPT {
-  serialize_value(&buffer, mMode);
-  serialize_value(&buffer, mPaddingMode);
-  serialize_value(&buffer, mAlignCorners);
-}
-
-////////////////////// creator /////////////////////////////
-
-TRTGridSamplerCreator::TRTGridSamplerCreator() {
-  mPluginAttributes = std::vector<nvinfer1::PluginField>(
-      {nvinfer1::PluginField("interpolation_mode"), nvinfer1::PluginField("padding_mode"),
-       nvinfer1::PluginField("align_corners")});
-  mFC.nbFields = mPluginAttributes.size();
-  mFC.fields = mPluginAttributes.data();
-}
-
-const char *TRTGridSamplerCreator::getPluginName() const TRT_NOEXCEPT { return PLUGIN_NAME; }
-
-const char *TRTGridSamplerCreator::getPluginVersion() const TRT_NOEXCEPT { return PLUGIN_VERSION; }
-
-nvinfer1::IPluginV2 *TRTGridSamplerCreator::createPlugin(
-    const char *name, const nvinfer1::PluginFieldCollection *fc) TRT_NOEXCEPT {
-  int mode = 0;
-  int paddingMode = 0;
-  bool alignCorners = false;
-
-  for (int i = 0; i < fc->nbFields; i++) {
-    if (fc->fields[i].data == nullptr) {
-      continue;
-    }
-    std::string field_name(fc->fields[i].name);
-
-    if (field_name.compare("interpolation_mode") == 0) {
-      mode = static_cast<const int *>(fc->fields[i].data)[0];
-    }
-
-    if (field_name.compare("padding_mode") == 0) {
-      paddingMode = static_cast<const int *>(fc->fields[i].data)[0];
-    }
-
-    if (field_name.compare("align_corners") == 0) {
-      alignCorners = (bool)(static_cast<const int *>(fc->fields[i].data)[0]);
-    }
-  }
-
-  TRTGridSampler *plugin = new TRTGridSampler(name, mode, paddingMode, alignCorners);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
+namespace mmdeploy
+{
+    namespace
+    {
+        static const char* PLUGIN_VERSION{"1"};
+        static const char* PLUGIN_NAME{"grid_sampler"};
+    }  // namespace
+
+    TRTGridSampler::TRTGridSampler(const std::string& name, int mode, int paddingMode, bool alignCorners)
+        : TRTPluginBase(name)
+        , mMode(mode)
+        , mPaddingMode(paddingMode)
+        , mAlignCorners(alignCorners)
+    {
+    }
+
+    TRTGridSampler::TRTGridSampler(const std::string name, const void* data, size_t length)
+        : TRTPluginBase(name)
+    {
+        deserialize_value(&data, &length, &mMode);
+        deserialize_value(&data, &length, &mPaddingMode);
+        deserialize_value(&data, &length, &mAlignCorners);
+    }
+
+    nvinfer1::IPluginV2DynamicExt* TRTGridSampler::clone() const TRT_NOEXCEPT
+    {
+        TRTGridSampler* plugin = new TRTGridSampler(mLayerName, mMode, mPaddingMode, mAlignCorners);
+        plugin->setPluginNamespace(getPluginNamespace());
+
+        return plugin;
+    }
+
+    nvinfer1::DimsExprs TRTGridSampler::getOutputDimensions(
+        int                        outputIndex,
+        const nvinfer1::DimsExprs* inputs,
+        int                        nbInputs,
+        nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT
+    {
+        nvinfer1::DimsExprs ret;
+        ret.nbDims = inputs[0].nbDims;
+        ret.d[0]   = inputs[0].d[0];
+        ret.d[1]   = inputs[0].d[1];
+        for (int i = 2; i < ret.nbDims; ++i)
+        {
+            ret.d[i] = inputs[1].d[i - 1];
+        }
+        return ret;
+    }
+
+    bool TRTGridSampler::supportsFormatCombination(int                               pos,
+                                                   const nvinfer1::PluginTensorDesc* ioDesc,
+                                                   int                               nbInputs,
+                                                   int                               nbOutputs) TRT_NOEXCEPT
+    {
+        if (pos == 0)
+        {
+            return (ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
+                    ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR);
+        }
+        else
+        {
+            return ioDesc[pos].type == ioDesc[0].type && ioDesc[pos].format == ioDesc[0].format;
+        }
+    }
+
+    void TRTGridSampler::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs,
+                                         int                                      nbInputs,
+                                         const nvinfer1::DynamicPluginTensorDesc* outputs,
+                                         int                                      nbOutputs) TRT_NOEXCEPT
+    {
+        // Validate input arguments
+    }
+
+    size_t TRTGridSampler::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                            int                               nbInputs,
+                                            const nvinfer1::PluginTensorDesc* outputs,
+                                            int                               nbOutputs) const TRT_NOEXCEPT
+    {
+        return 0;
+    }
+
+    int TRTGridSampler::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                const nvinfer1::PluginTensorDesc* outputDesc,
+                                const void* const*                inputs,
+                                void* const*                      outputs,
+                                void*                             workSpace,
+                                cudaStream_t                      stream) TRT_NOEXCEPT
+    {
+        nvinfer1::Dims           input_dims  = inputDesc[0].dims;
+        nvinfer1::Dims           grid_dims   = inputDesc[1].dims;
+        nvinfer1::Dims           output_dims = outputDesc[0].dims;
+
+        GridSamplerInterpolation interp_mode = GridSamplerInterpolation::Bilinear;
+        switch (mMode)
+        {
+            case 0:
+                interp_mode = GridSamplerInterpolation::Bilinear;
+                break;
+            case 1:
+                interp_mode = GridSamplerInterpolation::Nearest;
+                break;
+            default:
+                break;
+        }
+
+        GridSamplerPadding padding_mode = GridSamplerPadding::Zeros;
+        switch (mPaddingMode)
+        {
+            case 0:
+                padding_mode = GridSamplerPadding::Zeros;
+                break;
+
+            case 1:
+                padding_mode = GridSamplerPadding::Border;
+                break;
+
+            case 2:
+                padding_mode = GridSamplerPadding::Reflection;
+                break;
+            default:
+                break;
+        }
+
+        auto data_type = inputDesc[0].type;
+
+        switch (data_type)
+        {
+            case nvinfer1::DataType::kFLOAT:
+                grid_sample<float>((float*)outputs[0],
+                                   (float*)inputs[0],
+                                   (float*)inputs[1],
+                                   &(output_dims.d[0]),
+                                   &(input_dims.d[0]),
+                                   &(grid_dims.d[0]),
+                                   input_dims.nbDims,
+                                   interp_mode,
+                                   padding_mode,
+                                   mAlignCorners,
+                                   stream);
+                break;
+            default:
+                return 1;
+                break;
+        }
+
+        return 0;
+    }
+
+    nvinfer1::DataType TRTGridSampler::getOutputDataType(int                       index,
+                                                         const nvinfer1::DataType* inputTypes,
+                                                         int                       nbInputs) const TRT_NOEXCEPT
+    {
+        return inputTypes[0];
+    }
+
+    // IPluginV2 Methods
+    const char* TRTGridSampler::getPluginType() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
+    }
+
+    const char* TRTGridSampler::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
+    }
+
+    int TRTGridSampler::getNbOutputs() const TRT_NOEXCEPT
+    {
+        return 1;
+    }
+
+    size_t TRTGridSampler::getSerializationSize() const TRT_NOEXCEPT
+    {
+        return serialized_size(mMode) + serialized_size(mPaddingMode) + serialized_size(mAlignCorners);
+    }
+
+    void TRTGridSampler::serialize(void* buffer) const TRT_NOEXCEPT
+    {
+        serialize_value(&buffer, mMode);
+        serialize_value(&buffer, mPaddingMode);
+        serialize_value(&buffer, mAlignCorners);
+    }
+
+    ////////////////////// creator /////////////////////////////
 
-nvinfer1::IPluginV2 *TRTGridSamplerCreator::deserializePlugin(const char *name,
-                                                              const void *serialData,
-                                                              size_t serialLength) TRT_NOEXCEPT {
-  // This object will be deleted when the network is destroyed, which will
-  // call FCPluginDynamic::destroy()
-  auto plugin = new TRTGridSampler(name, serialData, serialLength);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
+    TRTGridSamplerCreator::TRTGridSamplerCreator()
+    {
+        mPluginAttributes = std::vector<nvinfer1::PluginField>({nvinfer1::PluginField("interpolation_mode"),
+                                                                nvinfer1::PluginField("padding_mode"),
+                                                                nvinfer1::PluginField("align_corners")});
+        mFC.nbFields = mPluginAttributes.size();
+        mFC.fields   = mPluginAttributes.data();
+    }
+
+    const char* TRTGridSamplerCreator::getPluginName() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
+    }
+
+    const char* TRTGridSamplerCreator::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
+    }
+
+    nvinfer1::IPluginV2* TRTGridSamplerCreator::createPlugin(
+        const char*                            name,
+        const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT
+    {
+        int  mode         = 0;
+        int  paddingMode  = 0;
+        bool alignCorners = false;
+
+        for (int i = 0; i < fc->nbFields; i++)
+        {
+            if (fc->fields[i].data == nullptr)
+            {
+                continue;
+            }
+            std::string field_name(fc->fields[i].name);
+
+            if (field_name.compare("interpolation_mode") == 0)
+            {
+                mode = static_cast<const int*>(fc->fields[i].data)[0];
+            }
+
+            if (field_name.compare("padding_mode") == 0)
+            {
+                paddingMode = static_cast<const int*>(fc->fields[i].data)[0];
+            }
+
+            if (field_name.compare("align_corners") == 0)
+            {
+                alignCorners = (bool)(static_cast<const int*>(fc->fields[i].data)[0]);
+            }
+        }
+
+        TRTGridSampler* plugin = new TRTGridSampler(name, mode, paddingMode, alignCorners);
+        plugin->setPluginNamespace(getPluginNamespace());
+        return plugin;
+    }
+
+    nvinfer1::IPluginV2* TRTGridSamplerCreator::deserializePlugin(const char* name,
+                                                                  const void* serialData,
+                                                                  size_t      serialLength) TRT_NOEXCEPT
+    {
+        // This object will be deleted when the network is destroyed, which will
+        // call FCPluginDynamic::destroy()
+        auto plugin = new TRTGridSampler(name, serialData, serialLength);
+        plugin->setPluginNamespace(getPluginNamespace());
+        return plugin;
+    }
 
-REGISTER_TENSORRT_PLUGIN(TRTGridSamplerCreator);
+    REGISTER_TENSORRT_PLUGIN(TRTGridSamplerCreator);
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler.hpp b/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler.hpp
index 0f62bce7c8..286b955d6c 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler.hpp
@@ -9,76 +9,94 @@
 
 #include "trt_plugin_base.hpp"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-class TRTGridSampler : public TRTPluginBase {
- public:
-  TRTGridSampler(const std::string &name, int mode, int paddingMode, bool alignCorners);
+    class TRTGridSampler : public TRTPluginBase
+    {
+      public:
+        TRTGridSampler(const std::string& name,
+                       int                mode,
+                       int                paddingMode,
+                       bool               alignCorners);
 
-  TRTGridSampler(const std::string name, const void *data, size_t length);
+        TRTGridSampler(const std::string name,
+                       const void*       data,
+                       size_t            length);
 
-  TRTGridSampler() = delete;
+        TRTGridSampler() = delete;
 
-  ~TRTGridSampler() TRT_NOEXCEPT override = default;
+        ~TRTGridSampler() TRT_NOEXCEPT override = default;
 
-  // IPluginV2DynamicExt Methods
-  nvinfer1::IPluginV2DynamicExt *clone() const TRT_NOEXCEPT override;
+        // IPluginV2DynamicExt Methods
+        nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
 
-  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs,
-                                          int nbInputs, nvinfer1::IExprBuilder &exprBuilder)
-      TRT_NOEXCEPT override;
+        nvinfer1::DimsExprs            getOutputDimensions(int                        outputIndex,
+                                                           const nvinfer1::DimsExprs* inputs,
+                                                           int                        nbInputs,
+                                                           nvinfer1::IExprBuilder&    exprBuilder)
+            TRT_NOEXCEPT override;
 
-  bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs,
-                                 int nbOutputs) TRT_NOEXCEPT override;
+        bool               supportsFormatCombination(int                               pos,
+                                                     const nvinfer1::PluginTensorDesc* ioDesc,
+                                                     int                               nbInputs,
+                                                     int                               nbOutputs) TRT_NOEXCEPT override;
 
-  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs,
-                       const nvinfer1::DynamicPluginTensorDesc *out,
-                       int nbOutputs) TRT_NOEXCEPT override;
+        void               configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                                           int                                      nbInputs,
+                                           const nvinfer1::DynamicPluginTensorDesc* out,
+                                           int                                      nbOutputs) TRT_NOEXCEPT override;
 
-  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
-                          const nvinfer1::PluginTensorDesc *outputs,
-                          int nbOutputs) const TRT_NOEXCEPT override;
+        size_t             getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                            int                               nbInputs,
+                                            const nvinfer1::PluginTensorDesc* outputs,
+                                            int                               nbOutputs) const TRT_NOEXCEPT override;
 
-  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
-              const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
-              void *const *outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT override;
+        int                enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                   const nvinfer1::PluginTensorDesc* outputDesc,
+                                   const void* const*                inputs,
+                                   void* const*                      outputs,
+                                   void*                             workspace,
+                                   cudaStream_t                      stream) TRT_NOEXCEPT override;
 
-  // IPluginV2Ext Methods
-  nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
-                                       int nbInputs) const TRT_NOEXCEPT override;
+        // IPluginV2Ext Methods
+        nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override;
 
-  // IPluginV2 Methods
-  const char *getPluginType() const TRT_NOEXCEPT override;
+        // IPluginV2 Methods
+        const char*        getPluginType() const TRT_NOEXCEPT override;
 
-  const char *getPluginVersion() const TRT_NOEXCEPT override;
+        const char*        getPluginVersion() const TRT_NOEXCEPT override;
 
-  int getNbOutputs() const TRT_NOEXCEPT override;
+        int                getNbOutputs() const TRT_NOEXCEPT override;
 
-  size_t getSerializationSize() const TRT_NOEXCEPT override;
+        size_t             getSerializationSize() const TRT_NOEXCEPT override;
 
-  void serialize(void *buffer) const TRT_NOEXCEPT override;
+        void               serialize(void* buffer) const TRT_NOEXCEPT override;
 
- private:
-  int mMode;
-  int mPaddingMode;
-  bool mAlignCorners;
-};
+      private:
+        int  mMode;
+        int  mPaddingMode;
+        bool mAlignCorners;
+    };
 
-class TRTGridSamplerCreator : public TRTPluginCreatorBase {
- public:
-  TRTGridSamplerCreator();
+    class TRTGridSamplerCreator : public TRTPluginCreatorBase
+    {
+      public:
+        TRTGridSamplerCreator();
 
-  ~TRTGridSamplerCreator() TRT_NOEXCEPT override = default;
+        ~TRTGridSamplerCreator() TRT_NOEXCEPT override = default;
 
-  const char *getPluginName() const TRT_NOEXCEPT override;
+        const char*          getPluginName() const TRT_NOEXCEPT override;
 
-  const char *getPluginVersion() const TRT_NOEXCEPT override;
+        const char*          getPluginVersion() const TRT_NOEXCEPT override;
 
-  nvinfer1::IPluginV2 *createPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
-      TRT_NOEXCEPT override;
+        nvinfer1::IPluginV2* createPlugin(const char*                            name,
+                                          const nvinfer1::PluginFieldCollection* fc)
+            TRT_NOEXCEPT override;
 
-  nvinfer1::IPluginV2 *deserializePlugin(const char *name, const void *serialData,
-                                         size_t serialLength) TRT_NOEXCEPT override;
-};
+        nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                               const void* serialData,
+                                               size_t      serialLength) TRT_NOEXCEPT override;
+    };
 }  // namespace mmdeploy
 #endif  // TRT_GRID_SAMPLER_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler_kernel.cu b/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler_kernel.cu
index 5d83f98d2c..28d581dd66 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler_kernel.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler_kernel.cu
@@ -27,370 +27,470 @@ using mmdeploy::TensorDesc;
 //     -1 --> -0.5
 //     +1 --> (size - 1) + 0.5 == size - 0.5
 //     scale_factor = size / 2
-template <typename scalar_t>
-static __forceinline__ __device__ scalar_t grid_sampler_unnormalize(scalar_t coord, int size,
-                                                                    bool align_corners) {
-  if (align_corners) {
-    // unnormalize coord from [-1, 1] to [0, size - 1]
-    return ((coord + 1.f) / 2) * (size - 1);
-  } else {
-    // unnormalize coord from [-1, 1] to [-0.5, size - 0.5]
-    return ((coord + 1.f) * size - 1) / 2;
-  }
+template<typename scalar_t>
+static __forceinline__ __device__ scalar_t grid_sampler_unnormalize(scalar_t coord, int size, bool align_corners)
+{
+    if (align_corners)
+    {
+        // unnormalize coord from [-1, 1] to [0, size - 1]
+        return ((coord + 1.f) / 2) * (size - 1);
+    }
+    else
+    {
+        // unnormalize coord from [-1, 1] to [-0.5, size - 0.5]
+        return ((coord + 1.f) * size - 1) / 2;
+    }
 }
 
 // Clips coordinates to between 0 and clip_limit - 1
-template <typename scalar_t>
-static __forceinline__ __device__ scalar_t clip_coordinates(scalar_t in, int clip_limit) {
-  return ::min(static_cast<scalar_t>(clip_limit - 1), ::max(in, static_cast<scalar_t>(0)));
+template<typename scalar_t>
+static __forceinline__ __device__ scalar_t clip_coordinates(scalar_t in, int clip_limit)
+{
+    return ::min(static_cast<scalar_t>(clip_limit - 1), ::max(in, static_cast<scalar_t>(0)));
 }
 
 // Reflects coordinates until they fall between low and high (inclusive).
 // The bounds are passed as twice their value so that half-integer values
 // can be represented as ints.
-template <typename scalar_t>
-static __forceinline__ __device__ scalar_t reflect_coordinates(scalar_t in, int twice_low,
-                                                               int twice_high) {
-  if (twice_low == twice_high) {
-    return static_cast<scalar_t>(0);
-  }
-  scalar_t min = static_cast<scalar_t>(twice_low) / 2;
-  scalar_t span = static_cast<scalar_t>(twice_high - twice_low) / 2;
-  in = ::fabs(in - min);
-  // `fmod` returns same sign as `in`, which is positive after the `fabs` above.
-  scalar_t extra = ::fmod(in, span);
-  int flips = static_cast<int>(::floor(in / span));
-  if (flips % 2 == 0) {
-    return extra + min;
-  } else {
-    return span - extra + min;
-  }
+template<typename scalar_t>
+static __forceinline__ __device__ scalar_t reflect_coordinates(scalar_t in, int twice_low, int twice_high)
+{
+    if (twice_low == twice_high)
+    {
+        return static_cast<scalar_t>(0);
+    }
+    scalar_t min   = static_cast<scalar_t>(twice_low) / 2;
+    scalar_t span  = static_cast<scalar_t>(twice_high - twice_low) / 2;
+    in             = ::fabs(in - min);
+    // `fmod` returns same sign as `in`, which is positive after the `fabs` above.
+    scalar_t extra = ::fmod(in, span);
+    int      flips = static_cast<int>(::floor(in / span));
+    if (flips % 2 == 0)
+    {
+        return extra + min;
+    }
+    else
+    {
+        return span - extra + min;
+    }
 }
 
-template <typename scalar_t>
-static __forceinline__ __device__ scalar_t safe_downgrade_to_int_range(scalar_t x) {
-  // -100.0 does not have special meaning. This is just to make sure
-  // it's not within_bounds_2d or within_bounds_3d, and does not cause
-  // undefined behavior. See #35506.
-  if (x > INT_MAX - 1 || x < INT_MIN || !::isfinite(static_cast<double>(x)))
-    return static_cast<scalar_t>(-100.0);
-  return x;
+template<typename scalar_t>
+static __forceinline__ __device__ scalar_t safe_downgrade_to_int_range(scalar_t x)
+{
+    // -100.0 does not have special meaning. This is just to make sure
+    // it's not within_bounds_2d or within_bounds_3d, and does not cause
+    // undefined behavior. See #35506.
+    if (x > INT_MAX - 1 || x < INT_MIN || !::isfinite(static_cast<double>(x)))
+        return static_cast<scalar_t>(-100.0);
+    return x;
 }
 
 // Computes the pixel source index value for a grid coordinate
-template <typename scalar_t>
+template<typename scalar_t>
 static __forceinline__ __device__ scalar_t grid_sampler_compute_source_index(
-    scalar_t coord, int size, GridSamplerPadding padding_mode, bool align_corners) {
-  coord = grid_sampler_unnormalize(coord, size, align_corners);
-  if (padding_mode == GridSamplerPadding::Border) {
-    // clip coordinates to image borders
-    coord = clip_coordinates(coord, size);
-  } else if (padding_mode == GridSamplerPadding::Reflection) {
-    // reflect coordinates by image borders
-    if (align_corners) {
-      coord = reflect_coordinates(coord, 0, 2 * (size - 1));
-    } else {
-      coord = reflect_coordinates(coord, -1, 2 * size - 1);
+    scalar_t           coord,
+    int                size,
+    GridSamplerPadding padding_mode,
+    bool               align_corners)
+{
+    coord = grid_sampler_unnormalize(coord, size, align_corners);
+    if (padding_mode == GridSamplerPadding::Border)
+    {
+        // clip coordinates to image borders
+        coord = clip_coordinates(coord, size);
+    }
+    else if (padding_mode == GridSamplerPadding::Reflection)
+    {
+        // reflect coordinates by image borders
+        if (align_corners)
+        {
+            coord = reflect_coordinates(coord, 0, 2 * (size - 1));
+        }
+        else
+        {
+            coord = reflect_coordinates(coord, -1, 2 * size - 1);
+        }
+        // clip coordinates to image borders
+        coord = clip_coordinates(coord, size);
     }
-    // clip coordinates to image borders
-    coord = clip_coordinates(coord, size);
-  }
 
-  coord = safe_downgrade_to_int_range(coord);
-  return coord;
+    coord = safe_downgrade_to_int_range(coord);
+    return coord;
 }
 
-static __forceinline__ __device__ bool within_bounds_2d(int h, int w, int H, int W) {
-  return h >= 0 && h < H && w >= 0 && w < W;
+static __forceinline__ __device__ bool within_bounds_2d(int h, int w, int H, int W)
+{
+    return h >= 0 && h < H && w >= 0 && w < W;
 }
 
-static __forceinline__ __device__ bool within_bounds_3d(int d, int h, int w, int D, int H, int W) {
-  return d >= 0 && d < D && h >= 0 && h < H && w >= 0 && w < W;
+static __forceinline__ __device__ bool within_bounds_3d(int d, int h, int w, int D, int H, int W)
+{
+    return d >= 0 && d < D && h >= 0 && h < H && w >= 0 && w < W;
 }
 
-template <typename scalar_t>
-__global__ void grid_sampler_2d_kernel(const int nthreads, const scalar_t *input,
-                                       const scalar_t *grid, scalar_t *output,
-                                       TensorDesc input_desc, TensorDesc grid_desc,
-                                       TensorDesc output_desc,
+template<typename scalar_t>
+__global__ void grid_sampler_2d_kernel(const int                      nthreads,
+                                       const scalar_t*                input,
+                                       const scalar_t*                grid,
+                                       scalar_t*                      output,
+                                       TensorDesc                     input_desc,
+                                       TensorDesc                     grid_desc,
+                                       TensorDesc                     output_desc,
                                        const GridSamplerInterpolation interpolation_mode,
-                                       const GridSamplerPadding padding_mode, bool align_corners) {
-  int C = input_desc.shape[1];
-  int inp_H = input_desc.shape[2];
-  int inp_W = input_desc.shape[3];
-  int out_H = grid_desc.shape[1];
-  int out_W = grid_desc.shape[2];
-  int inp_sN = input_desc.stride[0];
-  int inp_sC = input_desc.stride[1];
-  int inp_sH = input_desc.stride[2];
-  int inp_sW = input_desc.stride[3];
-  int grid_sN = grid_desc.stride[0];
-  int grid_sH = grid_desc.stride[1];
-  int grid_sW = grid_desc.stride[2];
-  int grid_sCoor = grid_desc.stride[3];
-  int out_sN = output_desc.stride[0];
-  int out_sC = output_desc.stride[1];
-  int out_sH = output_desc.stride[2];
-  int out_sW = output_desc.stride[3];
-
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    const int w = index % out_W;
-    const int h = (index / out_W) % out_H;
-    const int n = index / (out_H * out_W);
-    const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
-
-    // get the corresponding input x, y coordinates from grid
-    scalar_t ix = grid[grid_offset];
-    scalar_t iy = grid[grid_offset + grid_sCoor];
-
-    ix = grid_sampler_compute_source_index(ix, inp_W, padding_mode, align_corners);
-    iy = grid_sampler_compute_source_index(iy, inp_H, padding_mode, align_corners);
-
-    if (interpolation_mode == GridSamplerInterpolation::Bilinear) {
-      // get NE, NW, SE, SW pixel values from (x, y)
-      int ix_nw = static_cast<int>(::floor(ix));
-      int iy_nw = static_cast<int>(::floor(iy));
-      int ix_ne = ix_nw + 1;
-      int iy_ne = iy_nw;
-      int ix_sw = ix_nw;
-      int iy_sw = iy_nw + 1;
-      int ix_se = ix_nw + 1;
-      int iy_se = iy_nw + 1;
-
-      // get surfaces to each neighbor:
-      scalar_t nw = (ix_se - ix) * (iy_se - iy);
-      scalar_t ne = (ix - ix_sw) * (iy_sw - iy);
-      scalar_t sw = (ix_ne - ix) * (iy - iy_ne);
-      scalar_t se = (ix - ix_nw) * (iy - iy_nw);
-
-      // calculate bilinear weighted pixel value and set output pixel
-      auto inp_ptr_NC = input + n * inp_sN;
-      auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
-      for (int c = 0; c < C; ++c, inp_ptr_NC += inp_sC, out_ptr_NCHW += out_sC) {
-        *out_ptr_NCHW = static_cast<scalar_t>(0);
-        if (within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W)) {
-          *out_ptr_NCHW += inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW] * nw;
-        }
-        if (within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W)) {
-          *out_ptr_NCHW += inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW] * ne;
-        }
-        if (within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W)) {
-          *out_ptr_NCHW += inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW] * sw;
+                                       const GridSamplerPadding       padding_mode,
+                                       bool                           align_corners)
+{
+    int C          = input_desc.shape[1];
+    int inp_H      = input_desc.shape[2];
+    int inp_W      = input_desc.shape[3];
+    int out_H      = grid_desc.shape[1];
+    int out_W      = grid_desc.shape[2];
+    int inp_sN     = input_desc.stride[0];
+    int inp_sC     = input_desc.stride[1];
+    int inp_sH     = input_desc.stride[2];
+    int inp_sW     = input_desc.stride[3];
+    int grid_sN    = grid_desc.stride[0];
+    int grid_sH    = grid_desc.stride[1];
+    int grid_sW    = grid_desc.stride[2];
+    int grid_sCoor = grid_desc.stride[3];
+    int out_sN     = output_desc.stride[0];
+    int out_sC     = output_desc.stride[1];
+    int out_sH     = output_desc.stride[2];
+    int out_sW     = output_desc.stride[3];
+
+    CUDA_1D_KERNEL_LOOP(index, nthreads)
+    {
+        const int w           = index % out_W;
+        const int h           = (index / out_W) % out_H;
+        const int n           = index / (out_H * out_W);
+        const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
+
+        // get the corresponding input x, y coordinates from grid
+        scalar_t  ix = grid[grid_offset];
+        scalar_t  iy = grid[grid_offset + grid_sCoor];
+
+        ix = grid_sampler_compute_source_index(ix, inp_W, padding_mode, align_corners);
+        iy = grid_sampler_compute_source_index(iy, inp_H, padding_mode, align_corners);
+
+        if (interpolation_mode == GridSamplerInterpolation::Bilinear)
+        {
+            // get NE, NW, SE, SW pixel values from (x, y)
+            int      ix_nw = static_cast<int>(::floor(ix));
+            int      iy_nw = static_cast<int>(::floor(iy));
+            int      ix_ne = ix_nw + 1;
+            int      iy_ne = iy_nw;
+            int      ix_sw = ix_nw;
+            int      iy_sw = iy_nw + 1;
+            int      ix_se = ix_nw + 1;
+            int      iy_se = iy_nw + 1;
+
+            // get surfaces to each neighbor:
+            scalar_t nw = (ix_se - ix) * (iy_se - iy);
+            scalar_t ne = (ix - ix_sw) * (iy_sw - iy);
+            scalar_t sw = (ix_ne - ix) * (iy - iy_ne);
+            scalar_t se = (ix - ix_nw) * (iy - iy_nw);
+
+            // calculate bilinear weighted pixel value and set output pixel
+            auto     inp_ptr_NC   = input + n * inp_sN;
+            auto     out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
+            for (int c = 0; c < C; ++c, inp_ptr_NC += inp_sC, out_ptr_NCHW += out_sC)
+            {
+                *out_ptr_NCHW = static_cast<scalar_t>(0);
+                if (within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W))
+                {
+                    *out_ptr_NCHW += inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW] * nw;
+                }
+                if (within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W))
+                {
+                    *out_ptr_NCHW += inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW] * ne;
+                }
+                if (within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W))
+                {
+                    *out_ptr_NCHW += inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW] * sw;
+                }
+                if (within_bounds_2d(iy_se, ix_se, inp_H, inp_W))
+                {
+                    *out_ptr_NCHW += inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW] * se;
+                }
+            }
         }
-        if (within_bounds_2d(iy_se, ix_se, inp_H, inp_W)) {
-          *out_ptr_NCHW += inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW] * se;
+        else if (interpolation_mode == GridSamplerInterpolation::Nearest)
+        {
+            int  ix_nearest = static_cast<int>(::round(ix));
+            int  iy_nearest = static_cast<int>(::round(iy));
+
+            // assign nearest neighbor pixel value to output pixel
+            auto inp_ptr_NC   = input + n * inp_sN;
+            auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
+            for (int c = 0; c < C; ++c, inp_ptr_NC += inp_sC, out_ptr_NCHW += out_sC)
+            {
+                if (within_bounds_2d(iy_nearest, ix_nearest, inp_H, inp_W))
+                {
+                    *out_ptr_NCHW = inp_ptr_NC[iy_nearest * inp_sH + ix_nearest * inp_sW];
+                }
+                else
+                {
+                    *out_ptr_NCHW = static_cast<scalar_t>(0);
+                }
+            }
         }
-      }
-    } else if (interpolation_mode == GridSamplerInterpolation::Nearest) {
-      int ix_nearest = static_cast<int>(::round(ix));
-      int iy_nearest = static_cast<int>(::round(iy));
-
-      // assign nearest neighbor pixel value to output pixel
-      auto inp_ptr_NC = input + n * inp_sN;
-      auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
-      for (int c = 0; c < C; ++c, inp_ptr_NC += inp_sC, out_ptr_NCHW += out_sC) {
-        if (within_bounds_2d(iy_nearest, ix_nearest, inp_H, inp_W)) {
-          *out_ptr_NCHW = inp_ptr_NC[iy_nearest * inp_sH + ix_nearest * inp_sW];
-        } else {
-          *out_ptr_NCHW = static_cast<scalar_t>(0);
-        }
-      }
     }
-  }
 }
 
-template <typename scalar_t>
-__global__ void grid_sampler_3d_kernel(const int nthreads, const scalar_t *input,
-                                       const scalar_t *grid, scalar_t *output,
-                                       TensorDesc input_desc, TensorDesc grid_desc,
-                                       TensorDesc output_desc,
+template<typename scalar_t>
+__global__ void grid_sampler_3d_kernel(const int                      nthreads,
+                                       const scalar_t*                input,
+                                       const scalar_t*                grid,
+                                       scalar_t*                      output,
+                                       TensorDesc                     input_desc,
+                                       TensorDesc                     grid_desc,
+                                       TensorDesc                     output_desc,
                                        const GridSamplerInterpolation interpolation_mode,
-                                       const GridSamplerPadding padding_mode, bool align_corners) {
-  int C = input_desc.shape[1];
-  int inp_D = input_desc.shape[2];
-  int inp_H = input_desc.shape[3];
-  int inp_W = input_desc.shape[4];
-  int out_D = grid_desc.shape[1];
-  int out_H = grid_desc.shape[2];
-  int out_W = grid_desc.shape[3];
-  int inp_sN = input_desc.stride[0];
-  int inp_sC = input_desc.stride[1];
-  int inp_sD = input_desc.stride[2];
-  int inp_sH = input_desc.stride[3];
-  int inp_sW = input_desc.stride[4];
-  int grid_sN = grid_desc.stride[0];
-  int grid_sD = grid_desc.stride[1];
-  int grid_sH = grid_desc.stride[2];
-  int grid_sW = grid_desc.stride[3];
-  int grid_sCoor = grid_desc.stride[4];
-  int out_sN = output_desc.stride[0];
-  int out_sC = output_desc.stride[1];
-  int out_sD = output_desc.stride[2];
-  int out_sH = output_desc.stride[3];
-  int out_sW = output_desc.stride[4];
-
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    const int w = index % out_W;
-    const int h = (index / out_W) % out_H;
-    const int d = (index / (out_H * out_W)) % out_D;
-    const int n = index / (out_D * out_H * out_W);
-    const int grid_offset = n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW;
-
-    // get the corresponding input x, y, z coordinates from grid
-    scalar_t ix = grid[grid_offset];
-    scalar_t iy = grid[grid_offset + grid_sCoor];
-    scalar_t iz = grid[grid_offset + 2 * grid_sCoor];
-
-    ix = grid_sampler_compute_source_index(ix, inp_W, padding_mode, align_corners);
-    iy = grid_sampler_compute_source_index(iy, inp_H, padding_mode, align_corners);
-    iz = grid_sampler_compute_source_index(iz, inp_D, padding_mode, align_corners);
-
-    if (interpolation_mode == GridSamplerInterpolation::Bilinear) {
-      // get corner pixel values from (x, y, z)
-      // for 4d, we used north-east-south-west
-      // for 5d, we add top-bottom
-      int ix_tnw = static_cast<int>(::floor(ix));
-      int iy_tnw = static_cast<int>(::floor(iy));
-      int iz_tnw = static_cast<int>(::floor(iz));
-
-      int ix_tne = ix_tnw + 1;
-      int iy_tne = iy_tnw;
-      int iz_tne = iz_tnw;
-
-      int ix_tsw = ix_tnw;
-      int iy_tsw = iy_tnw + 1;
-      int iz_tsw = iz_tnw;
-
-      int ix_tse = ix_tnw + 1;
-      int iy_tse = iy_tnw + 1;
-      int iz_tse = iz_tnw;
-
-      int ix_bnw = ix_tnw;
-      int iy_bnw = iy_tnw;
-      int iz_bnw = iz_tnw + 1;
-
-      int ix_bne = ix_tnw + 1;
-      int iy_bne = iy_tnw;
-      int iz_bne = iz_tnw + 1;
-
-      int ix_bsw = ix_tnw;
-      int iy_bsw = iy_tnw + 1;
-      int iz_bsw = iz_tnw + 1;
-
-      int ix_bse = ix_tnw + 1;
-      int iy_bse = iy_tnw + 1;
-      int iz_bse = iz_tnw + 1;
-
-      // get surfaces to each neighbor:
-      scalar_t tnw = (ix_bse - ix) * (iy_bse - iy) * (iz_bse - iz);
-      scalar_t tne = (ix - ix_bsw) * (iy_bsw - iy) * (iz_bsw - iz);
-      scalar_t tsw = (ix_bne - ix) * (iy - iy_bne) * (iz_bne - iz);
-      scalar_t tse = (ix - ix_bnw) * (iy - iy_bnw) * (iz_bnw - iz);
-      scalar_t bnw = (ix_tse - ix) * (iy_tse - iy) * (iz - iz_tse);
-      scalar_t bne = (ix - ix_tsw) * (iy_tsw - iy) * (iz - iz_tsw);
-      scalar_t bsw = (ix_tne - ix) * (iy - iy_tne) * (iz - iz_tne);
-      scalar_t bse = (ix - ix_tnw) * (iy - iy_tnw) * (iz - iz_tnw);
-
-      auto inp_ptr_NC = input + n * inp_sN;
-      auto out_ptr_NCDHW = output + n * out_sN + d * out_sD + h * out_sH + w * out_sW;
-      for (int c = 0; c < C; ++c, inp_ptr_NC += inp_sC, out_ptr_NCDHW += out_sC) {
-        //   (c, iz_tnw, iy_tnw, ix_tnw) * tnw + (c, iz_tne, iy_tne, ix_tne) *
-        //   tne
-        // + (c, iz_tsw, iy_tsw, ix_tsw) * tsw + (c, iz_tse, iy_tse, ix_tse) *
-        // tse
-        // + (c, iz_bnw, iy_bnw, ix_bnw) * bnw + (c, iz_bne, iy_bne, ix_bne) *
-        // bne
-        // + (c, iz_bsw, iy_bsw, ix_bsw) * bsw + (c, iz_bse, iy_bse, ix_bse) *
-        // bse
-        *out_ptr_NCDHW = static_cast<scalar_t>(0);
-        if (within_bounds_3d(iz_tnw, iy_tnw, ix_tnw, inp_D, inp_H, inp_W)) {
-          *out_ptr_NCDHW += inp_ptr_NC[iz_tnw * inp_sD + iy_tnw * inp_sH + ix_tnw * inp_sW] * tnw;
-        }
-        if (within_bounds_3d(iz_tne, iy_tne, ix_tne, inp_D, inp_H, inp_W)) {
-          *out_ptr_NCDHW += inp_ptr_NC[iz_tne * inp_sD + iy_tne * inp_sH + ix_tne * inp_sW] * tne;
-        }
-        if (within_bounds_3d(iz_tsw, iy_tsw, ix_tsw, inp_D, inp_H, inp_W)) {
-          *out_ptr_NCDHW += inp_ptr_NC[iz_tsw * inp_sD + iy_tsw * inp_sH + ix_tsw * inp_sW] * tsw;
+                                       const GridSamplerPadding       padding_mode,
+                                       bool                           align_corners)
+{
+    int C          = input_desc.shape[1];
+    int inp_D      = input_desc.shape[2];
+    int inp_H      = input_desc.shape[3];
+    int inp_W      = input_desc.shape[4];
+    int out_D      = grid_desc.shape[1];
+    int out_H      = grid_desc.shape[2];
+    int out_W      = grid_desc.shape[3];
+    int inp_sN     = input_desc.stride[0];
+    int inp_sC     = input_desc.stride[1];
+    int inp_sD     = input_desc.stride[2];
+    int inp_sH     = input_desc.stride[3];
+    int inp_sW     = input_desc.stride[4];
+    int grid_sN    = grid_desc.stride[0];
+    int grid_sD    = grid_desc.stride[1];
+    int grid_sH    = grid_desc.stride[2];
+    int grid_sW    = grid_desc.stride[3];
+    int grid_sCoor = grid_desc.stride[4];
+    int out_sN     = output_desc.stride[0];
+    int out_sC     = output_desc.stride[1];
+    int out_sD     = output_desc.stride[2];
+    int out_sH     = output_desc.stride[3];
+    int out_sW     = output_desc.stride[4];
+
+    CUDA_1D_KERNEL_LOOP(index, nthreads)
+    {
+        const int w           = index % out_W;
+        const int h           = (index / out_W) % out_H;
+        const int d           = (index / (out_H * out_W)) % out_D;
+        const int n           = index / (out_D * out_H * out_W);
+        const int grid_offset = n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW;
+
+        // get the corresponding input x, y, z coordinates from grid
+        scalar_t  ix = grid[grid_offset];
+        scalar_t  iy = grid[grid_offset + grid_sCoor];
+        scalar_t  iz = grid[grid_offset + 2 * grid_sCoor];
+
+        ix = grid_sampler_compute_source_index(ix, inp_W, padding_mode, align_corners);
+        iy = grid_sampler_compute_source_index(iy, inp_H, padding_mode, align_corners);
+        iz = grid_sampler_compute_source_index(iz, inp_D, padding_mode, align_corners);
+
+        if (interpolation_mode == GridSamplerInterpolation::Bilinear)
+        {
+            // get corner pixel values from (x, y, z)
+            // for 4d, we used north-east-south-west
+            // for 5d, we add top-bottom
+            int      ix_tnw = static_cast<int>(::floor(ix));
+            int      iy_tnw = static_cast<int>(::floor(iy));
+            int      iz_tnw = static_cast<int>(::floor(iz));
+
+            int      ix_tne = ix_tnw + 1;
+            int      iy_tne = iy_tnw;
+            int      iz_tne = iz_tnw;
+
+            int      ix_tsw = ix_tnw;
+            int      iy_tsw = iy_tnw + 1;
+            int      iz_tsw = iz_tnw;
+
+            int      ix_tse = ix_tnw + 1;
+            int      iy_tse = iy_tnw + 1;
+            int      iz_tse = iz_tnw;
+
+            int      ix_bnw = ix_tnw;
+            int      iy_bnw = iy_tnw;
+            int      iz_bnw = iz_tnw + 1;
+
+            int      ix_bne = ix_tnw + 1;
+            int      iy_bne = iy_tnw;
+            int      iz_bne = iz_tnw + 1;
+
+            int      ix_bsw = ix_tnw;
+            int      iy_bsw = iy_tnw + 1;
+            int      iz_bsw = iz_tnw + 1;
+
+            int      ix_bse = ix_tnw + 1;
+            int      iy_bse = iy_tnw + 1;
+            int      iz_bse = iz_tnw + 1;
+
+            // get surfaces to each neighbor:
+            scalar_t tnw = (ix_bse - ix) * (iy_bse - iy) * (iz_bse - iz);
+            scalar_t tne = (ix - ix_bsw) * (iy_bsw - iy) * (iz_bsw - iz);
+            scalar_t tsw = (ix_bne - ix) * (iy - iy_bne) * (iz_bne - iz);
+            scalar_t tse = (ix - ix_bnw) * (iy - iy_bnw) * (iz_bnw - iz);
+            scalar_t bnw = (ix_tse - ix) * (iy_tse - iy) * (iz - iz_tse);
+            scalar_t bne = (ix - ix_tsw) * (iy_tsw - iy) * (iz - iz_tsw);
+            scalar_t bsw = (ix_tne - ix) * (iy - iy_tne) * (iz - iz_tne);
+            scalar_t bse = (ix - ix_tnw) * (iy - iy_tnw) * (iz - iz_tnw);
+
+            auto     inp_ptr_NC    = input + n * inp_sN;
+            auto     out_ptr_NCDHW = output + n * out_sN + d * out_sD + h * out_sH + w * out_sW;
+            for (int c = 0; c < C; ++c, inp_ptr_NC += inp_sC, out_ptr_NCDHW += out_sC)
+            {
+                //   (c, iz_tnw, iy_tnw, ix_tnw) * tnw + (c, iz_tne, iy_tne, ix_tne) *
+                //   tne
+                // + (c, iz_tsw, iy_tsw, ix_tsw) * tsw + (c, iz_tse, iy_tse, ix_tse) *
+                // tse
+                // + (c, iz_bnw, iy_bnw, ix_bnw) * bnw + (c, iz_bne, iy_bne, ix_bne) *
+                // bne
+                // + (c, iz_bsw, iy_bsw, ix_bsw) * bsw + (c, iz_bse, iy_bse, ix_bse) *
+                // bse
+                *out_ptr_NCDHW = static_cast<scalar_t>(0);
+                if (within_bounds_3d(iz_tnw, iy_tnw, ix_tnw, inp_D, inp_H, inp_W))
+                {
+                    *out_ptr_NCDHW += inp_ptr_NC[iz_tnw * inp_sD + iy_tnw * inp_sH + ix_tnw * inp_sW] * tnw;
+                }
+                if (within_bounds_3d(iz_tne, iy_tne, ix_tne, inp_D, inp_H, inp_W))
+                {
+                    *out_ptr_NCDHW += inp_ptr_NC[iz_tne * inp_sD + iy_tne * inp_sH + ix_tne * inp_sW] * tne;
+                }
+                if (within_bounds_3d(iz_tsw, iy_tsw, ix_tsw, inp_D, inp_H, inp_W))
+                {
+                    *out_ptr_NCDHW += inp_ptr_NC[iz_tsw * inp_sD + iy_tsw * inp_sH + ix_tsw * inp_sW] * tsw;
+                }
+                if (within_bounds_3d(iz_tse, iy_tse, ix_tse, inp_D, inp_H, inp_W))
+                {
+                    *out_ptr_NCDHW += inp_ptr_NC[iz_tse * inp_sD + iy_tse * inp_sH + ix_tse * inp_sW] * tse;
+                }
+                if (within_bounds_3d(iz_bnw, iy_bnw, ix_bnw, inp_D, inp_H, inp_W))
+                {
+                    *out_ptr_NCDHW += inp_ptr_NC[iz_bnw * inp_sD + iy_bnw * inp_sH + ix_bnw * inp_sW] * bnw;
+                }
+                if (within_bounds_3d(iz_bne, iy_bne, ix_bne, inp_D, inp_H, inp_W))
+                {
+                    *out_ptr_NCDHW += inp_ptr_NC[iz_bne * inp_sD + iy_bne * inp_sH + ix_bne * inp_sW] * bne;
+                }
+                if (within_bounds_3d(iz_bsw, iy_bsw, ix_bsw, inp_D, inp_H, inp_W))
+                {
+                    *out_ptr_NCDHW += inp_ptr_NC[iz_bsw * inp_sD + iy_bsw * inp_sH + ix_bsw * inp_sW] * bsw;
+                }
+                if (within_bounds_3d(iz_bse, iy_bse, ix_bse, inp_D, inp_H, inp_W))
+                {
+                    *out_ptr_NCDHW += inp_ptr_NC[iz_bse * inp_sD + iy_bse * inp_sH + ix_bse * inp_sW] * bse;
+                }
+            }
         }
-        if (within_bounds_3d(iz_tse, iy_tse, ix_tse, inp_D, inp_H, inp_W)) {
-          *out_ptr_NCDHW += inp_ptr_NC[iz_tse * inp_sD + iy_tse * inp_sH + ix_tse * inp_sW] * tse;
+        else if (interpolation_mode == GridSamplerInterpolation::Nearest)
+        {
+            int  ix_nearest = static_cast<int>(::round(ix));
+            int  iy_nearest = static_cast<int>(::round(iy));
+            int  iz_nearest = static_cast<int>(::round(iz));
+
+            // assign nearest neighbor pixel value to output pixel
+            auto inp_ptr_NC    = input + n * inp_sN;
+            auto out_ptr_NCDHW = output + n * out_sN + d * out_sD + h * out_sH + w * out_sW;
+            for (int c = 0; c < C; ++c, inp_ptr_NC += inp_sC, out_ptr_NCDHW += out_sC)
+            {
+                if (within_bounds_3d(iz_nearest, iy_nearest, ix_nearest, inp_D, inp_H, inp_W))
+                {
+                    *out_ptr_NCDHW =
+                        inp_ptr_NC[iz_nearest * inp_sD + iy_nearest * inp_sH + ix_nearest * inp_sW];
+                }
+                else
+                {
+                    *out_ptr_NCDHW = static_cast<scalar_t>(0);
+                }
+            }
         }
-        if (within_bounds_3d(iz_bnw, iy_bnw, ix_bnw, inp_D, inp_H, inp_W)) {
-          *out_ptr_NCDHW += inp_ptr_NC[iz_bnw * inp_sD + iy_bnw * inp_sH + ix_bnw * inp_sW] * bnw;
-        }
-        if (within_bounds_3d(iz_bne, iy_bne, ix_bne, inp_D, inp_H, inp_W)) {
-          *out_ptr_NCDHW += inp_ptr_NC[iz_bne * inp_sD + iy_bne * inp_sH + ix_bne * inp_sW] * bne;
-        }
-        if (within_bounds_3d(iz_bsw, iy_bsw, ix_bsw, inp_D, inp_H, inp_W)) {
-          *out_ptr_NCDHW += inp_ptr_NC[iz_bsw * inp_sD + iy_bsw * inp_sH + ix_bsw * inp_sW] * bsw;
-        }
-        if (within_bounds_3d(iz_bse, iy_bse, ix_bse, inp_D, inp_H, inp_W)) {
-          *out_ptr_NCDHW += inp_ptr_NC[iz_bse * inp_sD + iy_bse * inp_sH + ix_bse * inp_sW] * bse;
-        }
-      }
-    } else if (interpolation_mode == GridSamplerInterpolation::Nearest) {
-      int ix_nearest = static_cast<int>(::round(ix));
-      int iy_nearest = static_cast<int>(::round(iy));
-      int iz_nearest = static_cast<int>(::round(iz));
-
-      // assign nearest neighbor pixel value to output pixel
-      auto inp_ptr_NC = input + n * inp_sN;
-      auto out_ptr_NCDHW = output + n * out_sN + d * out_sD + h * out_sH + w * out_sW;
-      for (int c = 0; c < C; ++c, inp_ptr_NC += inp_sC, out_ptr_NCDHW += out_sC) {
-        if (within_bounds_3d(iz_nearest, iy_nearest, ix_nearest, inp_D, inp_H, inp_W)) {
-          *out_ptr_NCDHW =
-              inp_ptr_NC[iz_nearest * inp_sD + iy_nearest * inp_sH + ix_nearest * inp_sW];
-        } else {
-          *out_ptr_NCDHW = static_cast<scalar_t>(0);
-        }
-      }
     }
-  }
 }
 
-void create_desc(const int *dims, int nb_dims, TensorDesc &desc) {
-  memcpy(&desc.shape[0], dims, sizeof(int) * nb_dims);
-  desc.stride[nb_dims - 1] = 1;
-  for (int i = nb_dims - 2; i >= 0; --i) {
-    desc.stride[i] = desc.stride[i + 1] * desc.shape[i + 1];
-  }
+void create_desc(const int* dims, int nb_dims, TensorDesc& desc)
+{
+    memcpy(&desc.shape[0], dims, sizeof(int) * nb_dims);
+    desc.stride[nb_dims - 1] = 1;
+    for (int i = nb_dims - 2; i >= 0; --i)
+    {
+        desc.stride[i] = desc.stride[i + 1] * desc.shape[i + 1];
+    }
 }
 
-template <typename T>
-void grid_sample(T *output, const T *input, const T *grid, int *output_dims, int *input_dims,
-                 int *grid_dims, int nb_dims, GridSamplerInterpolation interp,
-                 GridSamplerPadding padding, bool align_corners, cudaStream_t stream) {
-  TensorDesc input_desc;
-  create_desc(input_dims, nb_dims, input_desc);
-
-  TensorDesc output_desc;
-  create_desc(output_dims, nb_dims, output_desc);
-
-  TensorDesc grid_desc;
-  create_desc(grid_dims, nb_dims, grid_desc);
+template<typename T>
+void grid_sample(T*                       output,
+                 const T*                 input,
+                 const T*                 grid,
+                 int*                     output_dims,
+                 int*                     input_dims,
+                 int*                     grid_dims,
+                 int                      nb_dims,
+                 GridSamplerInterpolation interp,
+                 GridSamplerPadding       padding,
+                 bool                     align_corners,
+                 cudaStream_t             stream)
+{
+    TensorDesc input_desc;
+    create_desc(input_dims, nb_dims, input_desc);
+
+    TensorDesc output_desc;
+    create_desc(output_dims, nb_dims, output_desc);
+
+    TensorDesc grid_desc;
+    create_desc(grid_dims, nb_dims, grid_desc);
+
+    int count = 1;
+    for (int i = 0; i < nb_dims; ++i)
+    {
+        if (i == 1)
+        {
+            continue;
+        }
+        count *= output_desc.shape[i];
+    }
 
-  int count = 1;
-  for (int i = 0; i < nb_dims; ++i) {
-    if (i == 1) {
-      continue;
+    if (nb_dims == 4)
+    {
+        grid_sampler_2d_kernel<T><<<GET_BLOCKS(count), THREADS_PER_BLOCK, 0, stream>>>(count,
+                                                                                       input,
+                                                                                       grid,
+                                                                                       output,
+                                                                                       input_desc,
+                                                                                       grid_desc,
+                                                                                       output_desc,
+                                                                                       interp,
+                                                                                       padding,
+                                                                                       align_corners);
+    }
+    else if (nb_dims == 5)
+    {
+        grid_sampler_3d_kernel<T><<<GET_BLOCKS(count), THREADS_PER_BLOCK, 0, stream>>>(count,
+                                                                                       input,
+                                                                                       grid,
+                                                                                       output,
+                                                                                       input_desc,
+                                                                                       grid_desc,
+                                                                                       output_desc,
+                                                                                       interp,
+                                                                                       padding,
+                                                                                       align_corners);
+    }
+    else
+    {
+        printf("input and grid dims should be 4 or 5\n");
     }
-    count *= output_desc.shape[i];
-  }
-
-  if (nb_dims == 4) {
-    grid_sampler_2d_kernel<T><<<GET_BLOCKS(count), THREADS_PER_BLOCK, 0, stream>>>(
-        count, input, grid, output, input_desc, grid_desc, output_desc, interp, padding,
-        align_corners);
-  } else if (nb_dims == 5) {
-    grid_sampler_3d_kernel<T><<<GET_BLOCKS(count), THREADS_PER_BLOCK, 0, stream>>>(
-        count, input, grid, output, input_desc, grid_desc, output_desc, interp, padding,
-        align_corners);
-  } else {
-    printf("input and grid dims should be 4 or 5\n");
-  }
 }
 
-template void grid_sample<float>(float *output, const float *input, const float *grid,
-                                 int *output_dims, int *input_dims, int *grid_dims, int nb_dims,
-                                 GridSamplerInterpolation interp, GridSamplerPadding padding,
-                                 bool align_corners, cudaStream_t stream);
+template void grid_sample<float>(float*                   output,
+                                 const float*             input,
+                                 const float*             grid,
+                                 int*                     output_dims,
+                                 int*                     input_dims,
+                                 int*                     grid_dims,
+                                 int                      nb_dims,
+                                 GridSamplerInterpolation interp,
+                                 GridSamplerPadding       padding,
+                                 bool                     align_corners,
+                                 cudaStream_t             stream);
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler_kernel.hpp b/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler_kernel.hpp
index e4e50332f4..2da0e3abc5 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler_kernel.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/grid_sampler/trt_grid_sampler_kernel.hpp
@@ -3,11 +3,28 @@
 #define TRT_GRID_SAMPLER_KERNEL_HPP
 #include <cuda_runtime.h>
 
-enum class GridSamplerInterpolation { Bilinear, Nearest };
-enum class GridSamplerPadding { Zeros, Border, Reflection };
+enum class GridSamplerInterpolation
+{
+    Bilinear,
+    Nearest
+};
+enum class GridSamplerPadding
+{
+    Zeros,
+    Border,
+    Reflection
+};
 
-template <typename T>
-void grid_sample(T *output, const T *input, const T *grid, int *output_dims, int *input_dims,
-                 int *grid_dims, int nb_dims, GridSamplerInterpolation interp,
-                 GridSamplerPadding padding, bool align_corners, cudaStream_t stream);
+template<typename T>
+void grid_sample(T*                       output,
+                 const T*                 input,
+                 const T*                 grid,
+                 int*                     output_dims,
+                 int*                     input_dims,
+                 int*                     grid_dims,
+                 int                      nb_dims,
+                 GridSamplerInterpolation interp,
+                 GridSamplerPadding       padding,
+                 bool                     align_corners,
+                 cudaStream_t             stream);
 #endif  // TRT_GRID_SAMPLER_KERNEL_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/instance_norm/trt_instance_norm.cpp b/csrc/mmdeploy/backend_ops/tensorrt/instance_norm/trt_instance_norm.cpp
index e6aab92f4c..7b5ed533e5 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/instance_norm/trt_instance_norm.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/instance_norm/trt_instance_norm.cpp
@@ -12,203 +12,259 @@
 
 using namespace nvinfer1;
 
-namespace mmdeploy {
-namespace {
-constexpr const char* PLUGIN_VERSION{"1"};
-constexpr const char* PLUGIN_NAME{"TRTInstanceNormalization"};
-}  // namespace
-
-TRTInstanceNormalization::TRTInstanceNormalization(const std::string& name, float epsilon)
-    : TRTPluginBase(name), mEpsilon(epsilon) {}
-
-TRTInstanceNormalization::TRTInstanceNormalization(const std::string& name, void const* serialData,
-                                                   size_t serialLength)
-    : TRTPluginBase(name) {
-  deserialize_value(&serialData, &serialLength, &mEpsilon);
-}
-
-TRTInstanceNormalization::~TRTInstanceNormalization() {}
-
-// TRTInstanceNormalization returns one output.
-int TRTInstanceNormalization::getNbOutputs() const TRT_NOEXCEPT { return 1; }
-
-DimsExprs TRTInstanceNormalization::getOutputDimensions(
-    int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
-    nvinfer1::IExprBuilder& exprBuilder) TRT_NOEXCEPT {
-  nvinfer1::DimsExprs output(inputs[0]);
-  return output;
-}
-
-size_t TRTInstanceNormalization::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
-                                                  int nbInputs,
-                                                  const nvinfer1::PluginTensorDesc* outputs,
-                                                  int nbOutputs) const TRT_NOEXCEPT {
-  int n = inputs[0].dims.d[0];
-  int c = inputs[0].dims.d[1];
-  int elem_size = sizeof(float);
-  return getAlignedSize(n * c * elem_size) * 2;
-}
-
-int TRTInstanceNormalization::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
-                                      const nvinfer1::PluginTensorDesc* outputDesc,
-                                      const void* const* inputs, void* const* outputs,
-                                      void* workspace, cudaStream_t stream) TRT_NOEXCEPT {
-  nvinfer1::Dims input_dims = inputDesc[0].dims;
-  int n = input_dims.d[0];
-  int c = input_dims.d[1];
-  int h = input_dims.d[2];
-  int w = input_dims.nbDims > 3 ? input_dims.d[3] : 1;
-  int elem_size = sizeof(float);
-
-  void* n_scales = (void*)workspace;
-  void* n_bias = (void*)((char*)workspace + getAlignedSize(n * c * elem_size));
-
-  const void* scales = (const void*)inputs[1];
-  const void* bias = (const void*)inputs[2];
-
-  for (int i = 0; i < n; ++i) {
-    cudaMemcpyAsync((char*)n_scales + i * c * elem_size, scales, c * elem_size,
-                    cudaMemcpyDeviceToDevice, stream);
-    cudaMemcpyAsync((char*)n_bias + i * c * elem_size, bias, c * elem_size,
-                    cudaMemcpyDeviceToDevice, stream);
-  }
-
-  cudnnSetTensor4dDescriptor(_b_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, n * c, 1, 1);
-  cudnnDataType_t cudnn_dtype{};
-  convert_trt2cudnn_dtype(inputDesc[0].type, &cudnn_dtype);
-  cudnnSetTensor4dDescriptor(_x_desc, CUDNN_TENSOR_NCHW, cudnn_dtype, 1, n * c, h, w);
-  cudnnSetTensor4dDescriptor(_y_desc, CUDNN_TENSOR_NCHW, cudnn_dtype, 1, n * c, h, w);
-  float alpha = 1;
-  float beta = 0;
-  void const* x_ptr = inputs[0];
-  void* y_ptr = outputs[0];
-  cudnnSetStream(_cudnn_handle, stream);
-  // Note: Use of CUDNN_BATCHNORM_SPATIAL_PERSISTENT can cause numerical
-  //       overflows (NaNs) for fp32 data in some circumstances. The lower-
-  //       performance CUDNN_BATCHNORM_SPATIAL should be used if this is not
-  //       acceptable.
-  cudnnBatchNormalizationForwardTraining(_cudnn_handle, CUDNN_BATCHNORM_SPATIAL_PERSISTENT, &alpha,
-                                         &beta, _x_desc, x_ptr, _y_desc, y_ptr, _b_desc, n_scales,
-                                         n_bias, 1., nullptr, nullptr, mEpsilon, nullptr, nullptr);
-  return 0;
-}
-
-size_t TRTInstanceNormalization::getSerializationSize() const TRT_NOEXCEPT {
-  return serialized_size(mEpsilon);
-}
-
-void TRTInstanceNormalization::serialize(void* buffer) const TRT_NOEXCEPT {
-  serialize_value(&buffer, mEpsilon);
-}
-
-bool TRTInstanceNormalization::supportsFormatCombination(int pos,
-                                                         const nvinfer1::PluginTensorDesc* ioDesc,
-                                                         int nbInputs, int nbOutputs) TRT_NOEXCEPT {
-  switch (pos) {
-    case 0:
-    case 3:
-      return ((ioDesc[pos].type == nvinfer1::DataType::kFLOAT ||
-               ioDesc[pos].type == nvinfer1::DataType::kHALF) &&
-              ioDesc[pos].format == nvinfer1::PluginFormat::kLINEAR &&
-              ioDesc[pos].type == ioDesc[0].type);
-    case 1:
-    case 2:
-      return ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
-             ioDesc[pos].format == nvinfer1::PluginFormat::kLINEAR;
-    default:
-      return false;
-  }
-  return false;
-}
-
-const char* TRTInstanceNormalization::getPluginType() const TRT_NOEXCEPT { return PLUGIN_NAME; }
-
-const char* TRTInstanceNormalization::getPluginVersion() const TRT_NOEXCEPT {
-  return PLUGIN_VERSION;
-}
-
-IPluginV2DynamicExt* TRTInstanceNormalization::clone() const TRT_NOEXCEPT {
-  auto* plugin = new TRTInstanceNormalization{mLayerName, mEpsilon};
-  plugin->setPluginNamespace(mPluginNamespace.c_str());
-  return plugin;
-}
-
-nvinfer1::DataType TRTInstanceNormalization::getOutputDataType(int index,
-                                                               const nvinfer1::DataType* inputTypes,
-                                                               int nbInputs) const TRT_NOEXCEPT {
-  return inputTypes[0];
-}
-
-// Attach the plugin object to an execution context and grant the plugin the
-// access to some context resource.
-void TRTInstanceNormalization::attachToContext(cudnnContext* cudnnContext,
-                                               cublasContext* cublasContext,
-                                               IGpuAllocator* gpuAllocator) TRT_NOEXCEPT {
-  _cudnn_handle = cudnnContext;
-  cudnnCreateTensorDescriptor(&_b_desc);
-  cudnnCreateTensorDescriptor(&_x_desc);
-  cudnnCreateTensorDescriptor(&_y_desc);
-}
-
-// Detach the plugin object from its execution context.
-void TRTInstanceNormalization::detachFromContext() TRT_NOEXCEPT {
-  if (_y_desc) {
-    cudnnDestroyTensorDescriptor(_y_desc);
-    _y_desc = nullptr;
-  }
-  if (_x_desc) {
-    cudnnDestroyTensorDescriptor(_x_desc);
-    _x_desc = nullptr;
-  }
-  if (_b_desc) {
-    cudnnDestroyTensorDescriptor(_b_desc);
-    _b_desc = nullptr;
-  }
-}
-
-void TRTInstanceNormalization::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
-                                               int nbInputs,
-                                               const nvinfer1::DynamicPluginTensorDesc* out,
-                                               int nbOutputs) TRT_NOEXCEPT {}
-
-// TRTInstanceNormalizationCreator methods
-TRTInstanceNormalizationCreator::TRTInstanceNormalizationCreator() {
-  mPluginAttributes.clear();
-  mPluginAttributes.emplace_back(PluginField("epsilon", nullptr, PluginFieldType::kFLOAT32, 1));
-
-  mFC.nbFields = mPluginAttributes.size();
-  mFC.fields = mPluginAttributes.data();
-}
-
-const char* TRTInstanceNormalizationCreator::getPluginName() const TRT_NOEXCEPT {
-  return PLUGIN_NAME;
-}
-
-const char* TRTInstanceNormalizationCreator::getPluginVersion() const TRT_NOEXCEPT {
-  return PLUGIN_VERSION;
-}
-
-IPluginV2DynamicExt* TRTInstanceNormalizationCreator::createPlugin(
-    const char* name, const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT {
-  float epsilon = 1e-5;
-  const PluginField* fields = fc->fields;
-  for (int i = 0; i < fc->nbFields; ++i) {
-    const char* attrName = fields[i].name;
-    if (!strcmp(attrName, "epsilon")) {
-      epsilon = *(static_cast<const float*>(fields[i].data));
-    }
-  }
-
-  TRTInstanceNormalization* obj = new TRTInstanceNormalization(name, epsilon);
-  obj->setPluginNamespace(mNamespace.c_str());
-  return obj;
-}
-
-IPluginV2DynamicExt* TRTInstanceNormalizationCreator::deserializePlugin(
-    const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT {
-  TRTInstanceNormalization* obj = new TRTInstanceNormalization{name, serialData, serialLength};
-  obj->setPluginNamespace(mNamespace.c_str());
-  return obj;
-}
-REGISTER_TENSORRT_PLUGIN(TRTInstanceNormalizationCreator);
+namespace mmdeploy
+{
+    namespace
+    {
+        constexpr const char* PLUGIN_VERSION{"1"};
+        constexpr const char* PLUGIN_NAME{"TRTInstanceNormalization"};
+    }  // namespace
+
+    TRTInstanceNormalization::TRTInstanceNormalization(const std::string& name,
+                                                       float              epsilon)
+        : TRTPluginBase(name)
+        , mEpsilon(epsilon)
+    {
+    }
+
+    TRTInstanceNormalization::TRTInstanceNormalization(const std::string& name,
+                                                       void const*        serialData,
+                                                       size_t             serialLength)
+        : TRTPluginBase(name)
+    {
+        deserialize_value(&serialData, &serialLength, &mEpsilon);
+    }
+
+    TRTInstanceNormalization::~TRTInstanceNormalization() {}
+
+    // TRTInstanceNormalization returns one output.
+    int TRTInstanceNormalization::getNbOutputs() const TRT_NOEXCEPT
+    {
+        return 1;
+    }
+
+    DimsExprs TRTInstanceNormalization::getOutputDimensions(int                        outputIndex,
+                                                            const nvinfer1::DimsExprs* inputs,
+                                                            int                        nbInputs,
+                                                            nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT
+    {
+        nvinfer1::DimsExprs output(inputs[0]);
+        return output;
+    }
+
+    size_t TRTInstanceNormalization::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                                      int                               nbInputs,
+                                                      const nvinfer1::PluginTensorDesc* outputs,
+                                                      int                               nbOutputs) const TRT_NOEXCEPT
+    {
+        int n         = inputs[0].dims.d[0];
+        int c         = inputs[0].dims.d[1];
+        int elem_size = sizeof(float);
+        return getAlignedSize(n * c * elem_size) * 2;
+    }
+
+    int TRTInstanceNormalization::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                          const nvinfer1::PluginTensorDesc* outputDesc,
+                                          const void* const*                inputs,
+                                          void* const*                      outputs,
+                                          void*                             workspace,
+                                          cudaStream_t                      stream) TRT_NOEXCEPT
+    {
+        nvinfer1::Dims input_dims = inputDesc[0].dims;
+        int            n          = input_dims.d[0];
+        int            c          = input_dims.d[1];
+        int            h          = input_dims.d[2];
+        int            w          = input_dims.nbDims > 3 ? input_dims.d[3] : 1;
+        int            elem_size  = sizeof(float);
+
+        void*          n_scales = (void*)workspace;
+        void*          n_bias   = (void*)((char*)workspace + getAlignedSize(n * c * elem_size));
+
+        const void*    scales = (const void*)inputs[1];
+        const void*    bias   = (const void*)inputs[2];
+
+        for (int i = 0; i < n; ++i)
+        {
+            cudaMemcpyAsync((char*)n_scales + i * c * elem_size, scales, c * elem_size, cudaMemcpyDeviceToDevice, stream);
+            cudaMemcpyAsync((char*)n_bias + i * c * elem_size, bias, c * elem_size, cudaMemcpyDeviceToDevice, stream);
+        }
+
+        cudnnSetTensor4dDescriptor(_b_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, n * c, 1, 1);
+        cudnnDataType_t cudnn_dtype{};
+        convert_trt2cudnn_dtype(inputDesc[0].type, &cudnn_dtype);
+        cudnnSetTensor4dDescriptor(_x_desc, CUDNN_TENSOR_NCHW, cudnn_dtype, 1, n * c, h, w);
+        cudnnSetTensor4dDescriptor(_y_desc, CUDNN_TENSOR_NCHW, cudnn_dtype, 1, n * c, h, w);
+        float       alpha = 1;
+        float       beta  = 0;
+        void const* x_ptr = inputs[0];
+        void*       y_ptr = outputs[0];
+        cudnnSetStream(_cudnn_handle, stream);
+        // Note: Use of CUDNN_BATCHNORM_SPATIAL_PERSISTENT can cause numerical
+        //       overflows (NaNs) for fp32 data in some circumstances. The lower-
+        //       performance CUDNN_BATCHNORM_SPATIAL should be used if this is not
+        //       acceptable.
+        cudnnBatchNormalizationForwardTraining(_cudnn_handle,
+                                               CUDNN_BATCHNORM_SPATIAL_PERSISTENT,
+                                               &alpha,
+                                               &beta,
+                                               _x_desc,
+                                               x_ptr,
+                                               _y_desc,
+                                               y_ptr,
+                                               _b_desc,
+                                               n_scales,
+                                               n_bias,
+                                               1.,
+                                               nullptr,
+                                               nullptr,
+                                               mEpsilon,
+                                               nullptr,
+                                               nullptr);
+        return 0;
+    }
+
+    size_t TRTInstanceNormalization::getSerializationSize() const TRT_NOEXCEPT
+    {
+        return serialized_size(mEpsilon);
+    }
+
+    void TRTInstanceNormalization::serialize(void* buffer) const TRT_NOEXCEPT
+    {
+        serialize_value(&buffer, mEpsilon);
+    }
+
+    bool TRTInstanceNormalization::supportsFormatCombination(int                               pos,
+                                                             const nvinfer1::PluginTensorDesc* ioDesc,
+                                                             int                               nbInputs,
+                                                             int                               nbOutputs) TRT_NOEXCEPT
+    {
+        switch (pos)
+        {
+            case 0:
+            case 3:
+                return ((ioDesc[pos].type == nvinfer1::DataType::kFLOAT ||
+                         ioDesc[pos].type == nvinfer1::DataType::kHALF) &&
+                        ioDesc[pos].format == nvinfer1::PluginFormat::kLINEAR &&
+                        ioDesc[pos].type == ioDesc[0].type);
+            case 1:
+            case 2:
+                return ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
+                       ioDesc[pos].format == nvinfer1::PluginFormat::kLINEAR;
+            default:
+                return false;
+        }
+        return false;
+    }
+
+    const char* TRTInstanceNormalization::getPluginType() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
+    }
+
+    const char* TRTInstanceNormalization::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
+    }
+
+    IPluginV2DynamicExt* TRTInstanceNormalization::clone() const TRT_NOEXCEPT
+    {
+        auto* plugin = new TRTInstanceNormalization{mLayerName, mEpsilon};
+        plugin->setPluginNamespace(mPluginNamespace.c_str());
+        return plugin;
+    }
+
+    nvinfer1::DataType TRTInstanceNormalization::getOutputDataType(int                       index,
+                                                                   const nvinfer1::DataType* inputTypes,
+                                                                   int                       nbInputs) const TRT_NOEXCEPT
+    {
+        return inputTypes[0];
+    }
+
+    // Attach the plugin object to an execution context and grant the plugin the
+    // access to some context resource.
+    void TRTInstanceNormalization::attachToContext(cudnnContext*  cudnnContext,
+                                                   cublasContext* cublasContext,
+                                                   IGpuAllocator* gpuAllocator) TRT_NOEXCEPT
+    {
+        _cudnn_handle = cudnnContext;
+        cudnnCreateTensorDescriptor(&_b_desc);
+        cudnnCreateTensorDescriptor(&_x_desc);
+        cudnnCreateTensorDescriptor(&_y_desc);
+    }
+
+    // Detach the plugin object from its execution context.
+    void TRTInstanceNormalization::detachFromContext() TRT_NOEXCEPT
+    {
+        if (_y_desc)
+        {
+            cudnnDestroyTensorDescriptor(_y_desc);
+            _y_desc = nullptr;
+        }
+        if (_x_desc)
+        {
+            cudnnDestroyTensorDescriptor(_x_desc);
+            _x_desc = nullptr;
+        }
+        if (_b_desc)
+        {
+            cudnnDestroyTensorDescriptor(_b_desc);
+            _b_desc = nullptr;
+        }
+    }
+
+    void TRTInstanceNormalization::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                                                   int                                      nbInputs,
+                                                   const nvinfer1::DynamicPluginTensorDesc* out,
+                                                   int                                      nbOutputs) TRT_NOEXCEPT {}
+
+    // TRTInstanceNormalizationCreator methods
+    TRTInstanceNormalizationCreator::TRTInstanceNormalizationCreator()
+    {
+        mPluginAttributes.clear();
+        mPluginAttributes.emplace_back(PluginField("epsilon", nullptr, PluginFieldType::kFLOAT32, 1));
+
+        mFC.nbFields = mPluginAttributes.size();
+        mFC.fields   = mPluginAttributes.data();
+    }
+
+    const char* TRTInstanceNormalizationCreator::getPluginName() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
+    }
+
+    const char* TRTInstanceNormalizationCreator::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
+    }
+
+    IPluginV2DynamicExt* TRTInstanceNormalizationCreator::createPlugin(
+        const char*                            name,
+        const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT
+    {
+        float              epsilon = 1e-5;
+        const PluginField* fields  = fc->fields;
+        for (int i = 0; i < fc->nbFields; ++i)
+        {
+            const char* attrName = fields[i].name;
+            if (!strcmp(attrName, "epsilon"))
+            {
+                epsilon = *(static_cast<const float*>(fields[i].data));
+            }
+        }
+
+        TRTInstanceNormalization* obj = new TRTInstanceNormalization(name, epsilon);
+        obj->setPluginNamespace(mNamespace.c_str());
+        return obj;
+    }
+
+    IPluginV2DynamicExt* TRTInstanceNormalizationCreator::deserializePlugin(
+        const char* name,
+        const void* serialData,
+        size_t      serialLength) TRT_NOEXCEPT
+    {
+        TRTInstanceNormalization* obj = new TRTInstanceNormalization{name, serialData, serialLength};
+        obj->setPluginNamespace(mNamespace.c_str());
+        return obj;
+    }
+    REGISTER_TENSORRT_PLUGIN(TRTInstanceNormalizationCreator);
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/instance_norm/trt_instance_norm.hpp b/csrc/mmdeploy/backend_ops/tensorrt/instance_norm/trt_instance_norm.hpp
index 2df04a5f6d..d8119d355b 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/instance_norm/trt_instance_norm.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/instance_norm/trt_instance_norm.hpp
@@ -14,80 +14,97 @@
 
 typedef unsigned short half_type;
 
-namespace mmdeploy {
-class TRTInstanceNormalization final : public TRTPluginBase {
- public:
-  TRTInstanceNormalization(const std::string& name, float epsilon);
+namespace mmdeploy
+{
+    class TRTInstanceNormalization final : public TRTPluginBase
+    {
+      public:
+        TRTInstanceNormalization(const std::string& name,
+                                 float              epsilon);
 
-  TRTInstanceNormalization(const std::string& name, void const* serialData, size_t serialLength);
+        TRTInstanceNormalization(const std::string& name,
+                                 void const*        serialData,
+                                 size_t             serialLength);
 
-  TRTInstanceNormalization() = delete;
+        TRTInstanceNormalization() = delete;
 
-  ~TRTInstanceNormalization() TRT_NOEXCEPT override;
+        ~TRTInstanceNormalization() TRT_NOEXCEPT override;
 
-  int getNbOutputs() const TRT_NOEXCEPT override;
+        int                 getNbOutputs() const TRT_NOEXCEPT override;
 
-  // DynamicExt plugins returns DimsExprs class instead of Dims
-  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs* inputs,
-                                          int nbInputs, nvinfer1::IExprBuilder& exprBuilder)
-      TRT_NOEXCEPT override;
+        // DynamicExt plugins returns DimsExprs class instead of Dims
+        nvinfer1::DimsExprs            getOutputDimensions(int                        outputIndex,
+                                                           const nvinfer1::DimsExprs* inputs,
+                                                           int                        nbInputs,
+                                                           nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT override;
 
-  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
-                          const nvinfer1::PluginTensorDesc* outputs,
-                          int nbOutputs) const TRT_NOEXCEPT override;
+        size_t                         getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                                        int                               nbInputs,
+                                                        const nvinfer1::PluginTensorDesc* outputs,
+                                                        int                               nbOutputs) const TRT_NOEXCEPT override;
 
-  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
-              const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs,
-              void* const* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT override;
+        int                            enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                               const nvinfer1::PluginTensorDesc* outputDesc,
+                                               const void* const*                inputs,
+                                               void* const*                      outputs,
+                                               void*                             workspace,
+                                               cudaStream_t                      stream) TRT_NOEXCEPT override;
 
-  size_t getSerializationSize() const TRT_NOEXCEPT override;
+        size_t                         getSerializationSize() const TRT_NOEXCEPT override;
 
-  void serialize(void* buffer) const TRT_NOEXCEPT override;
+        void                           serialize(void* buffer) const TRT_NOEXCEPT override;
 
-  // DynamicExt plugin supportsFormat update.
-  bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs,
-                                 int nbOutputs) TRT_NOEXCEPT override;
+        // DynamicExt plugin supportsFormat update.
+        bool                           supportsFormatCombination(int                               pos,
+                                                                 const nvinfer1::PluginTensorDesc* ioDesc,
+                                                                 int                               nbInputs,
+                                                                 int                               nbOutputs) TRT_NOEXCEPT override;
 
-  const char* getPluginType() const TRT_NOEXCEPT override;
+        const char*                    getPluginType() const TRT_NOEXCEPT override;
 
-  const char* getPluginVersion() const TRT_NOEXCEPT override;
+        const char*                    getPluginVersion() const TRT_NOEXCEPT override;
 
-  nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
+        nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
 
-  nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes,
-                                       int nbInputs) const TRT_NOEXCEPT override;
+        nvinfer1::DataType             getOutputDataType(int                       index,
+                                                         const nvinfer1::DataType* inputTypes,
+                                                         int                       nbInputs) const TRT_NOEXCEPT override;
 
-  void attachToContext(cudnnContext* cudnn, cublasContext* cublas,
-                       nvinfer1::IGpuAllocator* allocator) TRT_NOEXCEPT override;
+        void                           attachToContext(cudnnContext*            cudnn,
+                                                       cublasContext*           cublas,
+                                                       nvinfer1::IGpuAllocator* allocator) TRT_NOEXCEPT override;
 
-  void detachFromContext() TRT_NOEXCEPT override;
+        void                           detachFromContext() TRT_NOEXCEPT override;
 
-  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs,
-                       const nvinfer1::DynamicPluginTensorDesc* out,
-                       int nbOutputs) TRT_NOEXCEPT override;
+        void                           configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                                                       int                                      nbInputs,
+                                                       const nvinfer1::DynamicPluginTensorDesc* out,
+                                                       int                                      nbOutputs) TRT_NOEXCEPT override;
 
- private:
-  float mEpsilon{};
-  cudnnHandle_t _cudnn_handle{};
-  cudnnTensorDescriptor_t _x_desc{}, _y_desc{}, _b_desc{};
-  std::string mPluginNamespace{};
-};
+      private:
+        float                   mEpsilon{};
+        cudnnHandle_t           _cudnn_handle{};
+        cudnnTensorDescriptor_t _x_desc{}, _y_desc{}, _b_desc{};
+        std::string             mPluginNamespace{};
+    };
 
-class TRTInstanceNormalizationCreator : public TRTPluginCreatorBase {
- public:
-  TRTInstanceNormalizationCreator();
+    class TRTInstanceNormalizationCreator : public TRTPluginCreatorBase
+    {
+      public:
+        TRTInstanceNormalizationCreator();
 
-  ~TRTInstanceNormalizationCreator() override = default;
+        ~TRTInstanceNormalizationCreator() override = default;
 
-  const char* getPluginName() const TRT_NOEXCEPT override;
+        const char*                    getPluginName() const TRT_NOEXCEPT override;
 
-  const char* getPluginVersion() const TRT_NOEXCEPT override;
+        const char*                    getPluginVersion() const TRT_NOEXCEPT override;
 
-  nvinfer1::IPluginV2DynamicExt* createPlugin(
-      const char* name, const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override;
+        nvinfer1::IPluginV2DynamicExt* createPlugin(const char*                            name,
+                                                    const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override;
 
-  nvinfer1::IPluginV2DynamicExt* deserializePlugin(const char* name, const void* serialData,
-                                                   size_t serialLength) TRT_NOEXCEPT override;
-};
+        nvinfer1::IPluginV2DynamicExt* deserializePlugin(const char* name,
+                                                         const void* serialData,
+                                                         size_t      serialLength) TRT_NOEXCEPT override;
+    };
 }  // namespace mmdeploy
 #endif  // TRT_INSTANCE_NORMALIZATION_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv.cpp b/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv.cpp
index 692000b740..c3540002fa 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv.cpp
@@ -10,297 +10,397 @@
 
 using namespace nvinfer1;
 
-namespace mmdeploy {
-namespace {
-static const char *PLUGIN_VERSION{"1"};
-static const char *PLUGIN_NAME{"MMCVModulatedDeformConv2d"};
-}  // namespace
-
-ModulatedDeformableConvPluginDynamic::ModulatedDeformableConvPluginDynamic(
-    const std::string &name, const nvinfer1::Dims stride, const nvinfer1::Dims padding,
-    const nvinfer1::Dims dilation, const int deformableGroup, const int group)
-    : TRTPluginBase(name),
-      mStride(stride),
-      mPadding(padding),
-      mDilation(dilation),
-      mDeformableGroup(deformableGroup),
-      mGroup(group) {
-  mWithBias = false;
-}
-
-ModulatedDeformableConvPluginDynamic::ModulatedDeformableConvPluginDynamic(const std::string name,
-                                                                           const void *data,
-                                                                           size_t length)
-    : TRTPluginBase(name) {
-  deserialize_value(&data, &length, &mStride);
-  deserialize_value(&data, &length, &mPadding);
-  deserialize_value(&data, &length, &mDilation);
-  deserialize_value(&data, &length, &mDeformableGroup);
-  deserialize_value(&data, &length, &mGroup);
-  mWithBias = false;
-}
-ModulatedDeformableConvPluginDynamic::~ModulatedDeformableConvPluginDynamic() {}
-
-nvinfer1::IPluginV2DynamicExt *ModulatedDeformableConvPluginDynamic::clone() const TRT_NOEXCEPT {
-  ModulatedDeformableConvPluginDynamic *plugin = new ModulatedDeformableConvPluginDynamic(
-      mLayerName, mStride, mPadding, mDilation, mDeformableGroup, mGroup);
-  plugin->setPluginNamespace(getPluginNamespace());
-
-  return plugin;
-}
-
-static const nvinfer1::IDimensionExpr *get_hw(const nvinfer1::IDimensionExpr *input,
-                                              const nvinfer1::IDimensionExpr *weight,
-                                              const nvinfer1::IDimensionExpr *stride,
-                                              const nvinfer1::IDimensionExpr *pad,
-                                              const nvinfer1::IDimensionExpr *dilation,
-                                              nvinfer1::IExprBuilder &exprBuilder) {
-  using DimOp = nvinfer1::DimensionOperation;
-  auto expr_1 = exprBuilder.constant(1);
-
-  // d*(w-1)+1
-  auto kernel_0 = exprBuilder.operation(DimOp::kSUB, *weight, *expr_1);
-  auto kernel_1 = exprBuilder.operation(DimOp::kPROD, *dilation, *kernel_0);
-  auto kernel = exprBuilder.operation(DimOp::kSUM, *kernel_1, *expr_1);
-
-  // (1+2*p-k)//stride -1
-  auto out_0 = exprBuilder.operation(DimOp::kSUM, *pad, *pad);
-  auto out_1 = exprBuilder.operation(DimOp::kSUM, *input, *out_0);
-  auto out_2 = exprBuilder.operation(DimOp::kSUB, *out_1, *kernel);
-  auto out_3 = exprBuilder.operation(DimOp::kFLOOR_DIV, *out_2, *stride);
-  auto out = exprBuilder.operation(DimOp::kSUM, *out_3, *expr_1);
-
-  return out;
-}
-
-nvinfer1::DimsExprs ModulatedDeformableConvPluginDynamic::getOutputDimensions(
-    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
-    nvinfer1::IExprBuilder &exprBuilder) TRT_NOEXCEPT {
-  using DimOp = nvinfer1::DimensionOperation;
-  auto weight_dim = inputs[3].d;
-  nvinfer1::DimsExprs ret;
-  ret.nbDims = 4;
-  ret.d[0] = inputs[0].d[0];
-  ret.d[1] = inputs[3].d[0];
-
-  auto input_h = inputs[0].d[2];
-  auto input_w = inputs[0].d[3];
-  auto weight_h = weight_dim[2];
-  auto weight_w = weight_dim[3];
-  auto dilation_w = exprBuilder.constant(mDilation.d[0]);
-  auto dilation_h = exprBuilder.constant(mDilation.d[1]);
-  auto pad_w = exprBuilder.constant(mPadding.d[0]);
-  auto pad_h = exprBuilder.constant(mPadding.d[1]);
-  auto stride_w = exprBuilder.constant(mStride.d[0]);
-  auto stride_h = exprBuilder.constant(mStride.d[1]);
-  auto expr_1 = exprBuilder.constant(1);
-  auto expr_2 = exprBuilder.constant(2);
-
-  ret.d[2] = get_hw(input_h, weight_h, stride_h, pad_h, dilation_h, exprBuilder);
-  ret.d[3] = get_hw(input_w, weight_w, stride_w, pad_w, dilation_w, exprBuilder);
-
-  return ret;
-}
-
-bool ModulatedDeformableConvPluginDynamic::supportsFormatCombination(
-    int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT {
-  if (pos == 0) {
-    return ((ioDesc[pos].type == nvinfer1::DataType::kFLOAT ||
-             ioDesc[pos].type == nvinfer1::DataType::kHALF) &&
-            ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR);
-  } else {
-    return ioDesc[pos].type == ioDesc[0].type && ioDesc[pos].format == ioDesc[0].format;
-  }
-}
-
-void ModulatedDeformableConvPluginDynamic::configurePlugin(
-    const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
-    const nvinfer1::DynamicPluginTensorDesc *outputs, int nbOutputs) TRT_NOEXCEPT {
-  if (nbInputs == 5) {
-    mWithBias = true;
-  }
-}
-
-size_t ModulatedDeformableConvPluginDynamic::getWorkspaceSize(
-    const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
-    const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const TRT_NOEXCEPT {
-  int sizeof_dtype = mmdeploy::getElementSize(outputs[0].type);
-
-  int batch_size = inputs[0].dims.d[0];
-  int nInputPlane = inputs[0].dims.d[1];
-  int inputHeight = inputs[0].dims.d[2];
-  int inputWidth = inputs[0].dims.d[3];
-
-  int nOutputPlane = outputs[0].dims.d[1];
-  int outputHeight = outputs[0].dims.d[2];
-  int outputWidth = outputs[0].dims.d[3];
-
-  int kW = inputs[3].dims.d[2];
-  int kH = inputs[3].dims.d[3];
-  int im2col_step = std::min(32, batch_size);
-
-  size_t col_size =
-      mmdeploy::getAlignedSize(nInputPlane * kW * kH * outputHeight * outputWidth * sizeof_dtype);
-
-  return col_size;
-}
-
-int ModulatedDeformableConvPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
-                                                  const nvinfer1::PluginTensorDesc *outputDesc,
-                                                  const void *const *inputs, void *const *outputs,
-                                                  void *workSpace,
-                                                  cudaStream_t stream) TRT_NOEXCEPT {
-  int batch = inputDesc[0].dims.d[0];
-  int channels = inputDesc[0].dims.d[1];
-  int height = inputDesc[0].dims.d[2];
-  int width = inputDesc[0].dims.d[3];
-  int channels_out = outputDesc[0].dims.d[1];
-  int kernel_h = inputDesc[3].dims.d[2];
-  int kernel_w = inputDesc[3].dims.d[3];
-
-  const void *x = inputs[0];
-  const void *offset = inputs[1];
-  const void *mask = inputs[2];
-  const void *weight = inputs[3];
-  const void *bias = mWithBias ? inputs[4] : nullptr;
-  void *output = outputs[0];
-  int im2col_step = std::min(batch, 32);
-
-  // TODO: add fp16 support
-  auto data_type = inputDesc[0].type;
-  switch (data_type) {
-    case nvinfer1::DataType::kFLOAT:
-      ModulatedDeformConvForwardCUDAKernelLauncher<float>(
-          (float *)x, (float *)weight, (float *)bias, (float *)offset, (float *)mask,
-          (float *)output, workSpace, batch, channels, height, width, channels_out, kernel_w,
-          kernel_h, mStride.d[0], mStride.d[1], mPadding.d[0], mPadding.d[1], mDilation.d[0],
-          mDilation.d[1], mGroup, mDeformableGroup, im2col_step, m_cublas_handle, stream);
-      break;
-    case nvinfer1::DataType::kHALF:
-      ModulatedDeformConvForwardCUDAKernelLauncher<half>(
-          (half *)x, (half *)weight, (half *)bias, (half *)offset, (half *)mask, (half *)output,
-          workSpace, batch, channels, height, width, channels_out, kernel_w, kernel_h, mStride.d[0],
-          mStride.d[1], mPadding.d[0], mPadding.d[1], mDilation.d[0], mDilation.d[1], mGroup,
-          mDeformableGroup, im2col_step, m_cublas_handle, stream);
-      break;
-    default:
-      return 1;
-      break;
-  }
-
-  return 0;
-}
-
-nvinfer1::DataType ModulatedDeformableConvPluginDynamic::getOutputDataType(
-    int index, const nvinfer1::DataType *inputTypes, int nbInputs) const TRT_NOEXCEPT {
-  return inputTypes[0];
-}
-
-// IPluginV2 Methods
-const char *ModulatedDeformableConvPluginDynamic::getPluginType() const TRT_NOEXCEPT {
-  return PLUGIN_NAME;
-}
-
-const char *ModulatedDeformableConvPluginDynamic::getPluginVersion() const TRT_NOEXCEPT {
-  return PLUGIN_VERSION;
-}
-
-int ModulatedDeformableConvPluginDynamic::getNbOutputs() const TRT_NOEXCEPT { return 1; }
-
-size_t ModulatedDeformableConvPluginDynamic::getSerializationSize() const TRT_NOEXCEPT {
-  return serialized_size(mStride) + serialized_size(mPadding) + serialized_size(mDilation) +
-         serialized_size(mDeformableGroup) + serialized_size(mGroup);
-}
-
-void ModulatedDeformableConvPluginDynamic::serialize(void *buffer) const TRT_NOEXCEPT {
-  serialize_value(&buffer, mStride);
-  serialize_value(&buffer, mPadding);
-  serialize_value(&buffer, mDilation);
-  serialize_value(&buffer, mDeformableGroup);
-  serialize_value(&buffer, mGroup);
-}
-
-void ModulatedDeformableConvPluginDynamic::attachToContext(
-    cudnnContext *cudnnContext, cublasContext *cublasContext,
-    nvinfer1::IGpuAllocator *gpuAllocator) TRT_NOEXCEPT {
-  m_cublas_handle = cublasContext;
-}
-
-void ModulatedDeformableConvPluginDynamic::detachFromContext() TRT_NOEXCEPT {}
-
-////////////////////// creator /////////////////////////////
-
-ModulatedDeformableConvPluginDynamicCreator::ModulatedDeformableConvPluginDynamicCreator() {
-  mPluginAttributes.clear();
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("stride"));
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("padding"));
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("dilation"));
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("groups"));
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("deform_groups"));
-  mFC.nbFields = mPluginAttributes.size();
-  mFC.fields = mPluginAttributes.data();
-}
-
-const char *ModulatedDeformableConvPluginDynamicCreator::getPluginName() const TRT_NOEXCEPT {
-  return PLUGIN_NAME;
-}
-
-const char *ModulatedDeformableConvPluginDynamicCreator::getPluginVersion() const TRT_NOEXCEPT {
-  return PLUGIN_VERSION;
-}
-
-nvinfer1::IPluginV2 *ModulatedDeformableConvPluginDynamicCreator::createPlugin(
-    const char *name, const nvinfer1::PluginFieldCollection *fc) TRT_NOEXCEPT {
-  nvinfer1::Dims stride{2, {1, 1}};
-  nvinfer1::Dims padding{2, {0, 0}};
-  nvinfer1::Dims dilation{2, {1, 1}};
-  int deformableGroup = 1;
-  int group = 1;
-
-  for (int i = 0; i < fc->nbFields; i++) {
-    if (fc->fields[i].data == nullptr) {
-      continue;
+namespace mmdeploy
+{
+    namespace
+    {
+        static const char* PLUGIN_VERSION{"1"};
+        static const char* PLUGIN_NAME{"MMCVModulatedDeformConv2d"};
+    }  // namespace
+
+    ModulatedDeformableConvPluginDynamic::ModulatedDeformableConvPluginDynamic(const std::string&   name,
+                                                                               const nvinfer1::Dims stride,
+                                                                               const nvinfer1::Dims padding,
+                                                                               const nvinfer1::Dims dilation,
+                                                                               const int            deformableGroup,
+                                                                               const int            group)
+        : TRTPluginBase(name)
+        , mStride(stride)
+        , mPadding(padding)
+        , mDilation(dilation)
+        , mDeformableGroup(deformableGroup)
+        , mGroup(group)
+    {
+        mWithBias = false;
     }
-    std::string field_name(fc->fields[i].name);
 
-    if (field_name.compare("deform_groups") == 0) {
-      deformableGroup = static_cast<const int *>(fc->fields[i].data)[0];
+    ModulatedDeformableConvPluginDynamic::ModulatedDeformableConvPluginDynamic(const std::string name,
+                                                                               const void*       data,
+                                                                               size_t            length)
+        : TRTPluginBase(name)
+    {
+        deserialize_value(&data, &length, &mStride);
+        deserialize_value(&data, &length, &mPadding);
+        deserialize_value(&data, &length, &mDilation);
+        deserialize_value(&data, &length, &mDeformableGroup);
+        deserialize_value(&data, &length, &mGroup);
+        mWithBias = false;
+    }
+    ModulatedDeformableConvPluginDynamic::~ModulatedDeformableConvPluginDynamic() {}
+
+    nvinfer1::IPluginV2DynamicExt* ModulatedDeformableConvPluginDynamic::clone() const TRT_NOEXCEPT
+    {
+        ModulatedDeformableConvPluginDynamic* plugin = new ModulatedDeformableConvPluginDynamic(mLayerName,
+                                                                                                mStride,
+                                                                                                mPadding,
+                                                                                                mDilation,
+                                                                                                mDeformableGroup,
+                                                                                                mGroup);
+        plugin->setPluginNamespace(getPluginNamespace());
+
+        return plugin;
+    }
+
+    static const nvinfer1::IDimensionExpr* get_hw(const nvinfer1::IDimensionExpr* input,
+                                                  const nvinfer1::IDimensionExpr* weight,
+                                                  const nvinfer1::IDimensionExpr* stride,
+                                                  const nvinfer1::IDimensionExpr* pad,
+                                                  const nvinfer1::IDimensionExpr* dilation,
+                                                  nvinfer1::IExprBuilder&         exprBuilder)
+    {
+        using DimOp = nvinfer1::DimensionOperation;
+        auto expr_1 = exprBuilder.constant(1);
+
+        // d*(w-1)+1
+        auto kernel_0 = exprBuilder.operation(DimOp::kSUB, *weight, *expr_1);
+        auto kernel_1 = exprBuilder.operation(DimOp::kPROD, *dilation, *kernel_0);
+        auto kernel   = exprBuilder.operation(DimOp::kSUM, *kernel_1, *expr_1);
+
+        // (1+2*p-k)//stride -1
+        auto out_0 = exprBuilder.operation(DimOp::kSUM, *pad, *pad);
+        auto out_1 = exprBuilder.operation(DimOp::kSUM, *input, *out_0);
+        auto out_2 = exprBuilder.operation(DimOp::kSUB, *out_1, *kernel);
+        auto out_3 = exprBuilder.operation(DimOp::kFLOOR_DIV, *out_2, *stride);
+        auto out   = exprBuilder.operation(DimOp::kSUM, *out_3, *expr_1);
+
+        return out;
+    }
+
+    nvinfer1::DimsExprs ModulatedDeformableConvPluginDynamic::getOutputDimensions(int                        outputIndex,
+                                                                                  const nvinfer1::DimsExprs* inputs,
+                                                                                  int                        nbInputs,
+                                                                                  nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT
+    {
+        using DimOp                    = nvinfer1::DimensionOperation;
+        auto                weight_dim = inputs[3].d;
+        nvinfer1::DimsExprs ret;
+        ret.nbDims = 4;
+        ret.d[0]   = inputs[0].d[0];
+        ret.d[1]   = inputs[3].d[0];
+
+        auto input_h    = inputs[0].d[2];
+        auto input_w    = inputs[0].d[3];
+        auto weight_h   = weight_dim[2];
+        auto weight_w   = weight_dim[3];
+        auto dilation_w = exprBuilder.constant(mDilation.d[0]);
+        auto dilation_h = exprBuilder.constant(mDilation.d[1]);
+        auto pad_w      = exprBuilder.constant(mPadding.d[0]);
+        auto pad_h      = exprBuilder.constant(mPadding.d[1]);
+        auto stride_w   = exprBuilder.constant(mStride.d[0]);
+        auto stride_h   = exprBuilder.constant(mStride.d[1]);
+        auto expr_1     = exprBuilder.constant(1);
+        auto expr_2     = exprBuilder.constant(2);
+
+        ret.d[2] = get_hw(input_h, weight_h, stride_h, pad_h, dilation_h, exprBuilder);
+        ret.d[3] = get_hw(input_w, weight_w, stride_w, pad_w, dilation_w, exprBuilder);
+
+        return ret;
+    }
+
+    bool ModulatedDeformableConvPluginDynamic::supportsFormatCombination(int                               pos,
+                                                                         const nvinfer1::PluginTensorDesc* ioDesc,
+                                                                         int                               nbInputs,
+                                                                         int                               nbOutputs) TRT_NOEXCEPT
+    {
+        if (pos == 0)
+        {
+            return ((ioDesc[pos].type == nvinfer1::DataType::kFLOAT ||
+                     ioDesc[pos].type == nvinfer1::DataType::kHALF) &&
+                    ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR);
+        }
+        else
+        {
+            return ioDesc[pos].type == ioDesc[0].type && ioDesc[pos].format == ioDesc[0].format;
+        }
+    }
+
+    void ModulatedDeformableConvPluginDynamic::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs,
+                                                               int                                      nbInputs,
+                                                               const nvinfer1::DynamicPluginTensorDesc* outputs,
+                                                               int                                      nbOutputs) TRT_NOEXCEPT
+    {
+        if (nbInputs == 5)
+        {
+            mWithBias = true;
+        }
+    }
+
+    size_t ModulatedDeformableConvPluginDynamic::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                                                  int                               nbInputs,
+                                                                  const nvinfer1::PluginTensorDesc* outputs,
+                                                                  int                               nbOutputs) const TRT_NOEXCEPT
+    {
+        int    sizeof_dtype = mmdeploy::getElementSize(outputs[0].type);
+
+        int    batch_size  = inputs[0].dims.d[0];
+        int    nInputPlane = inputs[0].dims.d[1];
+        int    inputHeight = inputs[0].dims.d[2];
+        int    inputWidth  = inputs[0].dims.d[3];
+
+        int    nOutputPlane = outputs[0].dims.d[1];
+        int    outputHeight = outputs[0].dims.d[2];
+        int    outputWidth  = outputs[0].dims.d[3];
+
+        int    kW          = inputs[3].dims.d[2];
+        int    kH          = inputs[3].dims.d[3];
+        int    im2col_step = std::min(32, batch_size);
+
+        size_t col_size =
+            mmdeploy::getAlignedSize(nInputPlane * kW * kH * outputHeight * outputWidth * sizeof_dtype);
+
+        return col_size;
+    }
+
+    int ModulatedDeformableConvPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                                      const nvinfer1::PluginTensorDesc* outputDesc,
+                                                      const void* const*                inputs,
+                                                      void* const*                      outputs,
+                                                      void*                             workSpace,
+                                                      cudaStream_t                      stream) TRT_NOEXCEPT
+    {
+        int         batch        = inputDesc[0].dims.d[0];
+        int         channels     = inputDesc[0].dims.d[1];
+        int         height       = inputDesc[0].dims.d[2];
+        int         width        = inputDesc[0].dims.d[3];
+        int         channels_out = outputDesc[0].dims.d[1];
+        int         kernel_h     = inputDesc[3].dims.d[2];
+        int         kernel_w     = inputDesc[3].dims.d[3];
+
+        const void* x           = inputs[0];
+        const void* offset      = inputs[1];
+        const void* mask        = inputs[2];
+        const void* weight      = inputs[3];
+        const void* bias        = mWithBias ? inputs[4] : nullptr;
+        void*       output      = outputs[0];
+        int         im2col_step = std::min(batch, 32);
+
+        // TODO: add fp16 support
+        auto        data_type = inputDesc[0].type;
+        switch (data_type)
+        {
+            case nvinfer1::DataType::kFLOAT:
+                ModulatedDeformConvForwardCUDAKernelLauncher<float>((float*)x,
+                                                                    (float*)weight,
+                                                                    (float*)bias,
+                                                                    (float*)offset,
+                                                                    (float*)mask,
+                                                                    (float*)output,
+                                                                    workSpace,
+                                                                    batch,
+                                                                    channels,
+                                                                    height,
+                                                                    width,
+                                                                    channels_out,
+                                                                    kernel_w,
+                                                                    kernel_h,
+                                                                    mStride.d[0],
+                                                                    mStride.d[1],
+                                                                    mPadding.d[0],
+                                                                    mPadding.d[1],
+                                                                    mDilation.d[0],
+                                                                    mDilation.d[1],
+                                                                    mGroup,
+                                                                    mDeformableGroup,
+                                                                    im2col_step,
+                                                                    m_cublas_handle,
+                                                                    stream);
+                break;
+            case nvinfer1::DataType::kHALF:
+                ModulatedDeformConvForwardCUDAKernelLauncher<half>((half*)x,
+                                                                   (half*)weight,
+                                                                   (half*)bias,
+                                                                   (half*)offset,
+                                                                   (half*)mask,
+                                                                   (half*)output,
+                                                                   workSpace,
+                                                                   batch,
+                                                                   channels,
+                                                                   height,
+                                                                   width,
+                                                                   channels_out,
+                                                                   kernel_w,
+                                                                   kernel_h,
+                                                                   mStride.d[0],
+                                                                   mStride.d[1],
+                                                                   mPadding.d[0],
+                                                                   mPadding.d[1],
+                                                                   mDilation.d[0],
+                                                                   mDilation.d[1],
+                                                                   mGroup,
+                                                                   mDeformableGroup,
+                                                                   im2col_step,
+                                                                   m_cublas_handle,
+                                                                   stream);
+                break;
+            default:
+                return 1;
+                break;
+        }
+
+        return 0;
+    }
+
+    nvinfer1::DataType ModulatedDeformableConvPluginDynamic::getOutputDataType(int                       index,
+                                                                               const nvinfer1::DataType* inputTypes,
+                                                                               int                       nbInputs) const TRT_NOEXCEPT
+    {
+        return inputTypes[0];
+    }
+
+    // IPluginV2 Methods
+    const char* ModulatedDeformableConvPluginDynamic::getPluginType() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
+    }
+
+    const char* ModulatedDeformableConvPluginDynamic::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
+    }
+
+    int ModulatedDeformableConvPluginDynamic::getNbOutputs() const TRT_NOEXCEPT
+    {
+        return 1;
+    }
+
+    size_t ModulatedDeformableConvPluginDynamic::getSerializationSize() const TRT_NOEXCEPT
+    {
+        return serialized_size(mStride) + serialized_size(mPadding) + serialized_size(mDilation) +
+               serialized_size(mDeformableGroup) + serialized_size(mGroup);
+    }
+
+    void ModulatedDeformableConvPluginDynamic::serialize(void* buffer) const TRT_NOEXCEPT
+    {
+        serialize_value(&buffer, mStride);
+        serialize_value(&buffer, mPadding);
+        serialize_value(&buffer, mDilation);
+        serialize_value(&buffer, mDeformableGroup);
+        serialize_value(&buffer, mGroup);
+    }
+
+    void ModulatedDeformableConvPluginDynamic::attachToContext(
+        cudnnContext*            cudnnContext,
+        cublasContext*           cublasContext,
+        nvinfer1::IGpuAllocator* gpuAllocator) TRT_NOEXCEPT
+    {
+        m_cublas_handle = cublasContext;
+    }
+
+    void ModulatedDeformableConvPluginDynamic::detachFromContext() TRT_NOEXCEPT {}
+
+    ////////////////////// creator /////////////////////////////
+
+    ModulatedDeformableConvPluginDynamicCreator::ModulatedDeformableConvPluginDynamicCreator()
+    {
+        mPluginAttributes.clear();
+        mPluginAttributes.emplace_back(nvinfer1::PluginField("stride"));
+        mPluginAttributes.emplace_back(nvinfer1::PluginField("padding"));
+        mPluginAttributes.emplace_back(nvinfer1::PluginField("dilation"));
+        mPluginAttributes.emplace_back(nvinfer1::PluginField("groups"));
+        mPluginAttributes.emplace_back(nvinfer1::PluginField("deform_groups"));
+        mFC.nbFields = mPluginAttributes.size();
+        mFC.fields   = mPluginAttributes.data();
     }
 
-    if (field_name.compare("groups") == 0) {
-      group = static_cast<const int *>(fc->fields[i].data)[0];
+    const char* ModulatedDeformableConvPluginDynamicCreator::getPluginName() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
     }
 
-    if (field_name.compare("stride") == 0) {
-      stride.nbDims = 2;
-      stride.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
-      stride.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
+    const char* ModulatedDeformableConvPluginDynamicCreator::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
     }
 
-    if (field_name.compare("padding") == 0) {
-      padding.nbDims = 2;
-      padding.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
-      padding.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
+    nvinfer1::IPluginV2* ModulatedDeformableConvPluginDynamicCreator::createPlugin(
+        const char*                            name,
+        const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT
+    {
+        nvinfer1::Dims stride{2, {1, 1}};
+        nvinfer1::Dims padding{2, {0, 0}};
+        nvinfer1::Dims dilation{2, {1, 1}};
+        int            deformableGroup = 1;
+        int            group           = 1;
+
+        for (int i = 0; i < fc->nbFields; i++)
+        {
+            if (fc->fields[i].data == nullptr)
+            {
+                continue;
+            }
+            std::string field_name(fc->fields[i].name);
+
+            if (field_name.compare("deform_groups") == 0)
+            {
+                deformableGroup = static_cast<const int*>(fc->fields[i].data)[0];
+            }
+
+            if (field_name.compare("groups") == 0)
+            {
+                group = static_cast<const int*>(fc->fields[i].data)[0];
+            }
+
+            if (field_name.compare("stride") == 0)
+            {
+                stride.nbDims = 2;
+                stride.d[0]   = static_cast<const int*>(fc->fields[i].data)[0];
+                stride.d[1]   = static_cast<const int*>(fc->fields[i].data)[1];
+            }
+
+            if (field_name.compare("padding") == 0)
+            {
+                padding.nbDims = 2;
+                padding.d[0]   = static_cast<const int*>(fc->fields[i].data)[0];
+                padding.d[1]   = static_cast<const int*>(fc->fields[i].data)[1];
+            }
+
+            if (field_name.compare("dilation") == 0)
+            {
+                dilation.nbDims = 2;
+                dilation.d[0]   = static_cast<const int*>(fc->fields[i].data)[0];
+                dilation.d[1]   = static_cast<const int*>(fc->fields[i].data)[1];
+            }
+        }
+
+        ModulatedDeformableConvPluginDynamic* plugin = new ModulatedDeformableConvPluginDynamic(
+            name,
+            stride,
+            padding,
+            dilation,
+            deformableGroup,
+            group);
+        plugin->setPluginNamespace(getPluginNamespace());
+        return plugin;
     }
 
-    if (field_name.compare("dilation") == 0) {
-      dilation.nbDims = 2;
-      dilation.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
-      dilation.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
+    nvinfer1::IPluginV2* ModulatedDeformableConvPluginDynamicCreator::deserializePlugin(
+        const char* name,
+        const void* serialData,
+        size_t      serialLength) TRT_NOEXCEPT
+    {
+        auto plugin = new ModulatedDeformableConvPluginDynamic(name, serialData, serialLength);
+        plugin->setPluginNamespace(getPluginNamespace());
+        return plugin;
     }
-  }
-
-  ModulatedDeformableConvPluginDynamic *plugin = new ModulatedDeformableConvPluginDynamic(
-      name, stride, padding, dilation, deformableGroup, group);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-
-nvinfer1::IPluginV2 *ModulatedDeformableConvPluginDynamicCreator::deserializePlugin(
-    const char *name, const void *serialData, size_t serialLength) TRT_NOEXCEPT {
-  auto plugin = new ModulatedDeformableConvPluginDynamic(name, serialData, serialLength);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-REGISTER_TENSORRT_PLUGIN(ModulatedDeformableConvPluginDynamicCreator);
+    REGISTER_TENSORRT_PLUGIN(ModulatedDeformableConvPluginDynamicCreator);
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv.hpp b/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv.hpp
index 2dc6ed2f20..1bfbc17735 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv.hpp
@@ -9,74 +9,101 @@
 
 #include "trt_plugin_base.hpp"
 
-namespace mmdeploy {
-class ModulatedDeformableConvPluginDynamic : public TRTPluginBase {
- public:
-  ModulatedDeformableConvPluginDynamic(const std::string &name, const nvinfer1::Dims stride,
-                                       const nvinfer1::Dims padding, const nvinfer1::Dims dilation,
-                                       const int deformableGroup, const int group);
-
-  ModulatedDeformableConvPluginDynamic(const std::string name, const void *data, size_t length);
-
-  ModulatedDeformableConvPluginDynamic() = delete;
-
-  ~ModulatedDeformableConvPluginDynamic() TRT_NOEXCEPT override;
-
-  // IPluginV2DynamicExt Methods
-  nvinfer1::IPluginV2DynamicExt *clone() const TRT_NOEXCEPT override;
-  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs,
-                                          int nbInputs, nvinfer1::IExprBuilder &exprBuilder)
-      TRT_NOEXCEPT override;
-  bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs,
-                                 int nbOutputs) TRT_NOEXCEPT override;
-  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs,
-                       const nvinfer1::DynamicPluginTensorDesc *out,
-                       int nbOutputs) TRT_NOEXCEPT override;
-  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
-                          const nvinfer1::PluginTensorDesc *outputs,
-                          int nbOutputs) const TRT_NOEXCEPT override;
-  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
-              const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
-              void *const *outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT override;
-  void attachToContext(cudnnContext *cudnnContext, cublasContext *cublasContext,
-                       nvinfer1::IGpuAllocator *gpuAllocator) TRT_NOEXCEPT override;
-  void detachFromContext() TRT_NOEXCEPT override;
-
-  // IPluginV2Ext Methods
-  nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
-                                       int nbInputs) const TRT_NOEXCEPT override;
-
-  // IPluginV2 Methods
-  const char *getPluginType() const TRT_NOEXCEPT override;
-  const char *getPluginVersion() const TRT_NOEXCEPT override;
-  int getNbOutputs() const TRT_NOEXCEPT override;
-  size_t getSerializationSize() const TRT_NOEXCEPT override;
-  void serialize(void *buffer) const TRT_NOEXCEPT override;
-
- private:
-  nvinfer1::Dims mStride;
-  nvinfer1::Dims mPadding;
-  nvinfer1::Dims mDilation;
-  int mDeformableGroup;
-  int mGroup;
-  bool mWithBias;
-
-  cublasHandle_t m_cublas_handle;
-};
-
-class ModulatedDeformableConvPluginDynamicCreator : public TRTPluginCreatorBase {
- public:
-  ModulatedDeformableConvPluginDynamicCreator();
-
-  const char *getPluginName() const TRT_NOEXCEPT override;
-
-  const char *getPluginVersion() const TRT_NOEXCEPT override;
-
-  nvinfer1::IPluginV2 *createPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
-      TRT_NOEXCEPT override;
-
-  nvinfer1::IPluginV2 *deserializePlugin(const char *name, const void *serialData,
-                                         size_t serialLength) TRT_NOEXCEPT override;
-};
+namespace mmdeploy
+{
+    class ModulatedDeformableConvPluginDynamic : public TRTPluginBase
+    {
+      public:
+        ModulatedDeformableConvPluginDynamic(const std::string&   name,
+                                             const nvinfer1::Dims stride,
+                                             const nvinfer1::Dims padding,
+                                             const nvinfer1::Dims dilation,
+                                             const int            deformableGroup,
+                                             const int            group);
+
+        ModulatedDeformableConvPluginDynamic(const std::string name,
+                                             const void*       data,
+                                             size_t            length);
+
+        ModulatedDeformableConvPluginDynamic() = delete;
+
+        ~ModulatedDeformableConvPluginDynamic() TRT_NOEXCEPT override;
+
+        // IPluginV2DynamicExt Methods
+        nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
+
+        nvinfer1::DimsExprs            getOutputDimensions(int                        outputIndex,
+                                                           const nvinfer1::DimsExprs* inputs,
+                                                           int                        nbInputs,
+                                                           nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT override;
+
+        bool                           supportsFormatCombination(int                               pos,
+                                                                 const nvinfer1::PluginTensorDesc* ioDesc,
+                                                                 int                               nbInputs,
+                                                                 int                               nbOutputs) TRT_NOEXCEPT override;
+
+        void                           configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                                                       int                                      nbInputs,
+                                                       const nvinfer1::DynamicPluginTensorDesc* out,
+                                                       int                                      nbOutputs) TRT_NOEXCEPT override;
+
+        size_t                         getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                                        int                               nbInputs,
+                                                        const nvinfer1::PluginTensorDesc* outputs,
+                                                        int                               nbOutputs) const TRT_NOEXCEPT override;
+
+        int                enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                   const nvinfer1::PluginTensorDesc* outputDesc,
+                                   const void* const*                inputs,
+                                   void* const*                      outputs,
+                                   void*                             workspace,
+                                   cudaStream_t                      stream) TRT_NOEXCEPT override;
+
+        void                           attachToContext(cudnnContext*            cudnnContext,
+                                                       cublasContext*           cublasContext,
+                                                       nvinfer1::IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override;
+
+        void               detachFromContext() TRT_NOEXCEPT override;
+
+        // IPluginV2Ext Methods
+        nvinfer1::DataType             getOutputDataType(int                       index,
+                                                         const nvinfer1::DataType* inputTypes,
+                                                         int                       nbInputs) const TRT_NOEXCEPT override;
+
+        // IPluginV2 Methods
+        const char*        getPluginType() const TRT_NOEXCEPT override;
+        const char*        getPluginVersion() const TRT_NOEXCEPT override;
+        int                getNbOutputs() const TRT_NOEXCEPT override;
+        size_t             getSerializationSize() const TRT_NOEXCEPT override;
+        void               serialize(void* buffer) const TRT_NOEXCEPT override;
+
+      private:
+        nvinfer1::Dims mStride;
+        nvinfer1::Dims mPadding;
+        nvinfer1::Dims mDilation;
+        int            mDeformableGroup;
+        int            mGroup;
+        bool           mWithBias;
+
+        cublasHandle_t m_cublas_handle;
+    };
+
+    class ModulatedDeformableConvPluginDynamicCreator : public TRTPluginCreatorBase
+    {
+      public:
+        ModulatedDeformableConvPluginDynamicCreator();
+
+        const char*          getPluginName() const TRT_NOEXCEPT override;
+
+        const char*          getPluginVersion() const TRT_NOEXCEPT override;
+
+        nvinfer1::IPluginV2* createPlugin(const char*                            name,
+                                          const nvinfer1::PluginFieldCollection* fc)
+            TRT_NOEXCEPT override;
+
+        nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                               const void* serialData,
+                                               size_t      serialLength) TRT_NOEXCEPT override;
+    };
 }  // namespace mmdeploy
 #endif  // TRT_MODULATED_DEFORM_CONV_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv_kernel.cu b/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv_kernel.cu
index 1e1f99d5ff..1b8884c7dc 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv_kernel.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv_kernel.cu
@@ -7,132 +7,284 @@
 #include "trt_modulated_deform_conv_kernel.hpp"
 #include "trt_plugin_helper.hpp"
 
-template <typename T>
-void trt_modulated_deformable_im2col(const T* data_im_, const T* data_offset_, const T* data_mask_,
-                                     const int batch_size, const int channels, const int height_im,
-                                     const int width_im, const int height_col, const int width_col,
-                                     const int kernel_h, const int kenerl_w, const int pad_h,
-                                     const int pad_w, const int stride_h, const int stride_w,
-                                     const int dilation_h, const int dilation_w,
-                                     const int deformable_group, T* data_col_,
-                                     cudaStream_t stream) {
-  // num_axes should be smaller than block size
-  const int channel_per_deformable_group = channels / deformable_group;
-  const int num_kernels = channels * batch_size * height_col * width_col;
-
-  modulated_deformable_im2col_gpu_kernel<T>
-      <<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
-          num_kernels, data_im_, data_offset_, data_mask_, height_im, width_im, kernel_h, kenerl_w,
-          pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group,
-          batch_size, channels, deformable_group, height_col, width_col, data_col_);
-
-  cudaCheckError();
+template<typename T>
+void trt_modulated_deformable_im2col(const T*     data_im_,
+                                     const T*     data_offset_,
+                                     const T*     data_mask_,
+                                     const int    batch_size,
+                                     const int    channels,
+                                     const int    height_im,
+                                     const int    width_im,
+                                     const int    height_col,
+                                     const int    width_col,
+                                     const int    kernel_h,
+                                     const int    kenerl_w,
+                                     const int    pad_h,
+                                     const int    pad_w,
+                                     const int    stride_h,
+                                     const int    stride_w,
+                                     const int    dilation_h,
+                                     const int    dilation_w,
+                                     const int    deformable_group,
+                                     T*           data_col_,
+                                     cudaStream_t stream)
+{
+    // num_axes should be smaller than block size
+    const int channel_per_deformable_group = channels / deformable_group;
+    const int num_kernels                  = channels * batch_size * height_col * width_col;
+
+    modulated_deformable_im2col_gpu_kernel<T>
+        <<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(num_kernels,
+                                                                    data_im_,
+                                                                    data_offset_,
+                                                                    data_mask_,
+                                                                    height_im,
+                                                                    width_im,
+                                                                    kernel_h,
+                                                                    kenerl_w,
+                                                                    pad_h,
+                                                                    pad_w,
+                                                                    stride_h,
+                                                                    stride_w,
+                                                                    dilation_h,
+                                                                    dilation_w,
+                                                                    channel_per_deformable_group,
+                                                                    batch_size,
+                                                                    channels,
+                                                                    deformable_group,
+                                                                    height_col,
+                                                                    width_col,
+                                                                    data_col_);
+
+    cudaCheckError();
 }
 
-template <typename scalar_t>
-__global__ void output_add_bias_kernel(scalar_t* output, const scalar_t* bias, size_t step_batch,
-                                       size_t step_channel, size_t n) {
-  CUDA_1D_KERNEL_LOOP(index, n) { output[index] += bias[(index % step_batch) / step_channel]; }
+template<typename scalar_t>
+__global__ void output_add_bias_kernel(scalar_t*       output,
+                                       const scalar_t* bias,
+                                       size_t          step_batch,
+                                       size_t          step_channel,
+                                       size_t          n)
+{
+    CUDA_1D_KERNEL_LOOP(index, n)
+    {
+        output[index] += bias[(index % step_batch) / step_channel];
+    }
 }
 
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-template <>
-__global__ void output_add_bias_kernel<__half>(__half* output, const __half* bias,
-                                               size_t step_batch, size_t step_channel, size_t n) {
-  CUDA_1D_KERNEL_LOOP(index, n) {
-    const __half b = bias[(index % step_batch) / step_channel];
-    const __half o = output[index];
-    output[index] = __hadd(o, b);
-  }
+template<>
+__global__ void output_add_bias_kernel<__half>(__half*       output,
+                                               const __half* bias,
+                                               size_t        step_batch,
+                                               size_t        step_channel,
+                                               size_t        n)
+{
+    CUDA_1D_KERNEL_LOOP(index, n)
+    {
+        const __half b = bias[(index % step_batch) / step_channel];
+        const __half o = output[index];
+        output[index]  = __hadd(o, b);
+    }
 }
 #else
-template <>
-__global__ void output_add_bias_kernel<__half>(__half* output, const __half* bias,
-                                               size_t step_batch, size_t step_channel, size_t n) {
-  CUDA_1D_KERNEL_LOOP(index, n) {
-    const __half b = bias[(index % step_batch) / step_channel];
-    const __half o = output[index];
-    output[index] = __float2half(__half2float(o) + __half2float(b));
-  }
+template<>
+__global__ void output_add_bias_kernel<__half>(__half*       output,
+                                               const __half* bias,
+                                               size_t        step_batch,
+                                               size_t        step_channel,
+                                               size_t        n)
+{
+    CUDA_1D_KERNEL_LOOP(index, n)
+    {
+        const __half b = bias[(index % step_batch) / step_channel];
+        const __half o = output[index];
+        output[index]  = __float2half(__half2float(o) + __half2float(b));
+    }
 }
 #endif
 
-template <typename scalar_t>
-static void output_add_bias(scalar_t* output, const scalar_t* bias, size_t batch, size_t channel,
-                            size_t height, size_t width, cudaStream_t stream) {
-  size_t step_channel = height * width;
-  size_t step_batch = step_channel * channel;
-  size_t n = step_batch * batch;
-  output_add_bias_kernel<<<GET_BLOCKS(n), THREADS_PER_BLOCK, 0, stream>>>(output, bias, step_batch,
-                                                                          step_channel, n);
+template<typename scalar_t>
+static void output_add_bias(scalar_t*       output,
+                            const scalar_t* bias,
+                            size_t          batch,
+                            size_t          channel,
+                            size_t          height,
+                            size_t          width,
+                            cudaStream_t    stream)
+{
+    size_t step_channel = height * width;
+    size_t step_batch   = step_channel * channel;
+    size_t n            = step_batch * batch;
+    output_add_bias_kernel<<<GET_BLOCKS(n), THREADS_PER_BLOCK, 0, stream>>>(output,
+                                                                            bias,
+                                                                            step_batch,
+                                                                            step_channel,
+                                                                            n);
 }
 
-template <typename scalar_t>
-void ModulatedDeformConvForwardCUDAKernelLauncher(
-    const scalar_t* input, const scalar_t* weight, const scalar_t* bias, const scalar_t* offset,
-    const scalar_t* mask, scalar_t* output, void* workspace, int batch, int channels, int height,
-    int width, int channels_out, int kernel_w, int kernel_h, int stride_w, int stride_h, int pad_w,
-    int pad_h, int dilation_w, int dilation_h, int group, int deformable_group, int im2col_step,
-    cublasHandle_t cublas_handle, cudaStream_t stream) {
-  bool with_bias = (bias != nullptr);
-
-  im2col_step = std::min(int(batch), im2col_step);
-  assert(batch % im2col_step == 0);
-
-  const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
-  const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
-
-  scalar_t* columns = (scalar_t*)workspace;
-
-  const size_t input_step = channels * height * width;
-  const size_t offset_step = deformable_group * kernel_h * kernel_w * 2 * height_out * width_out;
-  const size_t mask_step = deformable_group * kernel_h * kernel_w * height_out * width_out;
-  const size_t out_step = channels_out * height_out * width_out;
-  const size_t out_group_step = out_step / group;
-  const size_t col_g_step = channels * kernel_w * kernel_h / group * height_out * width_out;
-  const size_t weight_g_step = channels_out / group * channels / group * kernel_h * kernel_w;
-
-  const int m = channels_out / group;
-  const int n = height_out * width_out;
-  const int k = channels / group * kernel_h * kernel_w;
-  scalar_t alpha = 1.;
-  scalar_t beta = 0.;
-
-  for (int b = 0; b < batch; b++) {
-    const scalar_t* input_start = input + b * input_step;
-    const scalar_t* offset_start = offset + b * offset_step;
-    const scalar_t* mask_start = mask + b * mask_step;
-    trt_modulated_deformable_im2col<scalar_t>(
-        input_start, offset_start, mask_start, 1, channels, height, width, height_out, width_out,
-        kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
-        deformable_group, columns, stream);
-
-    for (int g = 0; g < group; g++) {
-      const scalar_t* weight_start = weight + g * weight_g_step;
-      scalar_t* col_start = columns + g * col_g_step;
-      scalar_t* out_buffer_start = output + b * out_step + g * out_group_step;
-
-      cublasGemmWrap<scalar_t>(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &alpha, col_start,
-                               n, weight_start, k, &beta, out_buffer_start, n);
-      cudaCheckError();
+template<typename scalar_t>
+void ModulatedDeformConvForwardCUDAKernelLauncher(const scalar_t* input,
+                                                  const scalar_t* weight,
+                                                  const scalar_t* bias,
+                                                  const scalar_t* offset,
+                                                  const scalar_t* mask,
+                                                  scalar_t*       output,
+                                                  void*           workspace,
+                                                  int             batch,
+                                                  int             channels,
+                                                  int             height,
+                                                  int             width,
+                                                  int             channels_out,
+                                                  int             kernel_w,
+                                                  int             kernel_h,
+                                                  int             stride_w,
+                                                  int             stride_h,
+                                                  int             pad_w,
+                                                  int             pad_h,
+                                                  int             dilation_w,
+                                                  int             dilation_h,
+                                                  int             group,
+                                                  int             deformable_group,
+                                                  int             im2col_step,
+                                                  cublasHandle_t  cublas_handle,
+                                                  cudaStream_t    stream)
+{
+    bool with_bias = (bias != nullptr);
+
+    im2col_step = std::min(int(batch), im2col_step);
+    assert(batch % im2col_step == 0);
+
+    const int    height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+    const int    width_out  = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+    scalar_t*    columns = (scalar_t*)workspace;
+
+    const size_t input_step     = channels * height * width;
+    const size_t offset_step    = deformable_group * kernel_h * kernel_w * 2 * height_out * width_out;
+    const size_t mask_step      = deformable_group * kernel_h * kernel_w * height_out * width_out;
+    const size_t out_step       = channels_out * height_out * width_out;
+    const size_t out_group_step = out_step / group;
+    const size_t col_g_step     = channels * kernel_w * kernel_h / group * height_out * width_out;
+    const size_t weight_g_step  = channels_out / group * channels / group * kernel_h * kernel_w;
+
+    const int    m     = channels_out / group;
+    const int    n     = height_out * width_out;
+    const int    k     = channels / group * kernel_h * kernel_w;
+    scalar_t     alpha = 1.;
+    scalar_t     beta  = 0.;
+
+    for (int b = 0; b < batch; b++)
+    {
+        const scalar_t* input_start  = input + b * input_step;
+        const scalar_t* offset_start = offset + b * offset_step;
+        const scalar_t* mask_start   = mask + b * mask_step;
+        trt_modulated_deformable_im2col<scalar_t>(
+            input_start,
+            offset_start,
+            mask_start,
+            1,
+            channels,
+            height,
+            width,
+            height_out,
+            width_out,
+            kernel_h,
+            kernel_w,
+            pad_h,
+            pad_w,
+            stride_h,
+            stride_w,
+            dilation_h,
+            dilation_w,
+            deformable_group,
+            columns,
+            stream);
+
+        for (int g = 0; g < group; g++)
+        {
+            const scalar_t* weight_start     = weight + g * weight_g_step;
+            scalar_t*       col_start        = columns + g * col_g_step;
+            scalar_t*       out_buffer_start = output + b * out_step + g * out_group_step;
+
+            cublasGemmWrap<scalar_t>(cublas_handle,
+                                     CUBLAS_OP_N,
+                                     CUBLAS_OP_N,
+                                     n,
+                                     m,
+                                     k,
+                                     &alpha,
+                                     col_start,
+                                     n,
+                                     weight_start,
+                                     k,
+                                     &beta,
+                                     out_buffer_start,
+                                     n);
+            cudaCheckError();
+        }
     }
-  }
 
-  if (with_bias) {
-    output_add_bias<scalar_t>(output, bias, batch, channels_out, height_out, width_out, stream);
-  }
+    if (with_bias)
+    {
+        output_add_bias<scalar_t>(output,
+                                  bias,
+                                  batch,
+                                  channels_out,
+                                  height_out,
+                                  width_out,
+                                  stream);
+    }
 }
 
-template void ModulatedDeformConvForwardCUDAKernelLauncher<float>(
-    const float* input, const float* weight, const float* bias, const float* offset,
-    const float* mask, float* output, void* workspace, int batch, int channels, int height,
-    int width, int channels_out, int kernel_w, int kernel_h, int stride_w, int stride_h, int pad_w,
-    int pad_h, int dilation_w, int dilation_h, int group, int deformable_group, int im2col_step,
-    cublasHandle_t cublas_handle, cudaStream_t stream);
-
-template void ModulatedDeformConvForwardCUDAKernelLauncher<__half>(
-    const __half* input, const __half* weight, const __half* bias, const __half* offset,
-    const __half* mask, __half* output, void* workspace, int batch, int channels, int height,
-    int width, int channels_out, int kernel_w, int kernel_h, int stride_w, int stride_h, int pad_w,
-    int pad_h, int dilation_w, int dilation_h, int group, int deformable_group, int im2col_step,
-    cublasHandle_t cublas_handle, cudaStream_t stream);
+template void ModulatedDeformConvForwardCUDAKernelLauncher<float>(const float*   input,
+                                                                  const float*   weight,
+                                                                  const float*   bias,
+                                                                  const float*   offset,
+                                                                  const float*   mask,
+                                                                  float*         output,
+                                                                  void*          workspace,
+                                                                  int            batch,
+                                                                  int            channels,
+                                                                  int            height,
+                                                                  int            width,
+                                                                  int            channels_out,
+                                                                  int            kernel_w,
+                                                                  int            kernel_h,
+                                                                  int            stride_w,
+                                                                  int            stride_h,
+                                                                  int            pad_w,
+                                                                  int            pad_h,
+                                                                  int            dilation_w,
+                                                                  int            dilation_h,
+                                                                  int            group,
+                                                                  int            deformable_group,
+                                                                  int            im2col_step,
+                                                                  cublasHandle_t cublas_handle,
+                                                                  cudaStream_t   stream);
+
+template void ModulatedDeformConvForwardCUDAKernelLauncher<__half>(const __half*  input,
+                                                                   const __half*  weight,
+                                                                   const __half*  bias,
+                                                                   const __half*  offset,
+                                                                   const __half*  mask,
+                                                                   __half*        output,
+                                                                   void*          workspace,
+                                                                   int            batch,
+                                                                   int            channels,
+                                                                   int            height,
+                                                                   int            width,
+                                                                   int            channels_out,
+                                                                   int            kernel_w,
+                                                                   int            kernel_h,
+                                                                   int            stride_w,
+                                                                   int            stride_h,
+                                                                   int            pad_w,
+                                                                   int            pad_h,
+                                                                   int            dilation_w,
+                                                                   int            dilation_h,
+                                                                   int            group,
+                                                                   int            deformable_group,
+                                                                   int            im2col_step,
+                                                                   cublasHandle_t cublas_handle,
+                                                                   cudaStream_t   stream);
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv_kernel.hpp b/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv_kernel.hpp
index 4cdec4fb38..4d928b16c5 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv_kernel.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/modulated_deform_conv/trt_modulated_deform_conv_kernel.hpp
@@ -4,12 +4,31 @@
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
 
-template <typename scalar_t>
-void ModulatedDeformConvForwardCUDAKernelLauncher(
-    const scalar_t* input, const scalar_t* weight, const scalar_t* bias, const scalar_t* offset,
-    const scalar_t* mask, scalar_t* output, void* workspace, int batch, int channels, int height,
-    int width, int channels_out, int kernel_w, int kernel_h, int stride_w, int stride_h, int pad_w,
-    int pad_h, int dilation_w, int dilation_h, int group, int deformable_group, int im2col_step,
-    cublasHandle_t cublas_handle, cudaStream_t stream);
+template<typename scalar_t>
+void ModulatedDeformConvForwardCUDAKernelLauncher(const scalar_t* input,
+                                                  const scalar_t* weight,
+                                                  const scalar_t* bias,
+                                                  const scalar_t* offset,
+                                                  const scalar_t* mask,
+                                                  scalar_t*       output,
+                                                  void*           workspace,
+                                                  int             batch,
+                                                  int             channels,
+                                                  int             height,
+                                                  int             width,
+                                                  int             channels_out,
+                                                  int             kernel_w,
+                                                  int             kernel_h,
+                                                  int             stride_w,
+                                                  int             stride_h,
+                                                  int             pad_w,
+                                                  int             pad_h,
+                                                  int             dilation_w,
+                                                  int             dilation_h,
+                                                  int             group,
+                                                  int             deformable_group,
+                                                  int             im2col_step,
+                                                  cublasHandle_t  cublas_handle,
+                                                  cudaStream_t    stream);
 
 #endif
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align.cpp b/csrc/mmdeploy/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align.cpp
index ad9a518da7..456acca9b4 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align.cpp
@@ -9,219 +9,263 @@
 #include "trt_multi_level_roi_align_kernel.hpp"
 #include "trt_plugin_helper.hpp"
 #include "trt_serialize.hpp"
-namespace mmdeploy {
-namespace {
-static const char *PLUGIN_VERSION{"1"};
-static const char *PLUGIN_NAME{"MMCVMultiLevelRoiAlign"};
-}  // namespace
-
-TRTMultiLevelRoiAlign::TRTMultiLevelRoiAlign(const std::string &name, int alignedHeight,
-                                             int alignedWidth, int poolMode, int sampleNum,
-                                             const std::vector<float> &featmapStrides,
-                                             float roiScaleFactor, int finestScale, bool aligned)
-    : TRTPluginBase(name),
-      mAlignedHeight(alignedHeight),
-      mAlignedWidth(alignedWidth),
-      mPoolMode(poolMode),
-      mSampleNum(sampleNum),
-      mFeatmapStrides(featmapStrides),
-      mRoiScaleFactor(roiScaleFactor),
-      mFinestScale(finestScale),
-      mAligned(aligned) {}
-
-TRTMultiLevelRoiAlign::TRTMultiLevelRoiAlign(const std::string name, const void *data,
-                                             size_t length)
-    : TRTPluginBase(name) {
-  deserialize_value(&data, &length, &mAlignedHeight);
-  deserialize_value(&data, &length, &mAlignedWidth);
-  deserialize_value(&data, &length, &mPoolMode);
-  deserialize_value(&data, &length, &mSampleNum);
-  deserialize_value(&data, &length, &mRoiScaleFactor);
-  deserialize_value(&data, &length, &mFinestScale);
-  deserialize_value(&data, &length, &mAligned);
-  deserialize_value(&data, &length, &mFeatmapStrides);
-}
-
-nvinfer1::IPluginV2DynamicExt *TRTMultiLevelRoiAlign::clone() const TRT_NOEXCEPT {
-  TRTMultiLevelRoiAlign *plugin =
-      new TRTMultiLevelRoiAlign(mLayerName, mAlignedHeight, mAlignedWidth, mPoolMode, mSampleNum,
-                                mFeatmapStrides, mRoiScaleFactor, mFinestScale, mAligned);
-  plugin->setPluginNamespace(getPluginNamespace());
-
-  return plugin;
-}
-
-nvinfer1::DimsExprs TRTMultiLevelRoiAlign::getOutputDimensions(
-    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
-    nvinfer1::IExprBuilder &exprBuilder) TRT_NOEXCEPT {
-  // warning, nbInputs should equal to mFeatmapStrides.size() + 1
-  nvinfer1::DimsExprs ret;
-  ret.nbDims = 4;
-  ret.d[0] = inputs[0].d[0];
-  ret.d[1] = inputs[1].d[1];
-  ret.d[2] = exprBuilder.constant(mAlignedHeight);
-  ret.d[3] = exprBuilder.constant(mAlignedWidth);
-
-  return ret;
-}
-
-bool TRTMultiLevelRoiAlign::supportsFormatCombination(int pos,
-                                                      const nvinfer1::PluginTensorDesc *ioDesc,
-                                                      int nbInputs, int nbOutputs) TRT_NOEXCEPT {
-  return ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
-         ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR;
-}
-
-void TRTMultiLevelRoiAlign::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *inputs,
-                                            int nbInputs,
-                                            const nvinfer1::DynamicPluginTensorDesc *outputs,
-                                            int nbOutputs) TRT_NOEXCEPT {
-  // Validate input arguments
-  ASSERT(nbOutputs == 1);
-  ASSERT(nbInputs >= 1);
-  mFeatmapStrides =
-      std::vector<float>(mFeatmapStrides.begin(), mFeatmapStrides.begin() + (nbInputs - 1));
-}
-
-size_t TRTMultiLevelRoiAlign::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs,
-                                               int nbInputs,
-                                               const nvinfer1::PluginTensorDesc *outputs,
-                                               int nbOutputs) const TRT_NOEXCEPT {
-  return 0;
-}
-
-int TRTMultiLevelRoiAlign::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
-                                   const nvinfer1::PluginTensorDesc *outputDesc,
-                                   const void *const *inputs, void *const *outputs, void *workSpace,
-                                   cudaStream_t stream) TRT_NOEXCEPT {
-  int num_rois = inputDesc[0].dims.d[0];
-  int batch_size = inputDesc[1].dims.d[0];
-  int channels = inputDesc[1].dims.d[1];
-
-  const int kMaxFeatMap = 10;
-  int heights[kMaxFeatMap];
-  int widths[kMaxFeatMap];
-  float strides[kMaxFeatMap];
-
-  int num_feats = mFeatmapStrides.size();
-  for (int i = 0; i < num_feats; ++i) {
-    heights[i] = inputDesc[i + 1].dims.d[2];
-    widths[i] = inputDesc[i + 1].dims.d[3];
-    strides[i] = mFeatmapStrides[i];
-  }
-
-  const void *rois = inputs[0];
-  const void *const *feats = inputs + 1;
-
-  multi_level_roi_align<float>((float *)outputs[0], (const float *)rois, num_rois, feats, num_feats,
-                               batch_size, channels, &heights[0], &widths[0], &strides[0],
-                               mAlignedHeight, mAlignedWidth, mPoolMode, mSampleNum,
-                               mRoiScaleFactor, mFinestScale, mAligned, stream);
-
-  return 0;
-}
-
-nvinfer1::DataType TRTMultiLevelRoiAlign::getOutputDataType(int index,
-                                                            const nvinfer1::DataType *inputTypes,
-                                                            int nbInputs) const TRT_NOEXCEPT {
-  return nvinfer1::DataType::kFLOAT;
-}
-
-// IPluginV2 Methods
-const char *TRTMultiLevelRoiAlign::getPluginType() const TRT_NOEXCEPT { return PLUGIN_NAME; }
-
-const char *TRTMultiLevelRoiAlign::getPluginVersion() const TRT_NOEXCEPT { return PLUGIN_VERSION; }
-
-int TRTMultiLevelRoiAlign::getNbOutputs() const TRT_NOEXCEPT { return 1; }
-
-size_t TRTMultiLevelRoiAlign::getSerializationSize() const TRT_NOEXCEPT {
-  return serialized_size(mFeatmapStrides) + serialized_size(mAlignedHeight) +
-         serialized_size(mAlignedWidth) + serialized_size(mPoolMode) + serialized_size(mSampleNum) +
-         serialized_size(mRoiScaleFactor) + serialized_size(mFinestScale) +
-         serialized_size(mAligned);
-}
-
-void TRTMultiLevelRoiAlign::serialize(void *buffer) const TRT_NOEXCEPT {
-  serialize_value(&buffer, mAlignedHeight);
-  serialize_value(&buffer, mAlignedWidth);
-  serialize_value(&buffer, mPoolMode);
-  serialize_value(&buffer, mSampleNum);
-  serialize_value(&buffer, mRoiScaleFactor);
-  serialize_value(&buffer, mFinestScale);
-  serialize_value(&buffer, mAligned);
-  serialize_value(&buffer, mFeatmapStrides);
-}
-
-TRTMultiLevelRoiAlignCreator::TRTMultiLevelRoiAlignCreator() {
-  mPluginAttributes = std::vector<nvinfer1::PluginField>(
-      {nvinfer1::PluginField("output_height"), nvinfer1::PluginField("output_width"),
-       nvinfer1::PluginField("pool_mode"), nvinfer1::PluginField("sampling_ratio"),
-       nvinfer1::PluginField("featmap_strides"), nvinfer1::PluginField("roi_scale_factor"),
-       nvinfer1::PluginField("finest_scale"), nvinfer1::PluginField("aligned")});
-  mFC.nbFields = mPluginAttributes.size();
-  mFC.fields = mPluginAttributes.data();
-}
-
-const char *TRTMultiLevelRoiAlignCreator::getPluginName() const TRT_NOEXCEPT { return PLUGIN_NAME; }
-
-const char *TRTMultiLevelRoiAlignCreator::getPluginVersion() const TRT_NOEXCEPT {
-  return PLUGIN_VERSION;
-}
-
-nvinfer1::IPluginV2 *TRTMultiLevelRoiAlignCreator::createPlugin(
-    const char *name, const nvinfer1::PluginFieldCollection *fc) TRT_NOEXCEPT {
-  int alignedHeight = 7;
-  int alignedWidth = 7;
-  int poolMode = 0;
-  int sampleNum = 2;
-  std::vector<float> featmapStrides;
-  float roiScaleFactor = -1;
-  int finestScale = 56;
-  bool aligned = false;
-
-  for (int i = 0; i < fc->nbFields; i++) {
-    if (fc->fields[i].data == nullptr) {
-      continue;
-    }
-    std::string field_name(fc->fields[i].name);
-
-    if (field_name.compare("output_height") == 0) {
-      alignedHeight = static_cast<const int *>(fc->fields[i].data)[0];
-    } else if (field_name.compare("output_width") == 0) {
-      alignedWidth = static_cast<const int *>(fc->fields[i].data)[0];
-    } else if (field_name.compare("pool_mode") == 0) {
-      poolMode = static_cast<const int *>(fc->fields[i].data)[0];
-    } else if (field_name.compare("sampling_ratio") == 0) {
-      sampleNum = static_cast<const int *>(fc->fields[i].data)[0];
-    } else if (field_name.compare("roi_scale_factor") == 0) {
-      roiScaleFactor = static_cast<const float *>(fc->fields[i].data)[0];
-    } else if (field_name.compare("finest_scale") == 0) {
-      finestScale = static_cast<const int *>(fc->fields[i].data)[0];
-    } else if (field_name.compare("featmap_strides") == 0) {
-      int data_size = (fc->fields[i].length);
-      const float *data_start = static_cast<const float *>(fc->fields[i].data);
-      featmapStrides = std::vector<float>(data_start, data_start + data_size);
-    } else if (field_name.compare("aligned") == 0) {
-      int aligned_int = static_cast<const int *>(fc->fields[i].data)[0];
-      aligned = aligned_int != 0;
-    }
-  }
-
-  ASSERT(featmapStrides.size() != 0);
-
-  TRTMultiLevelRoiAlign *plugin =
-      new TRTMultiLevelRoiAlign(name, alignedHeight, alignedWidth, poolMode, sampleNum,
-                                featmapStrides, roiScaleFactor, finestScale, aligned);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-
-nvinfer1::IPluginV2 *TRTMultiLevelRoiAlignCreator::deserializePlugin(
-    const char *name, const void *serialData, size_t serialLength) TRT_NOEXCEPT {
-  auto plugin = new TRTMultiLevelRoiAlign(name, serialData, serialLength);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-
-REGISTER_TENSORRT_PLUGIN(TRTMultiLevelRoiAlignCreator);
+namespace mmdeploy
+{
+    namespace
+    {
+        static const char* PLUGIN_VERSION{"1"};
+        static const char* PLUGIN_NAME{"MMCVMultiLevelRoiAlign"};
+    }  // namespace
+
+    TRTMultiLevelRoiAlign::TRTMultiLevelRoiAlign(const std::string& name, int alignedHeight, int alignedWidth, int poolMode, int sampleNum, const std::vector<float>& featmapStrides, float roiScaleFactor, int finestScale, bool aligned)
+        : TRTPluginBase(name)
+        , mAlignedHeight(alignedHeight)
+        , mAlignedWidth(alignedWidth)
+        , mPoolMode(poolMode)
+        , mSampleNum(sampleNum)
+        , mFeatmapStrides(featmapStrides)
+        , mRoiScaleFactor(roiScaleFactor)
+        , mFinestScale(finestScale)
+        , mAligned(aligned)
+    {
+    }
+
+    TRTMultiLevelRoiAlign::TRTMultiLevelRoiAlign(const std::string name, const void* data, size_t length)
+        : TRTPluginBase(name)
+    {
+        deserialize_value(&data, &length, &mAlignedHeight);
+        deserialize_value(&data, &length, &mAlignedWidth);
+        deserialize_value(&data, &length, &mPoolMode);
+        deserialize_value(&data, &length, &mSampleNum);
+        deserialize_value(&data, &length, &mRoiScaleFactor);
+        deserialize_value(&data, &length, &mFinestScale);
+        deserialize_value(&data, &length, &mAligned);
+        deserialize_value(&data, &length, &mFeatmapStrides);
+    }
+
+    nvinfer1::IPluginV2DynamicExt* TRTMultiLevelRoiAlign::clone() const TRT_NOEXCEPT
+    {
+        TRTMultiLevelRoiAlign* plugin =
+            new TRTMultiLevelRoiAlign(mLayerName, mAlignedHeight, mAlignedWidth, mPoolMode, mSampleNum, mFeatmapStrides, mRoiScaleFactor, mFinestScale, mAligned);
+        plugin->setPluginNamespace(getPluginNamespace());
+
+        return plugin;
+    }
+
+    nvinfer1::DimsExprs TRTMultiLevelRoiAlign::getOutputDimensions(
+        int                        outputIndex,
+        const nvinfer1::DimsExprs* inputs,
+        int                        nbInputs,
+        nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT
+    {
+        // warning, nbInputs should equal to mFeatmapStrides.size() + 1
+        nvinfer1::DimsExprs ret;
+        ret.nbDims = 4;
+        ret.d[0]   = inputs[0].d[0];
+        ret.d[1]   = inputs[1].d[1];
+        ret.d[2]   = exprBuilder.constant(mAlignedHeight);
+        ret.d[3]   = exprBuilder.constant(mAlignedWidth);
+
+        return ret;
+    }
+
+    bool TRTMultiLevelRoiAlign::supportsFormatCombination(int                               pos,
+                                                          const nvinfer1::PluginTensorDesc* ioDesc,
+                                                          int                               nbInputs,
+                                                          int                               nbOutputs) TRT_NOEXCEPT
+    {
+        return ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
+               ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR;
+    }
+
+    void TRTMultiLevelRoiAlign::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs,
+                                                int                                      nbInputs,
+                                                const nvinfer1::DynamicPluginTensorDesc* outputs,
+                                                int                                      nbOutputs) TRT_NOEXCEPT
+    {
+        // Validate input arguments
+        ASSERT(nbOutputs == 1);
+        ASSERT(nbInputs >= 1);
+        mFeatmapStrides =
+            std::vector<float>(mFeatmapStrides.begin(), mFeatmapStrides.begin() + (nbInputs - 1));
+    }
+
+    size_t TRTMultiLevelRoiAlign::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                                   int                               nbInputs,
+                                                   const nvinfer1::PluginTensorDesc* outputs,
+                                                   int                               nbOutputs) const TRT_NOEXCEPT
+    {
+        return 0;
+    }
+
+    int TRTMultiLevelRoiAlign::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                       const nvinfer1::PluginTensorDesc* outputDesc,
+                                       const void* const*                inputs,
+                                       void* const*                      outputs,
+                                       void*                             workSpace,
+                                       cudaStream_t                      stream) TRT_NOEXCEPT
+    {
+        int       num_rois   = inputDesc[0].dims.d[0];
+        int       batch_size = inputDesc[1].dims.d[0];
+        int       channels   = inputDesc[1].dims.d[1];
+
+        const int kMaxFeatMap = 10;
+        int       heights[kMaxFeatMap];
+        int       widths[kMaxFeatMap];
+        float     strides[kMaxFeatMap];
+
+        int       num_feats = mFeatmapStrides.size();
+        for (int i = 0; i < num_feats; ++i)
+        {
+            heights[i] = inputDesc[i + 1].dims.d[2];
+            widths[i]  = inputDesc[i + 1].dims.d[3];
+            strides[i] = mFeatmapStrides[i];
+        }
+
+        const void*        rois  = inputs[0];
+        const void* const* feats = inputs + 1;
+
+        multi_level_roi_align<float>((float*)outputs[0], (const float*)rois, num_rois, feats, num_feats, batch_size, channels, &heights[0], &widths[0], &strides[0], mAlignedHeight, mAlignedWidth, mPoolMode, mSampleNum, mRoiScaleFactor, mFinestScale, mAligned, stream);
+
+        return 0;
+    }
+
+    nvinfer1::DataType TRTMultiLevelRoiAlign::getOutputDataType(int                       index,
+                                                                const nvinfer1::DataType* inputTypes,
+                                                                int                       nbInputs) const TRT_NOEXCEPT
+    {
+        return nvinfer1::DataType::kFLOAT;
+    }
+
+    // IPluginV2 Methods
+    const char* TRTMultiLevelRoiAlign::getPluginType() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
+    }
+
+    const char* TRTMultiLevelRoiAlign::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
+    }
+
+    int TRTMultiLevelRoiAlign::getNbOutputs() const TRT_NOEXCEPT
+    {
+        return 1;
+    }
+
+    size_t TRTMultiLevelRoiAlign::getSerializationSize() const TRT_NOEXCEPT
+    {
+        return serialized_size(mFeatmapStrides) + serialized_size(mAlignedHeight) +
+               serialized_size(mAlignedWidth) + serialized_size(mPoolMode) + serialized_size(mSampleNum) +
+               serialized_size(mRoiScaleFactor) + serialized_size(mFinestScale) +
+               serialized_size(mAligned);
+    }
+
+    void TRTMultiLevelRoiAlign::serialize(void* buffer) const TRT_NOEXCEPT
+    {
+        serialize_value(&buffer, mAlignedHeight);
+        serialize_value(&buffer, mAlignedWidth);
+        serialize_value(&buffer, mPoolMode);
+        serialize_value(&buffer, mSampleNum);
+        serialize_value(&buffer, mRoiScaleFactor);
+        serialize_value(&buffer, mFinestScale);
+        serialize_value(&buffer, mAligned);
+        serialize_value(&buffer, mFeatmapStrides);
+    }
+
+    TRTMultiLevelRoiAlignCreator::TRTMultiLevelRoiAlignCreator()
+    {
+        mPluginAttributes = std::vector<nvinfer1::PluginField>(
+            {nvinfer1::PluginField("output_height"), nvinfer1::PluginField("output_width"), nvinfer1::PluginField("pool_mode"), nvinfer1::PluginField("sampling_ratio"), nvinfer1::PluginField("featmap_strides"), nvinfer1::PluginField("roi_scale_factor"), nvinfer1::PluginField("finest_scale"), nvinfer1::PluginField("aligned")});
+        mFC.nbFields = mPluginAttributes.size();
+        mFC.fields   = mPluginAttributes.data();
+    }
+
+    const char* TRTMultiLevelRoiAlignCreator::getPluginName() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
+    }
+
+    const char* TRTMultiLevelRoiAlignCreator::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
+    }
+
+    nvinfer1::IPluginV2* TRTMultiLevelRoiAlignCreator::createPlugin(
+        const char*                            name,
+        const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT
+    {
+        int                alignedHeight = 7;
+        int                alignedWidth  = 7;
+        int                poolMode      = 0;
+        int                sampleNum     = 2;
+        std::vector<float> featmapStrides;
+        float              roiScaleFactor = -1;
+        int                finestScale    = 56;
+        bool               aligned        = false;
+
+        for (int i = 0; i < fc->nbFields; i++)
+        {
+            if (fc->fields[i].data == nullptr)
+            {
+                continue;
+            }
+            std::string field_name(fc->fields[i].name);
+
+            if (field_name.compare("output_height") == 0)
+            {
+                alignedHeight = static_cast<const int*>(fc->fields[i].data)[0];
+            }
+            else if (field_name.compare("output_width") == 0)
+            {
+                alignedWidth = static_cast<const int*>(fc->fields[i].data)[0];
+            }
+            else if (field_name.compare("pool_mode") == 0)
+            {
+                poolMode = static_cast<const int*>(fc->fields[i].data)[0];
+            }
+            else if (field_name.compare("sampling_ratio") == 0)
+            {
+                sampleNum = static_cast<const int*>(fc->fields[i].data)[0];
+            }
+            else if (field_name.compare("roi_scale_factor") == 0)
+            {
+                roiScaleFactor = static_cast<const float*>(fc->fields[i].data)[0];
+            }
+            else if (field_name.compare("finest_scale") == 0)
+            {
+                finestScale = static_cast<const int*>(fc->fields[i].data)[0];
+            }
+            else if (field_name.compare("featmap_strides") == 0)
+            {
+                int          data_size  = (fc->fields[i].length);
+                const float* data_start = static_cast<const float*>(fc->fields[i].data);
+                featmapStrides          = std::vector<float>(data_start, data_start + data_size);
+            }
+            else if (field_name.compare("aligned") == 0)
+            {
+                int aligned_int = static_cast<const int*>(fc->fields[i].data)[0];
+                aligned         = aligned_int != 0;
+            }
+        }
+
+        ASSERT(featmapStrides.size() != 0);
+
+        TRTMultiLevelRoiAlign* plugin =
+            new TRTMultiLevelRoiAlign(name, alignedHeight, alignedWidth, poolMode, sampleNum, featmapStrides, roiScaleFactor, finestScale, aligned);
+        plugin->setPluginNamespace(getPluginNamespace());
+        return plugin;
+    }
+
+    nvinfer1::IPluginV2* TRTMultiLevelRoiAlignCreator::deserializePlugin(
+        const char* name,
+        const void* serialData,
+        size_t      serialLength) TRT_NOEXCEPT
+    {
+        auto plugin = new TRTMultiLevelRoiAlign(name, serialData, serialLength);
+        plugin->setPluginNamespace(getPluginNamespace());
+        return plugin;
+    }
+
+    REGISTER_TENSORRT_PLUGIN(TRTMultiLevelRoiAlignCreator);
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align.hpp b/csrc/mmdeploy/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align.hpp
index a9a06236e0..814118d29b 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align.hpp
@@ -10,69 +10,65 @@
 
 #include "trt_plugin_base.hpp"
 
-namespace mmdeploy {
-class TRTMultiLevelRoiAlign : public TRTPluginBase {
- public:
-  TRTMultiLevelRoiAlign(const std::string &name, int alignedHeight, int alignedWidth, int poolMode,
-                        int sampleNum, const std::vector<float> &featmapStrides,
-                        float roiScaleFactor = -1, int finestScale = 56, bool aligned = false);
+namespace mmdeploy
+{
+    class TRTMultiLevelRoiAlign : public TRTPluginBase
+    {
+      public:
+        TRTMultiLevelRoiAlign(const std::string& name, int alignedHeight, int alignedWidth, int poolMode, int sampleNum, const std::vector<float>& featmapStrides, float roiScaleFactor = -1, int finestScale = 56, bool aligned = false);
 
-  TRTMultiLevelRoiAlign(const std::string name, const void *data, size_t length);
+        TRTMultiLevelRoiAlign(const std::string name, const void* data, size_t length);
 
-  TRTMultiLevelRoiAlign() = delete;
+        TRTMultiLevelRoiAlign() = delete;
 
-  // IPluginV2DynamicExt Methods
-  nvinfer1::IPluginV2DynamicExt *clone() const TRT_NOEXCEPT override;
-  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs,
-                                          int nbInputs, nvinfer1::IExprBuilder &exprBuilder)
-      TRT_NOEXCEPT override;
-  bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs,
-                                 int nbOutputs) TRT_NOEXCEPT override;
-  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs,
-                       const nvinfer1::DynamicPluginTensorDesc *out,
-                       int nbOutputs) TRT_NOEXCEPT override;
-  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
-                          const nvinfer1::PluginTensorDesc *outputs,
-                          int nbOutputs) const TRT_NOEXCEPT override;
-  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
-              const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
-              void *const *outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT override;
+        // IPluginV2DynamicExt Methods
+        nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
+        nvinfer1::DimsExprs            getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, nvinfer1::IExprBuilder& exprBuilder)
+            TRT_NOEXCEPT override;
+        bool               supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc* ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT override;
+        void               configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs, const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) TRT_NOEXCEPT override;
+        size_t             getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nbInputs, const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const TRT_NOEXCEPT override;
+        int                enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                   const nvinfer1::PluginTensorDesc* outputDesc,
+                                   const void* const*                inputs,
+                                   void* const*                      outputs,
+                                   void*                             workspace,
+                                   cudaStream_t                      stream) TRT_NOEXCEPT override;
 
-  // IPluginV2Ext Methods
-  nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
-                                       int nbInputs) const TRT_NOEXCEPT override;
+        // IPluginV2Ext Methods
+        nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const TRT_NOEXCEPT override;
 
-  // IPluginV2 Methods
-  const char *getPluginType() const TRT_NOEXCEPT override;
-  const char *getPluginVersion() const TRT_NOEXCEPT override;
-  int getNbOutputs() const TRT_NOEXCEPT override;
-  size_t getSerializationSize() const TRT_NOEXCEPT override;
-  void serialize(void *buffer) const TRT_NOEXCEPT override;
+        // IPluginV2 Methods
+        const char*        getPluginType() const TRT_NOEXCEPT override;
+        const char*        getPluginVersion() const TRT_NOEXCEPT override;
+        int                getNbOutputs() const TRT_NOEXCEPT override;
+        size_t             getSerializationSize() const TRT_NOEXCEPT override;
+        void               serialize(void* buffer) const TRT_NOEXCEPT override;
 
- private:
-  int mAlignedHeight;
-  int mAlignedWidth;
-  int mPoolMode;
-  int mSampleNum;
-  std::vector<float> mFeatmapStrides;
-  float mRoiScaleFactor;
-  int mFinestScale;
-  bool mAligned;
-};
+      private:
+        int                mAlignedHeight;
+        int                mAlignedWidth;
+        int                mPoolMode;
+        int                mSampleNum;
+        std::vector<float> mFeatmapStrides;
+        float              mRoiScaleFactor;
+        int                mFinestScale;
+        bool               mAligned;
+    };
 
-class TRTMultiLevelRoiAlignCreator : public TRTPluginCreatorBase {
- public:
-  TRTMultiLevelRoiAlignCreator();
+    class TRTMultiLevelRoiAlignCreator : public TRTPluginCreatorBase
+    {
+      public:
+        TRTMultiLevelRoiAlignCreator();
 
-  const char *getPluginName() const TRT_NOEXCEPT override;
+        const char*          getPluginName() const TRT_NOEXCEPT override;
 
-  const char *getPluginVersion() const TRT_NOEXCEPT override;
+        const char*          getPluginVersion() const TRT_NOEXCEPT override;
 
-  nvinfer1::IPluginV2 *createPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
-      TRT_NOEXCEPT override;
+        nvinfer1::IPluginV2* createPlugin(const char* name, const nvinfer1::PluginFieldCollection* fc)
+            TRT_NOEXCEPT override;
 
-  nvinfer1::IPluginV2 *deserializePlugin(const char *name, const void *serialData,
-                                         size_t serialLength) TRT_NOEXCEPT override;
-};
+        nvinfer1::IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) TRT_NOEXCEPT override;
+    };
 }  // namespace mmdeploy
 #endif  // TRT_ROI_ALIGN_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align_kernel.cu b/csrc/mmdeploy/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align_kernel.cu
index 9eefbe3f32..260086b511 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align_kernel.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align_kernel.cu
@@ -10,167 +10,264 @@
 #include "trt_plugin_helper.hpp"
 
 const int kMAX_FEATMAP_SIZE = 10;
-struct FeatData {
-  const void *data[kMAX_FEATMAP_SIZE];
-  int batch_size;
-  int channels;
-  int h[kMAX_FEATMAP_SIZE];
-  int w[kMAX_FEATMAP_SIZE];
-  float spatial_scale[kMAX_FEATMAP_SIZE];
-  int num_featmap;
+struct FeatData
+{
+    const void* data[kMAX_FEATMAP_SIZE];
+    int         batch_size;
+    int         channels;
+    int         h[kMAX_FEATMAP_SIZE];
+    int         w[kMAX_FEATMAP_SIZE];
+    float       spatial_scale[kMAX_FEATMAP_SIZE];
+    int         num_featmap;
 };
 
-template <typename scalar_t, bool aligned, int pool_mode>
-__device__ scalar_t roi_align_single(const scalar_t *__restrict__ bottom_data,
-                                     const int roi_batch_ind, const scalar_t roi_start_w,
-                                     const scalar_t roi_start_h, const scalar_t roi_end_w,
-                                     const scalar_t roi_end_h, const scalar_t spatial_scale,
-                                     const int pw, const int ph, const int c, const int sample_num,
-                                     const int channels, const int height, const int width,
-                                     const int pooled_height, const int pooled_width) {
-  // Force malformed ROIs to be 1x1
-  scalar_t roi_width = max(roi_end_w - roi_start_w, (scalar_t)(aligned ? 0. : 1.));
-  scalar_t roi_height = max(roi_end_h - roi_start_h, (scalar_t)(aligned ? 0. : 1.));
-
-  const scalar_t bin_size_h = roi_height / pooled_height;
-  const scalar_t bin_size_w = roi_width / pooled_width;
-
-  const scalar_t *offset_bottom_data =
-      bottom_data + (roi_batch_ind * channels + c) * height * width;
-
-  const int sample_num_h = (sample_num > 0) ? sample_num : ceil(roi_height / pooled_height);
-  const int sample_num_w = (sample_num > 0) ? sample_num : ceil(roi_width / pooled_width);
-
-  scalar_t output_val = (pool_mode == 0) ? -FLT_MAX : 0;
-  const scalar_t y_offset = roi_start_h + ph * bin_size_h;
-  const scalar_t y_scale = bin_size_h / (scalar_t)(sample_num_h);
-  const scalar_t x_offset = roi_start_w + pw * bin_size_w;
-  const scalar_t x_scale = bin_size_w / (scalar_t)(sample_num_w);
-  for (int iy = 0; iy < sample_num_h; iy++) {
-    const scalar_t y = fma(scalar_t(iy) + scalar_t(.5f), y_scale, y_offset);
-    for (int ix = 0; ix < sample_num_w; ix++) {
-      const scalar_t x = fma(scalar_t(ix) + scalar_t(.5f), x_scale, x_offset);
-      scalar_t val = bilinear_interpolate<scalar_t>(offset_bottom_data, height, width, y, x);
-      if (pool_mode == 0) {
-        output_val = max(output_val, val);
-      } else {
-        output_val += val;
-      }
+template<typename scalar_t, bool aligned, int pool_mode>
+__device__ scalar_t roi_align_single(const scalar_t* __restrict__ bottom_data,
+                                     const int      roi_batch_ind,
+                                     const scalar_t roi_start_w,
+                                     const scalar_t roi_start_h,
+                                     const scalar_t roi_end_w,
+                                     const scalar_t roi_end_h,
+                                     const scalar_t spatial_scale,
+                                     const int      pw,
+                                     const int      ph,
+                                     const int      c,
+                                     const int      sample_num,
+                                     const int      channels,
+                                     const int      height,
+                                     const int      width,
+                                     const int      pooled_height,
+                                     const int      pooled_width)
+{
+    // Force malformed ROIs to be 1x1
+    scalar_t        roi_width  = max(roi_end_w - roi_start_w, (scalar_t)(aligned ? 0. : 1.));
+    scalar_t        roi_height = max(roi_end_h - roi_start_h, (scalar_t)(aligned ? 0. : 1.));
+
+    const scalar_t  bin_size_h = roi_height / pooled_height;
+    const scalar_t  bin_size_w = roi_width / pooled_width;
+
+    const scalar_t* offset_bottom_data =
+        bottom_data + (roi_batch_ind * channels + c) * height * width;
+
+    const int      sample_num_h = (sample_num > 0) ? sample_num : ceil(roi_height / pooled_height);
+    const int      sample_num_w = (sample_num > 0) ? sample_num : ceil(roi_width / pooled_width);
+
+    scalar_t       output_val = (pool_mode == 0) ? -FLT_MAX : 0;
+    const scalar_t y_offset   = roi_start_h + ph * bin_size_h;
+    const scalar_t y_scale    = bin_size_h / (scalar_t)(sample_num_h);
+    const scalar_t x_offset   = roi_start_w + pw * bin_size_w;
+    const scalar_t x_scale    = bin_size_w / (scalar_t)(sample_num_w);
+    for (int iy = 0; iy < sample_num_h; iy++)
+    {
+        const scalar_t y = fma(scalar_t(iy) + scalar_t(.5f), y_scale, y_offset);
+        for (int ix = 0; ix < sample_num_w; ix++)
+        {
+            const scalar_t x   = fma(scalar_t(ix) + scalar_t(.5f), x_scale, x_offset);
+            scalar_t       val = bilinear_interpolate<scalar_t>(offset_bottom_data, height, width, y, x);
+            if (pool_mode == 0)
+            {
+                output_val = max(output_val, val);
+            }
+            else
+            {
+                output_val += val;
+            }
+        }
+    }
+    if (pool_mode != 0)
+    {
+        output_val /= max(sample_num_h * sample_num_w, 1);
     }
-  }
-  if (pool_mode != 0) {
-    output_val /= max(sample_num_h * sample_num_w, 1);
-  }
 
-  return output_val;
+    return output_val;
 }
 
-template <typename scalar_t, bool aligned>
-__global__ void roi_extractor_kernel(scalar_t *__restrict__ output,
-                                     const scalar_t *__restrict__ bottom_rois, FeatData feat_data,
-                                     const int pool_mode, const int sample_num,
-                                     const float roi_scale_factor, const int finest_scale,
-                                     const int pooled_height, const int pooled_width,
-                                     int nThreads) {
-  CUDA_1D_KERNEL_LOOP(index, nThreads) {
-    const int channels = feat_data.channels;
-    int tmp_index = index;
-    const int pw = tmp_index % pooled_width;
-    tmp_index /= pooled_width;
-    const int ph = tmp_index % pooled_height;
-    tmp_index /= pooled_height;
-    const int c = tmp_index % channels;
-    const int n = tmp_index / channels;
-
-    const scalar_t *offset_bottom_rois = bottom_rois + n * 5;
-
-    scalar_t roi_offset_x0 = offset_bottom_rois[1];
-    scalar_t roi_offset_y0 = offset_bottom_rois[2];
-    scalar_t roi_offset_x1 = offset_bottom_rois[3];
-    scalar_t roi_offset_y1 = offset_bottom_rois[4];
-
-    const scalar_t scale = sqrtf((roi_offset_y1 - roi_offset_y0) * (roi_offset_x1 - roi_offset_x0));
-
-    const int target_lvls =
-        min(feat_data.num_featmap - 1,
-            max(0, int(floorf(log2f(scale / (scalar_t)(finest_scale) + 1e-6)))));
-
-    if (roi_scale_factor > 0.) {
-      const scalar_t roi_off_cx = (roi_offset_x0 + roi_offset_x1) * 0.5;
-      const scalar_t roi_off_cy = (roi_offset_y0 + roi_offset_y1) * 0.5;
-      const scalar_t half_scale_factor = roi_scale_factor * 0.5;
-      const scalar_t half_roi_off_w =
-          fma(roi_offset_x1 - roi_offset_x0 + 1, half_scale_factor, scalar_t(-0.5));
-      const scalar_t half_roi_off_h =
-          fma(roi_offset_y1 - roi_offset_y0 + 1, half_scale_factor, scalar_t(-0.5));
-
-      roi_offset_x0 = roi_off_cx - half_roi_off_w;
-      roi_offset_x1 = roi_off_cx + half_roi_off_w;
-      roi_offset_y0 = roi_off_cy - half_roi_off_h;
-      roi_offset_y1 = roi_off_cy + half_roi_off_h;
-    }
+template<typename scalar_t, bool aligned>
+__global__ void roi_extractor_kernel(scalar_t* __restrict__ output,
+                                     const scalar_t* __restrict__ bottom_rois,
+                                     FeatData    feat_data,
+                                     const int   pool_mode,
+                                     const int   sample_num,
+                                     const float roi_scale_factor,
+                                     const int   finest_scale,
+                                     const int   pooled_height,
+                                     const int   pooled_width,
+                                     int         nThreads)
+{
+    CUDA_1D_KERNEL_LOOP(index, nThreads)
+    {
+        const int channels  = feat_data.channels;
+        int       tmp_index = index;
+        const int pw        = tmp_index % pooled_width;
+        tmp_index /= pooled_width;
+        const int ph = tmp_index % pooled_height;
+        tmp_index /= pooled_height;
+        const int       c = tmp_index % channels;
+        const int       n = tmp_index / channels;
+
+        const scalar_t* offset_bottom_rois = bottom_rois + n * 5;
+
+        scalar_t        roi_offset_x0 = offset_bottom_rois[1];
+        scalar_t        roi_offset_y0 = offset_bottom_rois[2];
+        scalar_t        roi_offset_x1 = offset_bottom_rois[3];
+        scalar_t        roi_offset_y1 = offset_bottom_rois[4];
+
+        const scalar_t  scale = sqrtf((roi_offset_y1 - roi_offset_y0) * (roi_offset_x1 - roi_offset_x0));
 
-    const scalar_t spatial_scale = (scalar_t)feat_data.spatial_scale[target_lvls];
-    const int height = feat_data.h[target_lvls];
-    const int width = feat_data.w[target_lvls];
-    const scalar_t *bottom_data = (scalar_t *)feat_data.data[target_lvls];
-
-    const int roi_batch_ind = offset_bottom_rois[0];
-    const scalar_t offset = aligned ? (scalar_t)-0.5 : (scalar_t)0.0;
-    const scalar_t roi_start_w =
-        fma(roi_offset_x0, spatial_scale, offset);  // roi_offset_x0 * spatial_scale + offset;
-    const scalar_t roi_start_h =
-        fma(roi_offset_y0, spatial_scale, offset);  // roi_offset_y0 * spatial_scale + offset;
-    const scalar_t roi_end_w =
-        fma(roi_offset_x1, spatial_scale, offset);  // (roi_offset_x1) * spatial_scale - offset;
-    const scalar_t roi_end_h =
-        fma(roi_offset_y1, spatial_scale, offset);  // (roi_offset_y1)*spatial_scale - offset;
-
-    if (pool_mode == 0) {
-      const scalar_t output_val = roi_align_single<scalar_t, aligned, 0>(
-          bottom_data, roi_batch_ind, roi_start_w, roi_start_h, roi_end_w, roi_end_h, spatial_scale,
-          pw, ph, c, sample_num, channels, height, width, pooled_height, pooled_width);
-      output[index] = output_val;
-    } else {
-      const scalar_t output_val = roi_align_single<scalar_t, aligned, 1>(
-          bottom_data, roi_batch_ind, roi_start_w, roi_start_h, roi_end_w, roi_end_h, spatial_scale,
-          pw, ph, c, sample_num, channels, height, width, pooled_height, pooled_width);
-      output[index] = output_val;
+        const int       target_lvls =
+            min(feat_data.num_featmap - 1,
+                max(0, int(floorf(log2f(scale / (scalar_t)(finest_scale) + 1e-6)))));
+
+        if (roi_scale_factor > 0.)
+        {
+            const scalar_t roi_off_cx        = (roi_offset_x0 + roi_offset_x1) * 0.5;
+            const scalar_t roi_off_cy        = (roi_offset_y0 + roi_offset_y1) * 0.5;
+            const scalar_t half_scale_factor = roi_scale_factor * 0.5;
+            const scalar_t half_roi_off_w =
+                fma(roi_offset_x1 - roi_offset_x0 + 1, half_scale_factor, scalar_t(-0.5));
+            const scalar_t half_roi_off_h =
+                fma(roi_offset_y1 - roi_offset_y0 + 1, half_scale_factor, scalar_t(-0.5));
+
+            roi_offset_x0 = roi_off_cx - half_roi_off_w;
+            roi_offset_x1 = roi_off_cx + half_roi_off_w;
+            roi_offset_y0 = roi_off_cy - half_roi_off_h;
+            roi_offset_y1 = roi_off_cy + half_roi_off_h;
+        }
+
+        const scalar_t  spatial_scale = (scalar_t)feat_data.spatial_scale[target_lvls];
+        const int       height        = feat_data.h[target_lvls];
+        const int       width         = feat_data.w[target_lvls];
+        const scalar_t* bottom_data   = (scalar_t*)feat_data.data[target_lvls];
+
+        const int       roi_batch_ind = offset_bottom_rois[0];
+        const scalar_t  offset        = aligned ? (scalar_t)-0.5 : (scalar_t)0.0;
+        const scalar_t  roi_start_w =
+            fma(roi_offset_x0, spatial_scale, offset);  // roi_offset_x0 * spatial_scale + offset;
+        const scalar_t roi_start_h =
+            fma(roi_offset_y0, spatial_scale, offset);  // roi_offset_y0 * spatial_scale + offset;
+        const scalar_t roi_end_w =
+            fma(roi_offset_x1, spatial_scale, offset);  // (roi_offset_x1) * spatial_scale - offset;
+        const scalar_t roi_end_h =
+            fma(roi_offset_y1, spatial_scale, offset);  // (roi_offset_y1)*spatial_scale - offset;
+
+        if (pool_mode == 0)
+        {
+            const scalar_t output_val = roi_align_single<scalar_t, aligned, 0>(bottom_data,
+                                                                               roi_batch_ind,
+                                                                               roi_start_w,
+                                                                               roi_start_h,
+                                                                               roi_end_w,
+                                                                               roi_end_h,
+                                                                               spatial_scale,
+                                                                               pw,
+                                                                               ph,
+                                                                               c,
+                                                                               sample_num,
+                                                                               channels,
+                                                                               height,
+                                                                               width,
+                                                                               pooled_height,
+                                                                               pooled_width);
+            output[index] = output_val;
+        }
+        else
+        {
+            const scalar_t output_val = roi_align_single<scalar_t, aligned, 1>(bottom_data,
+                                                                               roi_batch_ind,
+                                                                               roi_start_w,
+                                                                               roi_start_h,
+                                                                               roi_end_w,
+                                                                               roi_end_h,
+                                                                               spatial_scale,
+                                                                               pw,
+                                                                               ph,
+                                                                               c,
+                                                                               sample_num,
+                                                                               channels,
+                                                                               height,
+                                                                               width,
+                                                                               pooled_height,
+                                                                               pooled_width);
+            output[index] = output_val;
+        }
     }
-  }
 }
 
-template <typename T>
-void multi_level_roi_align(T *output, const T *rois, int num_rois, const void *const *feats,
-                           int num_feats, int n, int c, int *h, int *w, float *strides,
-                           int aligned_height, int aligned_width, int pool_mode, int sample_num,
-                           float roi_scale_factor, int finest_scale, bool aligned,
-                           cudaStream_t stream) {
-  FeatData feat_data;
-  feat_data.batch_size = n;
-  feat_data.channels = c;
-  feat_data.num_featmap = num_feats;
-  for (int i = 0; i < num_feats; ++i) {
-    feat_data.data[i] = feats[i];
-    feat_data.h[i] = h[i];
-    feat_data.w[i] = w[i];
-    feat_data.spatial_scale[i] = 1. / float(strides[i]);
-  }
-  int nThreads = num_rois * c * aligned_height * aligned_width;
-  if (aligned) {
-    roi_extractor_kernel<T, true><<<GET_BLOCKS(nThreads), THREADS_PER_BLOCK, 0, stream>>>(
-        output, rois, feat_data, pool_mode, sample_num, roi_scale_factor, finest_scale,
-        aligned_height, aligned_width, nThreads);
-  } else {
-    roi_extractor_kernel<T, false><<<GET_BLOCKS(nThreads), THREADS_PER_BLOCK, 0, stream>>>(
-        output, rois, feat_data, pool_mode, sample_num, roi_scale_factor, finest_scale,
-        aligned_height, aligned_width, nThreads);
-  }
+template<typename T>
+void multi_level_roi_align(T*                 output,
+                           const T*           rois,
+                           int                num_rois,
+                           const void* const* feats,
+                           int                num_feats,
+                           int                n,
+                           int                c,
+                           int*               h,
+                           int*               w,
+                           float*             strides,
+                           int                aligned_height,
+                           int                aligned_width,
+                           int                pool_mode,
+                           int                sample_num,
+                           float              roi_scale_factor,
+                           int                finest_scale,
+                           bool               aligned,
+                           cudaStream_t       stream)
+{
+    FeatData feat_data;
+    feat_data.batch_size  = n;
+    feat_data.channels    = c;
+    feat_data.num_featmap = num_feats;
+    for (int i = 0; i < num_feats; ++i)
+    {
+        feat_data.data[i]          = feats[i];
+        feat_data.h[i]             = h[i];
+        feat_data.w[i]             = w[i];
+        feat_data.spatial_scale[i] = 1. / float(strides[i]);
+    }
+    int nThreads = num_rois * c * aligned_height * aligned_width;
+    if (aligned)
+    {
+        roi_extractor_kernel<T, true><<<GET_BLOCKS(nThreads), THREADS_PER_BLOCK, 0, stream>>>(output,
+                                                                                              rois,
+                                                                                              feat_data,
+                                                                                              pool_mode,
+                                                                                              sample_num,
+                                                                                              roi_scale_factor,
+                                                                                              finest_scale,
+                                                                                              aligned_height,
+                                                                                              aligned_width,
+                                                                                              nThreads);
+    }
+    else
+    {
+        roi_extractor_kernel<T, false><<<GET_BLOCKS(nThreads), THREADS_PER_BLOCK, 0, stream>>>(output,
+                                                                                               rois,
+                                                                                               feat_data,
+                                                                                               pool_mode,
+                                                                                               sample_num,
+                                                                                               roi_scale_factor,
+                                                                                               finest_scale,
+                                                                                               aligned_height,
+                                                                                               aligned_width,
+                                                                                               nThreads);
+    }
 }
 
-template void multi_level_roi_align<float>(float *output, const float *rois, int num_rois,
-                                           const void *const *feats, int num_feats, int n, int c,
-                                           int *h, int *w, float *strides, int aligned_height,
-                                           int aligned_width, int pool_mode, int sample_num,
-                                           float roi_scale_factor, int finest_scale, bool aligned,
-                                           cudaStream_t stream);
+template void multi_level_roi_align<float>(float*             output,
+                                           const float*       rois,
+                                           int                num_rois,
+                                           const void* const* feats,
+                                           int                num_feats,
+                                           int                n,
+                                           int                c,
+                                           int*               h,
+                                           int*               w,
+                                           float*             strides,
+                                           int                aligned_height,
+                                           int                aligned_width,
+                                           int                pool_mode,
+                                           int                sample_num,
+                                           float              roi_scale_factor,
+                                           int                finest_scale,
+                                           bool               aligned,
+                                           cudaStream_t       stream);
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align_kernel.hpp b/csrc/mmdeploy/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align_kernel.hpp
index 5f7220dbf0..efd5564a27 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align_kernel.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/multi_level_roi_align/trt_multi_level_roi_align_kernel.hpp
@@ -3,11 +3,7 @@
 #define TRT_MULTI_LEVEL_ROI_ALIGN_KERNEL_HPP
 #include <cuda_runtime.h>
 
-template <typename T>
-void multi_level_roi_align(T *output, const T *rois, int num_rois, const void *const *feats,
-                           int num_feats, int n, int c, int *h, int *w, float *strides,
-                           int aligned_height, int aligned_width, int pool_mode, int sample_num,
-                           float roi_scale_factor, int finest_scale, bool aligned,
-                           cudaStream_t stream);
+template<typename T>
+void multi_level_roi_align(T* output, const T* rois, int num_rois, const void* const* feats, int num_feats, int n, int c, int* h, int* w, float* strides, int aligned_height, int aligned_width, int pool_mode, int sample_num, float roi_scale_factor, int finest_scale, bool aligned, cudaStream_t stream);
 
 #endif  // TRT_MULTI_LEVEL_ROI_ALIGN_KERNEL_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/multi_level_rotated_roi_align/trt_multi_level_rotated_roi_align.cpp b/csrc/mmdeploy/backend_ops/tensorrt/multi_level_rotated_roi_align/trt_multi_level_rotated_roi_align.cpp
index 6637603128..ec3c282ffe 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/multi_level_rotated_roi_align/trt_multi_level_rotated_roi_align.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/multi_level_rotated_roi_align/trt_multi_level_rotated_roi_align.cpp
@@ -9,220 +9,309 @@
 #include "trt_multi_level_rotated_roi_align_kernel.hpp"
 #include "trt_plugin_helper.hpp"
 #include "trt_serialize.hpp"
-namespace mmdeploy {
-namespace {
-static const char *PLUGIN_VERSION{"1"};
-static const char *PLUGIN_NAME{"MMCVMultiLevelRotatedRoiAlign"};
-}  // namespace
-
-TRTMultiLevelRotatedRoiAlign::TRTMultiLevelRotatedRoiAlign(
-    const std::string &name, int alignedHeight, int alignedWidth, int clockwise, int sampleNum,
-    const std::vector<float> &featmapStrides, float roiScaleFactor, int finestScale, bool aligned)
-    : TRTPluginBase(name),
-      mAlignedHeight(alignedHeight),
-      mAlignedWidth(alignedWidth),
-      mClockwise(clockwise),
-      mSampleNum(sampleNum),
-      mFeatmapStrides(featmapStrides),
-      mRoiScaleFactor(roiScaleFactor),
-      mFinestScale(finestScale),
-      mAligned(aligned) {}
-
-TRTMultiLevelRotatedRoiAlign::TRTMultiLevelRotatedRoiAlign(const std::string name, const void *data,
-                                                           size_t length)
-    : TRTPluginBase(name) {
-  deserialize_value(&data, &length, &mAlignedHeight);
-  deserialize_value(&data, &length, &mAlignedWidth);
-  deserialize_value(&data, &length, &mClockwise);
-  deserialize_value(&data, &length, &mSampleNum);
-  deserialize_value(&data, &length, &mRoiScaleFactor);
-  deserialize_value(&data, &length, &mFinestScale);
-  deserialize_value(&data, &length, &mAligned);
-  deserialize_value(&data, &length, &mFeatmapStrides);
-}
-
-nvinfer1::IPluginV2DynamicExt *TRTMultiLevelRotatedRoiAlign::clone() const TRT_NOEXCEPT {
-  TRTMultiLevelRotatedRoiAlign *plugin = new TRTMultiLevelRotatedRoiAlign(
-      mLayerName, mAlignedHeight, mAlignedWidth, mClockwise, mSampleNum, mFeatmapStrides,
-      mRoiScaleFactor, mFinestScale, mAligned);
-  plugin->setPluginNamespace(getPluginNamespace());
-
-  return plugin;
-}
-
-nvinfer1::DimsExprs TRTMultiLevelRotatedRoiAlign::getOutputDimensions(
-    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
-    nvinfer1::IExprBuilder &exprBuilder) TRT_NOEXCEPT {
-  // warning, nbInputs should equal to mFeatmapStrides.size() + 1
-  nvinfer1::DimsExprs ret;
-  ret.nbDims = 4;
-  ret.d[0] = inputs[0].d[0];
-  ret.d[1] = inputs[1].d[1];
-  ret.d[2] = exprBuilder.constant(mAlignedHeight);
-  ret.d[3] = exprBuilder.constant(mAlignedWidth);
-
-  return ret;
-}
-
-bool TRTMultiLevelRotatedRoiAlign::supportsFormatCombination(
-    int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT {
-  return ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
-         ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR;
-}
-
-void TRTMultiLevelRotatedRoiAlign::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *inputs,
-                                                   int nbInputs,
-                                                   const nvinfer1::DynamicPluginTensorDesc *outputs,
-                                                   int nbOutputs) TRT_NOEXCEPT {
-  // Validate input arguments
-  ASSERT(nbOutputs == 1);
-  ASSERT(nbInputs >= 1);
-  mFeatmapStrides =
-      std::vector<float>(mFeatmapStrides.begin(), mFeatmapStrides.begin() + nbInputs - 1);
-}
-
-size_t TRTMultiLevelRotatedRoiAlign::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs,
-                                                      int nbInputs,
-                                                      const nvinfer1::PluginTensorDesc *outputs,
-                                                      int nbOutputs) const TRT_NOEXCEPT {
-  return 0;
-}
-
-int TRTMultiLevelRotatedRoiAlign::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
-                                          const nvinfer1::PluginTensorDesc *outputDesc,
-                                          const void *const *inputs, void *const *outputs,
-                                          void *workSpace, cudaStream_t stream) TRT_NOEXCEPT {
-  int num_rois = inputDesc[0].dims.d[0];
-  int batch_size = inputDesc[1].dims.d[0];
-  int channels = inputDesc[1].dims.d[1];
-
-  const int kMaxFeatMap = 10;
-  int heights[kMaxFeatMap];
-  int widths[kMaxFeatMap];
-  float strides[kMaxFeatMap];
-
-  int num_feats = mFeatmapStrides.size();
-  for (int i = 0; i < num_feats; ++i) {
-    heights[i] = inputDesc[i + 1].dims.d[2];
-    widths[i] = inputDesc[i + 1].dims.d[3];
-    strides[i] = mFeatmapStrides[i];
-  }
-
-  const void *rois = inputs[0];
-  const void *const *feats = inputs + 1;
-
-  multi_level_rotated_roi_align<float>((float *)outputs[0], (const float *)rois, num_rois, feats,
-                                       num_feats, batch_size, channels, &heights[0], &widths[0],
-                                       &strides[0], mAlignedHeight, mAlignedWidth, mClockwise,
-                                       mSampleNum, mRoiScaleFactor, mFinestScale, mAligned, stream);
-
-  return 0;
-}
-
-nvinfer1::DataType TRTMultiLevelRotatedRoiAlign::getOutputDataType(
-    int index, const nvinfer1::DataType *inputTypes, int nbInputs) const TRT_NOEXCEPT {
-  return nvinfer1::DataType::kFLOAT;
-}
-
-// IPluginV2 Methods
-const char *TRTMultiLevelRotatedRoiAlign::getPluginType() const TRT_NOEXCEPT { return PLUGIN_NAME; }
-
-const char *TRTMultiLevelRotatedRoiAlign::getPluginVersion() const TRT_NOEXCEPT {
-  return PLUGIN_VERSION;
-}
-
-int TRTMultiLevelRotatedRoiAlign::getNbOutputs() const TRT_NOEXCEPT { return 1; }
-
-size_t TRTMultiLevelRotatedRoiAlign::getSerializationSize() const TRT_NOEXCEPT {
-  return serialized_size(mFeatmapStrides) + serialized_size(mAlignedHeight) +
-         serialized_size(mAlignedWidth) + serialized_size(mClockwise) +
-         serialized_size(mSampleNum) + serialized_size(mRoiScaleFactor) +
-         serialized_size(mFinestScale) + serialized_size(mAligned);
-}
-
-void TRTMultiLevelRotatedRoiAlign::serialize(void *buffer) const TRT_NOEXCEPT {
-  serialize_value(&buffer, mAlignedHeight);
-  serialize_value(&buffer, mAlignedWidth);
-  serialize_value(&buffer, mClockwise);
-  serialize_value(&buffer, mSampleNum);
-  serialize_value(&buffer, mRoiScaleFactor);
-  serialize_value(&buffer, mFinestScale);
-  serialize_value(&buffer, mAligned);
-  serialize_value(&buffer, mFeatmapStrides);
-}
-
-TRTMultiLevelRotatedRoiAlignCreator::TRTMultiLevelRotatedRoiAlignCreator() {
-  mPluginAttributes = std::vector<nvinfer1::PluginField>(
-      {nvinfer1::PluginField("output_height"), nvinfer1::PluginField("output_width"),
-       nvinfer1::PluginField("clockwise"), nvinfer1::PluginField("sampling_ratio"),
-       nvinfer1::PluginField("featmap_strides"), nvinfer1::PluginField("roi_scale_factor"),
-       nvinfer1::PluginField("finest_scale"), nvinfer1::PluginField("aligned")});
-  mFC.nbFields = mPluginAttributes.size();
-  mFC.fields = mPluginAttributes.data();
-}
-
-const char *TRTMultiLevelRotatedRoiAlignCreator::getPluginName() const TRT_NOEXCEPT {
-  return PLUGIN_NAME;
-}
-
-const char *TRTMultiLevelRotatedRoiAlignCreator::getPluginVersion() const TRT_NOEXCEPT {
-  return PLUGIN_VERSION;
-}
-
-nvinfer1::IPluginV2 *TRTMultiLevelRotatedRoiAlignCreator::createPlugin(
-    const char *name, const nvinfer1::PluginFieldCollection *fc) TRT_NOEXCEPT {
-  int alignedHeight = 7;
-  int alignedWidth = 7;
-  int clockwise = 0;
-  int sampleNum = 2;
-  std::vector<float> featmapStrides;
-  float roiScaleFactor = -1;
-  int finestScale = 56;
-  bool aligned = false;
-
-  for (int i = 0; i < fc->nbFields; i++) {
-    if (fc->fields[i].data == nullptr) {
-      continue;
-    }
-    std::string field_name(fc->fields[i].name);
-
-    if (field_name.compare("output_height") == 0) {
-      alignedHeight = static_cast<const int *>(fc->fields[i].data)[0];
-    } else if (field_name.compare("output_width") == 0) {
-      alignedWidth = static_cast<const int *>(fc->fields[i].data)[0];
-    } else if (field_name.compare("clockwise") == 0) {
-      clockwise = static_cast<const int *>(fc->fields[i].data)[0];
-    } else if (field_name.compare("sampling_ratio") == 0) {
-      sampleNum = static_cast<const int *>(fc->fields[i].data)[0];
-    } else if (field_name.compare("roi_scale_factor") == 0) {
-      roiScaleFactor = static_cast<const float *>(fc->fields[i].data)[0];
-    } else if (field_name.compare("finest_scale") == 0) {
-      finestScale = static_cast<const int *>(fc->fields[i].data)[0];
-    } else if (field_name.compare("featmap_strides") == 0) {
-      int data_size = (fc->fields[i].length);
-      const float *data_start = static_cast<const float *>(fc->fields[i].data);
-      featmapStrides = std::vector<float>(data_start, data_start + data_size);
-    } else if (field_name.compare("aligned") == 0) {
-      int aligned_int = static_cast<const int *>(fc->fields[i].data)[0];
-      aligned = aligned_int != 0;
-    }
-  }
-
-  ASSERT(featmapStrides.size() != 0);
-
-  TRTMultiLevelRotatedRoiAlign *plugin =
-      new TRTMultiLevelRotatedRoiAlign(name, alignedHeight, alignedWidth, clockwise, sampleNum,
-                                       featmapStrides, roiScaleFactor, finestScale, aligned);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-
-nvinfer1::IPluginV2 *TRTMultiLevelRotatedRoiAlignCreator::deserializePlugin(
-    const char *name, const void *serialData, size_t serialLength) TRT_NOEXCEPT {
-  auto plugin = new TRTMultiLevelRotatedRoiAlign(name, serialData, serialLength);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-
-REGISTER_TENSORRT_PLUGIN(TRTMultiLevelRotatedRoiAlignCreator);
+namespace mmdeploy
+{
+    namespace
+    {
+        static const char* PLUGIN_VERSION{"1"};
+        static const char* PLUGIN_NAME{"MMCVMultiLevelRotatedRoiAlign"};
+    }  // namespace
+
+    TRTMultiLevelRotatedRoiAlign::TRTMultiLevelRotatedRoiAlign(const std::string&        name,
+                                                               int                       alignedHeight,
+                                                               int                       alignedWidth,
+                                                               int                       clockwise,
+                                                               int                       sampleNum,
+                                                               const std::vector<float>& featmapStrides,
+                                                               float                     roiScaleFactor,
+                                                               int                       finestScale,
+                                                               bool                      aligned)
+        : TRTPluginBase(name)
+        , mAlignedHeight(alignedHeight)
+        , mAlignedWidth(alignedWidth)
+        , mClockwise(clockwise)
+        , mSampleNum(sampleNum)
+        , mFeatmapStrides(featmapStrides)
+        , mRoiScaleFactor(roiScaleFactor)
+        , mFinestScale(finestScale)
+        , mAligned(aligned)
+    {
+    }
+
+    TRTMultiLevelRotatedRoiAlign::TRTMultiLevelRotatedRoiAlign(const std::string name,
+                                                               const void*       data,
+                                                               size_t            length)
+        : TRTPluginBase(name)
+    {
+        deserialize_value(&data, &length, &mAlignedHeight);
+        deserialize_value(&data, &length, &mAlignedWidth);
+        deserialize_value(&data, &length, &mClockwise);
+        deserialize_value(&data, &length, &mSampleNum);
+        deserialize_value(&data, &length, &mRoiScaleFactor);
+        deserialize_value(&data, &length, &mFinestScale);
+        deserialize_value(&data, &length, &mAligned);
+        deserialize_value(&data, &length, &mFeatmapStrides);
+    }
+
+    nvinfer1::IPluginV2DynamicExt* TRTMultiLevelRotatedRoiAlign::clone() const TRT_NOEXCEPT
+    {
+        TRTMultiLevelRotatedRoiAlign* plugin = new TRTMultiLevelRotatedRoiAlign(mLayerName,
+                                                                                mAlignedHeight,
+                                                                                mAlignedWidth,
+                                                                                mClockwise,
+                                                                                mSampleNum,
+                                                                                mFeatmapStrides,
+                                                                                mRoiScaleFactor,
+                                                                                mFinestScale,
+                                                                                mAligned);
+        plugin->setPluginNamespace(getPluginNamespace());
+
+        return plugin;
+    }
+
+    nvinfer1::DimsExprs TRTMultiLevelRotatedRoiAlign::getOutputDimensions(int                        outputIndex,
+                                                                          const nvinfer1::DimsExprs* inputs,
+                                                                          int                        nbInputs,
+                                                                          nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT
+    {
+        // warning, nbInputs should equal to mFeatmapStrides.size() + 1
+        nvinfer1::DimsExprs ret;
+        ret.nbDims = 4;
+        ret.d[0]   = inputs[0].d[0];
+        ret.d[1]   = inputs[1].d[1];
+        ret.d[2]   = exprBuilder.constant(mAlignedHeight);
+        ret.d[3]   = exprBuilder.constant(mAlignedWidth);
+
+        return ret;
+    }
+
+    bool TRTMultiLevelRotatedRoiAlign::supportsFormatCombination(int                               pos,
+                                                                 const nvinfer1::PluginTensorDesc* ioDesc,
+                                                                 int                               nbInputs,
+                                                                 int                               nbOutputs) TRT_NOEXCEPT
+    {
+        return ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
+               ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR;
+    }
+
+    void TRTMultiLevelRotatedRoiAlign::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs,
+                                                       int                                      nbInputs,
+                                                       const nvinfer1::DynamicPluginTensorDesc* outputs,
+                                                       int                                      nbOutputs) TRT_NOEXCEPT
+    {
+        // Validate input arguments
+        ASSERT(nbOutputs == 1);
+        ASSERT(nbInputs >= 1);
+        mFeatmapStrides =
+            std::vector<float>(mFeatmapStrides.begin(), mFeatmapStrides.begin() + nbInputs - 1);
+    }
+
+    size_t TRTMultiLevelRotatedRoiAlign::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                                          int                               nbInputs,
+                                                          const nvinfer1::PluginTensorDesc* outputs,
+                                                          int                               nbOutputs) const TRT_NOEXCEPT
+    {
+        return 0;
+    }
+
+    int TRTMultiLevelRotatedRoiAlign::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                              const nvinfer1::PluginTensorDesc* outputDesc,
+                                              const void* const*                inputs,
+                                              void* const*                      outputs,
+                                              void*                             workSpace,
+                                              cudaStream_t                      stream) TRT_NOEXCEPT
+    {
+        int       num_rois   = inputDesc[0].dims.d[0];
+        int       batch_size = inputDesc[1].dims.d[0];
+        int       channels   = inputDesc[1].dims.d[1];
+
+        const int kMaxFeatMap = 10;
+        int       heights[kMaxFeatMap];
+        int       widths[kMaxFeatMap];
+        float     strides[kMaxFeatMap];
+
+        int       num_feats = mFeatmapStrides.size();
+        for (int i = 0; i < num_feats; ++i)
+        {
+            heights[i] = inputDesc[i + 1].dims.d[2];
+            widths[i]  = inputDesc[i + 1].dims.d[3];
+            strides[i] = mFeatmapStrides[i];
+        }
+
+        const void*        rois  = inputs[0];
+        const void* const* feats = inputs + 1;
+
+        multi_level_rotated_roi_align<float>((float*)outputs[0],
+                                             (const float*)rois,
+                                             num_rois,
+                                             feats,
+                                             num_feats,
+                                             batch_size,
+                                             channels,
+                                             &heights[0],
+                                             &widths[0],
+                                             &strides[0],
+                                             mAlignedHeight,
+                                             mAlignedWidth,
+                                             mClockwise,
+                                             mSampleNum,
+                                             mRoiScaleFactor,
+                                             mFinestScale,
+                                             mAligned,
+                                             stream);
+
+        return 0;
+    }
+
+    nvinfer1::DataType TRTMultiLevelRotatedRoiAlign::getOutputDataType(int                       index,
+                                                                       const nvinfer1::DataType* inputTypes,
+                                                                       int                       nbInputs) const TRT_NOEXCEPT
+    {
+        return nvinfer1::DataType::kFLOAT;
+    }
+
+    // IPluginV2 Methods
+    const char* TRTMultiLevelRotatedRoiAlign::getPluginType() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
+    }
+
+    const char* TRTMultiLevelRotatedRoiAlign::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
+    }
+
+    int TRTMultiLevelRotatedRoiAlign::getNbOutputs() const TRT_NOEXCEPT
+    {
+        return 1;
+    }
+
+    size_t TRTMultiLevelRotatedRoiAlign::getSerializationSize() const TRT_NOEXCEPT
+    {
+        return serialized_size(mFeatmapStrides) + serialized_size(mAlignedHeight) +
+               serialized_size(mAlignedWidth) + serialized_size(mClockwise) +
+               serialized_size(mSampleNum) + serialized_size(mRoiScaleFactor) +
+               serialized_size(mFinestScale) + serialized_size(mAligned);
+    }
+
+    void TRTMultiLevelRotatedRoiAlign::serialize(void* buffer) const TRT_NOEXCEPT
+    {
+        serialize_value(&buffer, mAlignedHeight);
+        serialize_value(&buffer, mAlignedWidth);
+        serialize_value(&buffer, mClockwise);
+        serialize_value(&buffer, mSampleNum);
+        serialize_value(&buffer, mRoiScaleFactor);
+        serialize_value(&buffer, mFinestScale);
+        serialize_value(&buffer, mAligned);
+        serialize_value(&buffer, mFeatmapStrides);
+    }
+
+    TRTMultiLevelRotatedRoiAlignCreator::TRTMultiLevelRotatedRoiAlignCreator()
+    {
+        mPluginAttributes = std::vector<nvinfer1::PluginField>({nvinfer1::PluginField("output_height"),
+                                                                nvinfer1::PluginField("output_width"),
+                                                                nvinfer1::PluginField("clockwise"),
+                                                                nvinfer1::PluginField("sampling_ratio"),
+                                                                nvinfer1::PluginField("featmap_strides"),
+                                                                nvinfer1::PluginField("roi_scale_factor"),
+                                                                nvinfer1::PluginField("finest_scale"),
+                                                                nvinfer1::PluginField("aligned")});
+        mFC.nbFields = mPluginAttributes.size();
+        mFC.fields   = mPluginAttributes.data();
+    }
+
+    const char* TRTMultiLevelRotatedRoiAlignCreator::getPluginName() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
+    }
+
+    const char* TRTMultiLevelRotatedRoiAlignCreator::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
+    }
+
+    nvinfer1::IPluginV2* TRTMultiLevelRotatedRoiAlignCreator::createPlugin(
+        const char*                            name,
+        const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT
+    {
+        int                alignedHeight = 7;
+        int                alignedWidth  = 7;
+        int                clockwise     = 0;
+        int                sampleNum     = 2;
+        std::vector<float> featmapStrides;
+        float              roiScaleFactor = -1;
+        int                finestScale    = 56;
+        bool               aligned        = false;
+
+        for (int i = 0; i < fc->nbFields; i++)
+        {
+            if (fc->fields[i].data == nullptr)
+            {
+                continue;
+            }
+            std::string field_name(fc->fields[i].name);
+
+            if (field_name.compare("output_height") == 0)
+            {
+                alignedHeight = static_cast<const int*>(fc->fields[i].data)[0];
+            }
+            else if (field_name.compare("output_width") == 0)
+            {
+                alignedWidth = static_cast<const int*>(fc->fields[i].data)[0];
+            }
+            else if (field_name.compare("clockwise") == 0)
+            {
+                clockwise = static_cast<const int*>(fc->fields[i].data)[0];
+            }
+            else if (field_name.compare("sampling_ratio") == 0)
+            {
+                sampleNum = static_cast<const int*>(fc->fields[i].data)[0];
+            }
+            else if (field_name.compare("roi_scale_factor") == 0)
+            {
+                roiScaleFactor = static_cast<const float*>(fc->fields[i].data)[0];
+            }
+            else if (field_name.compare("finest_scale") == 0)
+            {
+                finestScale = static_cast<const int*>(fc->fields[i].data)[0];
+            }
+            else if (field_name.compare("featmap_strides") == 0)
+            {
+                int          data_size  = (fc->fields[i].length);
+                const float* data_start = static_cast<const float*>(fc->fields[i].data);
+                featmapStrides          = std::vector<float>(data_start, data_start + data_size);
+            }
+            else if (field_name.compare("aligned") == 0)
+            {
+                int aligned_int = static_cast<const int*>(fc->fields[i].data)[0];
+                aligned         = aligned_int != 0;
+            }
+        }
+
+        ASSERT(featmapStrides.size() != 0);
+
+        TRTMultiLevelRotatedRoiAlign* plugin = new TRTMultiLevelRotatedRoiAlign(name,
+                                                                                alignedHeight,
+                                                                                alignedWidth,
+                                                                                clockwise,
+                                                                                sampleNum,
+                                                                                featmapStrides,
+                                                                                roiScaleFactor,
+                                                                                finestScale,
+                                                                                aligned);
+        plugin->setPluginNamespace(getPluginNamespace());
+        return plugin;
+    }
+
+    nvinfer1::IPluginV2* TRTMultiLevelRotatedRoiAlignCreator::deserializePlugin(
+        const char* name,
+        const void* serialData,
+        size_t      serialLength) TRT_NOEXCEPT
+    {
+        auto plugin = new TRTMultiLevelRotatedRoiAlign(name, serialData, serialLength);
+        plugin->setPluginNamespace(getPluginNamespace());
+        return plugin;
+    }
+
+    REGISTER_TENSORRT_PLUGIN(TRTMultiLevelRotatedRoiAlignCreator);
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/multi_level_rotated_roi_align/trt_multi_level_rotated_roi_align.hpp b/csrc/mmdeploy/backend_ops/tensorrt/multi_level_rotated_roi_align/trt_multi_level_rotated_roi_align.hpp
index cf0bab7584..906a429f6e 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/multi_level_rotated_roi_align/trt_multi_level_rotated_roi_align.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/multi_level_rotated_roi_align/trt_multi_level_rotated_roi_align.hpp
@@ -10,70 +10,95 @@
 
 #include "trt_plugin_base.hpp"
 
-namespace mmdeploy {
-class TRTMultiLevelRotatedRoiAlign : public TRTPluginBase {
- public:
-  TRTMultiLevelRotatedRoiAlign(const std::string &name, int alignedHeight, int alignedWidth,
-                               int clockwise, int sampleNum,
-                               const std::vector<float> &featmapStrides, float roiScaleFactor = -1,
-                               int finestScale = 56, bool aligned = false);
-
-  TRTMultiLevelRotatedRoiAlign(const std::string name, const void *data, size_t length);
-
-  TRTMultiLevelRotatedRoiAlign() = delete;
-
-  // IPluginV2DynamicExt Methods
-  nvinfer1::IPluginV2DynamicExt *clone() const TRT_NOEXCEPT override;
-  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs,
-                                          int nbInputs, nvinfer1::IExprBuilder &exprBuilder)
-      TRT_NOEXCEPT override;
-  bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs,
-                                 int nbOutputs) TRT_NOEXCEPT override;
-  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs,
-                       const nvinfer1::DynamicPluginTensorDesc *out,
-                       int nbOutputs) TRT_NOEXCEPT override;
-  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
-                          const nvinfer1::PluginTensorDesc *outputs,
-                          int nbOutputs) const TRT_NOEXCEPT override;
-  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
-              const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
-              void *const *outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT override;
-
-  // IPluginV2Ext Methods
-  nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
-                                       int nbInputs) const TRT_NOEXCEPT override;
-
-  // IPluginV2 Methods
-  const char *getPluginType() const TRT_NOEXCEPT override;
-  const char *getPluginVersion() const TRT_NOEXCEPT override;
-  int getNbOutputs() const TRT_NOEXCEPT override;
-  size_t getSerializationSize() const TRT_NOEXCEPT override;
-  void serialize(void *buffer) const TRT_NOEXCEPT override;
-
- private:
-  int mAlignedHeight;
-  int mAlignedWidth;
-  int mClockwise;
-  int mSampleNum;
-  std::vector<float> mFeatmapStrides;
-  float mRoiScaleFactor;
-  int mFinestScale;
-  bool mAligned;
-};
-
-class TRTMultiLevelRotatedRoiAlignCreator : public TRTPluginCreatorBase {
- public:
-  TRTMultiLevelRotatedRoiAlignCreator();
-
-  const char *getPluginName() const TRT_NOEXCEPT override;
-
-  const char *getPluginVersion() const TRT_NOEXCEPT override;
-
-  nvinfer1::IPluginV2 *createPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
-      TRT_NOEXCEPT override;
-
-  nvinfer1::IPluginV2 *deserializePlugin(const char *name, const void *serialData,
-                                         size_t serialLength) TRT_NOEXCEPT override;
-};
+namespace mmdeploy
+{
+    class TRTMultiLevelRotatedRoiAlign : public TRTPluginBase
+    {
+      public:
+        TRTMultiLevelRotatedRoiAlign(const std::string&        name,
+                                     int                       alignedHeight,
+                                     int                       alignedWidth,
+                                     int                       clockwise,
+                                     int                       sampleNum,
+                                     const std::vector<float>& featmapStrides,
+                                     float                     roiScaleFactor = -1,
+                                     int                       finestScale    = 56,
+                                     bool                      aligned        = false);
+
+        TRTMultiLevelRotatedRoiAlign(const std::string name,
+                                     const void*       data,
+                                     size_t            length);
+
+        TRTMultiLevelRotatedRoiAlign() = delete;
+
+        // IPluginV2DynamicExt Methods
+        nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
+
+        nvinfer1::DimsExprs            getOutputDimensions(int                        outputIndex,
+                                                           const nvinfer1::DimsExprs* inputs,
+                                                           int                        nbInputs,
+                                                           nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT override;
+
+        bool                           supportsFormatCombination(int                               pos,
+                                                                 const nvinfer1::PluginTensorDesc* ioDesc,
+                                                                 int                               nbInputs,
+                                                                 int                               nbOutputs) TRT_NOEXCEPT override;
+
+        void                           configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                                                       int                                      nbInputs,
+                                                       const nvinfer1::DynamicPluginTensorDesc* out,
+                                                       int                                      nbOutputs) TRT_NOEXCEPT override;
+
+        size_t                         getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                                        int                               nbInputs,
+                                                        const nvinfer1::PluginTensorDesc* outputs,
+                                                        int                               nbOutputs) const TRT_NOEXCEPT override;
+
+        int                enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                   const nvinfer1::PluginTensorDesc* outputDesc,
+                                   const void* const*                inputs,
+                                   void* const*                      outputs,
+                                   void*                             workspace,
+                                   cudaStream_t                      stream) TRT_NOEXCEPT override;
+
+        // IPluginV2Ext Methods
+        nvinfer1::DataType             getOutputDataType(int                       index,
+                                                         const nvinfer1::DataType* inputTypes,
+                                                         int                       nbInputs) const TRT_NOEXCEPT override;
+
+        // IPluginV2 Methods
+        const char*        getPluginType() const TRT_NOEXCEPT override;
+        const char*        getPluginVersion() const TRT_NOEXCEPT override;
+        int                getNbOutputs() const TRT_NOEXCEPT override;
+        size_t             getSerializationSize() const TRT_NOEXCEPT override;
+        void               serialize(void* buffer) const TRT_NOEXCEPT override;
+
+      private:
+        int                mAlignedHeight;
+        int                mAlignedWidth;
+        int                mClockwise;
+        int                mSampleNum;
+        std::vector<float> mFeatmapStrides;
+        float              mRoiScaleFactor;
+        int                mFinestScale;
+        bool               mAligned;
+    };
+
+    class TRTMultiLevelRotatedRoiAlignCreator : public TRTPluginCreatorBase
+    {
+      public:
+        TRTMultiLevelRotatedRoiAlignCreator();
+
+        const char*          getPluginName() const TRT_NOEXCEPT override;
+
+        const char*          getPluginVersion() const TRT_NOEXCEPT override;
+
+        nvinfer1::IPluginV2* createPlugin(const char*                            name,
+                                          const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override;
+
+        nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                               const void* serialData,
+                                               size_t      serialLength) TRT_NOEXCEPT override;
+    };
 }  // namespace mmdeploy
 #endif  // TRT_MULTI_LEVEL_ROTATED_ROI_ALIGN_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/multi_level_rotated_roi_align/trt_multi_level_rotated_roi_align_kernel.cu b/csrc/mmdeploy/backend_ops/tensorrt/multi_level_rotated_roi_align/trt_multi_level_rotated_roi_align_kernel.cu
index 1c6f292bae..3b09215547 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/multi_level_rotated_roi_align/trt_multi_level_rotated_roi_align_kernel.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/multi_level_rotated_roi_align/trt_multi_level_rotated_roi_align_kernel.cu
@@ -10,155 +10,236 @@
 #include "trt_plugin_helper.hpp"
 
 const int kMAX_FEATMAP_SIZE = 10;
-struct FeatData {
-  const void *data[kMAX_FEATMAP_SIZE];
-  int batch_size;
-  int channels;
-  int h[kMAX_FEATMAP_SIZE];
-  int w[kMAX_FEATMAP_SIZE];
-  float spatial_scale[kMAX_FEATMAP_SIZE];
-  int num_featmap;
+struct FeatData
+{
+    const void* data[kMAX_FEATMAP_SIZE];
+    int         batch_size;
+    int         channels;
+    int         h[kMAX_FEATMAP_SIZE];
+    int         w[kMAX_FEATMAP_SIZE];
+    float       spatial_scale[kMAX_FEATMAP_SIZE];
+    int         num_featmap;
 };
 
-template <typename scalar_t, bool aligned>
-__device__ scalar_t roi_align_single(const scalar_t *__restrict__ bottom_data,
-                                     const int roi_batch_ind, scalar_t roi_center_w,
-                                     scalar_t roi_center_h, scalar_t roi_width, scalar_t roi_height,
-                                     scalar_t theta, const scalar_t spatial_scale, const int pw,
-                                     const int ph, const int c, const int sample_num,
-                                     const int channels, const int height, const int width,
-                                     const int pooled_height, const int pooled_width) {
-  // Force malformed ROIs to be 1x1
-
-  roi_width = max(roi_width, (scalar_t)1.);
-  roi_height = max(roi_height, (scalar_t)1.);
-
-  const scalar_t bin_size_h = roi_height / scalar_t(pooled_height);
-  const scalar_t bin_size_w = roi_width / scalar_t(pooled_width);
-
-  const scalar_t *offset_bottom_data =
-      bottom_data + (roi_batch_ind * channels + c) * height * width;
-
-  const int roi_bin_grid_h = (sample_num > 0) ? sample_num : ceil(roi_height / pooled_height);
-  const int roi_bin_grid_w = (sample_num > 0) ? sample_num : ceil(roi_width / pooled_width);
-
-  const scalar_t roi_start_h = -roi_height / scalar_t(2.0);
-  const scalar_t roi_start_w = -roi_width / scalar_t(2.0);
-  const scalar_t cosscalar_theta = cos(theta);
-  const scalar_t sinscalar_theta = sin(theta);
-
-  // We do average (integral) pooling inside a bin
-  const scalar_t count = max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
-
-  scalar_t output_val = 0.;
-
-  for (int iy = 0; iy < roi_bin_grid_h; iy++) {  // e.g., iy = 0, 1
-    const scalar_t yy = roi_start_h + ph * bin_size_h +
-                        static_cast<scalar_t>(iy + .5f) * bin_size_h /
-                            static_cast<scalar_t>(roi_bin_grid_h);  // e.g., 0.5, 1.5
-    for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-      const scalar_t xx =
-          roi_start_w + pw * bin_size_w +
-          static_cast<scalar_t>(ix + .5f) * bin_size_w / static_cast<scalar_t>(roi_bin_grid_w);
-
-      // Rotate by theta (counterclockwise) around the center and translate
-      scalar_t y = yy * cosscalar_theta - xx * sinscalar_theta + roi_center_h;
-      scalar_t x = yy * sinscalar_theta + xx * cosscalar_theta + roi_center_w;
-
-      scalar_t val = bilinear_interpolate<scalar_t>(offset_bottom_data, height, width, y, x);
-      output_val += val;
+template<typename scalar_t, bool aligned>
+__device__ scalar_t roi_align_single(const scalar_t* __restrict__ bottom_data,
+                                     const int      roi_batch_ind,
+                                     scalar_t       roi_center_w,
+                                     scalar_t       roi_center_h,
+                                     scalar_t       roi_width,
+                                     scalar_t       roi_height,
+                                     scalar_t       theta,
+                                     const scalar_t spatial_scale,
+                                     const int      pw,
+                                     const int      ph,
+                                     const int      c,
+                                     const int      sample_num,
+                                     const int      channels,
+                                     const int      height,
+                                     const int      width,
+                                     const int      pooled_height,
+                                     const int      pooled_width)
+{
+    // Force malformed ROIs to be 1x1
+
+    roi_width  = max(roi_width, (scalar_t)1.);
+    roi_height = max(roi_height, (scalar_t)1.);
+
+    const scalar_t  bin_size_h = roi_height / scalar_t(pooled_height);
+    const scalar_t  bin_size_w = roi_width / scalar_t(pooled_width);
+
+    const scalar_t* offset_bottom_data =
+        bottom_data + (roi_batch_ind * channels + c) * height * width;
+
+    const int      roi_bin_grid_h = (sample_num > 0) ? sample_num : ceil(roi_height / pooled_height);
+    const int      roi_bin_grid_w = (sample_num > 0) ? sample_num : ceil(roi_width / pooled_width);
+
+    const scalar_t roi_start_h     = -roi_height / scalar_t(2.0);
+    const scalar_t roi_start_w     = -roi_width / scalar_t(2.0);
+    const scalar_t cosscalar_theta = cos(theta);
+    const scalar_t sinscalar_theta = sin(theta);
+
+    // We do average (integral) pooling inside a bin
+    const scalar_t count = max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
+
+    scalar_t       output_val = 0.;
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++)
+    {  // e.g., iy = 0, 1
+        const scalar_t yy = roi_start_h + ph * bin_size_h +
+                            static_cast<scalar_t>(iy + .5f) * bin_size_h /
+                                static_cast<scalar_t>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+        for (int ix = 0; ix < roi_bin_grid_w; ix++)
+        {
+            const scalar_t xx =
+                roi_start_w + pw * bin_size_w +
+                static_cast<scalar_t>(ix + .5f) * bin_size_w / static_cast<scalar_t>(roi_bin_grid_w);
+
+            // Rotate by theta (counterclockwise) around the center and translate
+            scalar_t y = yy * cosscalar_theta - xx * sinscalar_theta + roi_center_h;
+            scalar_t x = yy * sinscalar_theta + xx * cosscalar_theta + roi_center_w;
+
+            scalar_t val = bilinear_interpolate<scalar_t>(offset_bottom_data, height, width, y, x);
+            output_val += val;
+        }
     }
-  }
 
-  return output_val / count;
+    return output_val / count;
 }
 
-template <typename scalar_t, bool aligned>
-__global__ void rotated_roi_extractor_kernel(scalar_t *__restrict__ output,
-                                             const scalar_t *__restrict__ bottom_rois,
-                                             FeatData feat_data, const int clockwise,
-                                             const int sample_num, const float roi_scale_factor,
-                                             const int finest_scale, const int pooled_height,
-                                             const int pooled_width, int nThreads) {
-  CUDA_1D_KERNEL_LOOP(index, nThreads) {
-    const int channels = feat_data.channels;
-    int tmp_index = index;
-    const int pw = tmp_index % pooled_width;
-    tmp_index /= pooled_width;
-    const int ph = tmp_index % pooled_height;
-    tmp_index /= pooled_height;
-    const int c = tmp_index % channels;
-    const int n = tmp_index / channels;
-
-    const scalar_t *offset_bottom_rois = bottom_rois + n * 6;
-
-    scalar_t roi_offset_x0 = offset_bottom_rois[1];
-    scalar_t roi_offset_y0 = offset_bottom_rois[2];
-    scalar_t roi_offset_width = offset_bottom_rois[3];
-    scalar_t roi_offset_height = offset_bottom_rois[4];
-    scalar_t theta = offset_bottom_rois[5];
-
-    const scalar_t scale = sqrtf(roi_offset_width * roi_offset_height);
-
-    const int target_lvls =
-        min(feat_data.num_featmap - 1,
-            max(0, int(floorf(log2f(scale / (scalar_t)(finest_scale) + 1e-6)))));
-
-    if (roi_scale_factor > 0.) {
-      roi_offset_width = roi_offset_width * roi_scale_factor;
-      roi_offset_height = roi_offset_height * roi_scale_factor;
+template<typename scalar_t, bool aligned>
+__global__ void rotated_roi_extractor_kernel(scalar_t* __restrict__ output,
+                                             const scalar_t* __restrict__ bottom_rois,
+                                             FeatData    feat_data,
+                                             const int   clockwise,
+                                             const int   sample_num,
+                                             const float roi_scale_factor,
+                                             const int   finest_scale,
+                                             const int   pooled_height,
+                                             const int   pooled_width,
+                                             int         nThreads)
+{
+    CUDA_1D_KERNEL_LOOP(index, nThreads)
+    {
+        const int channels  = feat_data.channels;
+        int       tmp_index = index;
+        const int pw        = tmp_index % pooled_width;
+        tmp_index /= pooled_width;
+        const int ph = tmp_index % pooled_height;
+        tmp_index /= pooled_height;
+        const int       c = tmp_index % channels;
+        const int       n = tmp_index / channels;
+
+        const scalar_t* offset_bottom_rois = bottom_rois + n * 6;
+
+        scalar_t        roi_offset_x0     = offset_bottom_rois[1];
+        scalar_t        roi_offset_y0     = offset_bottom_rois[2];
+        scalar_t        roi_offset_width  = offset_bottom_rois[3];
+        scalar_t        roi_offset_height = offset_bottom_rois[4];
+        scalar_t        theta             = offset_bottom_rois[5];
+
+        const scalar_t  scale = sqrtf(roi_offset_width * roi_offset_height);
+
+        const int       target_lvls =
+            min(feat_data.num_featmap - 1,
+                max(0, int(floorf(log2f(scale / (scalar_t)(finest_scale) + 1e-6)))));
+
+        if (roi_scale_factor > 0.)
+        {
+            roi_offset_width  = roi_offset_width * roi_scale_factor;
+            roi_offset_height = roi_offset_height * roi_scale_factor;
+        }
+
+        const scalar_t  spatial_scale = (scalar_t)feat_data.spatial_scale[target_lvls];
+        const int       height        = feat_data.h[target_lvls];
+        const int       width         = feat_data.w[target_lvls];
+        const scalar_t* bottom_data   = (scalar_t*)feat_data.data[target_lvls];
+
+        const int       roi_batch_ind = offset_bottom_rois[0];
+        const scalar_t  offset        = aligned ? (scalar_t)-0.5 : (scalar_t)0.0;
+        const scalar_t  roi_center_w  = fma(roi_offset_x0, spatial_scale, offset);
+        const scalar_t  roi_center_h  = fma(roi_offset_y0, spatial_scale, offset);
+        const scalar_t  roi_width     = roi_offset_width * spatial_scale;
+        const scalar_t  roi_height    = roi_offset_height * spatial_scale;
+
+        theta = clockwise > 0 ? -theta : theta;
+
+        const scalar_t output_val = roi_align_single<scalar_t, aligned>(bottom_data,
+                                                                        roi_batch_ind,
+                                                                        roi_center_w,
+                                                                        roi_center_h,
+                                                                        roi_width,
+                                                                        roi_height,
+                                                                        theta,
+                                                                        spatial_scale,
+                                                                        pw,
+                                                                        ph,
+                                                                        c,
+                                                                        sample_num,
+                                                                        channels,
+                                                                        height,
+                                                                        width,
+                                                                        pooled_height,
+                                                                        pooled_width);
+        output[index] = output_val;
     }
-
-    const scalar_t spatial_scale = (scalar_t)feat_data.spatial_scale[target_lvls];
-    const int height = feat_data.h[target_lvls];
-    const int width = feat_data.w[target_lvls];
-    const scalar_t *bottom_data = (scalar_t *)feat_data.data[target_lvls];
-
-    const int roi_batch_ind = offset_bottom_rois[0];
-    const scalar_t offset = aligned ? (scalar_t)-0.5 : (scalar_t)0.0;
-    const scalar_t roi_center_w = fma(roi_offset_x0, spatial_scale, offset);
-    const scalar_t roi_center_h = fma(roi_offset_y0, spatial_scale, offset);
-    const scalar_t roi_width = roi_offset_width * spatial_scale;
-    const scalar_t roi_height = roi_offset_height * spatial_scale;
-
-    theta = clockwise > 0 ? -theta : theta;
-
-    const scalar_t output_val = roi_align_single<scalar_t, aligned>(
-        bottom_data, roi_batch_ind, roi_center_w, roi_center_h, roi_width, roi_height, theta,
-        spatial_scale, pw, ph, c, sample_num, channels, height, width, pooled_height, pooled_width);
-    output[index] = output_val;
-  }
 }
 
-template <typename T>
-void multi_level_rotated_roi_align(T *output, const T *rois, int num_rois, const void *const *feats,
-                                   int num_feats, int n, int c, int *h, int *w, float *strides,
-                                   int aligned_height, int aligned_width, int clockwise,
-                                   int sample_num, float roi_scale_factor, int finest_scale,
-                                   bool aligned, cudaStream_t stream) {
-  FeatData feat_data;
-  feat_data.batch_size = n;
-  feat_data.channels = c;
-  feat_data.num_featmap = num_feats;
-  for (int i = 0; i < num_feats; ++i) {
-    feat_data.data[i] = feats[i];
-    feat_data.h[i] = h[i];
-    feat_data.w[i] = w[i];
-    feat_data.spatial_scale[i] = 1. / float(strides[i]);
-  }
-  int nThreads = num_rois * c * aligned_height * aligned_width;
-  if (aligned) {
-    rotated_roi_extractor_kernel<T, true><<<GET_BLOCKS(nThreads), THREADS_PER_BLOCK, 0, stream>>>(
-        output, rois, feat_data, clockwise, sample_num, roi_scale_factor, finest_scale,
-        aligned_height, aligned_width, nThreads);
-  } else {
-    rotated_roi_extractor_kernel<T, false><<<GET_BLOCKS(nThreads), THREADS_PER_BLOCK, 0, stream>>>(
-        output, rois, feat_data, clockwise, sample_num, roi_scale_factor, finest_scale,
-        aligned_height, aligned_width, nThreads);
-  }
+template<typename T>
+void multi_level_rotated_roi_align(T*                 output,
+                                   const T*           rois,
+                                   int                num_rois,
+                                   const void* const* feats,
+                                   int                num_feats,
+                                   int                n,
+                                   int                c,
+                                   int*               h,
+                                   int*               w,
+                                   float*             strides,
+                                   int                aligned_height,
+                                   int                aligned_width,
+                                   int                clockwise,
+                                   int                sample_num,
+                                   float              roi_scale_factor,
+                                   int                finest_scale,
+                                   bool               aligned,
+                                   cudaStream_t       stream)
+{
+    FeatData feat_data;
+    feat_data.batch_size  = n;
+    feat_data.channels    = c;
+    feat_data.num_featmap = num_feats;
+    for (int i = 0; i < num_feats; ++i)
+    {
+        feat_data.data[i]          = feats[i];
+        feat_data.h[i]             = h[i];
+        feat_data.w[i]             = w[i];
+        feat_data.spatial_scale[i] = 1. / float(strides[i]);
+    }
+    int nThreads = num_rois * c * aligned_height * aligned_width;
+    if (aligned)
+    {
+        rotated_roi_extractor_kernel<T, true><<<GET_BLOCKS(nThreads), THREADS_PER_BLOCK, 0, stream>>>(output,
+                                                                                                      rois,
+                                                                                                      feat_data,
+                                                                                                      clockwise,
+                                                                                                      sample_num,
+                                                                                                      roi_scale_factor,
+                                                                                                      finest_scale,
+                                                                                                      aligned_height,
+                                                                                                      aligned_width,
+                                                                                                      nThreads);
+    }
+    else
+    {
+        rotated_roi_extractor_kernel<T, false><<<GET_BLOCKS(nThreads), THREADS_PER_BLOCK, 0, stream>>>(output,
+                                                                                                       rois,
+                                                                                                       feat_data,
+                                                                                                       clockwise,
+                                                                                                       sample_num,
+                                                                                                       roi_scale_factor,
+                                                                                                       finest_scale,
+                                                                                                       aligned_height,
+                                                                                                       aligned_width,
+                                                                                                       nThreads);
+    }
 }
 
-template void multi_level_rotated_roi_align<float>(
-    float *output, const float *rois, int num_rois, const void *const *feats, int num_feats, int n,
-    int c, int *h, int *w, float *strides, int aligned_height, int aligned_width, int clockwise,
-    int sample_num, float roi_scale_factor, int finest_scale, bool aligned, cudaStream_t stream);
+template void multi_level_rotated_roi_align<float>(float*             output,
+                                                   const float*       rois,
+                                                   int                num_rois,
+                                                   const void* const* feats,
+                                                   int                num_feats,
+                                                   int                n,
+                                                   int                c,
+                                                   int*               h,
+                                                   int*               w,
+                                                   float*             strides,
+                                                   int                aligned_height,
+                                                   int                aligned_width,
+                                                   int                clockwise,
+                                                   int                sample_num,
+                                                   float              roi_scale_factor,
+                                                   int                finest_scale,
+                                                   bool               aligned,
+                                                   cudaStream_t       stream);
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/multi_level_rotated_roi_align/trt_multi_level_rotated_roi_align_kernel.hpp b/csrc/mmdeploy/backend_ops/tensorrt/multi_level_rotated_roi_align/trt_multi_level_rotated_roi_align_kernel.hpp
index fc3700df3b..b3f7fc0f94 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/multi_level_rotated_roi_align/trt_multi_level_rotated_roi_align_kernel.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/multi_level_rotated_roi_align/trt_multi_level_rotated_roi_align_kernel.hpp
@@ -3,11 +3,24 @@
 #define TRT_MULTI_LEVEL_ROTATED_ROI_ALIGN_KERNEL_HPP
 #include <cuda_runtime.h>
 
-template <typename T>
-void multi_level_rotated_roi_align(T *output, const T *rois, int num_rois, const void *const *feats,
-                                   int num_feats, int n, int c, int *h, int *w, float *strides,
-                                   int aligned_height, int aligned_width, int clockwise,
-                                   int sample_num, float roi_scale_factor, int finest_scale,
-                                   bool aligned, cudaStream_t stream);
+template<typename T>
+void multi_level_rotated_roi_align(T*                 output,
+                                   const T*           rois,
+                                   int                num_rois,
+                                   const void* const* feats,
+                                   int                num_feats,
+                                   int                n,
+                                   int                c,
+                                   int*               h,
+                                   int*               w,
+                                   float*             strides,
+                                   int                aligned_height,
+                                   int                aligned_width,
+                                   int                clockwise,
+                                   int                sample_num,
+                                   float              roi_scale_factor,
+                                   int                finest_scale,
+                                   bool               aligned,
+                                   cudaStream_t       stream);
 
 #endif  // TRT_MULTI_LEVEL_ROTATED_ROI_ALIGN_KERNEL_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn.cpp b/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn.cpp
index d14a25e929..73a3a8e6b9 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn.cpp
@@ -10,164 +10,226 @@
 
 using namespace nvinfer1;
 
-namespace mmdeploy {
-namespace {
-static const char *PLUGIN_VERSION{"1"};
-static const char *PLUGIN_NAME{"MMCVMultiScaleDeformableAttention"};
-}  // namespace
-
-MultiScaleDeformableAttnPluginDynamic::MultiScaleDeformableAttnPluginDynamic(
-    const std::string &name)
-    : TRTPluginBase(name) {}
-
-MultiScaleDeformableAttnPluginDynamic::MultiScaleDeformableAttnPluginDynamic(const std::string name,
-                                                                             const void *data,
-                                                                             size_t length)
-    : TRTPluginBase(name) {}
-MultiScaleDeformableAttnPluginDynamic::~MultiScaleDeformableAttnPluginDynamic() {}
-
-nvinfer1::IPluginV2DynamicExt *MultiScaleDeformableAttnPluginDynamic::clone() const TRT_NOEXCEPT {
-  MultiScaleDeformableAttnPluginDynamic *plugin =
-      new MultiScaleDeformableAttnPluginDynamic(mLayerName);
-  plugin->setPluginNamespace(getPluginNamespace());
-
-  return plugin;
-}
-
-nvinfer1::DimsExprs MultiScaleDeformableAttnPluginDynamic::getOutputDimensions(
-    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
-    nvinfer1::IExprBuilder &exprBuilder) TRT_NOEXCEPT {
-  nvinfer1::DimsExprs ret;
-  ret.nbDims = 3;
-  ret.d[0] = inputs[0].d[0];
-  ret.d[1] = inputs[3].d[1];
-
-  ret.d[2] = exprBuilder.operation(DimensionOperation::kPROD, *inputs[0].d[2], *inputs[0].d[3]);
-
-  return ret;
-}
-
-bool MultiScaleDeformableAttnPluginDynamic::supportsFormatCombination(
-    int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT {
-  if (ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR) {
-    if ((pos == 1) || (pos == 2)) {
-      return (ioDesc[pos].type == nvinfer1::DataType::kINT32);
-    } else {
-      return ((ioDesc[pos].type == ioDesc[0].type) &&
-              ((ioDesc[pos].type == nvinfer1::DataType::kFLOAT) ||
-               (ioDesc[pos].type == nvinfer1::DataType::kHALF)));
-    }
-  } else {
-    return false;
-  }
-}
-
-void MultiScaleDeformableAttnPluginDynamic::configurePlugin(
-    const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
-    const nvinfer1::DynamicPluginTensorDesc *outputs, int nbOutputs) TRT_NOEXCEPT {}
-
-size_t MultiScaleDeformableAttnPluginDynamic::getWorkspaceSize(
-    const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
-    const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const TRT_NOEXCEPT {
-  return 0;
-}
-
-int MultiScaleDeformableAttnPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
-                                                   const nvinfer1::PluginTensorDesc *outputDesc,
-                                                   const void *const *inputs, void *const *outputs,
-                                                   void *workSpace,
-                                                   cudaStream_t stream) TRT_NOEXCEPT {
-  int32_t const batch = inputDesc[0].dims.d[0];
-  int32_t spatial_size = inputDesc[0].dims.d[1];
-  int32_t num_heads = inputDesc[0].dims.d[2];
-  int32_t channels = inputDesc[0].dims.d[3];
-  int32_t num_levels = inputDesc[1].dims.d[0];
-  int32_t num_query = inputDesc[3].dims.d[1];
-  int32_t num_point = inputDesc[3].dims.d[4];
-  int32_t rc = 0;
-  if (inputDesc[0].type == nvinfer1::DataType::kFLOAT) {
-    float const *value = static_cast<float const *>(inputs[0]);
-    int32_t const *spatialShapes = static_cast<int32_t const *>(inputs[1]);
-    int32_t const *levelStartIndex = static_cast<int32_t const *>(inputs[2]);
-    float const *samplingLoc = static_cast<float const *>(inputs[3]);
-    float const *attnWeight = static_cast<float const *>(inputs[4]);
-    float *output = static_cast<float *>(outputs[0]);
-
-    rc = ms_deform_attn_cuda_forward(value, spatialShapes, levelStartIndex, samplingLoc, attnWeight,
-                                     output, batch, spatial_size, num_heads, channels, num_levels,
-                                     num_query, num_point, stream);
-  } else if (inputDesc[0].type == nvinfer1::DataType::kHALF) {
-    const __half *value = static_cast<const __half *>(inputs[0]);
-    int32_t const *spatialShapes = static_cast<int32_t const *>(inputs[1]);
-    int32_t const *levelStartIndex = static_cast<int32_t const *>(inputs[2]);
-    const __half *samplingLoc = static_cast<const __half *>(inputs[3]);
-    const __half *attnWeight = static_cast<const __half *>(inputs[4]);
-    __half *output = static_cast<__half *>(outputs[0]);
-
-    rc = ms_deform_attn_cuda_forward(value, spatialShapes, levelStartIndex, samplingLoc, attnWeight,
-                                     output, batch, spatial_size, num_heads, channels, num_levels,
-                                     num_query, num_point, stream);
-  }
-
-  return rc;
-}
-
-nvinfer1::DataType MultiScaleDeformableAttnPluginDynamic::getOutputDataType(
-    int index, const nvinfer1::DataType *inputTypes, int nbInputs) const TRT_NOEXCEPT {
-  return inputTypes[0];
-}
-
-// IPluginV2 Methods
-const char *MultiScaleDeformableAttnPluginDynamic::getPluginType() const TRT_NOEXCEPT {
-  return PLUGIN_NAME;
-}
-
-const char *MultiScaleDeformableAttnPluginDynamic::getPluginVersion() const TRT_NOEXCEPT {
-  return PLUGIN_VERSION;
-}
-
-int MultiScaleDeformableAttnPluginDynamic::getNbOutputs() const TRT_NOEXCEPT { return 1; }
-
-size_t MultiScaleDeformableAttnPluginDynamic::getSerializationSize() const TRT_NOEXCEPT {
-  return 0;
-}
-
-void MultiScaleDeformableAttnPluginDynamic::serialize(void *buffer) const TRT_NOEXCEPT {}
-
-void MultiScaleDeformableAttnPluginDynamic::attachToContext(
-    cudnnContext *cudnnContext, cublasContext *cublasContext,
-    nvinfer1::IGpuAllocator *gpuAllocator) TRT_NOEXCEPT {}
-
-void MultiScaleDeformableAttnPluginDynamic::detachFromContext() TRT_NOEXCEPT {}
-
-////////////////////// creator /////////////////////////////
-
-MultiScaleDeformableAttnPluginDynamicCreator::MultiScaleDeformableAttnPluginDynamicCreator() {
-  mPluginAttributes.clear();
-  mFC.nbFields = mPluginAttributes.size();
-  mFC.fields = mPluginAttributes.data();
-}
-
-const char *MultiScaleDeformableAttnPluginDynamicCreator::getPluginName() const TRT_NOEXCEPT {
-  return PLUGIN_NAME;
-}
-
-const char *MultiScaleDeformableAttnPluginDynamicCreator::getPluginVersion() const TRT_NOEXCEPT {
-  return PLUGIN_VERSION;
-}
-
-nvinfer1::IPluginV2 *MultiScaleDeformableAttnPluginDynamicCreator::createPlugin(
-    const char *name, const nvinfer1::PluginFieldCollection *fc) TRT_NOEXCEPT {
-  MultiScaleDeformableAttnPluginDynamic *plugin = new MultiScaleDeformableAttnPluginDynamic(name);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-
-nvinfer1::IPluginV2 *MultiScaleDeformableAttnPluginDynamicCreator::deserializePlugin(
-    const char *name, const void *serialData, size_t serialLength) TRT_NOEXCEPT {
-  auto plugin = new MultiScaleDeformableAttnPluginDynamic(name, serialData, serialLength);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-REGISTER_TENSORRT_PLUGIN(MultiScaleDeformableAttnPluginDynamicCreator);
+namespace mmdeploy
+{
+    namespace
+    {
+        static const char* PLUGIN_VERSION{"1"};
+        static const char* PLUGIN_NAME{"MMCVMultiScaleDeformableAttention"};
+    }  // namespace
+
+    MultiScaleDeformableAttnPluginDynamic::MultiScaleDeformableAttnPluginDynamic(
+        const std::string& name)
+        : TRTPluginBase(name)
+    {
+    }
+
+    MultiScaleDeformableAttnPluginDynamic::MultiScaleDeformableAttnPluginDynamic(const std::string name,
+                                                                                 const void*       data,
+                                                                                 size_t            length)
+        : TRTPluginBase(name)
+    {
+    }
+    MultiScaleDeformableAttnPluginDynamic::~MultiScaleDeformableAttnPluginDynamic() {}
+
+    nvinfer1::IPluginV2DynamicExt* MultiScaleDeformableAttnPluginDynamic::clone() const TRT_NOEXCEPT
+    {
+        MultiScaleDeformableAttnPluginDynamic* plugin =
+            new MultiScaleDeformableAttnPluginDynamic(mLayerName);
+        plugin->setPluginNamespace(getPluginNamespace());
+
+        return plugin;
+    }
+
+    nvinfer1::DimsExprs MultiScaleDeformableAttnPluginDynamic::getOutputDimensions(int                        outputIndex,
+                                                                                   const nvinfer1::DimsExprs* inputs,
+                                                                                   int                        nbInputs,
+                                                                                   nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT
+    {
+        nvinfer1::DimsExprs ret;
+        ret.nbDims = 3;
+        ret.d[0]   = inputs[0].d[0];
+        ret.d[1]   = inputs[3].d[1];
+
+        ret.d[2] = exprBuilder.operation(DimensionOperation::kPROD, *inputs[0].d[2], *inputs[0].d[3]);
+
+        return ret;
+    }
+
+    bool MultiScaleDeformableAttnPluginDynamic::supportsFormatCombination(int                               pos,
+                                                                          const nvinfer1::PluginTensorDesc* ioDesc,
+                                                                          int                               nbInputs,
+                                                                          int                               nbOutputs) TRT_NOEXCEPT
+    {
+        if (ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR)
+        {
+            if ((pos == 1) || (pos == 2))
+            {
+                return (ioDesc[pos].type == nvinfer1::DataType::kINT32);
+            }
+            else
+            {
+                return ((ioDesc[pos].type == ioDesc[0].type) &&
+                        ((ioDesc[pos].type == nvinfer1::DataType::kFLOAT) ||
+                         (ioDesc[pos].type == nvinfer1::DataType::kHALF)));
+            }
+        }
+        else
+        {
+            return false;
+        }
+    }
+
+    void   MultiScaleDeformableAttnPluginDynamic::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs,
+                                                                int                                      nbInputs,
+                                                                const nvinfer1::DynamicPluginTensorDesc* outputs,
+                                                                int                                      nbOutputs) TRT_NOEXCEPT {}
+
+    size_t MultiScaleDeformableAttnPluginDynamic::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                                                   int                               nbInputs,
+                                                                   const nvinfer1::PluginTensorDesc* outputs,
+                                                                   int                               nbOutputs) const TRT_NOEXCEPT
+    {
+        return 0;
+    }
+
+    int MultiScaleDeformableAttnPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                                       const nvinfer1::PluginTensorDesc* outputDesc,
+                                                       const void* const*                inputs,
+                                                       void* const*                      outputs,
+                                                       void*                             workSpace,
+                                                       cudaStream_t                      stream) TRT_NOEXCEPT
+    {
+        int32_t const batch        = inputDesc[0].dims.d[0];
+        int32_t       spatial_size = inputDesc[0].dims.d[1];
+        int32_t       num_heads    = inputDesc[0].dims.d[2];
+        int32_t       channels     = inputDesc[0].dims.d[3];
+        int32_t       num_levels   = inputDesc[1].dims.d[0];
+        int32_t       num_query    = inputDesc[3].dims.d[1];
+        int32_t       num_point    = inputDesc[3].dims.d[4];
+        int32_t       rc           = 0;
+        if (inputDesc[0].type == nvinfer1::DataType::kFLOAT)
+        {
+            float const*   value           = static_cast<float const*>(inputs[0]);
+            int32_t const* spatialShapes   = static_cast<int32_t const*>(inputs[1]);
+            int32_t const* levelStartIndex = static_cast<int32_t const*>(inputs[2]);
+            float const*   samplingLoc     = static_cast<float const*>(inputs[3]);
+            float const*   attnWeight      = static_cast<float const*>(inputs[4]);
+            float*         output          = static_cast<float*>(outputs[0]);
+
+            rc = ms_deform_attn_cuda_forward(value,
+                                             spatialShapes,
+                                             levelStartIndex,
+                                             samplingLoc,
+                                             attnWeight,
+                                             output,
+                                             batch,
+                                             spatial_size,
+                                             num_heads,
+                                             channels,
+                                             num_levels,
+                                             num_query,
+                                             num_point,
+                                             stream);
+        }
+        else if (inputDesc[0].type == nvinfer1::DataType::kHALF)
+        {
+            const __half*  value           = static_cast<const __half*>(inputs[0]);
+            int32_t const* spatialShapes   = static_cast<int32_t const*>(inputs[1]);
+            int32_t const* levelStartIndex = static_cast<int32_t const*>(inputs[2]);
+            const __half*  samplingLoc     = static_cast<const __half*>(inputs[3]);
+            const __half*  attnWeight      = static_cast<const __half*>(inputs[4]);
+            __half*        output          = static_cast<__half*>(outputs[0]);
+
+            rc = ms_deform_attn_cuda_forward(value,
+                                             spatialShapes,
+                                             levelStartIndex,
+                                             samplingLoc,
+                                             attnWeight,
+                                             output,
+                                             batch,
+                                             spatial_size,
+                                             num_heads,
+                                             channels,
+                                             num_levels,
+                                             num_query,
+                                             num_point,
+                                             stream);
+        }
+
+        return rc;
+    }
+
+    nvinfer1::DataType MultiScaleDeformableAttnPluginDynamic::getOutputDataType(int                       index,
+                                                                                const nvinfer1::DataType* inputTypes,
+                                                                                int                       nbInputs) const TRT_NOEXCEPT
+    {
+        return inputTypes[0];
+    }
+
+    // IPluginV2 Methods
+    const char* MultiScaleDeformableAttnPluginDynamic::getPluginType() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
+    }
+
+    const char* MultiScaleDeformableAttnPluginDynamic::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
+    }
+
+    int MultiScaleDeformableAttnPluginDynamic::getNbOutputs() const TRT_NOEXCEPT
+    {
+        return 1;
+    }
+
+    size_t MultiScaleDeformableAttnPluginDynamic::getSerializationSize() const TRT_NOEXCEPT
+    {
+        return 0;
+    }
+
+    void MultiScaleDeformableAttnPluginDynamic::serialize(void* buffer) const TRT_NOEXCEPT {}
+
+    void MultiScaleDeformableAttnPluginDynamic::attachToContext(cudnnContext*            cudnnContext,
+                                                                cublasContext*           cublasContext,
+                                                                nvinfer1::IGpuAllocator* gpuAllocator) TRT_NOEXCEPT {}
+
+    void MultiScaleDeformableAttnPluginDynamic::detachFromContext() TRT_NOEXCEPT {}
+
+    ////////////////////// creator /////////////////////////////
+
+    MultiScaleDeformableAttnPluginDynamicCreator::MultiScaleDeformableAttnPluginDynamicCreator()
+    {
+        mPluginAttributes.clear();
+        mFC.nbFields = mPluginAttributes.size();
+        mFC.fields   = mPluginAttributes.data();
+    }
+
+    const char* MultiScaleDeformableAttnPluginDynamicCreator::getPluginName() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
+    }
+
+    const char* MultiScaleDeformableAttnPluginDynamicCreator::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
+    }
+
+    nvinfer1::IPluginV2* MultiScaleDeformableAttnPluginDynamicCreator::createPlugin(const char*                            name,
+                                                                                    const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT
+    {
+        MultiScaleDeformableAttnPluginDynamic* plugin = new MultiScaleDeformableAttnPluginDynamic(name);
+        plugin->setPluginNamespace(getPluginNamespace());
+        return plugin;
+    }
+
+    nvinfer1::IPluginV2* MultiScaleDeformableAttnPluginDynamicCreator::deserializePlugin(const char* name,
+                                                                                         const void* serialData,
+                                                                                         size_t      serialLength) TRT_NOEXCEPT
+    {
+        auto plugin = new MultiScaleDeformableAttnPluginDynamic(name, serialData, serialLength);
+        plugin->setPluginNamespace(getPluginNamespace());
+        return plugin;
+    }
+    REGISTER_TENSORRT_PLUGIN(MultiScaleDeformableAttnPluginDynamicCreator);
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn.hpp b/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn.hpp
index 7e66e9e54d..62821e27ed 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn.hpp
@@ -9,62 +9,85 @@
 
 #include "trt_plugin_base.hpp"
 
-namespace mmdeploy {
-class MultiScaleDeformableAttnPluginDynamic : public TRTPluginBase {
- public:
-  MultiScaleDeformableAttnPluginDynamic(const std::string &name);
-
-  MultiScaleDeformableAttnPluginDynamic(const std::string name, const void *data, size_t length);
-
-  MultiScaleDeformableAttnPluginDynamic();
-
-  ~MultiScaleDeformableAttnPluginDynamic() TRT_NOEXCEPT override;
-
-  // IPluginV2DynamicExt Methods
-  nvinfer1::IPluginV2DynamicExt *clone() const TRT_NOEXCEPT override;
-  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs,
-                                          int nbInputs, nvinfer1::IExprBuilder &exprBuilder)
-      TRT_NOEXCEPT override;
-  bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs,
-                                 int nbOutputs) TRT_NOEXCEPT override;
-  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs,
-                       const nvinfer1::DynamicPluginTensorDesc *out,
-                       int nbOutputs) TRT_NOEXCEPT override;
-  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
-                          const nvinfer1::PluginTensorDesc *outputs,
-                          int nbOutputs) const TRT_NOEXCEPT override;
-  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
-              const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
-              void *const *outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT override;
-  void attachToContext(cudnnContext *cudnnContext, cublasContext *cublasContext,
-                       nvinfer1::IGpuAllocator *gpuAllocator) TRT_NOEXCEPT override;
-  void detachFromContext() TRT_NOEXCEPT override;
-
-  // IPluginV2Ext Methods
-  nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
-                                       int nbInputs) const TRT_NOEXCEPT override;
-
-  // IPluginV2 Methods
-  const char *getPluginType() const TRT_NOEXCEPT override;
-  const char *getPluginVersion() const TRT_NOEXCEPT override;
-  int getNbOutputs() const TRT_NOEXCEPT override;
-  size_t getSerializationSize() const TRT_NOEXCEPT override;
-  void serialize(void *buffer) const TRT_NOEXCEPT override;
-};
-
-class MultiScaleDeformableAttnPluginDynamicCreator : public TRTPluginCreatorBase {
- public:
-  MultiScaleDeformableAttnPluginDynamicCreator();
-
-  const char *getPluginName() const TRT_NOEXCEPT override;
-
-  const char *getPluginVersion() const TRT_NOEXCEPT override;
-
-  nvinfer1::IPluginV2 *createPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
-      TRT_NOEXCEPT override;
-
-  nvinfer1::IPluginV2 *deserializePlugin(const char *name, const void *serialData,
-                                         size_t serialLength) TRT_NOEXCEPT override;
-};
+namespace mmdeploy
+{
+    class MultiScaleDeformableAttnPluginDynamic : public TRTPluginBase
+    {
+      public:
+        MultiScaleDeformableAttnPluginDynamic(const std::string& name);
+
+        MultiScaleDeformableAttnPluginDynamic(const std::string name,
+                                              const void*       data,
+                                              size_t            length);
+
+        MultiScaleDeformableAttnPluginDynamic();
+
+        ~MultiScaleDeformableAttnPluginDynamic() TRT_NOEXCEPT override;
+
+        // IPluginV2DynamicExt Methods
+        nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
+
+        nvinfer1::DimsExprs            getOutputDimensions(int                        outputIndex,
+                                                           const nvinfer1::DimsExprs* inputs,
+                                                           int                        nbInputs,
+                                                           nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT override;
+
+        bool                           supportsFormatCombination(int                               pos,
+                                                                 const nvinfer1::PluginTensorDesc* ioDesc,
+                                                                 int                               nbInputs,
+                                                                 int                               nbOutputs) TRT_NOEXCEPT override;
+
+        void                           configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                                                       int                                      nbInputs,
+                                                       const nvinfer1::DynamicPluginTensorDesc* out,
+                                                       int                                      nbOutputs) TRT_NOEXCEPT override;
+
+        size_t                         getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                                        int                               nbInputs,
+                                                        const nvinfer1::PluginTensorDesc* outputs,
+                                                        int                               nbOutputs) const TRT_NOEXCEPT override;
+
+        int                enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                   const nvinfer1::PluginTensorDesc* outputDesc,
+                                   const void* const*                inputs,
+                                   void* const*                      outputs,
+                                   void*                             workspace,
+                                   cudaStream_t                      stream) TRT_NOEXCEPT override;
+
+        void                           attachToContext(cudnnContext*            cudnnContext,
+                                                       cublasContext*           cublasContext,
+                                                       nvinfer1::IGpuAllocator* gpuAllocator) TRT_NOEXCEPT override;
+
+        void               detachFromContext() TRT_NOEXCEPT override;
+
+        // IPluginV2Ext Methods
+        nvinfer1::DataType             getOutputDataType(int                       index,
+                                                         const nvinfer1::DataType* inputTypes,
+                                                         int                       nbInputs) const TRT_NOEXCEPT override;
+
+        // IPluginV2 Methods
+        const char*        getPluginType() const TRT_NOEXCEPT override;
+        const char*        getPluginVersion() const TRT_NOEXCEPT override;
+        int                getNbOutputs() const TRT_NOEXCEPT override;
+        size_t             getSerializationSize() const TRT_NOEXCEPT override;
+        void               serialize(void* buffer) const TRT_NOEXCEPT override;
+    };
+
+    class MultiScaleDeformableAttnPluginDynamicCreator : public TRTPluginCreatorBase
+    {
+      public:
+        MultiScaleDeformableAttnPluginDynamicCreator();
+
+        const char*          getPluginName() const TRT_NOEXCEPT override;
+
+        const char*          getPluginVersion() const TRT_NOEXCEPT override;
+
+        nvinfer1::IPluginV2* createPlugin(const char*                            name,
+                                          const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override;
+
+        nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                               const void* serialData,
+                                               size_t      serialLength) TRT_NOEXCEPT override;
+    };
 }  // namespace mmdeploy
 #endif  // TRT_MS_DEFORM_ATTN_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn_kernel.cu b/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn_kernel.cu
index 6b7588eae0..2d10a1ee9f 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn_kernel.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn_kernel.cu
@@ -7,58 +7,113 @@
 #include "trt_ms_deform_attn_kernel.hpp"
 #include "trt_plugin_helper.hpp"
 
-template <typename scalar_t>
-void ms_deformable_im2col_cuda(cudaStream_t stream, scalar_t const* dataValue,
-                               int32_t const* dataSpatialShapes, int32_t const* dataLevelStartIndex,
-                               scalar_t const* dataSamplingLoc, scalar_t const* dataAttnWeight,
-                               int32_t const batchSize, int32_t const spatialSize,
-                               int32_t const numHeads, int32_t const channels,
-                               int32_t const numLevels, int32_t const numQuery,
-                               int32_t const numPoint, scalar_t* dataCol) {
-  int32_t const numKernels = batchSize * numQuery * numHeads * channels;
-  int32_t const numActualKernels = batchSize * numQuery * numHeads * channels;
+template<typename scalar_t>
+void ms_deformable_im2col_cuda(cudaStream_t    stream,
+                               scalar_t const* dataValue,
+                               int32_t const*  dataSpatialShapes,
+                               int32_t const*  dataLevelStartIndex,
+                               scalar_t const* dataSamplingLoc,
+                               scalar_t const* dataAttnWeight,
+                               int32_t const   batchSize,
+                               int32_t const   spatialSize,
+                               int32_t const   numHeads,
+                               int32_t const   channels,
+                               int32_t const   numLevels,
+                               int32_t const   numQuery,
+                               int32_t const   numPoint,
+                               scalar_t*       dataCol)
+{
+    int32_t const numKernels       = batchSize * numQuery * numHeads * channels;
+    int32_t const numActualKernels = batchSize * numQuery * numHeads * channels;
 
-  ms_deformable_im2col_gpu_kernel<scalar_t>
-      <<<GET_BLOCKS(numActualKernels), THREADS_PER_BLOCK, 0, stream>>>(
-          numKernels, dataValue, dataSpatialShapes, dataLevelStartIndex, dataSamplingLoc,
-          dataAttnWeight, batchSize, spatialSize, numHeads, channels, numLevels, numQuery, numPoint,
-          dataCol);
+    ms_deformable_im2col_gpu_kernel<scalar_t>
+        <<<GET_BLOCKS(numActualKernels), THREADS_PER_BLOCK, 0, stream>>>(numKernels,
+                                                                         dataValue,
+                                                                         dataSpatialShapes,
+                                                                         dataLevelStartIndex,
+                                                                         dataSamplingLoc,
+                                                                         dataAttnWeight,
+                                                                         batchSize,
+                                                                         spatialSize,
+                                                                         numHeads,
+                                                                         channels,
+                                                                         numLevels,
+                                                                         numQuery,
+                                                                         numPoint,
+                                                                         dataCol);
 }
 
-template <typename scalar_t>
-int32_t ms_deform_attn_cuda_forward(const scalar_t* value, const int32_t* spatialShapes,
-                                    const int32_t* levelStartIndex, const scalar_t* samplingLoc,
-                                    const scalar_t* attnWeight, scalar_t* output, int32_t batch,
-                                    int32_t mSpatialSize, int32_t mNumHeads, int32_t mChannels,
-                                    int32_t mNumLevels, int32_t mNumQuery, int32_t mNumPoint,
-                                    cudaStream_t stream) {
-  auto perValueSize = mSpatialSize * mNumHeads * mChannels;
-  auto perSampleLocSize = mNumQuery * mNumHeads * mNumLevels * mNumPoint * 2;
-  auto perAttnWeightSize = mNumQuery * mNumHeads * mNumLevels * mNumPoint;
-  auto perOutputSize = mNumQuery * mNumHeads * mChannels;
+template<typename scalar_t>
+int32_t ms_deform_attn_cuda_forward(const scalar_t* value,
+                                    const int32_t*  spatialShapes,
+                                    const int32_t*  levelStartIndex,
+                                    const scalar_t* samplingLoc,
+                                    const scalar_t* attnWeight,
+                                    scalar_t*       output,
+                                    int32_t         batch,
+                                    int32_t         mSpatialSize,
+                                    int32_t         mNumHeads,
+                                    int32_t         mChannels,
+                                    int32_t         mNumLevels,
+                                    int32_t         mNumQuery,
+                                    int32_t         mNumPoint,
+                                    cudaStream_t    stream)
+{
+    auto    perValueSize      = mSpatialSize * mNumHeads * mChannels;
+    auto    perSampleLocSize  = mNumQuery * mNumHeads * mNumLevels * mNumPoint * 2;
+    auto    perAttnWeightSize = mNumQuery * mNumHeads * mNumLevels * mNumPoint;
+    auto    perOutputSize     = mNumQuery * mNumHeads * mChannels;
 
-  int32_t mIm2colStep = batch;
+    int32_t mIm2colStep = batch;
 
-  for (int32_t n = 0; n < batch / mIm2colStep; ++n) {
-    auto columns = output + n * mIm2colStep * perOutputSize;
-    ms_deformable_im2col_cuda<scalar_t>(
-        stream, value + n * mIm2colStep * perValueSize, spatialShapes, levelStartIndex,
-        samplingLoc + n * mIm2colStep * perSampleLocSize,
-        attnWeight + n * mIm2colStep * perAttnWeightSize, mIm2colStep, mSpatialSize, mNumHeads,
-        mChannels, mNumLevels, mNumQuery, mNumPoint, columns);
-  }
+    for (int32_t n = 0; n < batch / mIm2colStep; ++n)
+    {
+        auto columns = output + n * mIm2colStep * perOutputSize;
+        ms_deformable_im2col_cuda<scalar_t>(stream,
+                                            value + n * mIm2colStep * perValueSize,
+                                            spatialShapes,
+                                            levelStartIndex,
+                                            samplingLoc + n * mIm2colStep * perSampleLocSize,
+                                            attnWeight + n * mIm2colStep * perAttnWeightSize,
+                                            mIm2colStep,
+                                            mSpatialSize,
+                                            mNumHeads,
+                                            mChannels,
+                                            mNumLevels,
+                                            mNumQuery,
+                                            mNumPoint,
+                                            columns);
+    }
 
-  return 0;
+    return 0;
 }
 
-template int32_t ms_deform_attn_cuda_forward<float>(
-    const float* value, const int32_t* spatialShapes, const int32_t* levelStartIndex,
-    const float* samplingLoc, const float* attnWeight, float* output, int32_t batch,
-    int32_t mSpatialSize, int32_t mNumHeads, int32_t mChannels, int32_t mNumLevels,
-    int32_t mNumQuery, int32_t mNumPoint, cudaStream_t stream);
+template int32_t ms_deform_attn_cuda_forward<float>(const float*   value,
+                                                    const int32_t* spatialShapes,
+                                                    const int32_t* levelStartIndex,
+                                                    const float*   samplingLoc,
+                                                    const float*   attnWeight,
+                                                    float*         output,
+                                                    int32_t        batch,
+                                                    int32_t        mSpatialSize,
+                                                    int32_t        mNumHeads,
+                                                    int32_t        mChannels,
+                                                    int32_t        mNumLevels,
+                                                    int32_t        mNumQuery,
+                                                    int32_t        mNumPoint,
+                                                    cudaStream_t   stream);
 
-template int32_t ms_deform_attn_cuda_forward<__half>(
-    const __half* value, const int32_t* spatialShapes, const int32_t* levelStartIndex,
-    const __half* samplingLoc, const __half* attnWeight, __half* output, int32_t batch,
-    int32_t mSpatialSize, int32_t mNumHeads, int32_t mChannels, int32_t mNumLevels,
-    int32_t mNumQuery, int32_t mNumPoint, cudaStream_t stream);
+template int32_t ms_deform_attn_cuda_forward<__half>(const __half*  value,
+                                                     const int32_t* spatialShapes,
+                                                     const int32_t* levelStartIndex,
+                                                     const __half*  samplingLoc,
+                                                     const __half*  attnWeight,
+                                                     __half*        output,
+                                                     int32_t        batch,
+                                                     int32_t        mSpatialSize,
+                                                     int32_t        mNumHeads,
+                                                     int32_t        mChannels,
+                                                     int32_t        mNumLevels,
+                                                     int32_t        mNumQuery,
+                                                     int32_t        mNumPoint,
+                                                     cudaStream_t   stream);
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn_kernel.cuh b/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn_kernel.cuh
index cee34cfe65..0bef6ed98c 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn_kernel.cuh
+++ b/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn_kernel.cuh
@@ -4,254 +4,323 @@
 
 #include "common_cuda_helper.hpp"
 
-template <typename scalar_t>
-__device__ scalar_t ms_deform_attn_im2col_bilinear(const scalar_t*& bottom_data, const int& height,
-                                                   const int& width, const int& nheads,
-                                                   const int& channels, const scalar_t& h,
-                                                   const scalar_t& w, const int& m, const int& c) {
-  const int h_low = floorf(h);
-  const int w_low = floorf(w);
-  const int h_high = h_low + 1;
-  const int w_high = w_low + 1;
+template<typename scalar_t>
+__device__ scalar_t ms_deform_attn_im2col_bilinear(const scalar_t*& bottom_data,
+                                                   const int&       height,
+                                                   const int&       width,
+                                                   const int&       nheads,
+                                                   const int&       channels,
+                                                   const scalar_t&  h,
+                                                   const scalar_t&  w,
+                                                   const int&       m,
+                                                   const int&       c)
+{
+    const int      h_low  = floorf(h);
+    const int      w_low  = floorf(w);
+    const int      h_high = h_low + 1;
+    const int      w_high = w_low + 1;
 
-  const scalar_t lh = h - h_low;
-  const scalar_t lw = w - w_low;
-  const scalar_t hh = 1 - lh, hw = 1 - lw;
+    const scalar_t lh = h - h_low;
+    const scalar_t lw = w - w_low;
+    const scalar_t hh = 1 - lh, hw = 1 - lw;
 
-  const int w_stride = nheads * channels;
-  const int h_stride = width * w_stride;
-  const int h_low_ptr_offset = h_low * h_stride;
-  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
-  const int w_low_ptr_offset = w_low * w_stride;
-  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
-  const int base_ptr = m * channels + c;
+    const int      w_stride          = nheads * channels;
+    const int      h_stride          = width * w_stride;
+    const int      h_low_ptr_offset  = h_low * h_stride;
+    const int      h_high_ptr_offset = h_low_ptr_offset + h_stride;
+    const int      w_low_ptr_offset  = w_low * w_stride;
+    const int      w_high_ptr_offset = w_low_ptr_offset + w_stride;
+    const int      base_ptr          = m * channels + c;
 
-  scalar_t v1 = 0;
-  if (h_low >= 0 && w_low >= 0) {
-    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
-    v1 = bottom_data[ptr1];
-  }
-  scalar_t v2 = 0;
-  if (h_low >= 0 && w_high <= width - 1) {
-    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
-    v2 = bottom_data[ptr2];
-  }
-  scalar_t v3 = 0;
-  if (h_high <= height - 1 && w_low >= 0) {
-    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
-    v3 = bottom_data[ptr3];
-  }
-  scalar_t v4 = 0;
-  if (h_high <= height - 1 && w_high <= width - 1) {
-    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
-    v4 = bottom_data[ptr4];
-  }
+    scalar_t       v1 = 0;
+    if (h_low >= 0 && w_low >= 0)
+    {
+        const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+        v1             = bottom_data[ptr1];
+    }
+    scalar_t v2 = 0;
+    if (h_low >= 0 && w_high <= width - 1)
+    {
+        const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+        v2             = bottom_data[ptr2];
+    }
+    scalar_t v3 = 0;
+    if (h_high <= height - 1 && w_low >= 0)
+    {
+        const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+        v3             = bottom_data[ptr3];
+    }
+    scalar_t v4 = 0;
+    if (h_high <= height - 1 && w_high <= width - 1)
+    {
+        const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+        v4             = bottom_data[ptr4];
+    }
 
-  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+    const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
 
-  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-  return val;
+    const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+    return val;
 }
 
-template <>
-__device__ __half ms_deform_attn_im2col_bilinear<__half>(
-    const __half*& bottomData, int32_t const& height, int32_t const& width, int32_t const& nHeads,
-    int32_t const& channels, const __half& h, const __half& w, int32_t const& m, int32_t const& c) {
-  int32_t const hLow = __half2int_rd(h);
-  int32_t const wLow = __half2int_rd(w);
-  int32_t const hHigh = hLow + 1;
-  int32_t const wHigh = wLow + 1;
+template<>
+__device__ __half ms_deform_attn_im2col_bilinear<__half>(const __half*& bottomData,
+                                                         int32_t const& height,
+                                                         int32_t const& width,
+                                                         int32_t const& nHeads,
+                                                         int32_t const& channels,
+                                                         const __half&  h,
+                                                         const __half&  w,
+                                                         int32_t const& m,
+                                                         int32_t const& c)
+{
+    int32_t const hLow  = __half2int_rd(h);
+    int32_t const wLow  = __half2int_rd(w);
+    int32_t const hHigh = hLow + 1;
+    int32_t const wHigh = wLow + 1;
 
-  const __half kZERO = __int2half_rz(0);
-  const __half one = __int2half_rz(1);
+    const __half  kZERO = __int2half_rz(0);
+    const __half  one   = __int2half_rz(1);
 
 #if __CUDA_ARCH__ >= 530
-  const __half lh = __hsub(h, __int2half_rd(hLow));
-  const __half lw = __hsub(w, __int2half_rd(wLow));
-  const __half hh = __hsub(one, lh), hw = __hsub(one, lw);
+    const __half lh = __hsub(h, __int2half_rd(hLow));
+    const __half lw = __hsub(w, __int2half_rd(wLow));
+    const __half hh = __hsub(one, lh), hw = __hsub(one, lw);
 #else
-  const __half lh = __float2half(__half2float(h) - hLow);
-  const __half lw = __float2half(__half2float(w) - wLow);
-  const __half hh = __float2half(__half2float(one) - __half2float(lh));
-  const __half hw = __float2half(__half2float(one) - __half2float(lw));
+    const __half lh = __float2half(__half2float(h) - hLow);
+    const __half lw = __float2half(__half2float(w) - wLow);
+    const __half hh = __float2half(__half2float(one) - __half2float(lh));
+    const __half hw = __float2half(__half2float(one) - __half2float(lw));
 #endif
-  int32_t const wStride = nHeads * channels;
-  int32_t const hStride = width * wStride;
-  int32_t const hLowPtrOffset = hLow * hStride;
-  int32_t const hHighPtrOffset = hLowPtrOffset + hStride;
-  int32_t const wLowPtrOffset = wLow * wStride;
-  int32_t const wHighPtrOffset = wLowPtrOffset + wStride;
-  int32_t const basePtr = m * channels + c;
+    int32_t const wStride        = nHeads * channels;
+    int32_t const hStride        = width * wStride;
+    int32_t const hLowPtrOffset  = hLow * hStride;
+    int32_t const hHighPtrOffset = hLowPtrOffset + hStride;
+    int32_t const wLowPtrOffset  = wLow * wStride;
+    int32_t const wHighPtrOffset = wLowPtrOffset + wStride;
+    int32_t const basePtr        = m * channels + c;
 
-  __half v1 = kZERO;
-  if (hLow >= 0 && wLow >= 0) {
-    int32_t const ptr1 = hLowPtrOffset + wLowPtrOffset + basePtr;
-    v1 = bottomData[ptr1];
-  }
-  __half v2 = kZERO;
-  if (hLow >= 0 && wHigh <= width - 1) {
-    int32_t const ptr2 = hLowPtrOffset + wHighPtrOffset + basePtr;
-    v2 = bottomData[ptr2];
-  }
-  __half v3 = kZERO;
-  if (hHigh <= height - 1 && wLow >= 0) {
-    int32_t const ptr3 = hHighPtrOffset + wLowPtrOffset + basePtr;
-    v3 = bottomData[ptr3];
-  }
-  __half v4 = kZERO;
-  if (hHigh <= height - 1 && wHigh <= width - 1) {
-    int32_t const ptr4 = hHighPtrOffset + wHighPtrOffset + basePtr;
-    v4 = bottomData[ptr4];
-  }
+    __half        v1 = kZERO;
+    if (hLow >= 0 && wLow >= 0)
+    {
+        int32_t const ptr1 = hLowPtrOffset + wLowPtrOffset + basePtr;
+        v1                 = bottomData[ptr1];
+    }
+    __half v2 = kZERO;
+    if (hLow >= 0 && wHigh <= width - 1)
+    {
+        int32_t const ptr2 = hLowPtrOffset + wHighPtrOffset + basePtr;
+        v2                 = bottomData[ptr2];
+    }
+    __half v3 = kZERO;
+    if (hHigh <= height - 1 && wLow >= 0)
+    {
+        int32_t const ptr3 = hHighPtrOffset + wLowPtrOffset + basePtr;
+        v3                 = bottomData[ptr3];
+    }
+    __half v4 = kZERO;
+    if (hHigh <= height - 1 && wHigh <= width - 1)
+    {
+        int32_t const ptr4 = hHighPtrOffset + wHighPtrOffset + basePtr;
+        v4                 = bottomData[ptr4];
+    }
 
 #if __CUDA_ARCH__ >= 530
-  __half w1 = __hmul(__hmul(hh, hw), v1);
-  __half w2 = __hmul(__hmul(hh, lw), v2);
-  __half w3 = __hmul(__hmul(lh, hw), v3);
-  __half w4 = __hmul(__hmul(lh, lw), v4);
+    __half w1 = __hmul(__hmul(hh, hw), v1);
+    __half w2 = __hmul(__hmul(hh, lw), v2);
+    __half w3 = __hmul(__hmul(lh, hw), v3);
+    __half w4 = __hmul(__hmul(lh, lw), v4);
 
-  w1 = __hadd(w1, w2);
-  w3 = __hadd(w3, w4);
+    w1 = __hadd(w1, w2);
+    w3 = __hadd(w3, w4);
 
-  const __half val = __hadd(w1, w3);
+    const __half val = __hadd(w1, w3);
 #else
-  __half w1 = __float2half((__half2float(hh) * __half2float(hw)) * __half2float(v1));
-  __half w2 = __float2half((__half2float(hh) * __half2float(lw)) * __half2float(v2));
-  __half w3 = __float2half((__half2float(lh) * __half2float(hw)) * __half2float(v3));
-  __half w4 = __float2half((__half2float(lh) * __half2float(lw)) * __half2float(v4));
+    __half w1 = __float2half((__half2float(hh) * __half2float(hw)) * __half2float(v1));
+    __half w2 = __float2half((__half2float(hh) * __half2float(lw)) * __half2float(v2));
+    __half w3 = __float2half((__half2float(lh) * __half2float(hw)) * __half2float(v3));
+    __half w4 = __float2half((__half2float(lh) * __half2float(lw)) * __half2float(v4));
 
-  w1 = __float2half(__half2float(w1) + __half2float(w2));
-  w3 = __float2half(__half2float(w3) + __half2float(w4));
+    w1 = __float2half(__half2float(w1) + __half2float(w2));
+    w3 = __float2half(__half2float(w3) + __half2float(w4));
 
-  const __half val = __float2half(__half2float(w1) + __half2float(w3));
+    const __half val = __float2half(__half2float(w1) + __half2float(w3));
 #endif
-  return val;
+    return val;
 }
 
 #if 1
-template <typename scalar_t>
-__global__ void ms_deformable_im2col_gpu_kernel(
-    int32_t const n, scalar_t const* dataValue, int32_t const* dataSpatialShapes,
-    int32_t const* dataLevelStartIndex, scalar_t const* dataSamplingLoc,
-    scalar_t const* dataAttnWeight, int32_t const batchSize, int32_t const spatialSize,
-    int32_t const numHeads, int32_t const channels, int32_t const numLevels, int32_t const numQuery,
-    int32_t const numPoint, scalar_t* dataCol) {
-  CUDA_1D_KERNEL_LOOP(index, n) {
-    int32_t _temp = index;
-    int32_t const cCol = _temp % channels;
-    _temp /= channels;
-    int32_t const samplingIndex = _temp;
-    int32_t const mCol = _temp % numHeads;
-    _temp /= numHeads;
-    _temp /= numQuery;
-    int32_t const bCol = _temp;
+template<typename scalar_t>
+__global__ void ms_deformable_im2col_gpu_kernel(int32_t const   n,
+                                                scalar_t const* dataValue,
+                                                int32_t const*  dataSpatialShapes,
+                                                int32_t const*  dataLevelStartIndex,
+                                                scalar_t const* dataSamplingLoc,
+                                                scalar_t const* dataAttnWeight,
+                                                int32_t const   batchSize,
+                                                int32_t const   spatialSize,
+                                                int32_t const   numHeads,
+                                                int32_t const   channels,
+                                                int32_t const   numLevels,
+                                                int32_t const   numQuery,
+                                                int32_t const   numPoint,
+                                                scalar_t*       dataCol)
+{
+    CUDA_1D_KERNEL_LOOP(index, n)
+    {
+        int32_t       _temp = index;
+        int32_t const cCol  = _temp % channels;
+        _temp /= channels;
+        int32_t const samplingIndex = _temp;
+        int32_t const mCol          = _temp % numHeads;
+        _temp /= numHeads;
+        _temp /= numQuery;
+        int32_t const bCol = _temp;
 
-    scalar_t* dataColPtr = dataCol + index;
-    int32_t dataWeightPtr = samplingIndex * numLevels * numPoint;
-    int32_t dataLocWPtr = dataWeightPtr << 1;
-    int32_t const qidStride = numHeads * channels;
-    int32_t const dataValuePtrInitOffset = bCol * spatialSize * qidStride;
-    scalar_t col = 0;
+        scalar_t*     dataColPtr             = dataCol + index;
+        int32_t       dataWeightPtr          = samplingIndex * numLevels * numPoint;
+        int32_t       dataLocWPtr            = dataWeightPtr << 1;
+        int32_t const qidStride              = numHeads * channels;
+        int32_t const dataValuePtrInitOffset = bCol * spatialSize * qidStride;
+        scalar_t      col                    = 0;
 
-    for (int32_t lCol = 0; lCol < numLevels; ++lCol) {
-      int32_t const levelStartId = dataLevelStartIndex[lCol];
-      int32_t const spatialHPtr = lCol << 1;
-      int32_t const spatialH = dataSpatialShapes[spatialHPtr];
-      int32_t const spatialW = dataSpatialShapes[spatialHPtr + 1];
-      scalar_t const* dataValuePtr =
-          dataValue + (dataValuePtrInitOffset + levelStartId * qidStride);
-      for (int32_t pCol = 0; pCol < numPoint; ++pCol) {
-        scalar_t const locW = dataSamplingLoc[dataLocWPtr];
-        scalar_t const locH = dataSamplingLoc[dataLocWPtr + 1];
-        scalar_t const weight = dataAttnWeight[dataWeightPtr];
+        for (int32_t lCol = 0; lCol < numLevels; ++lCol)
+        {
+            int32_t const   levelStartId = dataLevelStartIndex[lCol];
+            int32_t const   spatialHPtr  = lCol << 1;
+            int32_t const   spatialH     = dataSpatialShapes[spatialHPtr];
+            int32_t const   spatialW     = dataSpatialShapes[spatialHPtr + 1];
+            scalar_t const* dataValuePtr =
+                dataValue + (dataValuePtrInitOffset + levelStartId * qidStride);
+            for (int32_t pCol = 0; pCol < numPoint; ++pCol)
+            {
+                scalar_t const locW   = dataSamplingLoc[dataLocWPtr];
+                scalar_t const locH   = dataSamplingLoc[dataLocWPtr + 1];
+                scalar_t const weight = dataAttnWeight[dataWeightPtr];
 
-        scalar_t const hIm = locH * spatialH - 0.5;
-        scalar_t const wIm = locW * spatialW - 0.5;
+                scalar_t const hIm = locH * spatialH - 0.5;
+                scalar_t const wIm = locW * spatialW - 0.5;
 
-        if (hIm > -1 && wIm > -1 && hIm < spatialH && wIm < spatialW) {
-          col += ms_deform_attn_im2col_bilinear(dataValuePtr, spatialH, spatialW, numHeads,
-                                                channels, hIm, wIm, mCol, cCol) *
-                 weight;
-        }
+                if (hIm > -1 && wIm > -1 && hIm < spatialH && wIm < spatialW)
+                {
+                    col += ms_deform_attn_im2col_bilinear(dataValuePtr,
+                                                          spatialH,
+                                                          spatialW,
+                                                          numHeads,
+                                                          channels,
+                                                          hIm,
+                                                          wIm,
+                                                          mCol,
+                                                          cCol) *
+                           weight;
+                }
 
-        dataWeightPtr += 1;
-        dataLocWPtr += 2;
-      }
+                dataWeightPtr += 1;
+                dataLocWPtr += 2;
+            }
+        }
+        *dataColPtr = col;
     }
-    *dataColPtr = col;
-  }
 }
 
-template <>
-__global__ void ms_deformable_im2col_gpu_kernel<__half>(
-    int32_t const n, const __half* dataValue, int32_t const* dataSpatialShapes,
-    int32_t const* dataLevelStartIndex, const __half* dataSamplingLoc, const __half* dataAttnWeight,
-    int32_t const batchSize, int32_t const spatialSize, int32_t const numHeads,
-    int32_t const channels, int32_t const numLevels, int32_t const numQuery, int32_t const numPoint,
-    __half* dataCol) {
-  CUDA_1D_KERNEL_LOOP(index, n) {
-    int32_t _temp = index;
-    int32_t const cCol = _temp % channels;
-    _temp /= channels;
-    int32_t const samplingIndex = _temp;
-    int32_t const mCol = _temp % numHeads;
-    _temp /= numHeads;
-    _temp /= numQuery;
-    int32_t const bCol = _temp;
+template<>
+__global__ void ms_deformable_im2col_gpu_kernel<__half>(int32_t const  n,
+                                                        const __half*  dataValue,
+                                                        int32_t const* dataSpatialShapes,
+                                                        int32_t const* dataLevelStartIndex,
+                                                        const __half*  dataSamplingLoc,
+                                                        const __half*  dataAttnWeight,
+                                                        int32_t const  batchSize,
+                                                        int32_t const  spatialSize,
+                                                        int32_t const  numHeads,
+                                                        int32_t const  channels,
+                                                        int32_t const  numLevels,
+                                                        int32_t const  numQuery,
+                                                        int32_t const  numPoint,
+                                                        __half*        dataCol)
+{
+    CUDA_1D_KERNEL_LOOP(index, n)
+    {
+        int32_t       _temp = index;
+        int32_t const cCol  = _temp % channels;
+        _temp /= channels;
+        int32_t const samplingIndex = _temp;
+        int32_t const mCol          = _temp % numHeads;
+        _temp /= numHeads;
+        _temp /= numQuery;
+        int32_t const bCol = _temp;
 
-    __half* dataColPtr = dataCol + index;
-    int32_t dataWeightPtr = samplingIndex * numLevels * numPoint;
-    int32_t dataLocWPtr = dataWeightPtr << 1;
-    int32_t const qidStride = numHeads * channels;
-    int32_t const dataValuePtrInitOffset = bCol * spatialSize * qidStride;
-    const __half kZERO_POINT_FIVE = __float2half(0.5f);
-    const __half kMINUS_ONE = __float2half(-1.0f);
-    const __half kZERO = __int2half_rz(0);
-    __half tpVal = kZERO;
-    __half col = kZERO;
+        __half*       dataColPtr             = dataCol + index;
+        int32_t       dataWeightPtr          = samplingIndex * numLevels * numPoint;
+        int32_t       dataLocWPtr            = dataWeightPtr << 1;
+        int32_t const qidStride              = numHeads * channels;
+        int32_t const dataValuePtrInitOffset = bCol * spatialSize * qidStride;
+        const __half  kZERO_POINT_FIVE       = __float2half(0.5f);
+        const __half  kMINUS_ONE             = __float2half(-1.0f);
+        const __half  kZERO                  = __int2half_rz(0);
+        __half        tpVal                  = kZERO;
+        __half        col                    = kZERO;
 
-    for (int32_t lCol = 0; lCol < numLevels; ++lCol) {
-      int32_t const levelStartId = dataLevelStartIndex[lCol];
-      int32_t const spatialHPtr = lCol << 1;
-      int32_t const spatialH = dataSpatialShapes[spatialHPtr];
-      int32_t const spatialW = dataSpatialShapes[spatialHPtr + 1];
-      const __half spatialHHalf = __int2half_rd(spatialH);
-      const __half spatialWHalf = __int2half_rd(spatialW);
-      const __half* dataValuePtr = dataValue + (dataValuePtrInitOffset + levelStartId * qidStride);
-      for (int32_t pCol = 0; pCol < numPoint; ++pCol) {
-        const __half locW = dataSamplingLoc[dataLocWPtr];
-        const __half locH = dataSamplingLoc[dataLocWPtr + 1];
-        const __half weight = dataAttnWeight[dataWeightPtr];
-#if __CUDA_ARCH__ >= 530
-        const __half hIm = __hsub(__hmul(locH, spatialHHalf), kZERO_POINT_FIVE);
-        const __half wIm = __hsub(__hmul(locW, spatialWHalf), kZERO_POINT_FIVE);
+        for (int32_t lCol = 0; lCol < numLevels; ++lCol)
+        {
+            int32_t const levelStartId = dataLevelStartIndex[lCol];
+            int32_t const spatialHPtr  = lCol << 1;
+            int32_t const spatialH     = dataSpatialShapes[spatialHPtr];
+            int32_t const spatialW     = dataSpatialShapes[spatialHPtr + 1];
+            const __half  spatialHHalf = __int2half_rd(spatialH);
+            const __half  spatialWHalf = __int2half_rd(spatialW);
+            const __half* dataValuePtr = dataValue + (dataValuePtrInitOffset + levelStartId * qidStride);
+            for (int32_t pCol = 0; pCol < numPoint; ++pCol)
+            {
+                const __half locW   = dataSamplingLoc[dataLocWPtr];
+                const __half locH   = dataSamplingLoc[dataLocWPtr + 1];
+                const __half weight = dataAttnWeight[dataWeightPtr];
+    #if __CUDA_ARCH__ >= 530
+                const __half hIm = __hsub(__hmul(locH, spatialHHalf), kZERO_POINT_FIVE);
+                const __half wIm = __hsub(__hmul(locW, spatialWHalf), kZERO_POINT_FIVE);
 
-        if (__hgt(hIm, kMINUS_ONE) && __hgt(wIm, kMINUS_ONE) && __hlt(hIm, spatialHHalf) &&
-            __hlt(wIm, spatialWHalf)) {
-          tpVal = ms_deform_attn_im2col_bilinear(dataValuePtr, spatialH, spatialW, numHeads,
-                                                 channels, hIm, wIm, mCol, cCol);
-          col = __hadd(col, __hmul(tpVal, weight));
-        }
-#else
-        const __half hIm = __float2half(__half2float(locH) * __half2float(spatialHHalf) -
-                                        __half2float(kZERO_POINT_FIVE));
-        const __half wIm = __float2half(__half2float(locW) * __half2float(spatialWHalf) -
-                                        __half2float(kZERO_POINT_FIVE));
+                if (__hgt(hIm, kMINUS_ONE) && __hgt(wIm, kMINUS_ONE) && __hlt(hIm, spatialHHalf) &&
+                    __hlt(wIm, spatialWHalf))
+                {
+                    tpVal = ms_deform_attn_im2col_bilinear(dataValuePtr,
+                                                           spatialH,
+                                                           spatialW,
+                                                           numHeads,
+                                                           channels,
+                                                           hIm,
+                                                           wIm,
+                                                           mCol,
+                                                           cCol);
+                    col   = __hadd(col, __hmul(tpVal, weight));
+                }
+    #else
+                const __half hIm = __float2half(__half2float(locH) * __half2float(spatialHHalf) -
+                                                __half2float(kZERO_POINT_FIVE));
+                const __half wIm = __float2half(__half2float(locW) * __half2float(spatialWHalf) -
+                                                __half2float(kZERO_POINT_FIVE));
 
-        if ((__half2float(hIm) > __half2float(kMINUS_ONE)) &&
-            (__half2float(wIm) > __half2float(kMINUS_ONE)) &&
-            (__half2float(hIm) < __half2float(spatialHHalf)) &&
-            (__half2float(wIm) < __half2float(spatialWHalf))) {
-          tpVal = ms_deform_attn_im2col_bilinear(dataValuePtr, spatialH, spatialW, numHeads,
-                                                 channels, hIm, wIm, mCol, cCol);
-          col = __float2half(__half2float(col) + (__half2float(tpVal) * __half2float(weight)));
+                if ((__half2float(hIm) > __half2float(kMINUS_ONE)) &&
+                    (__half2float(wIm) > __half2float(kMINUS_ONE)) &&
+                    (__half2float(hIm) < __half2float(spatialHHalf)) &&
+                    (__half2float(wIm) < __half2float(spatialWHalf)))
+                {
+                    tpVal = ms_deform_attn_im2col_bilinear(dataValuePtr,
+                                                           spatialH,
+                                                           spatialW,
+                                                           numHeads,
+                                                           channels,
+                                                           hIm,
+                                                           wIm,
+                                                           mCol,
+                                                           cCol);
+                    col   = __float2half(__half2float(col) + (__half2float(tpVal) * __half2float(weight)));
+                }
+    #endif
+                dataWeightPtr += 1;
+                dataLocWPtr += 2;
+            }
         }
-#endif
-        dataWeightPtr += 1;
-        dataLocWPtr += 2;
-      }
+        *dataColPtr = col;
     }
-    *dataColPtr = col;
-  }
 }
 #endif
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn_kernel.hpp b/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn_kernel.hpp
index adbe2566fd..b052c8ce7c 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn_kernel.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/multi_scale_deform_attn/trt_ms_deform_attn_kernel.hpp
@@ -4,12 +4,20 @@
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
 
-template <typename scalar_t>
-int32_t ms_deform_attn_cuda_forward(const scalar_t* value, const int32_t* spatialShapes,
-                                    const int32_t* levelStartIndex, const scalar_t* samplingLoc,
-                                    const scalar_t* attnWeight, scalar_t* output, int32_t batch,
-                                    int32_t mSpatialSize, int32_t mNumHeads, int32_t mChannels,
-                                    int32_t mNumLevels, int32_t mNumQuery, int32_t mNumPoint,
-                                    cudaStream_t stream);
+template<typename scalar_t>
+int32_t ms_deform_attn_cuda_forward(const scalar_t* value,
+                                    const int32_t*  spatialShapes,
+                                    const int32_t*  levelStartIndex,
+                                    const scalar_t* samplingLoc,
+                                    const scalar_t* attnWeight,
+                                    scalar_t*       output,
+                                    int32_t         batch,
+                                    int32_t         mSpatialSize,
+                                    int32_t         mNumHeads,
+                                    int32_t         mChannels,
+                                    int32_t         mNumLevels,
+                                    int32_t         mNumQuery,
+                                    int32_t         mNumPoint,
+                                    cudaStream_t    stream);
 
 #endif
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/roi_align/trt_roi_align.cpp b/csrc/mmdeploy/backend_ops/tensorrt/roi_align/trt_roi_align.cpp
index 988893125d..0d71885676 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/roi_align/trt_roi_align.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/roi_align/trt_roi_align.cpp
@@ -9,233 +9,315 @@
 #include "trt_roi_align_kernel.hpp"
 #include "trt_serialize.hpp"
 
-namespace mmdeploy {
-namespace {
-static const char *PLUGIN_VERSION{"1"};
-static const char *PLUGIN_NAME{"MMCVRoiAlign"};
-}  // namespace
-
-TRTRoIAlign::TRTRoIAlign(const std::string &name, int outWidth, int outHeight, float spatialScale,
-                         int sampleRatio, int poolMode, bool aligned)
-    : TRTPluginBase(name),
-      mOutWidth(outWidth),
-      mOutHeight(outHeight),
-      mSpatialScale(spatialScale),
-      mSampleRatio(sampleRatio),
-      mPoolMode(poolMode),
-      mAligned(aligned) {}
-
-TRTRoIAlign::TRTRoIAlign(const std::string name, const void *data, size_t length)
-    : TRTPluginBase(name) {
-  deserialize_value(&data, &length, &mOutWidth);
-  deserialize_value(&data, &length, &mOutHeight);
-  deserialize_value(&data, &length, &mSpatialScale);
-  deserialize_value(&data, &length, &mSampleRatio);
-  deserialize_value(&data, &length, &mPoolMode);
-  deserialize_value(&data, &length, &mAligned);
-}
-
-nvinfer1::IPluginV2DynamicExt *TRTRoIAlign::clone() const TRT_NOEXCEPT {
-  TRTRoIAlign *plugin = new TRTRoIAlign(mLayerName, mOutWidth, mOutHeight, mSpatialScale,
-                                        mSampleRatio, mPoolMode, mAligned);
-  plugin->setPluginNamespace(getPluginNamespace());
-
-  return plugin;
-}
-
-nvinfer1::DimsExprs TRTRoIAlign::getOutputDimensions(
-    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
-    nvinfer1::IExprBuilder &exprBuilder) TRT_NOEXCEPT {
-  nvinfer1::DimsExprs ret;
-  ret.nbDims = 4;
-  ret.d[0] = inputs[1].d[0];
-  ret.d[1] = inputs[0].d[1];
-  ret.d[2] = exprBuilder.constant(mOutHeight);
-  ret.d[3] = exprBuilder.constant(mOutWidth);
-
-  return ret;
-}
-
-bool TRTRoIAlign::supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc,
-                                            int nbInputs, int nbOutputs) TRT_NOEXCEPT {
-  return ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
-         ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR;
-}
-
-void TRTRoIAlign::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
-                                  const nvinfer1::DynamicPluginTensorDesc *outputs,
-                                  int nbOutputs) TRT_NOEXCEPT {}
-
-size_t TRTRoIAlign::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
-                                     const nvinfer1::PluginTensorDesc *outputs,
-                                     int nbOutputs) const TRT_NOEXCEPT {
-  size_t output_size = 0;
-  size_t word_size = 0;
-  switch (mPoolMode) {
-    case 0:  // max
-      output_size =
-          outputs[0].dims.d[0] * outputs[0].dims.d[1] * outputs[0].dims.d[2] * outputs[0].dims.d[3];
-      word_size = mmdeploy::getElementSize(outputs[0].type);
-      return output_size * word_size * 2;
-      break;
-    case 1:
-      return 0;
-      break;
-    default:
-      return 0;
-  }
-  return 0;
-}
-
-int TRTRoIAlign::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
-                         const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
-                         void *const *outputs, void *workSpace, cudaStream_t stream) TRT_NOEXCEPT {
-  int channels = inputDesc[0].dims.d[1];
-  int height = inputDesc[0].dims.d[2];
-  int width = inputDesc[0].dims.d[3];
-
-  int output_size = outputDesc[0].dims.d[0] * outputDesc[0].dims.d[1] * outputDesc[0].dims.d[2] *
-                    outputDesc[0].dims.d[3];
-  int word_size = mmdeploy::getElementSize(outputDesc[0].type);
-
-  const void *feat = inputs[0];
-  const void *rois = inputs[1];
-  void *output = outputs[0];
-  void *argmax_y = nullptr;
-  void *argmax_x = nullptr;
-
-  switch (mPoolMode) {
-    case 0:  // max
-      argmax_y = workSpace;
-      argmax_x = (char *)argmax_y + output_size * word_size;
-      break;
-    case 1:  // avg
-      break;
-  }
-
-  switch (outputDesc[0].type) {
-    case nvinfer1::DataType::kFLOAT:
-      TRTRoIAlignForwardCUDAKernelLauncher<float>(
-          (const float *)feat, (const float *)rois, (float *)output, (float *)argmax_y,
-          (float *)argmax_x, output_size, channels, height, width, mOutHeight, mOutWidth,
-          mSpatialScale, mSampleRatio, mPoolMode, mAligned, stream);
-      break;
-
-    default:
-      break;
-  }
-
-  return 0;
-}
-
-nvinfer1::DataType TRTRoIAlign::getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
-                                                  int nbInputs) const TRT_NOEXCEPT {
-  return inputTypes[0];
-}
-
-// IPluginV2 Methods
-const char *TRTRoIAlign::getPluginType() const TRT_NOEXCEPT { return PLUGIN_NAME; }
-
-const char *TRTRoIAlign::getPluginVersion() const TRT_NOEXCEPT { return PLUGIN_VERSION; }
-
-int TRTRoIAlign::getNbOutputs() const TRT_NOEXCEPT { return 1; }
-
-size_t TRTRoIAlign::getSerializationSize() const TRT_NOEXCEPT {
-  return serialized_size(mOutWidth) + serialized_size(mOutHeight) + serialized_size(mSpatialScale) +
-         serialized_size(mSampleRatio) + serialized_size(mPoolMode) + serialized_size(mAligned);
-}
-
-void TRTRoIAlign::serialize(void *buffer) const TRT_NOEXCEPT {
-  serialize_value(&buffer, mOutWidth);
-  serialize_value(&buffer, mOutHeight);
-  serialize_value(&buffer, mSpatialScale);
-  serialize_value(&buffer, mSampleRatio);
-  serialize_value(&buffer, mPoolMode);
-  serialize_value(&buffer, mAligned);
-}
-
-TRTRoIAlignCreator::TRTRoIAlignCreator() {
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("output_height"));
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("output_width"));
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("spatial_scale"));
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("sampling_ratio"));
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("mode"));
-  mPluginAttributes.emplace_back(nvinfer1::PluginField("aligned"));
-  mFC.nbFields = mPluginAttributes.size();
-  mFC.fields = mPluginAttributes.data();
-}
-
-const char *TRTRoIAlignCreator::getPluginName() const TRT_NOEXCEPT { return PLUGIN_NAME; }
-
-const char *TRTRoIAlignCreator::getPluginVersion() const TRT_NOEXCEPT { return PLUGIN_VERSION; }
-
-nvinfer1::IPluginV2 *TRTRoIAlignCreator::createPlugin(
-    const char *name, const nvinfer1::PluginFieldCollection *fc) TRT_NOEXCEPT {
-  int outWidth = 7;
-  int outHeight = 7;
-  float spatialScale = 1.0;
-  int sampleRatio = 0;
-  int poolMode = -1;
-  bool aligned = true;
-  for (int i = 0; i < fc->nbFields; i++) {
-    if (fc->fields[i].data == nullptr) {
-      continue;
-    }
-    std::string field_name(fc->fields[i].name);
-
-    if (field_name.compare("output_height") == 0) {
-      outHeight = static_cast<const int *>(fc->fields[i].data)[0];
-    }
-
-    if (field_name.compare("output_width") == 0) {
-      outWidth = static_cast<const int *>(fc->fields[i].data)[0];
-    }
-
-    if (field_name.compare("spatial_scale") == 0) {
-      spatialScale = static_cast<const float *>(fc->fields[i].data)[0];
-    }
-
-    if (field_name.compare("sampling_ratio") == 0) {
-      sampleRatio = static_cast<const int *>(fc->fields[i].data)[0];
-    }
-
-    if (field_name.compare("mode") == 0) {
-      int data_size = fc->fields[i].length;
-      ASSERT(data_size > 0);
-      const char *data_start = static_cast<const char *>(fc->fields[i].data);
-      std::string pool_mode(data_start);
-      if (pool_mode == "avg") {
-        poolMode = 1;
-      } else if (pool_mode == "max") {
-        poolMode = 0;
-      } else {
-        std::cout << "Unknown pool mode \"" << pool_mode << "\"." << std::endl;
-      }
-      ASSERT(poolMode >= 0);
-    }
-
-    if (field_name.compare("aligned") == 0) {
-      int aligned_int = static_cast<const int *>(fc->fields[i].data)[0];
-      aligned = aligned_int != 0;
-    }
-  }
-
-  ASSERT(outHeight > 0);
-  ASSERT(outWidth > 0);
-  ASSERT(spatialScale > 0.);
-  ASSERT(poolMode >= 0);
-
-  TRTRoIAlign *plugin =
-      new TRTRoIAlign(name, outWidth, outHeight, spatialScale, sampleRatio, poolMode, aligned);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-
-nvinfer1::IPluginV2 *TRTRoIAlignCreator::deserializePlugin(const char *name, const void *serialData,
-                                                           size_t serialLength) TRT_NOEXCEPT {
-  auto plugin = new TRTRoIAlign(name, serialData, serialLength);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-REGISTER_TENSORRT_PLUGIN(TRTRoIAlignCreator);
+namespace mmdeploy
+{
+    namespace
+    {
+        static const char* PLUGIN_VERSION{"1"};
+        static const char* PLUGIN_NAME{"MMCVRoiAlign"};
+    }  // namespace
+
+    TRTRoIAlign::TRTRoIAlign(const std::string& name,
+                             int                outWidth,
+                             int                outHeight,
+                             float              spatialScale,
+                             int                sampleRatio,
+                             int                poolMode,
+                             bool               aligned)
+        : TRTPluginBase(name)
+        , mOutWidth(outWidth)
+        , mOutHeight(outHeight)
+        , mSpatialScale(spatialScale)
+        , mSampleRatio(sampleRatio)
+        , mPoolMode(poolMode)
+        , mAligned(aligned)
+    {
+    }
+
+    TRTRoIAlign::TRTRoIAlign(const std::string name, const void* data, size_t length)
+        : TRTPluginBase(name)
+    {
+        deserialize_value(&data, &length, &mOutWidth);
+        deserialize_value(&data, &length, &mOutHeight);
+        deserialize_value(&data, &length, &mSpatialScale);
+        deserialize_value(&data, &length, &mSampleRatio);
+        deserialize_value(&data, &length, &mPoolMode);
+        deserialize_value(&data, &length, &mAligned);
+    }
+
+    nvinfer1::IPluginV2DynamicExt* TRTRoIAlign::clone() const TRT_NOEXCEPT
+    {
+        TRTRoIAlign* plugin = new TRTRoIAlign(mLayerName,
+                                              mOutWidth,
+                                              mOutHeight,
+                                              mSpatialScale,
+                                              mSampleRatio,
+                                              mPoolMode,
+                                              mAligned);
+        plugin->setPluginNamespace(getPluginNamespace());
+
+        return plugin;
+    }
+
+    nvinfer1::DimsExprs TRTRoIAlign::getOutputDimensions(int                        outputIndex,
+                                                         const nvinfer1::DimsExprs* inputs,
+                                                         int                        nbInputs,
+                                                         nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT
+    {
+        nvinfer1::DimsExprs ret;
+        ret.nbDims = 4;
+        ret.d[0]   = inputs[1].d[0];
+        ret.d[1]   = inputs[0].d[1];
+        ret.d[2]   = exprBuilder.constant(mOutHeight);
+        ret.d[3]   = exprBuilder.constant(mOutWidth);
+
+        return ret;
+    }
+
+    bool TRTRoIAlign::supportsFormatCombination(int                               pos,
+                                                const nvinfer1::PluginTensorDesc* ioDesc,
+                                                int                               nbInputs,
+                                                int                               nbOutputs) TRT_NOEXCEPT
+    {
+        return ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
+               ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR;
+    }
+
+    void   TRTRoIAlign::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs,
+                                      int                                      nbInputs,
+                                      const nvinfer1::DynamicPluginTensorDesc* outputs,
+                                      int                                      nbOutputs) TRT_NOEXCEPT {}
+
+    size_t TRTRoIAlign::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                         int                               nbInputs,
+                                         const nvinfer1::PluginTensorDesc* outputs,
+                                         int                               nbOutputs) const TRT_NOEXCEPT
+    {
+        size_t output_size = 0;
+        size_t word_size   = 0;
+        switch (mPoolMode)
+        {
+            case 0:  // max
+                output_size = outputs[0].dims.d[0] * outputs[0].dims.d[1] * outputs[0].dims.d[2] * outputs[0].dims.d[3];
+                word_size = mmdeploy::getElementSize(outputs[0].type);
+                return output_size * word_size * 2;
+                break;
+            case 1:
+                return 0;
+                break;
+            default:
+                return 0;
+        }
+        return 0;
+    }
+
+    int TRTRoIAlign::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                             const nvinfer1::PluginTensorDesc* outputDesc,
+                             const void* const*                inputs,
+                             void* const*                      outputs,
+                             void*                             workSpace,
+                             cudaStream_t                      stream) TRT_NOEXCEPT
+    {
+        int channels = inputDesc[0].dims.d[1];
+        int height   = inputDesc[0].dims.d[2];
+        int width    = inputDesc[0].dims.d[3];
+
+        int output_size = outputDesc[0].dims.d[0] *
+                          outputDesc[0].dims.d[1] *
+                          outputDesc[0].dims.d[2] *
+                          outputDesc[0].dims.d[3];
+        int         word_size = mmdeploy::getElementSize(outputDesc[0].type);
+
+        const void* feat     = inputs[0];
+        const void* rois     = inputs[1];
+        void*       output   = outputs[0];
+        void*       argmax_y = nullptr;
+        void*       argmax_x = nullptr;
+
+        switch (mPoolMode)
+        {
+            case 0:  // max
+                argmax_y = workSpace;
+                argmax_x = (char*)argmax_y + output_size * word_size;
+                break;
+            case 1:  // avg
+                break;
+        }
+
+        switch (outputDesc[0].type)
+        {
+            case nvinfer1::DataType::kFLOAT:
+                TRTRoIAlignForwardCUDAKernelLauncher<float>(
+                    (const float*)feat,
+                    (const float*)rois,
+                    (float*)output,
+                    (float*)argmax_y,
+                    (float*)argmax_x,
+                    output_size,
+                    channels,
+                    height,
+                    width,
+                    mOutHeight,
+                    mOutWidth,
+                    mSpatialScale,
+                    mSampleRatio,
+                    mPoolMode,
+                    mAligned,
+                    stream);
+                break;
+
+            default:
+                break;
+        }
+
+        return 0;
+    }
+
+    nvinfer1::DataType TRTRoIAlign::getOutputDataType(int                       index,
+                                                      const nvinfer1::DataType* inputTypes,
+                                                      int                       nbInputs) const TRT_NOEXCEPT
+    {
+        return inputTypes[0];
+    }
+
+    // IPluginV2 Methods
+    const char* TRTRoIAlign::getPluginType() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
+    }
+
+    const char* TRTRoIAlign::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
+    }
+
+    int TRTRoIAlign::getNbOutputs() const TRT_NOEXCEPT
+    {
+        return 1;
+    }
+
+    size_t TRTRoIAlign::getSerializationSize() const TRT_NOEXCEPT
+    {
+        return serialized_size(mOutWidth) + serialized_size(mOutHeight) + serialized_size(mSpatialScale) +
+               serialized_size(mSampleRatio) + serialized_size(mPoolMode) + serialized_size(mAligned);
+    }
+
+    void TRTRoIAlign::serialize(void* buffer) const TRT_NOEXCEPT
+    {
+        serialize_value(&buffer, mOutWidth);
+        serialize_value(&buffer, mOutHeight);
+        serialize_value(&buffer, mSpatialScale);
+        serialize_value(&buffer, mSampleRatio);
+        serialize_value(&buffer, mPoolMode);
+        serialize_value(&buffer, mAligned);
+    }
+
+    TRTRoIAlignCreator::TRTRoIAlignCreator()
+    {
+        mPluginAttributes.emplace_back(nvinfer1::PluginField("output_height"));
+        mPluginAttributes.emplace_back(nvinfer1::PluginField("output_width"));
+        mPluginAttributes.emplace_back(nvinfer1::PluginField("spatial_scale"));
+        mPluginAttributes.emplace_back(nvinfer1::PluginField("sampling_ratio"));
+        mPluginAttributes.emplace_back(nvinfer1::PluginField("mode"));
+        mPluginAttributes.emplace_back(nvinfer1::PluginField("aligned"));
+        mFC.nbFields = mPluginAttributes.size();
+        mFC.fields   = mPluginAttributes.data();
+    }
+
+    const char* TRTRoIAlignCreator::getPluginName() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
+    }
+
+    const char* TRTRoIAlignCreator::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
+    }
+
+    nvinfer1::IPluginV2* TRTRoIAlignCreator::createPlugin(
+        const char*                            name,
+        const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT
+    {
+        int   outWidth     = 7;
+        int   outHeight    = 7;
+        float spatialScale = 1.0;
+        int   sampleRatio  = 0;
+        int   poolMode     = -1;
+        bool  aligned      = true;
+        for (int i = 0; i < fc->nbFields; i++)
+        {
+            if (fc->fields[i].data == nullptr)
+            {
+                continue;
+            }
+            std::string field_name(fc->fields[i].name);
+
+            if (field_name.compare("output_height") == 0)
+            {
+                outHeight = static_cast<const int*>(fc->fields[i].data)[0];
+            }
+
+            if (field_name.compare("output_width") == 0)
+            {
+                outWidth = static_cast<const int*>(fc->fields[i].data)[0];
+            }
+
+            if (field_name.compare("spatial_scale") == 0)
+            {
+                spatialScale = static_cast<const float*>(fc->fields[i].data)[0];
+            }
+
+            if (field_name.compare("sampling_ratio") == 0)
+            {
+                sampleRatio = static_cast<const int*>(fc->fields[i].data)[0];
+            }
+
+            if (field_name.compare("mode") == 0)
+            {
+                int data_size = fc->fields[i].length;
+                ASSERT(data_size > 0);
+                const char* data_start = static_cast<const char*>(fc->fields[i].data);
+                std::string pool_mode(data_start);
+                if (pool_mode == "avg")
+                {
+                    poolMode = 1;
+                }
+                else if (pool_mode == "max")
+                {
+                    poolMode = 0;
+                }
+                else
+                {
+                    std::cout << "Unknown pool mode \"" << pool_mode << "\"." << std::endl;
+                }
+                ASSERT(poolMode >= 0);
+            }
+
+            if (field_name.compare("aligned") == 0)
+            {
+                int aligned_int = static_cast<const int*>(fc->fields[i].data)[0];
+                aligned         = aligned_int != 0;
+            }
+        }
+
+        ASSERT(outHeight > 0);
+        ASSERT(outWidth > 0);
+        ASSERT(spatialScale > 0.);
+        ASSERT(poolMode >= 0);
+
+        TRTRoIAlign* plugin =
+            new TRTRoIAlign(name, outWidth, outHeight, spatialScale, sampleRatio, poolMode, aligned);
+        plugin->setPluginNamespace(getPluginNamespace());
+        return plugin;
+    }
+
+    nvinfer1::IPluginV2* TRTRoIAlignCreator::deserializePlugin(const char* name,
+                                                               const void* serialData,
+                                                               size_t      serialLength) TRT_NOEXCEPT
+    {
+        auto plugin = new TRTRoIAlign(name, serialData, serialLength);
+        plugin->setPluginNamespace(getPluginNamespace());
+        return plugin;
+    }
+    REGISTER_TENSORRT_PLUGIN(TRTRoIAlignCreator);
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/roi_align/trt_roi_align.hpp b/csrc/mmdeploy/backend_ops/tensorrt/roi_align/trt_roi_align.hpp
index cfc14758f7..605c1a4333 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/roi_align/trt_roi_align.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/roi_align/trt_roi_align.hpp
@@ -8,65 +8,91 @@
 #include <vector>
 
 #include "trt_plugin_base.hpp"
-namespace mmdeploy {
-class TRTRoIAlign : public TRTPluginBase {
- public:
-  TRTRoIAlign(const std::string &name, int outWidth, int outHeight, float spatialScale,
-              int sampleRatio, int poolMode, bool aligned);
-
-  TRTRoIAlign(const std::string name, const void *data, size_t length);
-
-  TRTRoIAlign() = delete;
-
-  // IPluginV2DynamicExt Methods
-  nvinfer1::IPluginV2DynamicExt *clone() const TRT_NOEXCEPT override;
-  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs,
-                                          int nbInputs, nvinfer1::IExprBuilder &exprBuilder)
-      TRT_NOEXCEPT override;
-  bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs,
-                                 int nbOutputs) TRT_NOEXCEPT override;
-  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs,
-                       const nvinfer1::DynamicPluginTensorDesc *out,
-                       int nbOutputs) TRT_NOEXCEPT override;
-  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
-                          const nvinfer1::PluginTensorDesc *outputs,
-                          int nbOutputs) const TRT_NOEXCEPT override;
-  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
-              const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
-              void *const *outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT override;
-
-  // IPluginV2Ext Methods
-  nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
-                                       int nbInputs) const TRT_NOEXCEPT override;
-
-  // IPluginV2 Methods
-  const char *getPluginType() const TRT_NOEXCEPT override;
-  const char *getPluginVersion() const TRT_NOEXCEPT override;
-  int getNbOutputs() const TRT_NOEXCEPT override;
-  size_t getSerializationSize() const TRT_NOEXCEPT override;
-  void serialize(void *buffer) const TRT_NOEXCEPT override;
-
- private:
-  int mOutWidth;
-  int mOutHeight;
-  float mSpatialScale;
-  int mSampleRatio;
-  int mPoolMode;  // 1:avg 0:max
-  bool mAligned;
-};
-
-class TRTRoIAlignCreator : public TRTPluginCreatorBase {
- public:
-  TRTRoIAlignCreator();
-
-  const char *getPluginName() const TRT_NOEXCEPT override;
-
-  const char *getPluginVersion() const TRT_NOEXCEPT override;
-  nvinfer1::IPluginV2 *createPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
-      TRT_NOEXCEPT override;
-
-  nvinfer1::IPluginV2 *deserializePlugin(const char *name, const void *serialData,
-                                         size_t serialLength) TRT_NOEXCEPT override;
-};
+namespace mmdeploy
+{
+    class TRTRoIAlign : public TRTPluginBase
+    {
+      public:
+        TRTRoIAlign(const std::string& name,
+                    int                outWidth,
+                    int                outHeight,
+                    float              spatialScale,
+                    int                sampleRatio,
+                    int                poolMode,
+                    bool               aligned);
+
+        TRTRoIAlign(const std::string name,
+                    const void*       data,
+                    size_t            length);
+
+        TRTRoIAlign() = delete;
+
+        // IPluginV2DynamicExt Methods
+        nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
+
+        nvinfer1::DimsExprs            getOutputDimensions(int                        outputIndex,
+                                                           const nvinfer1::DimsExprs* inputs,
+                                                           int                        nbInputs,
+                                                           nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT override;
+
+        bool                           supportsFormatCombination(int                               pos,
+                                                                 const nvinfer1::PluginTensorDesc* ioDesc,
+                                                                 int                               nbInputs,
+                                                                 int                               nbOutputs) TRT_NOEXCEPT override;
+
+        void                           configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                                                       int                                      nbInputs,
+                                                       const nvinfer1::DynamicPluginTensorDesc* out,
+                                                       int                                      nbOutputs) TRT_NOEXCEPT override;
+
+        size_t                         getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                                        int                               nbInputs,
+                                                        const nvinfer1::PluginTensorDesc* outputs,
+                                                        int                               nbOutputs) const TRT_NOEXCEPT override;
+
+        int                enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                   const nvinfer1::PluginTensorDesc* outputDesc,
+                                   const void* const*                inputs,
+                                   void* const*                      outputs,
+                                   void*                             workspace,
+                                   cudaStream_t                      stream) TRT_NOEXCEPT override;
+
+        // IPluginV2Ext Methods
+        nvinfer1::DataType             getOutputDataType(int                       index,
+                                                         const nvinfer1::DataType* inputTypes,
+                                                         int                       nbInputs) const TRT_NOEXCEPT override;
+
+        // IPluginV2 Methods
+        const char*        getPluginType() const TRT_NOEXCEPT override;
+        const char*        getPluginVersion() const TRT_NOEXCEPT override;
+        int                getNbOutputs() const TRT_NOEXCEPT override;
+        size_t             getSerializationSize() const TRT_NOEXCEPT override;
+        void               serialize(void* buffer) const TRT_NOEXCEPT override;
+
+      private:
+        int   mOutWidth;
+        int   mOutHeight;
+        float mSpatialScale;
+        int   mSampleRatio;
+        int   mPoolMode;  // 1:avg 0:max
+        bool  mAligned;
+    };
+
+    class TRTRoIAlignCreator : public TRTPluginCreatorBase
+    {
+      public:
+        TRTRoIAlignCreator();
+
+        const char*          getPluginName() const TRT_NOEXCEPT override;
+
+        const char*          getPluginVersion() const TRT_NOEXCEPT override;
+
+        nvinfer1::IPluginV2* createPlugin(const char*                            name,
+                                          const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override;
+
+        nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                               const void* serialData,
+                                               size_t      serialLength) TRT_NOEXCEPT override;
+    };
 }  // namespace mmdeploy
 #endif  // TRT_ROI_ALIGN_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/roi_align/trt_roi_align_kernel.cu b/csrc/mmdeploy/backend_ops/tensorrt/roi_align/trt_roi_align_kernel.cu
index 4e1a825d4f..a8ba93b5ad 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/roi_align/trt_roi_align_kernel.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/roi_align/trt_roi_align_kernel.cu
@@ -4,104 +4,157 @@
 #include "trt_roi_align_kernel.hpp"
 
 /*** Forward ***/
-template <typename T>
-__global__ void roi_align_forward_cuda_kernel(const int nthreads, const T* input, const T* rois,
-                                              T* output, T* argmax_y, T* argmax_x,
-                                              const int pooled_height, const int pooled_width,
-                                              const T spatial_scale, const int sampling_ratio,
-                                              const int pool_mode,  // 0 - max pool, 1 - avg pool
-                                              const bool aligned, const int channels,
-                                              const int height, const int width) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    // (n, c, ph, pw) is an element in the pooled output
-    int pw = index % pooled_width;
-    int ph = (index / pooled_width) % pooled_height;
-    int c = (index / pooled_width / pooled_height) % channels;
-    int n = index / pooled_width / pooled_height / channels;
+template<typename T>
+__global__ void roi_align_forward_cuda_kernel(const int  nthreads,
+                                              const T*   input,
+                                              const T*   rois,
+                                              T*         output,
+                                              T*         argmax_y,
+                                              T*         argmax_x,
+                                              const int  pooled_height,
+                                              const int  pooled_width,
+                                              const T    spatial_scale,
+                                              const int  sampling_ratio,
+                                              const int  pool_mode,  // 0 - max pool, 1 - avg pool
+                                              const bool aligned,
+                                              const int  channels,
+                                              const int  height,
+                                              const int  width)
+{
+    CUDA_1D_KERNEL_LOOP(index, nthreads)
+    {
+        // (n, c, ph, pw) is an element in the pooled output
+        int      pw = index % pooled_width;
+        int      ph = (index / pooled_width) % pooled_height;
+        int      c  = (index / pooled_width / pooled_height) % channels;
+        int      n  = index / pooled_width / pooled_height / channels;
 
-    const T* offset_rois = rois + n * 5;
-    int roi_batch_ind = offset_rois[0];
+        const T* offset_rois   = rois + n * 5;
+        int      roi_batch_ind = offset_rois[0];
 
-    // Do not using rounding; this implementation detail is critical
-    T offset = aligned ? (T)0.5 : (T)0.0;
-    T roi_start_w = offset_rois[1] * spatial_scale - offset;
-    T roi_start_h = offset_rois[2] * spatial_scale - offset;
-    T roi_end_w = offset_rois[3] * spatial_scale - offset;
-    T roi_end_h = offset_rois[4] * spatial_scale - offset;
+        // Do not using rounding; this implementation detail is critical
+        T        offset      = aligned ? (T)0.5 : (T)0.0;
+        T        roi_start_w = offset_rois[1] * spatial_scale - offset;
+        T        roi_start_h = offset_rois[2] * spatial_scale - offset;
+        T        roi_end_w   = offset_rois[3] * spatial_scale - offset;
+        T        roi_end_h   = offset_rois[4] * spatial_scale - offset;
 
-    T roi_width = roi_end_w - roi_start_w;
-    T roi_height = roi_end_h - roi_start_h;
-    if (!aligned) {  // for backward-compatibility only
-      roi_width = max(roi_width, (T)1.);
-      roi_height = max(roi_height, (T)1.);
-    }
+        T        roi_width  = roi_end_w - roi_start_w;
+        T        roi_height = roi_end_h - roi_start_h;
+        if (!aligned)
+        {  // for backward-compatibility only
+            roi_width  = max(roi_width, (T)1.);
+            roi_height = max(roi_height, (T)1.);
+        }
 
-    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+        T        bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+        T        bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
 
-    const T* offset_input = input + (roi_batch_ind * channels + c) * height * width;
+        const T* offset_input = input + (roi_batch_ind * channels + c) * height * width;
 
-    // We use roi_bin_grid to sample the grid and mimic integral
-    int roi_bin_grid_h =
-        (sampling_ratio > 0) ? sampling_ratio : static_cast<int>(ceilf(roi_height / pooled_height));
-    int roi_bin_grid_w =
-        (sampling_ratio > 0) ? sampling_ratio : static_cast<int>(ceilf(roi_width / pooled_width));
+        // We use roi_bin_grid to sample the grid and mimic integral
+        int      roi_bin_grid_h =
+            (sampling_ratio > 0) ? sampling_ratio : static_cast<int>(ceilf(roi_height / pooled_height));
+        int roi_bin_grid_w =
+            (sampling_ratio > 0) ? sampling_ratio : static_cast<int>(ceilf(roi_width / pooled_width));
 
-    if (pool_mode == 0) {
-      // We do max pooling inside a bin
-      T maxval = -FLT_MAX;
-      T maxidx_y = -1.f, maxidx_x = -1.f;
-      for (int iy = 0; iy < roi_bin_grid_h; iy++) {
-        const T y = roi_start_h + ph * bin_size_h +
-                    static_cast<T>(iy + .5f) * bin_size_h / static_cast<T>(roi_bin_grid_h);
-        for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-          const T x = roi_start_w + pw * bin_size_w +
-                      static_cast<T>(ix + .5f) * bin_size_w / static_cast<T>(roi_bin_grid_w);
-          T val = bilinear_interpolate(offset_input, height, width, y, x);
-          if (val > maxval) {
-            maxval = val;
-            maxidx_y = y;
-            maxidx_x = x;
-          }
+        if (pool_mode == 0)
+        {
+            // We do max pooling inside a bin
+            T maxval   = -FLT_MAX;
+            T maxidx_y = -1.f, maxidx_x = -1.f;
+            for (int iy = 0; iy < roi_bin_grid_h; iy++)
+            {
+                const T y = roi_start_h + ph * bin_size_h +
+                            static_cast<T>(iy + .5f) * bin_size_h / static_cast<T>(roi_bin_grid_h);
+                for (int ix = 0; ix < roi_bin_grid_w; ix++)
+                {
+                    const T x = roi_start_w + pw * bin_size_w +
+                                static_cast<T>(ix + .5f) * bin_size_w / static_cast<T>(roi_bin_grid_w);
+                    T val = bilinear_interpolate(offset_input, height, width, y, x);
+                    if (val > maxval)
+                    {
+                        maxval   = val;
+                        maxidx_y = y;
+                        maxidx_x = x;
+                    }
+                }
+            }
+            output[index]   = maxval;
+            argmax_y[index] = maxidx_y;
+            argmax_x[index] = maxidx_x;
         }
-      }
-      output[index] = maxval;
-      argmax_y[index] = maxidx_y;
-      argmax_x[index] = maxidx_x;
-    } else if (pool_mode == 1) {
-      // We do average pooling inside a bin
-      const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1);
-      T output_val = 0.;
-      for (int iy = 0; iy < roi_bin_grid_h; iy++) {
-        const T y = roi_start_h + ph * bin_size_h +
-                    static_cast<T>(iy + .5f) * bin_size_h / static_cast<T>(roi_bin_grid_h);
-        for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-          const T x = roi_start_w + pw * bin_size_w +
-                      static_cast<T>(ix + .5f) * bin_size_w / static_cast<T>(roi_bin_grid_w);
-          T val = bilinear_interpolate(offset_input, height, width, y, x);
-          output_val += val;
+        else if (pool_mode == 1)
+        {
+            // We do average pooling inside a bin
+            const T count      = max(roi_bin_grid_h * roi_bin_grid_w, 1);
+            T       output_val = 0.;
+            for (int iy = 0; iy < roi_bin_grid_h; iy++)
+            {
+                const T y = roi_start_h + ph * bin_size_h +
+                            static_cast<T>(iy + .5f) * bin_size_h / static_cast<T>(roi_bin_grid_h);
+                for (int ix = 0; ix < roi_bin_grid_w; ix++)
+                {
+                    const T x = roi_start_w + pw * bin_size_w +
+                                static_cast<T>(ix + .5f) * bin_size_w / static_cast<T>(roi_bin_grid_w);
+                    T val = bilinear_interpolate(offset_input, height, width, y, x);
+                    output_val += val;
+                }
+            }
+            output[index] = output_val / count;
         }
-      }
-      output[index] = output_val / count;
     }
-  }
 }
 
-template <typename scalar_t>
-void TRTRoIAlignForwardCUDAKernelLauncher(const scalar_t* input, const scalar_t* rois,
-                                          scalar_t* output, scalar_t* argmax_y, scalar_t* argmax_x,
-                                          int output_size, int channels, int height, int width,
-                                          int aligned_height, int aligned_width,
-                                          scalar_t spatial_scale, int sampling_ratio, int pool_mode,
-                                          bool aligned, cudaStream_t stream) {
-  roi_align_forward_cuda_kernel<scalar_t>
-      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
-          output_size, input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width,
-          static_cast<scalar_t>(spatial_scale), sampling_ratio, pool_mode, aligned, channels,
-          height, width);
+template<typename scalar_t>
+void TRTRoIAlignForwardCUDAKernelLauncher(const scalar_t* input,
+                                          const scalar_t* rois,
+                                          scalar_t*       output,
+                                          scalar_t*       argmax_y,
+                                          scalar_t*       argmax_x,
+                                          int             output_size,
+                                          int             channels,
+                                          int             height,
+                                          int             width,
+                                          int             aligned_height,
+                                          int             aligned_width,
+                                          scalar_t        spatial_scale,
+                                          int             sampling_ratio,
+                                          int             pool_mode,
+                                          bool            aligned,
+                                          cudaStream_t    stream)
+{
+    roi_align_forward_cuda_kernel<scalar_t>
+        <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(output_size,
+                                                                    input,
+                                                                    rois,
+                                                                    output,
+                                                                    argmax_y,
+                                                                    argmax_x,
+                                                                    aligned_height,
+                                                                    aligned_width,
+                                                                    static_cast<scalar_t>(spatial_scale),
+                                                                    sampling_ratio,
+                                                                    pool_mode,
+                                                                    aligned,
+                                                                    channels,
+                                                                    height,
+                                                                    width);
 }
 
-template void TRTRoIAlignForwardCUDAKernelLauncher<float>(
-    const float* input, const float* rois, float* output, float* argmax_y, float* argmax_x,
-    int output_size, int channels, int height, int width, int aligned_height, int aligned_width,
-    float spatial_scale, int sampling_ratio, int pool_mode, bool aligned, cudaStream_t stream);
+template void TRTRoIAlignForwardCUDAKernelLauncher<float>(const float* input,
+                                                          const float* rois,
+                                                          float*       output,
+                                                          float*       argmax_y,
+                                                          float*       argmax_x,
+                                                          int          output_size,
+                                                          int          channels,
+                                                          int          height,
+                                                          int          width,
+                                                          int          aligned_height,
+                                                          int          aligned_width,
+                                                          float        spatial_scale,
+                                                          int          sampling_ratio,
+                                                          int          pool_mode,
+                                                          bool         aligned,
+                                                          cudaStream_t stream);
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/roi_align/trt_roi_align_kernel.hpp b/csrc/mmdeploy/backend_ops/tensorrt/roi_align/trt_roi_align_kernel.hpp
index 3db656bff9..38906636a4 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/roi_align/trt_roi_align_kernel.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/roi_align/trt_roi_align_kernel.hpp
@@ -4,12 +4,22 @@
 
 #include "common_cuda_helper.hpp"
 
-template <typename scalar_t>
-void TRTRoIAlignForwardCUDAKernelLauncher(const scalar_t* input, const scalar_t* rois,
-                                          scalar_t* output, scalar_t* argmax_y, scalar_t* argmax_x,
-                                          int output_size, int channels, int height, int width,
-                                          int aligned_height, int aligned_width,
-                                          scalar_t spatial_scale, int sampling_ratio, int pool_mode,
-                                          bool aligned, cudaStream_t stream);
+template<typename scalar_t>
+void TRTRoIAlignForwardCUDAKernelLauncher(const scalar_t* input,
+                                          const scalar_t* rois,
+                                          scalar_t*       output,
+                                          scalar_t*       argmax_y,
+                                          scalar_t*       argmax_x,
+                                          int             output_size,
+                                          int             channels,
+                                          int             height,
+                                          int             width,
+                                          int             aligned_height,
+                                          int             aligned_width,
+                                          scalar_t        spatial_scale,
+                                          int             sampling_ratio,
+                                          int             pool_mode,
+                                          bool            aligned,
+                                          cudaStream_t    stream);
 
 #endif  // ROI_ALIGN_CUDA_KERNEL_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention.cpp b/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention.cpp
index a4ecb2356a..551c6ce996 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention.cpp
@@ -10,174 +10,242 @@
 
 using namespace nvinfer1;
 
-namespace mmdeploy {
-namespace {
-static const char *PLUGIN_VERSION{"1"};
-static const char *PLUGIN_NAME{"ScaledDotProductAttentionTRT"};
-}  // namespace
-
-ScaledDotProductAttentionTRT::ScaledDotProductAttentionTRT(const std::string &name)
-    : TRTPluginBase(name), mask_dim(0) {}
-
-ScaledDotProductAttentionTRT::ScaledDotProductAttentionTRT(const std::string name, const void *data,
-                                                           size_t length)
-    : TRTPluginBase(name), mask_dim(0) {}
-
-ScaledDotProductAttentionTRT::~ScaledDotProductAttentionTRT() {}
-
-nvinfer1::IPluginV2DynamicExt *ScaledDotProductAttentionTRT::clone() const TRT_NOEXCEPT {
-  ScaledDotProductAttentionTRT *plugin = new ScaledDotProductAttentionTRT(mLayerName);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-
-nvinfer1::DimsExprs ScaledDotProductAttentionTRT::getOutputDimensions(
-    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
-    nvinfer1::IExprBuilder &exprBuilder) TRT_NOEXCEPT {
-  if (outputIndex == 0) return inputs[0];
-  nvinfer1::DimsExprs ret;
-  ret.nbDims = 3;
-  ret.d[0] = inputs[0].d[0];
-  ret.d[1] = inputs[0].d[1];
-  ret.d[2] = inputs[1].d[1];
-
-  return ret;
-}
-
-bool ScaledDotProductAttentionTRT::supportsFormatCombination(
-    int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs, int nbOutputs) TRT_NOEXCEPT {
-  if (pos == 0) {
-    return (ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
-            ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR);
-  } else {
-    return ioDesc[pos].type == ioDesc[0].type && ioDesc[pos].format == ioDesc[0].format;
-  }
-}
-
-// Attach the plugin object to an execution context and grant the plugin the
-// access to some context resource.
-void ScaledDotProductAttentionTRT::attachToContext(cudnnContext *cudnnContext,
-                                                   cublasContext *cublasContext,
-                                                   IGpuAllocator *gpuAllocator) TRT_NOEXCEPT {
-  _cublas_handle = cublasContext;
-  _cudnn_handle = cudnnContext;
-  cudnnCreateTensorDescriptor(&_x_desc);
-  cudnnCreateTensorDescriptor(&_y_desc);
-  cudnnCreateTensorDescriptor(&_mask_desc);
-}
-
-// Detach the plugin object from its execution context.
-void ScaledDotProductAttentionTRT::detachFromContext() TRT_NOEXCEPT {
-  cudnnDestroyTensorDescriptor(_y_desc);
-  cudnnDestroyTensorDescriptor(_x_desc);
-  cudnnDestroyTensorDescriptor(_mask_desc);
-}
-
-void ScaledDotProductAttentionTRT::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in,
-                                                   int nbInputs,
-                                                   const nvinfer1::DynamicPluginTensorDesc *out,
-                                                   int nbOutputs) TRT_NOEXCEPT {
-  if (nbInputs != 4) {
-    mask_dim = 0;
-  } else {
-    mask_dim = in[3].desc.dims.nbDims;
-  }
-}
-
-int ScaledDotProductAttentionTRT::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
-                                          const nvinfer1::PluginTensorDesc *outputDesc,
-                                          const void *const *inputs, void *const *outputs,
-                                          void *workSpace, cudaStream_t stream) TRT_NOEXCEPT {
-  if (CUDNN_STATUS_SUCCESS != cudnnSetStream(_cudnn_handle, stream)) return 1;
-  if (CUBLAS_STATUS_SUCCESS != cublasSetStream(_cublas_handle, stream)) return 1;
-  int B = inputDesc[0].dims.d[0];  // batch * heads
-  int Nt = inputDesc[0].dims.d[1];
-  int Ns = inputDesc[1].dims.d[1];
-  int E = inputDesc[0].dims.d[2];  // embeding size
-
-  const void *query = inputs[0];
-  const void *key = inputs[1];
-  const void *value = inputs[2];
-  const void *mask = nullptr;
-
-  int mask_dims[3];
-  mask_dims[0] = 0;
-  if (mask_dim > 0) {
-    mask = inputs[3];
-    // check if mask need broadcast
-    if (mask_dim == 2) {
-      mask_dims[0] = 1;
-      mask_dims[1] = inputDesc[3].dims.d[0];
-      mask_dims[2] = inputDesc[3].dims.d[1];
-    } else {
-      mask_dims[0] = inputDesc[3].dims.d[0];
-      mask_dims[1] = inputDesc[3].dims.d[1];
-      mask_dims[2] = inputDesc[3].dims.d[2];
-    }
-  }
-
-  void *output = outputs[0];
-  void *attn = outputs[1];
-
-  auto data_type = inputDesc[0].type;
-  cudnnDataType_t cudnn_dtype{};
-  convert_trt2cudnn_dtype(data_type, &cudnn_dtype);
-  switch (data_type) {
-    case nvinfer1::DataType::kFLOAT:
-      dot_product_attention_impl<float>((float *)query, (float *)key, (float *)value, (float *)mask,
-                                        (float *)attn, (float *)output, B, Nt, Ns, E, &mask_dims[0],
-                                        _x_desc, _y_desc, _mask_desc, cudnn_dtype, stream,
-                                        _cublas_handle, _cudnn_handle);
-      break;
-    default:
-      return 1;
-  }
-
-  return 0;
-}
-
-nvinfer1::DataType ScaledDotProductAttentionTRT::getOutputDataType(
-    int index, const nvinfer1::DataType *inputTypes, int nbInputs) const TRT_NOEXCEPT {
-  return inputTypes[0];
-}
-
-// IPluginV2 Methods
-const char *ScaledDotProductAttentionTRT::getPluginType() const TRT_NOEXCEPT { return PLUGIN_NAME; }
-
-const char *ScaledDotProductAttentionTRT::getPluginVersion() const TRT_NOEXCEPT {
-  return PLUGIN_VERSION;
-}
-
-int ScaledDotProductAttentionTRT::getNbOutputs() const TRT_NOEXCEPT { return 2; }
-
-size_t ScaledDotProductAttentionTRT::getSerializationSize() const TRT_NOEXCEPT { return 0; }
-
-void ScaledDotProductAttentionTRT::serialize(void *buffer) const TRT_NOEXCEPT {}
-
-////////////////////// creator /////////////////////////////
-
-ScaledDotProductAttentionTRTCreator::ScaledDotProductAttentionTRTCreator() {}
-
-const char *ScaledDotProductAttentionTRTCreator::getPluginName() const TRT_NOEXCEPT {
-  return PLUGIN_NAME;
-}
-
-const char *ScaledDotProductAttentionTRTCreator::getPluginVersion() const TRT_NOEXCEPT {
-  return PLUGIN_VERSION;
-}
-
-nvinfer1::IPluginV2 *ScaledDotProductAttentionTRTCreator::createPlugin(
-    const char *name, const nvinfer1::PluginFieldCollection *fc) TRT_NOEXCEPT {
-  ScaledDotProductAttentionTRT *plugin = new ScaledDotProductAttentionTRT(name);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-
-nvinfer1::IPluginV2 *ScaledDotProductAttentionTRTCreator::deserializePlugin(
-    const char *name, const void *serialData, size_t serialLength) TRT_NOEXCEPT {
-  auto plugin = new ScaledDotProductAttentionTRT(name, serialData, serialLength);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-REGISTER_TENSORRT_PLUGIN(ScaledDotProductAttentionTRTCreator);
+namespace mmdeploy
+{
+    namespace
+    {
+        static const char* PLUGIN_VERSION{"1"};
+        static const char* PLUGIN_NAME{"ScaledDotProductAttentionTRT"};
+    }  // namespace
+
+    ScaledDotProductAttentionTRT::ScaledDotProductAttentionTRT(const std::string& name)
+        : TRTPluginBase(name)
+        , mask_dim(0)
+    {
+    }
+
+    ScaledDotProductAttentionTRT::ScaledDotProductAttentionTRT(const std::string name,
+                                                               const void*       data,
+                                                               size_t            length)
+        : TRTPluginBase(name)
+        , mask_dim(0)
+    {
+    }
+
+    ScaledDotProductAttentionTRT::~ScaledDotProductAttentionTRT() {}
+
+    nvinfer1::IPluginV2DynamicExt* ScaledDotProductAttentionTRT::clone() const TRT_NOEXCEPT
+    {
+        ScaledDotProductAttentionTRT* plugin = new ScaledDotProductAttentionTRT(mLayerName);
+        plugin->setPluginNamespace(getPluginNamespace());
+        return plugin;
+    }
+
+    nvinfer1::DimsExprs ScaledDotProductAttentionTRT::getOutputDimensions(
+        int                        outputIndex,
+        const nvinfer1::DimsExprs* inputs,
+        int                        nbInputs,
+        nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT
+    {
+        if (outputIndex == 0) return inputs[0];
+        nvinfer1::DimsExprs ret;
+        ret.nbDims = 3;
+        ret.d[0]   = inputs[0].d[0];
+        ret.d[1]   = inputs[0].d[1];
+        ret.d[2]   = inputs[1].d[1];
+
+        return ret;
+    }
+
+    bool ScaledDotProductAttentionTRT::supportsFormatCombination(
+        int                               pos,
+        const nvinfer1::PluginTensorDesc* ioDesc,
+        int                               nbInputs,
+        int                               nbOutputs) TRT_NOEXCEPT
+    {
+        if (pos == 0)
+        {
+            return (ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
+                    ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR);
+        }
+        else
+        {
+            return ioDesc[pos].type == ioDesc[0].type && ioDesc[pos].format == ioDesc[0].format;
+        }
+    }
+
+    // Attach the plugin object to an execution context and grant the plugin the
+    // access to some context resource.
+    void ScaledDotProductAttentionTRT::attachToContext(cudnnContext*  cudnnContext,
+                                                       cublasContext* cublasContext,
+                                                       IGpuAllocator* gpuAllocator) TRT_NOEXCEPT
+    {
+        _cublas_handle = cublasContext;
+        _cudnn_handle  = cudnnContext;
+        cudnnCreateTensorDescriptor(&_x_desc);
+        cudnnCreateTensorDescriptor(&_y_desc);
+        cudnnCreateTensorDescriptor(&_mask_desc);
+    }
+
+    // Detach the plugin object from its execution context.
+    void ScaledDotProductAttentionTRT::detachFromContext() TRT_NOEXCEPT
+    {
+        cudnnDestroyTensorDescriptor(_y_desc);
+        cudnnDestroyTensorDescriptor(_x_desc);
+        cudnnDestroyTensorDescriptor(_mask_desc);
+    }
+
+    void ScaledDotProductAttentionTRT::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                                                       int                                      nbInputs,
+                                                       const nvinfer1::DynamicPluginTensorDesc* out,
+                                                       int                                      nbOutputs) TRT_NOEXCEPT
+    {
+        if (nbInputs != 4)
+        {
+            mask_dim = 0;
+        }
+        else
+        {
+            mask_dim = in[3].desc.dims.nbDims;
+        }
+    }
+
+    int ScaledDotProductAttentionTRT::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                              const nvinfer1::PluginTensorDesc* outputDesc,
+                                              const void* const*                inputs,
+                                              void* const*                      outputs,
+                                              void*                             workSpace,
+                                              cudaStream_t                      stream) TRT_NOEXCEPT
+    {
+        if (CUDNN_STATUS_SUCCESS != cudnnSetStream(_cudnn_handle, stream)) return 1;
+        if (CUBLAS_STATUS_SUCCESS != cublasSetStream(_cublas_handle, stream)) return 1;
+        int         B  = inputDesc[0].dims.d[0];  // batch * heads
+        int         Nt = inputDesc[0].dims.d[1];
+        int         Ns = inputDesc[1].dims.d[1];
+        int         E  = inputDesc[0].dims.d[2];  // embeding size
+
+        const void* query = inputs[0];
+        const void* key   = inputs[1];
+        const void* value = inputs[2];
+        const void* mask  = nullptr;
+
+        int         mask_dims[3];
+        mask_dims[0] = 0;
+        if (mask_dim > 0)
+        {
+            mask = inputs[3];
+            // check if mask need broadcast
+            if (mask_dim == 2)
+            {
+                mask_dims[0] = 1;
+                mask_dims[1] = inputDesc[3].dims.d[0];
+                mask_dims[2] = inputDesc[3].dims.d[1];
+            }
+            else
+            {
+                mask_dims[0] = inputDesc[3].dims.d[0];
+                mask_dims[1] = inputDesc[3].dims.d[1];
+                mask_dims[2] = inputDesc[3].dims.d[2];
+            }
+        }
+
+        void*           output = outputs[0];
+        void*           attn   = outputs[1];
+
+        auto            data_type = inputDesc[0].type;
+        cudnnDataType_t cudnn_dtype{};
+        convert_trt2cudnn_dtype(data_type, &cudnn_dtype);
+        switch (data_type)
+        {
+            case nvinfer1::DataType::kFLOAT:
+                dot_product_attention_impl<float>((float*)query,
+                                                  (float*)key,
+                                                  (float*)value,
+                                                  (float*)mask,
+                                                  (float*)attn,
+                                                  (float*)output,
+                                                  B,
+                                                  Nt,
+                                                  Ns,
+                                                  E,
+                                                  &mask_dims[0],
+                                                  _x_desc,
+                                                  _y_desc,
+                                                  _mask_desc,
+                                                  cudnn_dtype,
+                                                  stream,
+                                                  _cublas_handle,
+                                                  _cudnn_handle);
+                break;
+            default:
+                return 1;
+        }
+
+        return 0;
+    }
+
+    nvinfer1::DataType ScaledDotProductAttentionTRT::getOutputDataType(
+        int                       index,
+        const nvinfer1::DataType* inputTypes,
+        int                       nbInputs) const TRT_NOEXCEPT
+    {
+        return inputTypes[0];
+    }
+
+    // IPluginV2 Methods
+    const char* ScaledDotProductAttentionTRT::getPluginType() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
+    }
+
+    const char* ScaledDotProductAttentionTRT::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
+    }
+
+    int ScaledDotProductAttentionTRT::getNbOutputs() const TRT_NOEXCEPT
+    {
+        return 2;
+    }
+
+    size_t ScaledDotProductAttentionTRT::getSerializationSize() const TRT_NOEXCEPT
+    {
+        return 0;
+    }
+
+    void ScaledDotProductAttentionTRT::serialize(void* buffer) const TRT_NOEXCEPT {}
+
+    ////////////////////// creator /////////////////////////////
+
+    ScaledDotProductAttentionTRTCreator::ScaledDotProductAttentionTRTCreator() {}
+
+    const char* ScaledDotProductAttentionTRTCreator::getPluginName() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
+    }
+
+    const char* ScaledDotProductAttentionTRTCreator::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
+    }
+
+    nvinfer1::IPluginV2* ScaledDotProductAttentionTRTCreator::createPlugin(
+        const char*                            name,
+        const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT
+    {
+        ScaledDotProductAttentionTRT* plugin = new ScaledDotProductAttentionTRT(name);
+        plugin->setPluginNamespace(getPluginNamespace());
+        return plugin;
+    }
+
+    nvinfer1::IPluginV2* ScaledDotProductAttentionTRTCreator::deserializePlugin(
+        const char* name,
+        const void* serialData,
+        size_t      serialLength) TRT_NOEXCEPT
+    {
+        auto plugin = new ScaledDotProductAttentionTRT(name, serialData, serialLength);
+        plugin->setPluginNamespace(getPluginNamespace());
+        return plugin;
+    }
+    REGISTER_TENSORRT_PLUGIN(ScaledDotProductAttentionTRTCreator);
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention.hpp b/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention.hpp
index 86d35616a9..4aea4c1e20 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention.hpp
@@ -9,65 +9,86 @@
 
 #include "trt_plugin_base.hpp"
 
-namespace mmdeploy {
-class ScaledDotProductAttentionTRT : public TRTPluginBase {
- public:
-  ScaledDotProductAttentionTRT(const std::string &name);
-
-  ScaledDotProductAttentionTRT(const std::string name, const void *data, size_t length);
-
-  ScaledDotProductAttentionTRT() = delete;
-
-  ~ScaledDotProductAttentionTRT() TRT_NOEXCEPT override;
-
-  virtual void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs,
-                               const nvinfer1::DynamicPluginTensorDesc *out,
-                               int nbOutputs) TRT_NOEXCEPT override;
-  // IPluginV2DynamicExt Methods
-  nvinfer1::IPluginV2DynamicExt *clone() const TRT_NOEXCEPT override;
-  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs,
-                                          int nbInputs, nvinfer1::IExprBuilder &exprBuilder)
-      TRT_NOEXCEPT override;
-  bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs,
-                                 int nbOutputs) TRT_NOEXCEPT override;
-  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
-              const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
-              void *const *outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT override;
-
-  // IPluginV2Ext Methods
-  nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
-                                       int nbInputs) const TRT_NOEXCEPT override;
-
-  // IPluginV2 Methods
-  const char *getPluginType() const TRT_NOEXCEPT override;
-  const char *getPluginVersion() const TRT_NOEXCEPT override;
-  int getNbOutputs() const TRT_NOEXCEPT override;
-  size_t getSerializationSize() const TRT_NOEXCEPT override;
-  void serialize(void *buffer) const TRT_NOEXCEPT override;
-  void attachToContext(cudnnContext *cudnn, cublasContext *cublas,
-                       nvinfer1::IGpuAllocator *allocator) TRT_NOEXCEPT override;
-  void detachFromContext() TRT_NOEXCEPT override;
-
- private:
-  int mask_dim;
-  cublasHandle_t _cublas_handle{};
-  cudnnHandle_t _cudnn_handle{};
-  cudnnTensorDescriptor_t _x_desc{}, _y_desc{}, _mask_desc{};
-};
-
-class ScaledDotProductAttentionTRTCreator : public TRTPluginCreatorBase {
- public:
-  ScaledDotProductAttentionTRTCreator();
-
-  const char *getPluginName() const TRT_NOEXCEPT override;
-
-  const char *getPluginVersion() const TRT_NOEXCEPT override;
-
-  nvinfer1::IPluginV2 *createPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
-      TRT_NOEXCEPT override;
-
-  nvinfer1::IPluginV2 *deserializePlugin(const char *name, const void *serialData,
-                                         size_t serialLength) TRT_NOEXCEPT override;
-};
+namespace mmdeploy
+{
+    class ScaledDotProductAttentionTRT : public TRTPluginBase
+    {
+      public:
+        ScaledDotProductAttentionTRT(const std::string& name);
+
+        ScaledDotProductAttentionTRT(const std::string name,
+                                     const void*       data,
+                                     size_t            length);
+
+        ScaledDotProductAttentionTRT() = delete;
+
+        ~ScaledDotProductAttentionTRT() TRT_NOEXCEPT override;
+
+        virtual void                   configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                                                       int                                      nbInputs,
+                                                       const nvinfer1::DynamicPluginTensorDesc* out,
+                                                       int                                      nbOutputs) TRT_NOEXCEPT override;
+
+        // IPluginV2DynamicExt Methods
+        nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
+
+        nvinfer1::DimsExprs            getOutputDimensions(int                        outputIndex,
+                                                           const nvinfer1::DimsExprs* inputs,
+                                                           int                        nbInputs,
+                                                           nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT override;
+
+        bool                           supportsFormatCombination(int                               pos,
+                                                                 const nvinfer1::PluginTensorDesc* ioDesc,
+                                                                 int                               nbInputs,
+                                                                 int                               nbOutputs) TRT_NOEXCEPT override;
+
+        int                enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                   const nvinfer1::PluginTensorDesc* outputDesc,
+                                   const void* const*                inputs,
+                                   void* const*                      outputs,
+                                   void*                             workspace,
+                                   cudaStream_t                      stream) TRT_NOEXCEPT override;
+
+        // IPluginV2Ext Methods
+        nvinfer1::DataType             getOutputDataType(int                       index,
+                                                         const nvinfer1::DataType* inputTypes,
+                                                         int                       nbInputs) const TRT_NOEXCEPT override;
+
+        // IPluginV2 Methods
+        const char*        getPluginType() const TRT_NOEXCEPT override;
+        const char*        getPluginVersion() const TRT_NOEXCEPT override;
+        int                getNbOutputs() const TRT_NOEXCEPT override;
+        size_t             getSerializationSize() const TRT_NOEXCEPT override;
+        void               serialize(void* buffer) const TRT_NOEXCEPT override;
+
+        void                           attachToContext(cudnnContext*            cudnn,
+                                                       cublasContext*           cublas,
+                                                       nvinfer1::IGpuAllocator* allocator) TRT_NOEXCEPT override;
+
+        void               detachFromContext() TRT_NOEXCEPT override;
+
+      private:
+        int                     mask_dim;
+        cublasHandle_t          _cublas_handle{};
+        cudnnHandle_t           _cudnn_handle{};
+        cudnnTensorDescriptor_t _x_desc{}, _y_desc{}, _mask_desc{};
+    };
+
+    class ScaledDotProductAttentionTRTCreator : public TRTPluginCreatorBase
+    {
+      public:
+        ScaledDotProductAttentionTRTCreator();
+
+        const char*          getPluginName() const TRT_NOEXCEPT override;
+
+        const char*          getPluginVersion() const TRT_NOEXCEPT override;
+
+        nvinfer1::IPluginV2* createPlugin(const char*                            name,
+                                          const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override;
+
+        nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                               const void* serialData,
+                                               size_t      serialLength) TRT_NOEXCEPT override;
+    };
 }  // namespace mmdeploy
 #endif  // TRT_SCALED_DOT_PRODUCT_ATTENTION_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention_kernel.cu b/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention_kernel.cu
index a0ee16c998..9775265b78 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention_kernel.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention_kernel.cu
@@ -11,93 +11,228 @@
 #include "scaled_dot_product_attention_kernel.hpp"
 #include "trt_plugin_helper.hpp"
 
-template <typename scalar_t>
-cublasStatus_t cublasgemmStridedBatchedWrap(cublasHandle_t handle, cublasOperation_t transa,
-                                            cublasOperation_t transb, int m, int n, int k,
-                                            const scalar_t* alpha, const scalar_t* A, int lda,
-                                            long long int strideA, const scalar_t* B, int ldb,
-                                            long long int strideB, const scalar_t* beta,
-                                            scalar_t* C, int ldc, long long int strideC,
-                                            int batchCount);
+template<typename scalar_t>
+cublasStatus_t cublasgemmStridedBatchedWrap(cublasHandle_t    handle,
+                                            cublasOperation_t transa,
+                                            cublasOperation_t transb,
+                                            int               m,
+                                            int               n,
+                                            int               k,
+                                            const scalar_t*   alpha,
+                                            const scalar_t*   A,
+                                            int               lda,
+                                            long long int     strideA,
+                                            const scalar_t*   B,
+                                            int               ldb,
+                                            long long int     strideB,
+                                            const scalar_t*   beta,
+                                            scalar_t*         C,
+                                            int               ldc,
+                                            long long int     strideC,
+                                            int               batchCount);
 
-template <>
-cublasStatus_t cublasgemmStridedBatchedWrap<float>(cublasHandle_t handle, cublasOperation_t transa,
-                                                   cublasOperation_t transb, int m, int n, int k,
-                                                   const float* alpha, const float* A, int lda,
-                                                   long long int strideA, const float* B, int ldb,
-                                                   long long int strideB, const float* beta,
-                                                   float* C, int ldc, long long int strideC,
-                                                   int batchCount) {
-  return cublasSgemmStridedBatched(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb,
-                                   strideB, beta, C, ldc, strideC, batchCount);
+template<>
+cublasStatus_t cublasgemmStridedBatchedWrap<float>(cublasHandle_t    handle,
+                                                   cublasOperation_t transa,
+                                                   cublasOperation_t transb,
+                                                   int               m,
+                                                   int               n,
+                                                   int               k,
+                                                   const float*      alpha,
+                                                   const float*      A,
+                                                   int               lda,
+                                                   long long int     strideA,
+                                                   const float*      B,
+                                                   int               ldb,
+                                                   long long int     strideB,
+                                                   const float*      beta,
+                                                   float*            C,
+                                                   int               ldc,
+                                                   long long int     strideC,
+                                                   int               batchCount)
+{
+    return cublasSgemmStridedBatched(handle,
+                                     transa,
+                                     transb,
+                                     m,
+                                     n,
+                                     k,
+                                     alpha,
+                                     A,
+                                     lda,
+                                     strideA,
+                                     B,
+                                     ldb,
+                                     strideB,
+                                     beta,
+                                     C,
+                                     ldc,
+                                     strideC,
+                                     batchCount);
 }
 
-template <>
-cublasStatus_t cublasgemmStridedBatchedWrap<__half>(cublasHandle_t handle, cublasOperation_t transa,
-                                                    cublasOperation_t transb, int m, int n, int k,
-                                                    const __half* alpha, const __half* A, int lda,
-                                                    long long int strideA, const __half* B, int ldb,
-                                                    long long int strideB, const __half* beta,
-                                                    __half* C, int ldc, long long int strideC,
-                                                    int batchCount) {
-  return cublasHgemmStridedBatched(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb,
-                                   strideB, beta, C, ldc, strideC, batchCount);
+template<>
+cublasStatus_t cublasgemmStridedBatchedWrap<__half>(cublasHandle_t    handle,
+                                                    cublasOperation_t transa,
+                                                    cublasOperation_t transb,
+                                                    int               m,
+                                                    int               n,
+                                                    int               k,
+                                                    const __half*     alpha,
+                                                    const __half*     A,
+                                                    int               lda,
+                                                    long long int     strideA,
+                                                    const __half*     B,
+                                                    int               ldb,
+                                                    long long int     strideB,
+                                                    const __half*     beta,
+                                                    __half*           C,
+                                                    int               ldc,
+                                                    long long int     strideC,
+                                                    int               batchCount)
+{
+    return cublasHgemmStridedBatched(handle,
+                                     transa,
+                                     transb,
+                                     m,
+                                     n,
+                                     k,
+                                     alpha,
+                                     A,
+                                     lda,
+                                     strideA,
+                                     B,
+                                     ldb,
+                                     strideB,
+                                     beta,
+                                     C,
+                                     ldc,
+                                     strideC,
+                                     batchCount);
 }
 
-template <typename scalar_t>
-void dot_product_attention_impl(const scalar_t* query, const scalar_t* key, const scalar_t* value,
-                                const scalar_t* mask, scalar_t* attn, scalar_t* output, int B,
-                                int Nt, int Ns, int E, const int* mask_dims,
-                                cudnnTensorDescriptor_t& x_desc, cudnnTensorDescriptor_t& y_desc,
-                                cudnnTensorDescriptor_t& mask_desc, cudnnDataType_t cudnn_dtype,
-                                cudaStream_t stream, cublasHandle_t cublas_handle,
-                                cudnnHandle_t cudnn_handle) {
-  {
-    // Q @ K
-    const int m = Ns;
-    const int n = Nt;
-    const int k = E;
-    const auto alpha = scalar_t(1.0f / sqrt(float(E)));
-    const auto beta = scalar_t(0);
-    cublasgemmStridedBatchedWrap(cublas_handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, key, k,
-                                 Ns * E, query, k, Nt * E, &beta, attn, m, Nt * Ns, B);
-  }
+template<typename scalar_t>
+void dot_product_attention_impl(const scalar_t*          query,
+                                const scalar_t*          key,
+                                const scalar_t*          value,
+                                const scalar_t*          mask,
+                                scalar_t*                attn,
+                                scalar_t*                output,
+                                int                      B,
+                                int                      Nt,
+                                int                      Ns,
+                                int                      E,
+                                const int*               mask_dims,
+                                cudnnTensorDescriptor_t& x_desc,
+                                cudnnTensorDescriptor_t& y_desc,
+                                cudnnTensorDescriptor_t& mask_desc,
+                                cudnnDataType_t          cudnn_dtype,
+                                cudaStream_t             stream,
+                                cublasHandle_t           cublas_handle,
+                                cudnnHandle_t            cudnn_handle)
+{
+    {
+        // Q @ K
+        const int  m     = Ns;
+        const int  n     = Nt;
+        const int  k     = E;
+        const auto alpha = scalar_t(1.0f / sqrt(float(E)));
+        const auto beta  = scalar_t(0);
+        cublasgemmStridedBatchedWrap(cublas_handle,
+                                     CUBLAS_OP_T,
+                                     CUBLAS_OP_N,
+                                     m,
+                                     n,
+                                     k,
+                                     &alpha,
+                                     key,
+                                     k,
+                                     Ns * E,
+                                     query,
+                                     k,
+                                     Nt * E,
+                                     &beta,
+                                     attn,
+                                     m,
+                                     Nt * Ns,
+                                     B);
+    }
 
-  if (mask_dims != nullptr && mask_dims[0] != 0) {
-    const auto alpha = scalar_t(1);
-    const auto beta = scalar_t(1);
-    cudnnSetTensor4dDescriptor(mask_desc, CUDNN_TENSOR_NCHW, cudnn_dtype, 1, mask_dims[0],
-                               mask_dims[1], mask_dims[2]);
-    cudnnSetTensor4dDescriptor(x_desc, CUDNN_TENSOR_NCHW, cudnn_dtype, 1, B, Nt, Ns);
-    cudnnAddTensor(cudnn_handle, &alpha, mask_desc, mask, &beta, x_desc, attn);
-  }
+    if (mask_dims != nullptr && mask_dims[0] != 0)
+    {
+        const auto alpha = scalar_t(1);
+        const auto beta  = scalar_t(1);
+        cudnnSetTensor4dDescriptor(mask_desc,
+                                   CUDNN_TENSOR_NCHW,
+                                   cudnn_dtype,
+                                   1,
+                                   mask_dims[0],
+                                   mask_dims[1],
+                                   mask_dims[2]);
+        cudnnSetTensor4dDescriptor(x_desc, CUDNN_TENSOR_NCHW, cudnn_dtype, 1, B, Nt, Ns);
+        cudnnAddTensor(cudnn_handle, &alpha, mask_desc, mask, &beta, x_desc, attn);
+    }
 
-  {
-    // softmax attention
-    const auto alpha = scalar_t(1);
-    const auto beta = scalar_t(0);
-    cudnnSetTensor4dDescriptor(x_desc, CUDNN_TENSOR_NCHW, cudnn_dtype, B * Nt, Ns, 1, 1);
-    cudnnSetTensor4dDescriptor(y_desc, CUDNN_TENSOR_NCHW, cudnn_dtype, B * Nt, Ns, 1, 1);
-    cudnnSoftmaxForward(cudnn_handle, CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_INSTANCE, &alpha,
-                        x_desc, attn, &beta, y_desc, attn);
-  }
+    {
+        // softmax attention
+        const auto alpha = scalar_t(1);
+        const auto beta  = scalar_t(0);
+        cudnnSetTensor4dDescriptor(x_desc, CUDNN_TENSOR_NCHW, cudnn_dtype, B * Nt, Ns, 1, 1);
+        cudnnSetTensor4dDescriptor(y_desc, CUDNN_TENSOR_NCHW, cudnn_dtype, B * Nt, Ns, 1, 1);
+        cudnnSoftmaxForward(cudnn_handle,
+                            CUDNN_SOFTMAX_ACCURATE,
+                            CUDNN_SOFTMAX_MODE_INSTANCE,
+                            &alpha,
+                            x_desc,
+                            attn,
+                            &beta,
+                            y_desc,
+                            attn);
+    }
 
-  {
-    // attn @ v
-    const int m = E;
-    const int n = Nt;
-    const int k = Ns;
-    const auto alpha = scalar_t(1);
-    const auto beta = scalar_t(0);
-    cublasgemmStridedBatchedWrap(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, value, m,
-                                 Ns * E, (const scalar_t*)(attn), k, Ns * Nt, &beta, output, m,
-                                 Nt * E, B);
-  }
+    {
+        // attn @ v
+        const int  m     = E;
+        const int  n     = Nt;
+        const int  k     = Ns;
+        const auto alpha = scalar_t(1);
+        const auto beta  = scalar_t(0);
+        cublasgemmStridedBatchedWrap(cublas_handle,
+                                     CUBLAS_OP_N,
+                                     CUBLAS_OP_N,
+                                     m,
+                                     n,
+                                     k,
+                                     &alpha,
+                                     value,
+                                     m,
+                                     Ns * E,
+                                     (const scalar_t*)(attn),
+                                     k,
+                                     Ns * Nt,
+                                     &beta,
+                                     output,
+                                     m,
+                                     Nt * E,
+                                     B);
+    }
 }
 
-template void dot_product_attention_impl<float>(
-    const float* query, const float* key, const float* value, const float* mask, float* attn,
-    float* output, int B, int Nt, int Ns, int E, const int* mask_dims,
-    cudnnTensorDescriptor_t& x_desc, cudnnTensorDescriptor_t& y_desc,
-    cudnnTensorDescriptor_t& mask_desc, cudnnDataType_t cudnn_dtype, cudaStream_t stream,
-    cublasHandle_t cublas_handle, cudnnHandle_t cudnn_handle);
+template void dot_product_attention_impl<float>(const float*             query,
+                                                const float*             key,
+                                                const float*             value,
+                                                const float*             mask,
+                                                float*                   attn,
+                                                float*                   output,
+                                                int                      B,
+                                                int                      Nt,
+                                                int                      Ns,
+                                                int                      E,
+                                                const int*               mask_dims,
+                                                cudnnTensorDescriptor_t& x_desc,
+                                                cudnnTensorDescriptor_t& y_desc,
+                                                cudnnTensorDescriptor_t& mask_desc,
+                                                cudnnDataType_t          cudnn_dtype,
+                                                cudaStream_t             stream,
+                                                cublasHandle_t           cublas_handle,
+                                                cudnnHandle_t            cudnn_handle);
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention_kernel.hpp b/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention_kernel.hpp
index d1cdc7773a..b11a341aa9 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention_kernel.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/scaled_dot_product_attention/scaled_dot_product_attention_kernel.hpp
@@ -5,13 +5,24 @@
 #include <cuda_runtime.h>
 #include <cudnn.h>
 
-template <typename scalar_t>
-void dot_product_attention_impl(const scalar_t* query, const scalar_t* key, const scalar_t* value,
-                                const scalar_t* mask, scalar_t* attn, scalar_t* output, int B,
-                                int Nt, int Ns, int E, const int* mask_dims,
-                                cudnnTensorDescriptor_t& x_desc, cudnnTensorDescriptor_t& y_desc,
-                                cudnnTensorDescriptor_t& mask_desc, cudnnDataType_t cudnn_dtype,
-                                cudaStream_t stream, cublasHandle_t cublas_handle,
-                                cudnnHandle_t cudnn_handle);
+template<typename scalar_t>
+void dot_product_attention_impl(const scalar_t*          query,
+                                const scalar_t*          key,
+                                const scalar_t*          value,
+                                const scalar_t*          mask,
+                                scalar_t*                attn,
+                                scalar_t*                output,
+                                int                      B,
+                                int                      Nt,
+                                int                      Ns,
+                                int                      E,
+                                const int*               mask_dims,
+                                cudnnTensorDescriptor_t& x_desc,
+                                cudnnTensorDescriptor_t& y_desc,
+                                cudnnTensorDescriptor_t& mask_desc,
+                                cudnnDataType_t          cudnn_dtype,
+                                cudaStream_t             stream,
+                                cublasHandle_t           cublas_handle,
+                                cudnnHandle_t            cudnn_handle);
 
 #endif
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/scatternd/trt_scatternd.cpp b/csrc/mmdeploy/backend_ops/tensorrt/scatternd/trt_scatternd.cpp
index 13c637f408..ca5fe92dcc 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/scatternd/trt_scatternd.cpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/scatternd/trt_scatternd.cpp
@@ -2,155 +2,218 @@
 #include "NvInferVersion.h"
 // ScatterND is supported since TensorRT8
 #if NV_TENSORRT_MAJOR <= 7
-#include <assert.h>
-#include <stdio.h>
-
-#include <chrono>
-
-#include "trt_scatternd.hpp"
-#include "trt_scatternd_kernel.hpp"
-#include "trt_serialize.hpp"
-
-namespace mmdeploy {
-namespace {
-static const char *PLUGIN_VERSION{"1"};
-static const char *PLUGIN_NAME{"ScatterND"};
-}  // namespace
-
-TRTScatterND::TRTScatterND(const std::string &name) : TRTPluginBase(name) {}
-
-TRTScatterND::TRTScatterND(const std::string name, const void *data, size_t length)
-    : TRTPluginBase(name) {}
-
-nvinfer1::IPluginV2DynamicExt *TRTScatterND::clone() const TRT_NOEXCEPT {
-  TRTScatterND *plugin = new TRTScatterND(mLayerName);
-  plugin->setPluginNamespace(getPluginNamespace());
-
-  return plugin;
-}
-
-nvinfer1::DimsExprs TRTScatterND::getOutputDimensions(
-    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
-    nvinfer1::IExprBuilder &exprBuilder) TRT_NOEXCEPT {
-  return inputs[0];
-}
-
-bool TRTScatterND::supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc,
-                                             int nbInputs, int nbOutputs) TRT_NOEXCEPT {
-  if (pos < nbInputs) {
-    switch (pos) {
-      case 0:
-        // data
-        return (ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
-                ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR) ||
-               (ioDesc[pos].type == nvinfer1::DataType::kINT32 &&
-                ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR);
-      case 1:
-        // indices
-        return ioDesc[pos].type == nvinfer1::DataType::kINT32 &&
-               ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR;
-      case 2:
-        // updates
-        return ioDesc[pos].type == ioDesc[0].type && ioDesc[pos].format == ioDesc[0].format;
-      default:
-        return true;
+    #include <assert.h>
+    #include <stdio.h>
+
+    #include <chrono>
+
+    #include "trt_scatternd.hpp"
+    #include "trt_scatternd_kernel.hpp"
+    #include "trt_serialize.hpp"
+
+namespace mmdeploy
+{
+    namespace
+    {
+        static const char* PLUGIN_VERSION{"1"};
+        static const char* PLUGIN_NAME{"ScatterND"};
+    }  // namespace
+
+    TRTScatterND::TRTScatterND(const std::string& name)
+        : TRTPluginBase(name)
+    {
+    }
+
+    TRTScatterND::TRTScatterND(const std::string name, const void* data, size_t length)
+        : TRTPluginBase(name)
+    {
     }
-  } else {
-    switch (pos - nbInputs) {
-      case 0:
-        // output
-        return ioDesc[pos].type == ioDesc[0].type && ioDesc[pos].format == ioDesc[0].format;
-      default:
+
+    nvinfer1::IPluginV2DynamicExt* TRTScatterND::clone() const TRT_NOEXCEPT
+    {
+        TRTScatterND* plugin = new TRTScatterND(mLayerName);
+        plugin->setPluginNamespace(getPluginNamespace());
+
+        return plugin;
+    }
+
+    nvinfer1::DimsExprs TRTScatterND::getOutputDimensions(int                        outputIndex,
+                                                          const nvinfer1::DimsExprs* inputs,
+                                                          int                        nbInputs,
+                                                          nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT
+    {
+        return inputs[0];
+    }
+
+    bool TRTScatterND::supportsFormatCombination(int                               pos,
+                                                 const nvinfer1::PluginTensorDesc* ioDesc,
+                                                 int                               nbInputs,
+                                                 int                               nbOutputs) TRT_NOEXCEPT
+    {
+        if (pos < nbInputs)
+        {
+            switch (pos)
+            {
+                case 0:
+                    // data
+                    return (ioDesc[pos].type == nvinfer1::DataType::kFLOAT &&
+                            ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR) ||
+                           (ioDesc[pos].type == nvinfer1::DataType::kINT32 &&
+                            ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR);
+                case 1:
+                    // indices
+                    return ioDesc[pos].type == nvinfer1::DataType::kINT32 &&
+                           ioDesc[pos].format == nvinfer1::TensorFormat::kLINEAR;
+                case 2:
+                    // updates
+                    return ioDesc[pos].type == ioDesc[0].type && ioDesc[pos].format == ioDesc[0].format;
+                default:
+                    return true;
+            }
+        }
+        else
+        {
+            switch (pos - nbInputs)
+            {
+                case 0:
+                    // output
+                    return ioDesc[pos].type == ioDesc[0].type && ioDesc[pos].format == ioDesc[0].format;
+                default:
+                    return true;
+            }
+        }
         return true;
     }
-  }
-  return true;
-}
-
-void TRTScatterND::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
-                                   const nvinfer1::DynamicPluginTensorDesc *outputs,
-                                   int nbOutputs) TRT_NOEXCEPT {}
-
-size_t TRTScatterND::getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
-                                      const nvinfer1::PluginTensorDesc *outputs,
-                                      int nbOutputs) const TRT_NOEXCEPT {
-  return 0;
-}
-
-int TRTScatterND::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
-                          const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
-                          void *const *outputs, void *workSpace, cudaStream_t stream) TRT_NOEXCEPT {
-  const int *dims = &(inputDesc[0].dims.d[0]);
-  const int *indices_dims = &(inputDesc[1].dims.d[0]);
-  int nbDims = inputDesc[0].dims.nbDims;
-  int indice_nbDims = inputDesc[1].dims.nbDims;
-
-  const void *data = inputs[0];
-  const void *indices = inputs[1];
-  const void *update = inputs[2];
-  void *output = outputs[0];
-
-  auto data_type = inputDesc[0].type;
-
-  switch (data_type) {
-    case nvinfer1::DataType::kFLOAT:
-      TRTONNXScatterNDKernelLauncher<float>((float *)data, (int *)indices, (float *)update, dims,
-                                            nbDims, indices_dims, indice_nbDims, (float *)output,
-                                            stream);
-      break;
-
-    case nvinfer1::DataType::kINT32:
-      TRTONNXScatterNDKernelLauncher<int>((int *)data, (int *)indices, (int *)update, dims, nbDims,
-                                          indices_dims, indice_nbDims, (int *)output, stream);
-      break;
-    default:
-      break;
-  }
-
-  return 0;
-}
-
-nvinfer1::DataType TRTScatterND::getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
-                                                   int nbInputs) const TRT_NOEXCEPT {
-  return inputTypes[0];
-}
-
-// IPluginV2 Methods
-const char *TRTScatterND::getPluginType() const TRT_NOEXCEPT { return PLUGIN_NAME; }
-
-const char *TRTScatterND::getPluginVersion() const TRT_NOEXCEPT { return PLUGIN_VERSION; }
-
-int TRTScatterND::getNbOutputs() const TRT_NOEXCEPT { return 1; }
-
-size_t TRTScatterND::getSerializationSize() const TRT_NOEXCEPT { return 0; }
-
-void TRTScatterND::serialize(void *buffer) const TRT_NOEXCEPT {}
-
-TRTScatterNDCreator::TRTScatterNDCreator() {
-  mPluginAttributes.clear();
-  mFC.nbFields = mPluginAttributes.size();
-  mFC.fields = mPluginAttributes.data();
-}
-
-const char *TRTScatterNDCreator::getPluginName() const TRT_NOEXCEPT { return PLUGIN_NAME; }
-
-const char *TRTScatterNDCreator::getPluginVersion() const TRT_NOEXCEPT { return PLUGIN_VERSION; }
-
-nvinfer1::IPluginV2 *TRTScatterNDCreator::createPlugin(
-    const char *name, const nvinfer1::PluginFieldCollection *fc) TRT_NOEXCEPT {
-  TRTScatterND *plugin = new TRTScatterND(name);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-
-nvinfer1::IPluginV2 *TRTScatterNDCreator::deserializePlugin(const char *name,
-                                                            const void *serialData,
-                                                            size_t serialLength) TRT_NOEXCEPT {
-  auto plugin = new TRTScatterND(name, serialData, serialLength);
-  plugin->setPluginNamespace(getPluginNamespace());
-  return plugin;
-}
-
-REGISTER_TENSORRT_PLUGIN(TRTScatterNDCreator);
+
+    void   TRTScatterND::configurePlugin(const nvinfer1::DynamicPluginTensorDesc* inputs,
+                                       int                                      nbInputs,
+                                       const nvinfer1::DynamicPluginTensorDesc* outputs,
+                                       int                                      nbOutputs) TRT_NOEXCEPT {}
+
+    size_t TRTScatterND::getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                          int                               nbInputs,
+                                          const nvinfer1::PluginTensorDesc* outputs,
+                                          int                               nbOutputs) const TRT_NOEXCEPT
+    {
+        return 0;
+    }
+
+    int TRTScatterND::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                              const nvinfer1::PluginTensorDesc* outputDesc,
+                              const void* const*                inputs,
+                              void* const*                      outputs,
+                              void*                             workSpace,
+                              cudaStream_t                      stream) TRT_NOEXCEPT
+    {
+        const int*  dims          = &(inputDesc[0].dims.d[0]);
+        const int*  indices_dims  = &(inputDesc[1].dims.d[0]);
+        int         nbDims        = inputDesc[0].dims.nbDims;
+        int         indice_nbDims = inputDesc[1].dims.nbDims;
+
+        const void* data    = inputs[0];
+        const void* indices = inputs[1];
+        const void* update  = inputs[2];
+        void*       output  = outputs[0];
+
+        auto        data_type = inputDesc[0].type;
+
+        switch (data_type)
+        {
+            case nvinfer1::DataType::kFLOAT:
+                TRTONNXScatterNDKernelLauncher<float>((float*)data,
+                                                      (int*)indices,
+                                                      (float*)update,
+                                                      dims,
+                                                      nbDims,
+                                                      indices_dims,
+                                                      indice_nbDims,
+                                                      (float*)output,
+                                                      stream);
+                break;
+
+            case nvinfer1::DataType::kINT32:
+                TRTONNXScatterNDKernelLauncher<int>((int*)data,
+                                                    (int*)indices,
+                                                    (int*)update,
+                                                    dims,
+                                                    nbDims,
+                                                    indices_dims,
+                                                    indice_nbDims,
+                                                    (int*)output,
+                                                    stream);
+                break;
+            default:
+                break;
+        }
+
+        return 0;
+    }
+
+    nvinfer1::DataType TRTScatterND::getOutputDataType(int                       index,
+                                                       const nvinfer1::DataType* inputTypes,
+                                                       int                       nbInputs) const TRT_NOEXCEPT
+    {
+        return inputTypes[0];
+    }
+
+    // IPluginV2 Methods
+    const char* TRTScatterND::getPluginType() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
+    }
+
+    const char* TRTScatterND::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
+    }
+
+    int TRTScatterND::getNbOutputs() const TRT_NOEXCEPT
+    {
+        return 1;
+    }
+
+    size_t TRTScatterND::getSerializationSize() const TRT_NOEXCEPT
+    {
+        return 0;
+    }
+
+    void TRTScatterND::serialize(void* buffer) const TRT_NOEXCEPT {}
+
+    TRTScatterNDCreator::TRTScatterNDCreator()
+    {
+        mPluginAttributes.clear();
+        mFC.nbFields = mPluginAttributes.size();
+        mFC.fields   = mPluginAttributes.data();
+    }
+
+    const char* TRTScatterNDCreator::getPluginName() const TRT_NOEXCEPT
+    {
+        return PLUGIN_NAME;
+    }
+
+    const char* TRTScatterNDCreator::getPluginVersion() const TRT_NOEXCEPT
+    {
+        return PLUGIN_VERSION;
+    }
+
+    nvinfer1::IPluginV2* TRTScatterNDCreator::createPlugin(
+        const char*                            name,
+        const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT
+    {
+        TRTScatterND* plugin = new TRTScatterND(name);
+        plugin->setPluginNamespace(getPluginNamespace());
+        return plugin;
+    }
+
+    nvinfer1::IPluginV2* TRTScatterNDCreator::deserializePlugin(const char* name,
+                                                                const void* serialData,
+                                                                size_t      serialLength) TRT_NOEXCEPT
+    {
+        auto plugin = new TRTScatterND(name, serialData, serialLength);
+        plugin->setPluginNamespace(getPluginNamespace());
+        return plugin;
+    }
+
+    REGISTER_TENSORRT_PLUGIN(TRTScatterNDCreator);
 }  // namespace mmdeploy
 #endif
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/scatternd/trt_scatternd.hpp b/csrc/mmdeploy/backend_ops/tensorrt/scatternd/trt_scatternd.hpp
index d6b859855e..6afbbe450e 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/scatternd/trt_scatternd.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/scatternd/trt_scatternd.hpp
@@ -9,56 +9,77 @@
 
 #include "trt_plugin_base.hpp"
 
-namespace mmdeploy {
-class TRTScatterND : public TRTPluginBase {
- public:
-  TRTScatterND(const std::string &name);
-
-  TRTScatterND(const std::string name, const void *data, size_t length);
-
-  TRTScatterND() = delete;
-
-  // IPluginV2DynamicExt Methods
-  nvinfer1::IPluginV2DynamicExt *clone() const TRT_NOEXCEPT override;
-  nvinfer1::DimsExprs getOutputDimensions(int outputIndex, const nvinfer1::DimsExprs *inputs,
-                                          int nbInputs, nvinfer1::IExprBuilder &exprBuilder)
-      TRT_NOEXCEPT override;
-  bool supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *ioDesc, int nbInputs,
-                                 int nbOutputs) TRT_NOEXCEPT override;
-  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs,
-                       const nvinfer1::DynamicPluginTensorDesc *out,
-                       int nbOutputs) TRT_NOEXCEPT override;
-  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
-                          const nvinfer1::PluginTensorDesc *outputs,
-                          int nbOutputs) const TRT_NOEXCEPT override;
-  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
-              const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
-              void *const *outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT override;
-
-  // IPluginV2Ext Methods
-  nvinfer1::DataType getOutputDataType(int index, const nvinfer1::DataType *inputTypes,
-                                       int nbInputs) const TRT_NOEXCEPT override;
-
-  // IPluginV2 Methods
-  const char *getPluginType() const TRT_NOEXCEPT override;
-  const char *getPluginVersion() const TRT_NOEXCEPT override;
-  int getNbOutputs() const TRT_NOEXCEPT override;
-  size_t getSerializationSize() const TRT_NOEXCEPT override;
-  void serialize(void *buffer) const TRT_NOEXCEPT override;
-};
-
-class TRTScatterNDCreator : public TRTPluginCreatorBase {
- public:
-  TRTScatterNDCreator();
-
-  const char *getPluginName() const TRT_NOEXCEPT override;
-
-  const char *getPluginVersion() const TRT_NOEXCEPT override;
-  nvinfer1::IPluginV2 *createPlugin(const char *name, const nvinfer1::PluginFieldCollection *fc)
-      TRT_NOEXCEPT override;
-
-  nvinfer1::IPluginV2 *deserializePlugin(const char *name, const void *serialData,
-                                         size_t serialLength) TRT_NOEXCEPT override;
-};
+namespace mmdeploy
+{
+    class TRTScatterND : public TRTPluginBase
+    {
+      public:
+        TRTScatterND(const std::string& name);
+
+        TRTScatterND(const std::string name,
+                     const void*       data,
+                     size_t            length);
+
+        TRTScatterND() = delete;
+
+        // IPluginV2DynamicExt Methods
+        nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
+
+        nvinfer1::DimsExprs            getOutputDimensions(int                        outputIndex,
+                                                           const nvinfer1::DimsExprs* inputs,
+                                                           int                        nbInputs,
+                                                           nvinfer1::IExprBuilder&    exprBuilder) TRT_NOEXCEPT override;
+
+        bool                           supportsFormatCombination(int                               pos,
+                                                                 const nvinfer1::PluginTensorDesc* ioDesc,
+                                                                 int                               nbInputs,
+                                                                 int                               nbOutputs) TRT_NOEXCEPT override;
+
+        void                           configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                                                       int                                      nbInputs,
+                                                       const nvinfer1::DynamicPluginTensorDesc* out,
+                                                       int                                      nbOutputs) TRT_NOEXCEPT override;
+
+        size_t                         getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                                                        int                               nbInputs,
+                                                        const nvinfer1::PluginTensorDesc* outputs,
+                                                        int                               nbOutputs) const TRT_NOEXCEPT override;
+
+        int                enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                   const nvinfer1::PluginTensorDesc* outputDesc,
+                                   const void* const*                inputs,
+                                   void* const*                      outputs,
+                                   void*                             workspace,
+                                   cudaStream_t                      stream) TRT_NOEXCEPT override;
+
+        // IPluginV2Ext Methods
+        nvinfer1::DataType             getOutputDataType(int                       index,
+                                                         const nvinfer1::DataType* inputTypes,
+                                                         int                       nbInputs) const TRT_NOEXCEPT override;
+
+        // IPluginV2 Methods
+        const char*        getPluginType() const TRT_NOEXCEPT override;
+        const char*        getPluginVersion() const TRT_NOEXCEPT override;
+        int                getNbOutputs() const TRT_NOEXCEPT override;
+        size_t             getSerializationSize() const TRT_NOEXCEPT override;
+        void               serialize(void* buffer) const TRT_NOEXCEPT override;
+    };
+
+    class TRTScatterNDCreator : public TRTPluginCreatorBase
+    {
+      public:
+        TRTScatterNDCreator();
+
+        const char*          getPluginName() const TRT_NOEXCEPT override;
+
+        const char*          getPluginVersion() const TRT_NOEXCEPT override;
+
+        nvinfer1::IPluginV2* createPlugin(const char*                            name,
+                                          const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override;
+
+        nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                               const void* serialData,
+                                               size_t      serialLength) TRT_NOEXCEPT override;
+    };
 }  // namespace mmdeploy
 #endif  // TRT_SCATTERND_HPP
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/scatternd/trt_scatternd_kernel.cu b/csrc/mmdeploy/backend_ops/tensorrt/scatternd/trt_scatternd_kernel.cu
index c763992e9f..cd5a235afa 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/scatternd/trt_scatternd_kernel.cu
+++ b/csrc/mmdeploy/backend_ops/tensorrt/scatternd/trt_scatternd_kernel.cu
@@ -8,68 +8,98 @@
 
 using mmdeploy::TensorDesc;
 
-template <typename T>
-__global__ void onnx_scatternd_kernel(const int n, const int* indices, const T* update, T* output,
-                                      TensorDesc tensor_desc, TensorDesc indice_desc) {
-  const int indice_cols = indice_desc.shape[indice_desc.dim - 1];
-  const int copy_stride = tensor_desc.stride[indice_cols - 1];
-  const int* stride = &(tensor_desc.stride[0]);
-  CUDA_1D_KERNEL_LOOP(index, n) {
-    int output_offset = 0;
-    const int* indices_current = indices + index * indice_cols;
-    for (int i = 0; i < indice_cols; ++i) {
-      output_offset += stride[i] * indices_current[i];
+template<typename T>
+__global__ void onnx_scatternd_kernel(const int  n,
+                                      const int* indices,
+                                      const T*   update,
+                                      T*         output,
+                                      TensorDesc tensor_desc,
+                                      TensorDesc indice_desc)
+{
+    const int  indice_cols = indice_desc.shape[indice_desc.dim - 1];
+    const int  copy_stride = tensor_desc.stride[indice_cols - 1];
+    const int* stride      = &(tensor_desc.stride[0]);
+    CUDA_1D_KERNEL_LOOP(index, n)
+    {
+        int        output_offset   = 0;
+        const int* indices_current = indices + index * indice_cols;
+        for (int i = 0; i < indice_cols; ++i)
+        {
+            output_offset += stride[i] * indices_current[i];
+        }
+        memcpy(output + output_offset, update + index * copy_stride, copy_stride * sizeof(T));
     }
-    memcpy(output + output_offset, update + index * copy_stride, copy_stride * sizeof(T));
-  }
 }
 
-template <typename T>
-void TRTONNXScatterNDKernelLauncher(const T* data, const int* indices, const T* update,
-                                    const int* dims, int nbDims, const int* indices_dims,
-                                    int indice_nbDims, T* output, cudaStream_t stream) {
-  // fill tensordesc and initial
-  TensorDesc tensor_desc;
-  memset((void*)&tensor_desc, 0, sizeof(TensorDesc));
-  tensor_desc.dim = nbDims;
-  tensor_desc.shape[nbDims - 1] = dims[nbDims - 1];
-  tensor_desc.stride[nbDims - 1] = 1;
-  for (int i = nbDims - 2; i >= 0; --i) {
-    tensor_desc.shape[i] = dims[i];
-    tensor_desc.stride[i] = dims[i + 1] * tensor_desc.stride[i + 1];
-  }
-  const int data_size = tensor_desc.stride[0] * tensor_desc.shape[0];
+template<typename T>
+void TRTONNXScatterNDKernelLauncher(const T*     data,
+                                    const int*   indices,
+                                    const T*     update,
+                                    const int*   dims,
+                                    int          nbDims,
+                                    const int*   indices_dims,
+                                    int          indice_nbDims,
+                                    T*           output,
+                                    cudaStream_t stream)
+{
+    // fill tensordesc and initial
+    TensorDesc tensor_desc;
+    memset((void*)&tensor_desc, 0, sizeof(TensorDesc));
+    tensor_desc.dim                = nbDims;
+    tensor_desc.shape[nbDims - 1]  = dims[nbDims - 1];
+    tensor_desc.stride[nbDims - 1] = 1;
+    for (int i = nbDims - 2; i >= 0; --i)
+    {
+        tensor_desc.shape[i]  = dims[i];
+        tensor_desc.stride[i] = dims[i + 1] * tensor_desc.stride[i + 1];
+    }
+    const int  data_size = tensor_desc.stride[0] * tensor_desc.shape[0];
 
-  TensorDesc indice_desc;
-  memset((void*)&indice_desc, 0, sizeof(TensorDesc));
-  indice_desc.dim = indice_nbDims;
-  indice_desc.shape[indice_nbDims - 1] = indices_dims[indice_nbDims - 1];
-  indice_desc.stride[indice_nbDims - 1] = 1;
-  for (int i = indice_nbDims - 2; i >= 0; --i) {
-    indice_desc.shape[i] = indices_dims[i];
-    indice_desc.stride[i] = indices_dims[i + 1] * indice_desc.stride[i + 1];
-  }
+    TensorDesc indice_desc;
+    memset((void*)&indice_desc, 0, sizeof(TensorDesc));
+    indice_desc.dim                       = indice_nbDims;
+    indice_desc.shape[indice_nbDims - 1]  = indices_dims[indice_nbDims - 1];
+    indice_desc.stride[indice_nbDims - 1] = 1;
+    for (int i = indice_nbDims - 2; i >= 0; --i)
+    {
+        indice_desc.shape[i]  = indices_dims[i];
+        indice_desc.stride[i] = indices_dims[i + 1] * indice_desc.stride[i + 1];
+    }
 
-  // output = np.copy(data)
-  cudaMemcpyAsync(output, data, data_size * sizeof(T), cudaMemcpyDeviceToDevice, stream);
+    // output = np.copy(data)
+    cudaMemcpyAsync(output, data, data_size * sizeof(T), cudaMemcpyDeviceToDevice, stream);
 
-  int num_update_indice = 1;
-  for (int i = 0; i < indice_nbDims - 1; ++i) {
-    num_update_indice *= indice_desc.shape[i];
-  }
-  // scatter
-  const int col_block = DIVUP(num_update_indice, THREADS_PER_BLOCK);
-  onnx_scatternd_kernel<<<col_block, THREADS_PER_BLOCK, 0, stream>>>(
-      num_update_indice, indices, update, output, tensor_desc, indice_desc);
+    int num_update_indice = 1;
+    for (int i = 0; i < indice_nbDims - 1; ++i)
+    {
+        num_update_indice *= indice_desc.shape[i];
+    }
+    // scatter
+    const int col_block = DIVUP(num_update_indice, THREADS_PER_BLOCK);
+    onnx_scatternd_kernel<<<col_block, THREADS_PER_BLOCK, 0, stream>>>(num_update_indice,
+                                                                       indices,
+                                                                       update,
+                                                                       output,
+                                                                       tensor_desc,
+                                                                       indice_desc);
 }
 
-template void TRTONNXScatterNDKernelLauncher<float>(const float* data, const int* indices,
-                                                    const float* update, const int* dims,
-                                                    int nbDims, const int* indices_dims,
-                                                    int indice_nbDims, float* output,
+template void TRTONNXScatterNDKernelLauncher<float>(const float* data,
+                                                    const int*   indices,
+                                                    const float* update,
+                                                    const int*   dims,
+                                                    int          nbDims,
+                                                    const int*   indices_dims,
+                                                    int          indice_nbDims,
+                                                    float*       output,
                                                     cudaStream_t stream);
 
-template void TRTONNXScatterNDKernelLauncher<int>(const int* data, const int* indices,
-                                                  const int* update, const int* dims, int nbDims,
-                                                  const int* indices_dims, int indice_nbDims,
-                                                  int* output, cudaStream_t stream);
+template void TRTONNXScatterNDKernelLauncher<int>(const int*   data,
+                                                  const int*   indices,
+                                                  const int*   update,
+                                                  const int*   dims,
+                                                  int          nbDims,
+                                                  const int*   indices_dims,
+                                                  int          indice_nbDims,
+                                                  int*         output,
+                                                  cudaStream_t stream);
diff --git a/csrc/mmdeploy/backend_ops/tensorrt/scatternd/trt_scatternd_kernel.hpp b/csrc/mmdeploy/backend_ops/tensorrt/scatternd/trt_scatternd_kernel.hpp
index b64b66494d..093ccda4f0 100644
--- a/csrc/mmdeploy/backend_ops/tensorrt/scatternd/trt_scatternd_kernel.hpp
+++ b/csrc/mmdeploy/backend_ops/tensorrt/scatternd/trt_scatternd_kernel.hpp
@@ -3,9 +3,15 @@
 #define TRT_SCATTERND_KERNEL_HPP
 #include <cuda_runtime.h>
 
-template <typename T>
-void TRTONNXScatterNDKernelLauncher(const T* data, const int* indices, const T* update,
-                                    const int* dims, int nbDims, const int* indices_dims,
-                                    int indice_nbDims, T* output, cudaStream_t stream);
+template<typename T>
+void TRTONNXScatterNDKernelLauncher(const T*     data,
+                                    const int*   indices,
+                                    const T*     update,
+                                    const int*   dims,
+                                    int          nbDims,
+                                    const int*   indices_dims,
+                                    int          indice_nbDims,
+                                    T*           output,
+                                    cudaStream_t stream);
 
 #endif  // TRT_SCATTERND_KERNEL_HPP
diff --git a/csrc/mmdeploy/backend_ops/torchscript/ops/CMakeLists.txt b/csrc/mmdeploy/backend_ops/torchscript/ops/CMakeLists.txt
index 4a6120d0f8..91e0254570 100644
--- a/csrc/mmdeploy/backend_ops/torchscript/ops/CMakeLists.txt
+++ b/csrc/mmdeploy/backend_ops/torchscript/ops/CMakeLists.txt
@@ -1,41 +1,48 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
 if("cuda" IN_LIST MMDEPLOY_TARGET_DEVICES)
-    project(mmdeploy_torchscript_ops CUDA CXX)
-    file(GLOB_RECURSE BACKEND_OPS_SRCS *.cpp *.cu)
+  project(mmdeploy_torchscript_ops CUDA CXX)
+  file(GLOB_RECURSE BACKEND_OPS_SRCS *.cpp *.cu)
 else()
-    project(mmdeploy_torchscript_ops CXX)
-    file(GLOB_RECURSE BACKEND_OPS_SRCS *.cpp)
+  project(mmdeploy_torchscript_ops CXX)
+  file(GLOB_RECURSE BACKEND_OPS_SRCS *.cpp)
 endif()
 
 find_package(Torch REQUIRED)
 
 if(MSVC)
-    # workaround to fix building torchscript ops on windows
-    set(_TORCH_TARGET torch_cuda_cu torch_cuda_cpp torch_cpu)
-    foreach(_target IN LISTS _TORCH_TARGET)
-        if(TARGET ${_target})
-            get_property(FIXED_TORCH_CPU_COMPILE_OPTIONS TARGET ${_target} PROPERTY INTERFACE_COMPILE_OPTIONS)
-            string(REPLACE ";" " " FIXED_TORCH_CPU_COMPILE_OPTIONS "${FIXED_TORCH_CPU_COMPILE_OPTIONS}")
-            set_property(TARGET ${_target} PROPERTY INTERFACE_COMPILE_OPTIONS -Xcompiler "${FIXED_TORCH_CPU_COMPILE_OPTIONS}")
-        else()
-            message(WARNING "Target ${_target} not found.")
-        endif()
-    endforeach()
+  # workaround to fix building torchscript ops on windows
+  set(_TORCH_TARGET torch_cuda_cu torch_cuda_cpp torch_cpu)
+  foreach(_target IN LISTS _TORCH_TARGET)
+    if(TARGET ${_target})
+      get_property(
+        FIXED_TORCH_CPU_COMPILE_OPTIONS
+        TARGET ${_target}
+        PROPERTY INTERFACE_COMPILE_OPTIONS)
+      string(REPLACE ";" " " FIXED_TORCH_CPU_COMPILE_OPTIONS
+                     "${FIXED_TORCH_CPU_COMPILE_OPTIONS}")
+      set_property(
+        TARGET ${_target} PROPERTY INTERFACE_COMPILE_OPTIONS -Xcompiler
+                                   "${FIXED_TORCH_CPU_COMPILE_OPTIONS}")
+    else()
+      message(WARNING "Target ${_target} not found.")
+    endif()
+  endforeach()
 endif()
 
 add_library(${PROJECT_NAME}_obj OBJECT "${BACKEND_OPS_SRCS}")
-set_target_properties(${PROJECT_NAME}_obj PROPERTIES POSITION_INDEPENDENT_CODE 1)
+set_target_properties(${PROJECT_NAME}_obj PROPERTIES POSITION_INDEPENDENT_CODE
+                                                     1)
 target_compile_definitions(${PROJECT_NAME}_obj
-    PRIVATE -DTHRUST_IGNORE_DEPRECATED_CPP_DIALECT=1)
+                           PRIVATE -DTHRUST_IGNORE_DEPRECATED_CPP_DIALECT=1)
 target_include_directories(${PROJECT_NAME}_obj
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../common)
+                           PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../common)
 target_include_directories(${PROJECT_NAME}_obj
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/common)
+                           PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/common)
 
 if("cuda" IN_LIST MMDEPLOY_TARGET_DEVICES)
-    target_include_directories(${PROJECT_NAME}_obj
-        PRIVATE ${CUDA_TOOLKIT_ROOT_DIR}/include)
+  target_include_directories(${PROJECT_NAME}_obj
+                             PRIVATE ${CUDA_TOOLKIT_ROOT_DIR}/include)
 endif()
 target_link_libraries(${PROJECT_NAME}_obj PRIVATE ${TORCH_LIBRARIES})
 mmdeploy_export(${PROJECT_NAME}_obj)
diff --git a/csrc/mmdeploy/backend_ops/torchscript/ops/bind.cpp b/csrc/mmdeploy/backend_ops/torchscript/ops/bind.cpp
index f236ac9b66..777b2b1eed 100644
--- a/csrc/mmdeploy/backend_ops/torchscript/ops/bind.cpp
+++ b/csrc/mmdeploy/backend_ops/torchscript/ops/bind.cpp
@@ -1,13 +1,14 @@
 // Copyright (c) OpenMMLab. All rights reserved.
 #include "torch/script.h"
 
-TORCH_LIBRARY(mmdeploy, m) {
-  m.def(
-       "modulated_deform_conv(Tensor input, Tensor weight, Tensor bias, Tensor offset, Tensor "
-       "mask, "
-       "int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h, int pad_w, int "
-       "dilation_h,int dilation_w, int groups, int deform_groups, bool with_bias) -> Tensor")
-      .def(
-          "coreml_nms(Tensor boxes, Tensor scores, float iou_threshold, "
-          "float score_threshold, int max_boxes) -> Tensor[]");
+TORCH_LIBRARY(mmdeploy, m)
+{
+    m.def(
+         "modulated_deform_conv(Tensor input, Tensor weight, Tensor bias, Tensor offset, Tensor "
+         "mask, "
+         "int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h, int pad_w, int "
+         "dilation_h,int dilation_w, int groups, int deform_groups, bool with_bias) -> Tensor")
+        .def(
+            "coreml_nms(Tensor boxes, Tensor scores, float iou_threshold, "
+            "float score_threshold, int max_boxes) -> Tensor[]");
 }
diff --git a/csrc/mmdeploy/backend_ops/torchscript/ops/coreml_nms/coreml_nms_cpu.cpp b/csrc/mmdeploy/backend_ops/torchscript/ops/coreml_nms/coreml_nms_cpu.cpp
index a78b701349..f83a0ec313 100644
--- a/csrc/mmdeploy/backend_ops/torchscript/ops/coreml_nms/coreml_nms_cpu.cpp
+++ b/csrc/mmdeploy/backend_ops/torchscript/ops/coreml_nms/coreml_nms_cpu.cpp
@@ -4,28 +4,36 @@
 #include <vector>
 
 #include "torch/script.h"
-namespace mmdeploy {
-
-using at::Tensor;
-
-std::vector<Tensor> coreml_nms_cpu(Tensor boxes, Tensor scores, double iou_threshold,
-                                   double score_threshold, int64_t max_boxes) {
-  assert(boxes.dim() == 3);  // bboxes with shape (batch_size, num_bboxes, 4)
-  assert(boxes.size(2) == 4);
-  assert(boxes.size(0) == scores.size(0));  // check batch size
-  assert(boxes.size(1) == scores.size(1));  // check num boxes
-
-  auto batch_size = boxes.size(0);
-  auto num_boxes = boxes.size(1);
-  auto num_classes = scores.size(2);
-
-  Tensor ret_boxes = at::zeros({batch_size, max_boxes, 4});
-  Tensor ret_scores = at::zeros({batch_size, max_boxes, num_classes});
-  Tensor indices = at::zeros({batch_size, max_boxes}, at::kInt);
-  Tensor num_outputs = at::zeros({batch_size}, at::kInt);
-
-  return std::vector<Tensor>({ret_boxes, ret_scores, indices, num_outputs});
-}
-
-TORCH_LIBRARY_IMPL(mmdeploy, CPU, m) { m.impl("coreml_nms", coreml_nms_cpu); }
+namespace mmdeploy
+{
+
+    using at::Tensor;
+
+    std::vector<Tensor> coreml_nms_cpu(Tensor  boxes,
+                                       Tensor  scores,
+                                       double  iou_threshold,
+                                       double  score_threshold,
+                                       int64_t max_boxes)
+    {
+        assert(boxes.dim() == 3);  // bboxes with shape (batch_size, num_bboxes, 4)
+        assert(boxes.size(2) == 4);
+        assert(boxes.size(0) == scores.size(0));  // check batch size
+        assert(boxes.size(1) == scores.size(1));  // check num boxes
+
+        auto   batch_size  = boxes.size(0);
+        auto   num_boxes   = boxes.size(1);
+        auto   num_classes = scores.size(2);
+
+        Tensor ret_boxes   = at::zeros({batch_size, max_boxes, 4});
+        Tensor ret_scores  = at::zeros({batch_size, max_boxes, num_classes});
+        Tensor indices     = at::zeros({batch_size, max_boxes}, at::kInt);
+        Tensor num_outputs = at::zeros({batch_size}, at::kInt);
+
+        return std::vector<Tensor>({ret_boxes, ret_scores, indices, num_outputs});
+    }
+
+    TORCH_LIBRARY_IMPL(mmdeploy, CPU, m)
+    {
+        m.impl("coreml_nms", coreml_nms_cpu);
+    }
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/torchscript/ops/modulated_deform_conv/modulated_deform_conv_cpu.cpp b/csrc/mmdeploy/backend_ops/torchscript/ops/modulated_deform_conv/modulated_deform_conv_cpu.cpp
index c6d980919f..3a9b32e83b 100644
--- a/csrc/mmdeploy/backend_ops/torchscript/ops/modulated_deform_conv/modulated_deform_conv_cpu.cpp
+++ b/csrc/mmdeploy/backend_ops/torchscript/ops/modulated_deform_conv/modulated_deform_conv_cpu.cpp
@@ -3,92 +3,133 @@
 
 #include "torch/script.h"
 
-namespace mmdeploy {
-
-void modulated_deformable_im2col_cpu(
-    const at::Tensor data_im, const at::Tensor data_offset, const at::Tensor data_mask,
-    const int64_t batch_size, const int64_t channels, const int64_t height_im,
-    const int64_t width_im, const int64_t height_col, const int64_t width_col,
-    const int64_t kernel_h, const int64_t kernel_w, const int64_t pad_h, const int64_t pad_w,
-    const int64_t stride_h, const int64_t stride_w, const int64_t dilation_h,
-    const int64_t dilation_w, int64_t deformable_group, at::Tensor data_col) {
-  // num_axes should be smaller than block size
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      data_im.scalar_type(), "modulated_deformable_im2col_cpu", ([&] {
-        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
-        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
-        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
-        scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
-
-        deformable_im2col_2d<scalar_t>(data_im_, data_offset_, data_mask_, height_im, width_im,
-                                       kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
-                                       dilation_h, dilation_w, channels, deformable_group,
-                                       height_col, width_col, data_mask_ != nullptr, data_col_);
-      }));
-}
-
-at::Tensor modulated_deform_conv_forward_cpu(at::Tensor input, at::Tensor weight, at::Tensor bias,
-                                             at::Tensor offset, at::Tensor mask, int64_t kernel_h,
-                                             int64_t kernel_w, int64_t stride_h, int64_t stride_w,
-                                             int64_t pad_h, int64_t pad_w, int64_t dilation_h,
-                                             int64_t dilation_w, int64_t group,
-                                             int64_t deformable_group, bool with_bias) {
-  at::DeviceGuard guard(input.device());
-
-  const int batch = input.size(0);
-  const int channels = input.size(1);
-  const int height = input.size(2);
-  const int width = input.size(3);
-
-  const int channels_out = weight.size(0);
-  const int channels_kernel = weight.size(1);
-  const int kernel_h_ = weight.size(2);
-  const int kernel_w_ = weight.size(3);
-
-  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
-    AT_ERROR("Input shape and kernel shape won't match: (%d x %d vs %d x %d).", kernel_h_, kernel_w,
-             kernel_h_, kernel_w_);
-  if (channels != channels_kernel * group)
-    AT_ERROR("Input shape and kernel channels won't match: (%d vs %d).", channels,
-             channels_kernel * group);
-
-  const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
-  const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
-
-  // resize output
-  at::Tensor output =
-      at::zeros({batch, group, channels_out / group, height_out, width_out}, input.options());
-  // resize temporary columns
-  at::Tensor columns = at::zeros(
-      {group, channels * kernel_h * kernel_w / group, 1 * height_out * width_out}, input.options());
-
-  // divide into group
-  weight =
-      weight.view({group, weight.size(0) / group, weight.size(1), weight.size(2), weight.size(3)});
-  for (int b = 0; b < batch; b++) {
-    modulated_deformable_im2col_cpu(input[b], offset[b], mask[b], 1, channels, height, width,
-                                    height_out, width_out, kernel_h, kernel_w, pad_h, pad_w,
-                                    stride_h, stride_w, dilation_h, dilation_w, deformable_group,
-                                    columns);
-
-    for (int g = 0; g < group; g++) {
-      output[b][g] =
-          output[b][g].flatten(1).addmm_(weight[g].flatten(1), columns[g]).view_as(output[b][g]);
+namespace mmdeploy
+{
+
+    void modulated_deformable_im2col_cpu(const at::Tensor data_im,
+                                         const at::Tensor data_offset,
+                                         const at::Tensor data_mask,
+                                         const int64_t    batch_size,
+                                         const int64_t    channels,
+                                         const int64_t    height_im,
+                                         const int64_t    width_im,
+                                         const int64_t    height_col,
+                                         const int64_t    width_col,
+                                         const int64_t    kernel_h,
+                                         const int64_t    kernel_w,
+                                         const int64_t    pad_h,
+                                         const int64_t    pad_w,
+                                         const int64_t    stride_h,
+                                         const int64_t    stride_w,
+                                         const int64_t    dilation_h,
+                                         const int64_t    dilation_w,
+                                         int64_t          deformable_group,
+                                         at::Tensor       data_col)
+    {
+        // num_axes should be smaller than block size
+
+        AT_DISPATCH_FLOATING_TYPES_AND_HALF(data_im.scalar_type(),
+                                            "modulated_deformable_im2col_cpu",
+                                            ([&]
+                                             {
+                                                const scalar_t* data_im_     = data_im.data_ptr<scalar_t>();
+                                                const scalar_t* data_offset_ = data_offset.data_ptr<scalar_t>();
+                                                const scalar_t* data_mask_   = data_mask.data_ptr<scalar_t>();
+                                                scalar_t*       data_col_    = data_col.data_ptr<scalar_t>();
+
+                                                deformable_im2col_2d<scalar_t>(data_im_,
+                                                                               data_offset_,
+                                                                               data_mask_,
+                                                                               height_im,
+                                                                               width_im,
+                                                                               kernel_h,
+                                                                               kernel_w,
+                                                                               pad_h,
+                                                                               pad_w,
+                                                                               stride_h,
+                                                                               stride_w,
+                                                                               dilation_h,
+                                                                               dilation_w,
+                                                                               channels,
+                                                                               deformable_group,
+                                                                               height_col,
+                                                                               width_col,
+                                                                               data_mask_ != nullptr,
+                                                                               data_col_); }));
     }
-  }
 
-  output = output.view(
-      {output.size(0), output.size(1) * output.size(2), output.size(3), output.size(4)});
-
-  if (with_bias) {
-    output += bias.view({1, bias.size(0), 1, 1});
-  }
-
-  return output;
-}
+    at::Tensor modulated_deform_conv_forward_cpu(at::Tensor input,
+                                                 at::Tensor weight,
+                                                 at::Tensor bias,
+                                                 at::Tensor offset,
+                                                 at::Tensor mask,
+                                                 int64_t    kernel_h,
+                                                 int64_t    kernel_w,
+                                                 int64_t    stride_h,
+                                                 int64_t    stride_w,
+                                                 int64_t    pad_h,
+                                                 int64_t    pad_w,
+                                                 int64_t    dilation_h,
+                                                 int64_t    dilation_w,
+                                                 int64_t    group,
+                                                 int64_t    deformable_group,
+                                                 bool       with_bias)
+    {
+        at::DeviceGuard guard(input.device());
+
+        const int       batch    = input.size(0);
+        const int       channels = input.size(1);
+        const int       height   = input.size(2);
+        const int       width    = input.size(3);
+
+        const int       channels_out    = weight.size(0);
+        const int       channels_kernel = weight.size(1);
+        const int       kernel_h_       = weight.size(2);
+        const int       kernel_w_       = weight.size(3);
+
+        if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
+            AT_ERROR("Input shape and kernel shape won't match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_);
+        if (channels != channels_kernel * group)
+            AT_ERROR("Input shape and kernel channels won't match: (%d vs %d).", channels, channels_kernel * group);
+
+        const int  height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+        const int  width_out  = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+        // resize output
+        at::Tensor output =
+            at::zeros({batch, group, channels_out / group, height_out, width_out}, input.options());
+        // resize temporary columns
+        at::Tensor columns = at::zeros(
+            {group, channels * kernel_h * kernel_w / group, 1 * height_out * width_out},
+            input.options());
+
+        // divide into group
+        weight =
+            weight.view({group, weight.size(0) / group, weight.size(1), weight.size(2), weight.size(3)});
+        for (int b = 0; b < batch; b++)
+        {
+            modulated_deformable_im2col_cpu(input[b], offset[b], mask[b], 1, channels, height, width, height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, deformable_group, columns);
+
+            for (int g = 0; g < group; g++)
+            {
+                output[b][g] =
+                    output[b][g].flatten(1).addmm_(weight[g].flatten(1), columns[g]).view_as(output[b][g]);
+            }
+        }
+
+        output = output.view(
+            {output.size(0), output.size(1) * output.size(2), output.size(3), output.size(4)});
+
+        if (with_bias)
+        {
+            output += bias.view({1, bias.size(0), 1, 1});
+        }
+
+        return output;
+    }
 
-TORCH_LIBRARY_IMPL(mmdeploy, CPU, m) {
-  m.impl("modulated_deform_conv", modulated_deform_conv_forward_cpu);
-}
+    TORCH_LIBRARY_IMPL(mmdeploy, CPU, m)
+    {
+        m.impl("modulated_deform_conv", modulated_deform_conv_forward_cpu);
+    }
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/torchscript/ops/modulated_deform_conv/modulated_deform_conv_cuda.cu b/csrc/mmdeploy/backend_ops/torchscript/ops/modulated_deform_conv/modulated_deform_conv_cuda.cu
index 3f9b6aef08..53cb5fd65c 100644
--- a/csrc/mmdeploy/backend_ops/torchscript/ops/modulated_deform_conv/modulated_deform_conv_cuda.cu
+++ b/csrc/mmdeploy/backend_ops/torchscript/ops/modulated_deform_conv/modulated_deform_conv_cuda.cu
@@ -3,95 +3,157 @@
 #include "modulated_deform_conv/modulated_deform_conv_cuda.cuh"
 #include "torch/script.h"
 
-namespace mmdeploy {
-
-void modulated_deformable_im2col_cuda(
-    const at::Tensor data_im, const at::Tensor data_offset, const at::Tensor data_mask,
-    const int64_t batch_size, const int64_t channels, const int64_t height_im,
-    const int64_t width_im, const int64_t height_col, const int64_t width_col,
-    const int64_t kernel_h, const int64_t kernel_w, const int64_t pad_h, const int64_t pad_w,
-    const int64_t stride_h, const int64_t stride_w, const int64_t dilation_h,
-    const int64_t dilation_w, const int64_t deformable_group, at::Tensor data_col) {
-  // num_axes should be smaller than block size
-  const int channel_per_deformable_group = channels / deformable_group;
-  const int num_kernels = channels * batch_size * height_col * width_col;
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      data_im.scalar_type(), "modulated_deformable_im2col_cuda", ([&] {
-        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
-        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
-        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
-        scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
-        modulated_deformable_im2col_gpu_kernel<scalar_t>
-            <<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, at::cuda::getCurrentCUDAStream()>>>(
-                num_kernels, data_im_, data_offset_, data_mask_, height_im, width_im, kernel_h,
-                kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
-                channel_per_deformable_group, batch_size, channels, deformable_group, height_col,
-                width_col, data_col_);
-      }));
-}
-
-at::Tensor modulated_deform_conv_forward_cuda(at::Tensor input, at::Tensor weight, at::Tensor bias,
-                                              at::Tensor offset, at::Tensor mask, int64_t kernel_h,
-                                              int64_t kernel_w, int64_t stride_h, int64_t stride_w,
-                                              int64_t pad_h, int64_t pad_w, int64_t dilation_h,
-                                              int64_t dilation_w, int64_t group,
-                                              int64_t deformable_group, bool with_bias) {
-  at::DeviceGuard guard(input.device());
-
-  const int batch = input.size(0);
-  const int channels = input.size(1);
-  const int height = input.size(2);
-  const int width = input.size(3);
-
-  const int channels_out = weight.size(0);
-  const int channels_kernel = weight.size(1);
-  const int kernel_h_ = weight.size(2);
-  const int kernel_w_ = weight.size(3);
-
-  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
-    AT_ERROR("Input shape and kernel shape won't match: (%d x %d vs %d x %d).", kernel_h_, kernel_w,
-             kernel_h_, kernel_w_);
-  if (channels != channels_kernel * group)
-    AT_ERROR("Input shape and kernel channels won't match: (%d vs %d).", channels,
-             channels_kernel * group);
-
-  const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
-  const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
-
-  // resize output
-  at::Tensor output =
-      at::zeros({batch, group, channels_out / group, height_out, width_out}, input.options());
-  // resize temporary columns
-  at::Tensor columns = at::zeros(
-      {group, channels * kernel_h * kernel_w / group, 1 * height_out * width_out}, input.options());
-
-  // divide into group
-  weight =
-      weight.view({group, weight.size(0) / group, weight.size(1), weight.size(2), weight.size(3)});
-  for (int b = 0; b < batch; b++) {
-    modulated_deformable_im2col_cuda(input[b], offset[b], mask[b], 1, channels, height, width,
-                                     height_out, width_out, kernel_h, kernel_w, pad_h, pad_w,
-                                     stride_h, stride_w, dilation_h, dilation_w, deformable_group,
-                                     columns);
-
-    for (int g = 0; g < group; g++) {
-      output[b][g] =
-          output[b][g].flatten(1).addmm_(weight[g].flatten(1), columns[g]).view_as(output[b][g]);
+namespace mmdeploy
+{
+
+    void modulated_deformable_im2col_cuda(const at::Tensor data_im,
+                                          const at::Tensor data_offset,
+                                          const at::Tensor data_mask,
+                                          const int64_t    batch_size,
+                                          const int64_t    channels,
+                                          const int64_t    height_im,
+                                          const int64_t    width_im,
+                                          const int64_t    height_col,
+                                          const int64_t    width_col,
+                                          const int64_t    kernel_h,
+                                          const int64_t    kernel_w,
+                                          const int64_t    pad_h,
+                                          const int64_t    pad_w,
+                                          const int64_t    stride_h,
+                                          const int64_t    stride_w,
+                                          const int64_t    dilation_h,
+                                          const int64_t    dilation_w,
+                                          const int64_t    deformable_group,
+                                          at::Tensor       data_col)
+    {
+        // num_axes should be smaller than block size
+        const int channel_per_deformable_group = channels / deformable_group;
+        const int num_kernels                  = channels * batch_size * height_col * width_col;
+
+        AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+            data_im.scalar_type(),
+            "modulated_deformable_im2col_cuda",
+            ([&]
+             {
+                const scalar_t* data_im_     = data_im.data_ptr<scalar_t>();
+                const scalar_t* data_offset_ = data_offset.data_ptr<scalar_t>();
+                const scalar_t* data_mask_   = data_mask.data_ptr<scalar_t>();
+                scalar_t*       data_col_    = data_col.data_ptr<scalar_t>();
+
+                modulated_deformable_im2col_gpu_kernel<scalar_t>
+                    <<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, at::cuda::getCurrentCUDAStream()>>>(num_kernels,
+                                                                                                          data_im_,
+                                                                                                          data_offset_,
+                                                                                                          data_mask_,
+                                                                                                          height_im,
+                                                                                                          width_im,
+                                                                                                          kernel_h,
+                                                                                                          kernel_w,
+                                                                                                          pad_h,
+                                                                                                          pad_w,
+                                                                                                          stride_h,
+                                                                                                          stride_w,
+                                                                                                          dilation_h,
+                                                                                                          dilation_w,
+                                                                                                          channel_per_deformable_group,
+                                                                                                          batch_size,
+                                                                                                          channels,
+                                                                                                          deformable_group,
+                                                                                                          height_col,
+                                                                                                          width_col,
+                                                                                                          data_col_); }));
     }
-  }
 
-  output = output.view(
-      {output.size(0), output.size(1) * output.size(2), output.size(3), output.size(4)});
+    at::Tensor modulated_deform_conv_forward_cuda(at::Tensor input,
+                                                  at::Tensor weight,
+                                                  at::Tensor bias,
+                                                  at::Tensor offset,
+                                                  at::Tensor mask,
+                                                  int64_t    kernel_h,
+                                                  int64_t    kernel_w,
+                                                  int64_t    stride_h,
+                                                  int64_t    stride_w,
+                                                  int64_t    pad_h,
+                                                  int64_t    pad_w,
+                                                  int64_t    dilation_h,
+                                                  int64_t    dilation_w,
+                                                  int64_t    group,
+                                                  int64_t    deformable_group,
+                                                  bool       with_bias)
+    {
+        at::DeviceGuard guard(input.device());
+
+        const int       batch    = input.size(0);
+        const int       channels = input.size(1);
+        const int       height   = input.size(2);
+        const int       width    = input.size(3);
+
+        const int       channels_out    = weight.size(0);
+        const int       channels_kernel = weight.size(1);
+        const int       kernel_h_       = weight.size(2);
+        const int       kernel_w_       = weight.size(3);
+
+        if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
+            AT_ERROR("Input shape and kernel shape won't match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_);
+        if (channels != channels_kernel * group)
+            AT_ERROR("Input shape and kernel channels won't match: (%d vs %d).", channels, channels_kernel * group);
+
+        const int  height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+        const int  width_out  = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
 
-  if (with_bias) {
-    output += bias.view({1, bias.size(0), 1, 1});
-  }
+        // resize output
+        at::Tensor output =
+            at::zeros({batch, group, channels_out / group, height_out, width_out}, input.options());
+        // resize temporary columns
+        at::Tensor columns = at::zeros(
+            {group, channels * kernel_h * kernel_w / group, 1 * height_out * width_out},
+            input.options());
 
-  return output;
-}
+        // divide into group
+        weight =
+            weight.view({group, weight.size(0) / group, weight.size(1), weight.size(2), weight.size(3)});
+        for (int b = 0; b < batch; b++)
+        {
+            modulated_deformable_im2col_cuda(input[b],
+                                             offset[b],
+                                             mask[b],
+                                             1,
+                                             channels,
+                                             height,
+                                             width,
+                                             height_out,
+                                             width_out,
+                                             kernel_h,
+                                             kernel_w,
+                                             pad_h,
+                                             pad_w,
+                                             stride_h,
+                                             stride_w,
+                                             dilation_h,
+                                             dilation_w,
+                                             deformable_group,
+                                             columns);
 
-TORCH_LIBRARY_IMPL(mmdeploy, CUDA, m) {
-  m.impl("modulated_deform_conv", modulated_deform_conv_forward_cuda);
-}
+            for (int g = 0; g < group; g++)
+            {
+                output[b][g] =
+                    output[b][g].flatten(1).addmm_(weight[g].flatten(1), columns[g]).view_as(output[b][g]);
+            }
+        }
+
+        output = output.view(
+            {output.size(0), output.size(1) * output.size(2), output.size(3), output.size(4)});
+
+        if (with_bias)
+        {
+            output += bias.view({1, bias.size(0), 1, 1});
+        }
+
+        return output;
+    }
+
+    TORCH_LIBRARY_IMPL(mmdeploy, CUDA, m)
+    {
+        m.impl("modulated_deform_conv", modulated_deform_conv_forward_cuda);
+    }
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/torchscript/optimizer/CMakeLists.txt b/csrc/mmdeploy/backend_ops/torchscript/optimizer/CMakeLists.txt
index 1b5e75ccca..c528972177 100644
--- a/csrc/mmdeploy/backend_ops/torchscript/optimizer/CMakeLists.txt
+++ b/csrc/mmdeploy/backend_ops/torchscript/optimizer/CMakeLists.txt
@@ -3,16 +3,18 @@
 project(ts_optimizer)
 
 find_package(Torch REQUIRED)
-find_library(TORCH_PYTHON_LIBRARY torch_python PATHS "${TORCH_INSTALL_PREFIX}/lib")
-if (NOT TARGET pybind11)
-    add_subdirectory(${CMAKE_SOURCE_DIR}/third_party/pybind11 pybind11)
-endif ()
+find_library(TORCH_PYTHON_LIBRARY torch_python
+             PATHS "${TORCH_INSTALL_PREFIX}/lib")
+if(NOT TARGET pybind11)
+  add_subdirectory(${CMAKE_SOURCE_DIR}/third_party/pybind11 pybind11)
+endif()
 
 file(GLOB_RECURSE OPTIMIZER_SRCS *.cpp)
 
 pybind11_add_module(${PROJECT_NAME} ${OPTIMIZER_SRCS})
-target_link_libraries(${PROJECT_NAME} PRIVATE ${TORCH_LIBRARIES} ${TORCH_PYTHON_LIBRARY})
+target_link_libraries(${PROJECT_NAME} PRIVATE ${TORCH_LIBRARIES}
+                                              ${TORCH_PYTHON_LIBRARY})
 target_link_directories(${PROJECT_NAME} PRIVATE mmdeploy::torchscript_ops)
 set_target_properties(
-        ${PROJECT_NAME} PROPERTIES LIBRARY_OUTPUT_DIRECTORY
-        ${CMAKE_SOURCE_DIR}/mmdeploy/backend/torchscript)
+  ${PROJECT_NAME} PROPERTIES LIBRARY_OUTPUT_DIRECTORY
+                             ${CMAKE_SOURCE_DIR}/mmdeploy/backend/torchscript)
diff --git a/csrc/mmdeploy/backend_ops/torchscript/optimizer/bind.cpp b/csrc/mmdeploy/backend_ops/torchscript/optimizer/bind.cpp
index 3b8bb0f632..49d3d8930a 100644
--- a/csrc/mmdeploy/backend_ops/torchscript/optimizer/bind.cpp
+++ b/csrc/mmdeploy/backend_ops/torchscript/optimizer/bind.cpp
@@ -12,36 +12,45 @@
 #include "passes/onnx/merge_shape_concate.h"
 #include "passes/onnx/onnx_peephole.h"
 
-namespace mmdeploy {
-namespace torch_jit {
+namespace mmdeploy
+{
+    namespace torch_jit
+    {
 
-void optimize_for_backend(torch::jit::Module& model, const std::string& ir = "torchscript",
-                          const std::string& backend = "torchscript") {
-  if (ir == "torchscript") {
-    model = optimize_for_torchscript(model);
-  } else if (ir == "onnx") {
-    model = optimize_for_onnx(model);
-  } else {
-    fprintf(stderr, "No optimize for combination ir: %s backend: %s\n", ir.c_str(),
-            backend.c_str());
-    exit(-1);
-  }
-}
+        void optimize_for_backend(torch::jit::Module& model,
+                                  const std::string&  ir      = "torchscript",
+                                  const std::string&  backend = "torchscript")
+        {
+            if (ir == "torchscript")
+            {
+                model = optimize_for_torchscript(model);
+            }
+            else if (ir == "onnx")
+            {
+                model = optimize_for_onnx(model);
+            }
+            else
+            {
+                fprintf(stderr, "No optimize for combination ir: %s backend: %s\n", ir.c_str(), backend.c_str());
+                exit(-1);
+            }
+        }
 
-PYBIND11_MODULE(ts_optimizer, m) {
-  namespace py = pybind11;
-  m.def("optimize_for_backend", optimize_for_backend, py::arg("module"),
-        py::arg("ir") = std::string("torchscript"),
-        py::arg("backend") = std::string("torchscript"));
-  py::module_ onnx_module = m.def_submodule("onnx");
-  onnx_module.def("_jit_pass_merge_shape_concate", MergeShapeConcate, py::arg("graph"));
-  onnx_module.def("_jit_pass_onnx_peephole", ONNXPeephole, py::arg("graph"));
-  onnx_module.def("_jit_pass_flatten_cls_head", FlattenClsHead, py::arg("graph"));
-  onnx_module.def("_jit_pass_fuse_select_assign", FuseSelectAssign, py::arg("graph"),
-                  py::arg("params"));
-  onnx_module.def("_jit_pass_common_subgraph_elimination", CommonSubgraphElimination,
-                  py::arg("graph"), py::arg("params"));
-}
+        PYBIND11_MODULE(ts_optimizer, m)
+        {
+            namespace py = pybind11;
+            m.def("optimize_for_backend",
+                  optimize_for_backend,
+                  py::arg("module"),
+                  py::arg("ir")      = std::string("torchscript"),
+                  py::arg("backend") = std::string("torchscript"));
+            py::module_ onnx_module = m.def_submodule("onnx");
+            onnx_module.def("_jit_pass_merge_shape_concate", MergeShapeConcate, py::arg("graph"));
+            onnx_module.def("_jit_pass_onnx_peephole", ONNXPeephole, py::arg("graph"));
+            onnx_module.def("_jit_pass_flatten_cls_head", FlattenClsHead, py::arg("graph"));
+            onnx_module.def("_jit_pass_fuse_select_assign", FuseSelectAssign, py::arg("graph"), py::arg("params"));
+            onnx_module.def("_jit_pass_common_subgraph_elimination", CommonSubgraphElimination, py::arg("graph"), py::arg("params"));
+        }
 
-}  // namespace torch_jit
+    }  // namespace torch_jit
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/torchscript/optimizer/ir/subgraph_matcher.cpp b/csrc/mmdeploy/backend_ops/torchscript/optimizer/ir/subgraph_matcher.cpp
index 10ce9829d5..4834f1d3d5 100644
--- a/csrc/mmdeploy/backend_ops/torchscript/optimizer/ir/subgraph_matcher.cpp
+++ b/csrc/mmdeploy/backend_ops/torchscript/optimizer/ir/subgraph_matcher.cpp
@@ -8,306 +8,399 @@
 
 #include <regex>
 #include <stack>
-namespace mmdeploy {
-namespace torch_jit {
-
-using torch::jit::AttributeKind;
-using torch::jit::ClassType;
-using torch::jit::Node;
-using torch::jit::Symbol;
-using torch::jit::Value;
-
-namespace prim {
-using namespace ::c10::prim;
-}
-
-namespace attr {
-using namespace ::c10::attr;
-}
-
-/**
- * \brief A class implementing an API for comparing subgraphs.
- */
-class SubgraphMatcher::SubgraphMatcherImpl {
- public:
-  explicit SubgraphMatcherImpl(const Graph& pattern, MatchAttribute match_attribute)
-      : pattern_(pattern), match_attribute_(match_attribute) {}
-
-  /**
-   * \brief Compare matchGraph with the part of the graph denoted by a node \p
-   * ANCHOR.
-   *
-   * The anchor node would be compared against the deepest node in the
-   * match-graph. A node is considered matching if its number of inputs/outputs
-   * is the same as in the corresponding matchGraph node, its type is the same,
-   * and all nodes producing input-values also match.
-   */
-  bool matchesSubgraphFromAnchorNode(Node* anchor);
-
-  /** \brief Return match map for nodes. */
-  std::unordered_map<const Node*, Node*> nodes_map() const { return nodes_map_; }
-
-  /** \brief Return match map for values. */
-  std::unordered_map<const Value*, Value*> values_map() const { return values_map_; }
-
- private:
-  bool matchValues(const Value* v1, Value* v2);
-  bool matchNodes(const Node* n1, Node* n2);
-  bool matchAttributes(const Node* n1, Node* n2);
-
-  static bool isInput(const Value* v);
-  static bool isOutput(const Value* v);
-
-  std::unordered_map<const Node*, Node*> nodes_map_;
-  std::unordered_map<const Value*, Value*> values_map_;
-
-  const MatchAttribute match_attribute_;
-  const Graph& pattern_;
-  const Node* anchor_ = nullptr;
-};
-
-bool SubgraphMatcher::SubgraphMatcherImpl::isInput(const Value* v) {
-  return v->node()->kind() == prim::Param;
-}
-
-bool SubgraphMatcher::SubgraphMatcherImpl::isOutput(const Value* v) {
-  for (const Value* output : v->owningGraph()->outputs()) {
-    if (v == output) {
-      return true;
-    }
-  }
-  return false;
-}
-
-/**
- * Compare two Values. V1 is from pattern, V2 is from the actual graph.
- *
- * The values are considered matching if:
- * 1) the nodes defining them match
- * 2) they have the same number of uses, except they are entry or exit nodes.
- */
-bool SubgraphMatcher::SubgraphMatcherImpl::matchValues(const Value* v1, Value* v2) {
-  // Check if we've already visited these values.
-  if (values_map_.count(v1)) {
-    if (values_map_.at(v1) != v2) {
-      GRAPH_DEBUG("Values %", v1->debugName(), " and %", v2->debugName(),
-                  " did not match because %", v1->debugName(), " has already been matched with %",
-                  values_map_.at(v1)->debugName(), ".\n");
-      return false;
-    }
-    return true;
-  }
-
-  // When V2 is ANCHOR, we're comparing exiting values, and when V1->node is
-  // PARAM, we're comparing entering values - in these two cases the number of
-  // uses don't need to be the same.
-  if (v1->uses().size() != v2->uses().size() && !isOutput(v1) && !isInput(v1)) {
-    GRAPH_DEBUG("Values %", v1->debugName(), " and %", v2->debugName(),
-                " did not match because number of their uses is different.\n");
-    return false;
-  }
-
-  // Add the values to the map before calling matchNodes to avoid infinite
-  // recursion.
-  GRAPH_DEBUG("Values %", v1->debugName(), " and %", v2->debugName(), " matched.\n");
-  values_map_[v1] = v2;
-  return matchNodes(v1->node(), v2->node());
-}
-
-bool SubgraphMatcher::SubgraphMatcherImpl::matchAttributes(const Node* n1, Node* n2) {
-  if (match_attribute_ == FORCE_MATCH && n1->numAttributes() != n2->numAttributes()) {
-    GRAPH_DEBUG("Nodes did not match in number attributes:\n", *n1, *n2);
-    return false;
-  }
-  for (const Symbol& attr_name : n1->attributeNames()) {
-    if (n1->kindOf(attr_name) != n2->kindOf(attr_name)) {
-      GRAPH_DEBUG("Nodes did not match because type of attribute '", attr_name.toQualString(),
-                  "' did not match:\n", *n1, *n2);
-      return false;
-    }
-    std::vector<int64_t> n1is, n2is;
-    std::vector<double> n1fs, n2fs;
-    switch (n1->kindOf(attr_name)) {
-      case AttributeKind::s:
-        if (!std::regex_match(n2->s(attr_name), std::regex(n1->s(attr_name)))) {
-          GRAPH_DEBUG("Nodes did not match because attribute '", attr_name.toQualString(),
-                      "' did not match: ", n1->s(attr_name), " != ", n2->s(attr_name), " \n", *n1,
-                      *n2);
-          return false;
+namespace mmdeploy
+{
+    namespace torch_jit
+    {
+
+        using torch::jit::AttributeKind;
+        using torch::jit::ClassType;
+        using torch::jit::Node;
+        using torch::jit::Symbol;
+        using torch::jit::Value;
+
+        namespace prim
+        {
+            using namespace ::c10::prim;
         }
-        break;
-      case AttributeKind::f:
-        if (n1->f(attr_name) != n2->f(attr_name)) {
-          GRAPH_DEBUG("Nodes did not match because attribute '", attr_name.toQualString(),
-                      "' did not match:", n1->f(attr_name), " != ", n2->f(attr_name), " \n", *n1,
-                      *n2);
-          return false;
+
+        namespace attr
+        {
+            using namespace ::c10::attr;
+        }
+
+        /**
+         * \brief A class implementing an API for comparing subgraphs.
+         */
+        class SubgraphMatcher::SubgraphMatcherImpl
+        {
+          public:
+            explicit SubgraphMatcherImpl(const Graph& pattern, MatchAttribute match_attribute)
+                : pattern_(pattern)
+                , match_attribute_(match_attribute)
+            {
+            }
+
+            /**
+             * \brief Compare matchGraph with the part of the graph denoted by a node \p
+             * ANCHOR.
+             *
+             * The anchor node would be compared against the deepest node in the
+             * match-graph. A node is considered matching if its number of inputs/outputs
+             * is the same as in the corresponding matchGraph node, its type is the same,
+             * and all nodes producing input-values also match.
+             */
+            bool                                   matchesSubgraphFromAnchorNode(Node* anchor);
+
+            /** \brief Return match map for nodes. */
+            std::unordered_map<const Node*, Node*> nodes_map() const
+            {
+                return nodes_map_;
+            }
+
+            /** \brief Return match map for values. */
+            std::unordered_map<const Value*, Value*> values_map() const
+            {
+                return values_map_;
+            }
+
+          private:
+            bool                                     matchValues(const Value* v1, Value* v2);
+            bool                                     matchNodes(const Node* n1, Node* n2);
+            bool                                     matchAttributes(const Node* n1, Node* n2);
+
+            static bool                              isInput(const Value* v);
+            static bool                              isOutput(const Value* v);
+
+            std::unordered_map<const Node*, Node*>   nodes_map_;
+            std::unordered_map<const Value*, Value*> values_map_;
+
+            const MatchAttribute                     match_attribute_;
+            const Graph&                             pattern_;
+            const Node*                              anchor_ = nullptr;
+        };
+
+        bool SubgraphMatcher::SubgraphMatcherImpl::isInput(const Value* v)
+        {
+            return v->node()->kind() == prim::Param;
+        }
+
+        bool SubgraphMatcher::SubgraphMatcherImpl::isOutput(const Value* v)
+        {
+            for (const Value* output : v->owningGraph()->outputs())
+            {
+                if (v == output)
+                {
+                    return true;
+                }
+            }
+            return false;
+        }
+
+        /**
+         * Compare two Values. V1 is from pattern, V2 is from the actual graph.
+         *
+         * The values are considered matching if:
+         * 1) the nodes defining them match
+         * 2) they have the same number of uses, except they are entry or exit nodes.
+         */
+        bool SubgraphMatcher::SubgraphMatcherImpl::matchValues(const Value* v1, Value* v2)
+        {
+            // Check if we've already visited these values.
+            if (values_map_.count(v1))
+            {
+                if (values_map_.at(v1) != v2)
+                {
+                    GRAPH_DEBUG("Values %",
+                                v1->debugName(),
+                                " and %",
+                                v2->debugName(),
+                                " did not match because %",
+                                v1->debugName(),
+                                " has already been matched with %",
+                                values_map_.at(v1)->debugName(),
+                                ".\n");
+                    return false;
+                }
+                return true;
+            }
+
+            // When V2 is ANCHOR, we're comparing exiting values, and when V1->node is
+            // PARAM, we're comparing entering values - in these two cases the number of
+            // uses don't need to be the same.
+            if (v1->uses().size() != v2->uses().size() && !isOutput(v1) && !isInput(v1))
+            {
+                GRAPH_DEBUG("Values %",
+                            v1->debugName(),
+                            " and %",
+                            v2->debugName(),
+                            " did not match because number of their uses is different.\n");
+                return false;
+            }
+
+            // Add the values to the map before calling matchNodes to avoid infinite
+            // recursion.
+            GRAPH_DEBUG("Values %", v1->debugName(), " and %", v2->debugName(), " matched.\n");
+            values_map_[v1] = v2;
+            return matchNodes(v1->node(), v2->node());
+        }
+
+        bool SubgraphMatcher::SubgraphMatcherImpl::matchAttributes(const Node* n1, Node* n2)
+        {
+            if (match_attribute_ == FORCE_MATCH && n1->numAttributes() != n2->numAttributes())
+            {
+                GRAPH_DEBUG("Nodes did not match in number attributes:\n", *n1, *n2);
+                return false;
+            }
+            for (const Symbol& attr_name : n1->attributeNames())
+            {
+                if (n1->kindOf(attr_name) != n2->kindOf(attr_name))
+                {
+                    GRAPH_DEBUG("Nodes did not match because type of attribute '",
+                                attr_name.toQualString(),
+                                "' did not match:\n",
+                                *n1,
+                                *n2);
+                    return false;
+                }
+                std::vector<int64_t> n1is, n2is;
+                std::vector<double>  n1fs, n2fs;
+                switch (n1->kindOf(attr_name))
+                {
+                    case AttributeKind::s:
+                        if (!std::regex_match(n2->s(attr_name), std::regex(n1->s(attr_name))))
+                        {
+                            GRAPH_DEBUG("Nodes did not match because attribute '",
+                                        attr_name.toQualString(),
+                                        "' did not match: ",
+                                        n1->s(attr_name),
+                                        " != ",
+                                        n2->s(attr_name),
+                                        " \n",
+                                        *n1,
+                                        *n2);
+                            return false;
+                        }
+                        break;
+                    case AttributeKind::f:
+                        if (n1->f(attr_name) != n2->f(attr_name))
+                        {
+                            GRAPH_DEBUG("Nodes did not match because attribute '",
+                                        attr_name.toQualString(),
+                                        "' did not match:",
+                                        n1->f(attr_name),
+                                        " != ",
+                                        n2->f(attr_name),
+                                        " \n",
+                                        *n1,
+                                        *n2);
+                            return false;
+                        }
+                        break;
+                    case AttributeKind::i:
+                        if (n1->i(attr_name) != n2->i(attr_name))
+                        {
+                            GRAPH_DEBUG("Nodes did not match because attribute '",
+                                        attr_name.toQualString(),
+                                        "' did not match:",
+                                        n1->i(attr_name),
+                                        " != ",
+                                        n2->i(attr_name),
+                                        " \n",
+                                        *n1,
+                                        *n2);
+                            return false;
+                        }
+                        break;
+                    case AttributeKind::is:
+                        n1is = n1->is(attr_name);
+                        n2is = n2->is(attr_name);
+                        if (n1is.size() != n2is.size()) return false;
+                        for (size_t i = 0; i < n1is.size(); ++i)
+                        {
+                            if (n1is[i] != n2is[i]) return false;
+                        }
+                        break;
+                    case AttributeKind::fs:
+                        n1fs = n1->fs(attr_name);
+                        n2fs = n2->fs(attr_name);
+                        if (n1fs.size() != n2fs.size()) return false;
+                        for (size_t i = 0; i < n1fs.size(); ++i)
+                        {
+                            if (n1fs[i] != n2fs[i]) return false;
+                        }
+                        break;
+                    default:
+                    {
+                        // Other attributes types not supported yet
+                        GRAPH_DEBUG("Nodes did not match because type of attribute '",
+                                    attr_name.toQualString(),
+                                    "' is not supported.\n",
+                                    *n1,
+                                    *n2);
+                        return false;
+                    }
+                }
+            }
+            return true;
+        }
+
+        static bool endsWith(const std::string& str, const std::string& suffix)
+        {
+            return str.size() >= suffix.size() &&
+                   0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
         }
-        break;
-      case AttributeKind::i:
-        if (n1->i(attr_name) != n2->i(attr_name)) {
-          GRAPH_DEBUG("Nodes did not match because attribute '", attr_name.toQualString(),
-                      "' did not match:", n1->i(attr_name), " != ", n2->i(attr_name), " \n", *n1,
-                      *n2);
-          return false;
+
+        /**
+         * Compare two Nodes. N1 is from pattern, N2 is from the actual graph.
+         *
+         * The nodes are considered matching if:
+         * 1) N1 and N2 are of the same kind.
+         * 2) Number of inputs and outputs is the same.
+         * 3) All input and output values match.
+         *
+         * A special case is when N1 is PARAM - this is considered outside the pattern,
+         * so it matches everything.
+         */
+        bool SubgraphMatcher::SubgraphMatcherImpl::matchNodes(const Node* n1, Node* n2)
+        {
+            // Check if we've already visited these nodes.
+            if (nodes_map_.count(n1))
+            {
+                return nodes_map_.at(n1) == n2;
+            }
+
+            // Param node in pattern graph matches everything.
+            if (n1->kind() == prim::Param)
+            {
+                GRAPH_DEBUG("Nodes matched:\n", *n1, *n2);
+                return true;
+            }
+
+            // We don't allow matches to span across blocks, so check if N2 is in the same
+            // block as the first (anchor) node.
+            if (n2->owningBlock() != anchor_->owningBlock())
+            {
+                GRAPH_DEBUG("Nodes did not match because it is in the different block:\n", *n1, *n2);
+                return false;
+            }
+
+            // Special handling for matching modules
+            if (n1->kind() == Symbol::fromQualString("match::module"))
+            {
+                if (n2->kind() == prim::GetAttr)
+                {
+                    if (!n1->hasAttributeS("name"))
+                    {
+                        GRAPH_DEBUG(
+                            "Nodes did not match because special node match::module does not have 'name' "
+                            "attribute:\n",
+                            *n1,
+                            *n2);
+                        return false;
+                    }
+                    auto t                = n2->output()->type()->expect<ClassType>();
+                    auto real_typename    = t->name()->qualifiedName();
+                    auto pattern_typename = n1->s(attr::name);
+                    if (!endsWith(real_typename, pattern_typename))
+                    {
+                        GRAPH_DEBUG("Nodes did not match because expected module type is different:\n");
+                        GRAPH_DEBUG("  actualtype:    ", real_typename, "\n");
+                        GRAPH_DEBUG("  expected type: ", pattern_typename, "\n");
+                        GRAPH_DEBUG("Nodes:", *n1, *n2);
+                        return false;
+                    }
+                }
+            }
+            else
+            {
+                if (n1->kind() != n2->kind() || n1->outputs().size() != n2->outputs().size() ||
+                    n1->inputs().size() != n2->inputs().size())
+                {
+                    GRAPH_DEBUG("Nodes did not match in their kind or number of inputs/outputs:\n", *n1, *n2);
+                    return false;
+                }
+
+                if (match_attribute_ != NO_MATCH)
+                {
+                    if (!matchAttributes(n1, n2))
+                    {
+                        return false;
+                    }
+                }
+            }
+
+            // Add nodes to the map before calling matchValues to avoid infinite
+            // recursion.
+            nodes_map_[n1] = n2;
+            for (const auto i : c10::irange(n1->outputs().size()))
+            {
+                if (!matchValues(n1->outputs()[i], n2->outputs()[i]))
+                {
+                    return false;
+                }
+            }
+            for (const auto i : c10::irange(n1->inputs().size()))
+            {
+                if (!matchValues(n1->inputs()[i], n2->inputs()[i]))
+                {
+                    return false;
+                }
+            }
+
+            GRAPH_DEBUG("Nodes matched:\n", *n1, *n2);
+            return true;
+        }
+
+        /**
+         * Recursively try to match pattern with the actual graph starting from the
+         * exiting node in the pattern and anchor node in the actual graph.
+         */
+        bool SubgraphMatcher::SubgraphMatcherImpl::matchesSubgraphFromAnchorNode(Node* anchor)
+        {
+            GRAPH_UPDATE("Starting match from a new anchor: ", *anchor);
+            nodes_map_.clear();
+            values_map_.clear();
+            anchor_ = anchor;
+
+            const Node* bottom_node = *(pattern_.nodes().end());
+            bottom_node             = bottom_node->input(0)->node();
+
+            if (!matchNodes(bottom_node, anchor))
+            {
+                return false;
+            }
+
+            for (const Value* output : pattern_.outputs())
+            {
+                AT_ASSERT(values_map_.count(output));
+            }
+
+            GRAPH_UPDATE("Pattern matched!\n");
+            return true;
         }
-        break;
-      case AttributeKind::is:
-        n1is = n1->is(attr_name);
-        n2is = n2->is(attr_name);
-        if (n1is.size() != n2is.size()) return false;
-        for (size_t i = 0; i < n1is.size(); ++i) {
-          if (n1is[i] != n2is[i]) return false;
+
+        SubgraphMatcher::SubgraphMatcher(const Graph& pattern, MatchAttribute match_attribute)
+            : impl_(new SubgraphMatcher::SubgraphMatcherImpl(pattern, match_attribute))
+        {
         }
-        break;
-      case AttributeKind::fs:
-        n1fs = n1->fs(attr_name);
-        n2fs = n2->fs(attr_name);
-        if (n1fs.size() != n2fs.size()) return false;
-        for (size_t i = 0; i < n1fs.size(); ++i) {
-          if (n1fs[i] != n2fs[i]) return false;
+
+        SubgraphMatcher::~SubgraphMatcher() = default;
+
+        bool SubgraphMatcher::matchesSubgraphFromAnchorNode(Node* anchor)
+        {
+            return impl_->matchesSubgraphFromAnchorNode(anchor);
         }
-        break;
-      default: {
-        // Other attributes types not supported yet
-        GRAPH_DEBUG("Nodes did not match because type of attribute '", attr_name.toQualString(),
-                    "' is not supported.\n", *n1, *n2);
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
-static bool endsWith(const std::string& str, const std::string& suffix) {
-  return str.size() >= suffix.size() &&
-         0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
-}
-
-/**
- * Compare two Nodes. N1 is from pattern, N2 is from the actual graph.
- *
- * The nodes are considered matching if:
- * 1) N1 and N2 are of the same kind.
- * 2) Number of inputs and outputs is the same.
- * 3) All input and output values match.
- *
- * A special case is when N1 is PARAM - this is considered outside the pattern,
- * so it matches everything.
- */
-bool SubgraphMatcher::SubgraphMatcherImpl::matchNodes(const Node* n1, Node* n2) {
-  // Check if we've already visited these nodes.
-  if (nodes_map_.count(n1)) {
-    return nodes_map_.at(n1) == n2;
-  }
-
-  // Param node in pattern graph matches everything.
-  if (n1->kind() == prim::Param) {
-    GRAPH_DEBUG("Nodes matched:\n", *n1, *n2);
-    return true;
-  }
-
-  // We don't allow matches to span across blocks, so check if N2 is in the same
-  // block as the first (anchor) node.
-  if (n2->owningBlock() != anchor_->owningBlock()) {
-    GRAPH_DEBUG("Nodes did not match because it is in the different block:\n", *n1, *n2);
-    return false;
-  }
-
-  // Special handling for matching modules
-  if (n1->kind() == Symbol::fromQualString("match::module")) {
-    if (n2->kind() == prim::GetAttr) {
-      if (!n1->hasAttributeS("name")) {
-        GRAPH_DEBUG(
-            "Nodes did not match because special node match::module does not have 'name' "
-            "attribute:\n",
-            *n1, *n2);
-        return false;
-      }
-      auto t = n2->output()->type()->expect<ClassType>();
-      auto real_typename = t->name()->qualifiedName();
-      auto pattern_typename = n1->s(attr::name);
-      if (!endsWith(real_typename, pattern_typename)) {
-        GRAPH_DEBUG("Nodes did not match because expected module type is different:\n");
-        GRAPH_DEBUG("  actualtype:    ", real_typename, "\n");
-        GRAPH_DEBUG("  expected type: ", pattern_typename, "\n");
-        GRAPH_DEBUG("Nodes:", *n1, *n2);
-        return false;
-      }
-    }
-  } else {
-    if (n1->kind() != n2->kind() || n1->outputs().size() != n2->outputs().size() ||
-        n1->inputs().size() != n2->inputs().size()) {
-      GRAPH_DEBUG("Nodes did not match in their kind or number of inputs/outputs:\n", *n1, *n2);
-      return false;
-    }
-
-    if (match_attribute_ != NO_MATCH) {
-      if (!matchAttributes(n1, n2)) {
-        return false;
-      }
-    }
-  }
-
-  // Add nodes to the map before calling matchValues to avoid infinite
-  // recursion.
-  nodes_map_[n1] = n2;
-  for (const auto i : c10::irange(n1->outputs().size())) {
-    if (!matchValues(n1->outputs()[i], n2->outputs()[i])) {
-      return false;
-    }
-  }
-  for (const auto i : c10::irange(n1->inputs().size())) {
-    if (!matchValues(n1->inputs()[i], n2->inputs()[i])) {
-      return false;
-    }
-  }
-
-  GRAPH_DEBUG("Nodes matched:\n", *n1, *n2);
-  return true;
-}
-
-/**
- * Recursively try to match pattern with the actual graph starting from the
- * exiting node in the pattern and anchor node in the actual graph.
- */
-bool SubgraphMatcher::SubgraphMatcherImpl::matchesSubgraphFromAnchorNode(Node* anchor) {
-  GRAPH_UPDATE("Starting match from a new anchor: ", *anchor);
-  nodes_map_.clear();
-  values_map_.clear();
-  anchor_ = anchor;
-
-  const Node* bottom_node = *(pattern_.nodes().end());
-  bottom_node = bottom_node->input(0)->node();
-
-  if (!matchNodes(bottom_node, anchor)) {
-    return false;
-  }
-
-  for (const Value* output : pattern_.outputs()) {
-    AT_ASSERT(values_map_.count(output));
-  }
-
-  GRAPH_UPDATE("Pattern matched!\n");
-  return true;
-}
-
-SubgraphMatcher::SubgraphMatcher(const Graph& pattern, MatchAttribute match_attribute)
-    : impl_(new SubgraphMatcher::SubgraphMatcherImpl(pattern, match_attribute)) {}
-
-SubgraphMatcher::~SubgraphMatcher() = default;
-
-bool SubgraphMatcher::matchesSubgraphFromAnchorNode(Node* anchor) {
-  return impl_->matchesSubgraphFromAnchorNode(anchor);
-}
-
-std::unordered_map<const Node*, Node*> SubgraphMatcher::nodes_map() const {
-  return impl_->nodes_map();
-}
-
-std::unordered_map<const Value*, Value*> SubgraphMatcher::values_map() const {
-  return impl_->values_map();
-}
-
-}  // namespace torch_jit
+
+        std::unordered_map<const Node*, Node*> SubgraphMatcher::nodes_map() const
+        {
+            return impl_->nodes_map();
+        }
+
+        std::unordered_map<const Value*, Value*> SubgraphMatcher::values_map() const
+        {
+            return impl_->values_map();
+        }
+
+    }  // namespace torch_jit
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/torchscript/optimizer/ir/subgraph_matcher.h b/csrc/mmdeploy/backend_ops/torchscript/optimizer/ir/subgraph_matcher.h
index e2488e252c..ffe1b51aa8 100644
--- a/csrc/mmdeploy/backend_ops/torchscript/optimizer/ir/subgraph_matcher.h
+++ b/csrc/mmdeploy/backend_ops/torchscript/optimizer/ir/subgraph_matcher.h
@@ -5,34 +5,42 @@
 #include <torch/script.h>
 
 #include <memory>
-namespace mmdeploy {
-namespace torch_jit {
-using torch::jit::Graph;
-using torch::jit::Node;
-using torch::jit::Value;
-
-enum MatchAttribute { FORCE_MATCH, TRY_MATCH, NO_MATCH };
-
-class SubgraphMatcher {
- public:
-  explicit SubgraphMatcher(const Graph& pattern, MatchAttribute match_attribute = TRY_MATCH);
-
-  ~SubgraphMatcher();
-
-  bool matchesSubgraphFromAnchorNode(Node* anchor);
-
-  /** \brief Return match map for nodes. */
-  std::unordered_map<const Node*, Node*> nodes_map() const;
-
-  /** \brief Return match map for values. */
-  std::unordered_map<const Value*, Value*> values_map() const;
-
- private:
-  class SubgraphMatcherImpl;
-  std::unique_ptr<SubgraphMatcherImpl> impl_;
-};
-
-}  // namespace torch_jit
+namespace mmdeploy
+{
+    namespace torch_jit
+    {
+        using torch::jit::Graph;
+        using torch::jit::Node;
+        using torch::jit::Value;
+
+        enum MatchAttribute
+        {
+            FORCE_MATCH,
+            TRY_MATCH,
+            NO_MATCH
+        };
+
+        class SubgraphMatcher
+        {
+          public:
+            explicit SubgraphMatcher(const Graph& pattern, MatchAttribute match_attribute = TRY_MATCH);
+
+            ~SubgraphMatcher();
+
+            bool                                     matchesSubgraphFromAnchorNode(Node* anchor);
+
+            /** \brief Return match map for nodes. */
+            std::unordered_map<const Node*, Node*>   nodes_map() const;
+
+            /** \brief Return match map for values. */
+            std::unordered_map<const Value*, Value*> values_map() const;
+
+          private:
+            class SubgraphMatcherImpl;
+            std::unique_ptr<SubgraphMatcherImpl> impl_;
+        };
+
+    }  // namespace torch_jit
 }  // namespace mmdeploy
 
 #endif
diff --git a/csrc/mmdeploy/backend_ops/torchscript/optimizer/optimizer.cpp b/csrc/mmdeploy/backend_ops/torchscript/optimizer/optimizer.cpp
index 05ef9d54cd..2178bb3a4e 100644
--- a/csrc/mmdeploy/backend_ops/torchscript/optimizer/optimizer.cpp
+++ b/csrc/mmdeploy/backend_ops/torchscript/optimizer/optimizer.cpp
@@ -12,59 +12,63 @@
 #include <torch/csrc/jit/passes/remove_expands.h>
 
 #if TORCH_VERSION_MINOR >= 9
-#include <torch/csrc/jit/passes/frozen_conv_add_relu_fusion.h>
-#include <torch/csrc/jit/passes/frozen_linear_transpose.h>
-#include <torch/csrc/jit/passes/frozen_ops_to_mkldnn.h>
+    #include <torch/csrc/jit/passes/frozen_conv_add_relu_fusion.h>
+    #include <torch/csrc/jit/passes/frozen_linear_transpose.h>
+    #include <torch/csrc/jit/passes/frozen_ops_to_mkldnn.h>
 #endif
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-using torch::jit::Graph;
-const std::shared_ptr<Graph>& required_passes(const std::shared_ptr<Graph>& graph) {
-  RemoveExpands(graph);
-  CanonicalizeOps(graph);
-  EliminateDeadCode(graph);
-  return graph;
-}
+    using torch::jit::Graph;
+    const std::shared_ptr<Graph>& required_passes(const std::shared_ptr<Graph>& graph)
+    {
+        RemoveExpands(graph);
+        CanonicalizeOps(graph);
+        EliminateDeadCode(graph);
+        return graph;
+    }
 
-Module optimize_for_torchscript(const Module& model) {
-  auto frozen_model = freeze_module(model);
-  auto graph = frozen_model.get_method("forward").graph();
-  OptimizeFrozenGraph(graph, true);
+    Module optimize_for_torchscript(const Module& model)
+    {
+        auto frozen_model = freeze_module(model);
+        auto graph        = frozen_model.get_method("forward").graph();
+        OptimizeFrozenGraph(graph, true);
 
 #if TORCH_VERSION_MINOR >= 9
-  FuseFrozenConvAddRelu(graph);
-  ConvertFrozenOpsToMKLDNN(graph);
-  FrozenLinearTranspose(graph);
+        FuseFrozenConvAddRelu(graph);
+        ConvertFrozenOpsToMKLDNN(graph);
+        FrozenLinearTranspose(graph);
 #endif
 
-  graph = required_passes(graph);
-  EliminateCommonSubexpression(graph);
-  PeepholeOptimize(graph);
-  ConstantPropagation(graph);
-  ConstantPooling(graph);
+        graph = required_passes(graph);
+        EliminateCommonSubexpression(graph);
+        PeepholeOptimize(graph);
+        ConstantPropagation(graph);
+        ConstantPooling(graph);
 
-  // TODO: add more custom passes
+        // TODO: add more custom passes
 
-  return frozen_model;
-}
+        return frozen_model;
+    }
 
-Module optimize_for_onnx(const Module& model) {
-  auto frozen_model = freeze_module(model, {"training"});
-  auto graph = frozen_model.get_method("forward").graph();
-  OptimizeFrozenGraph(graph, true);
+    Module optimize_for_onnx(const Module& model)
+    {
+        auto frozen_model = freeze_module(model, {"training"});
+        auto graph        = frozen_model.get_method("forward").graph();
+        OptimizeFrozenGraph(graph, true);
 
 #if TORCH_VERSION_MINOR >= 9
-  FuseFrozenConvAddRelu(graph);
-  ConvertFrozenOpsToMKLDNN(graph);
-  FrozenLinearTranspose(graph);
+        FuseFrozenConvAddRelu(graph);
+        ConvertFrozenOpsToMKLDNN(graph);
+        FrozenLinearTranspose(graph);
 #endif
 
-  // TODO: add more custom passes
+        // TODO: add more custom passes
 
-  return frozen_model;
-}
+        return frozen_model;
+    }
 
-// TODO: add optimizer for other backend/onnx
+    // TODO: add optimizer for other backend/onnx
 
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/torchscript/optimizer/optimizer.h b/csrc/mmdeploy/backend_ops/torchscript/optimizer/optimizer.h
index d0d91c627d..fc5a3725d1 100644
--- a/csrc/mmdeploy/backend_ops/torchscript/optimizer/optimizer.h
+++ b/csrc/mmdeploy/backend_ops/torchscript/optimizer/optimizer.h
@@ -1,10 +1,11 @@
 // Copyright (c) OpenMMLab. All rights reserved.
 #include <torch/script.h>
 
-namespace mmdeploy {
-using torch::jit::script::Module;
+namespace mmdeploy
+{
+    using torch::jit::script::Module;
 
-Module optimize_for_torchscript(const Module &model);
+    Module optimize_for_torchscript(const Module& model);
 
-Module optimize_for_onnx(const Module &model);
+    Module optimize_for_onnx(const Module& model);
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/common_subgraph_elimination.cpp b/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/common_subgraph_elimination.cpp
index c6541e630a..c26db5a34f 100644
--- a/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/common_subgraph_elimination.cpp
+++ b/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/common_subgraph_elimination.cpp
@@ -4,135 +4,161 @@
 #include <torch/csrc/jit/ir/node_hashing.h>
 #include <torch/csrc/jit/passes/common_subexpression_elimination.h>
 
-namespace mmdeploy {
-namespace torch_jit {
-
-using c10::Symbol;
-using torch::jit::Block;
-using torch::jit::EqualNode;
-using torch::jit::HashNode;
-using torch::jit::Node;
-using torch::jit::Value;
-
-struct EqualNodeWithParams {
-  EqualNodeWithParams(std::unordered_map<std::string, Tensor>& params) : params_(params) {}
-
-  bool operator()(const Node* lhs, const Node* rhs) const {
-    auto lhs_inputs = lhs->inputs();
-    auto rhs_inputs = rhs->inputs();
-  }
-
- private:
-  std::unordered_map<std::string, Tensor>& params_;
-};
-
-struct CommonSubexpressionEliminator {
-  using ParamMapType = std::unordered_map<std::string, std::pair<Tensor, Value*>>;
-  CommonSubexpressionEliminator(std::shared_ptr<Graph> graph,
-                                std::unordered_map<std::string, Tensor>& params)
-      : graph_(std::move(graph)), params_(params) {}
-
-  bool run(std::function<Node*(Node*)> parent_lookup_fn) {
-    ParamMapType param_map;
-    return run(graph_->block(), std::move(parent_lookup_fn), param_map);
-  }
-
-  // The function implements common subexpression elimination.
-  // Since the nodes are visited in topological order, one pass is enough.
-  // returns true if CSE made changes to a graph
-  bool run(Block* block, std::function<Node*(Node*)> parent_lookup_fn, ParamMapType& param_map) {
-    std::unordered_set<Node*, HashNode, EqualNode> subexprs;
-    bool changed = false;
-    for (auto it = block->nodes().begin(); it != block->nodes().end(); ++it) {
-      auto node = *it;
-
-      // check if inputs come from params(graph input)
-      auto node_inputs = node->inputs();
-      for (auto input : node_inputs) {
-        if (input->node()->kind() == Symbol::fromQualString("prim::Param")) {
-          auto debug_name = input->debugName();
-
-          // check if input in params_
-          if (params_.find(debug_name) == params_.end()) continue;
-
-          // check if input is already visited.
-          if (param_map.find(debug_name) != param_map.end()) continue;
-
-          // check if there is a param has same value with input
-          auto val = params_[debug_name];
-          bool update_map = true;
-          for (auto kv : param_map) {
-            auto param_val = kv.second.first;
-            if (val.device() != param_val.device()) continue;
-            if (val.dtype() != param_val.dtype()) continue;
-            if (!val.equal(param_val)) continue;
-            input->replaceAllUsesWith(kv.second.second);
-            update_map = false;
-            break;
-          }
-
-          // add input to param_map
-          if (update_map) {
-            param_map.emplace(debug_name,
-                              std::make_pair<Tensor, Value*>(std::move(val), std::move(input)));
-          }
-        }
-      }
-
-      if (!node->blocks().empty()) {
-        // Traverse sub-blocks.
-        for (auto block : node->blocks()) {
-          changed |= run(
-              block,
-              [&](Node* n) {
-                auto existing = subexprs.find(n);
-                if (existing != subexprs.end()) {
-                  return *existing;
+namespace mmdeploy
+{
+    namespace torch_jit
+    {
+
+        using c10::Symbol;
+        using torch::jit::Block;
+        using torch::jit::EqualNode;
+        using torch::jit::HashNode;
+        using torch::jit::Node;
+        using torch::jit::Value;
+
+        struct EqualNodeWithParams
+        {
+            EqualNodeWithParams(std::unordered_map<std::string, Tensor>& params)
+                : params_(params)
+            {
+            }
+
+            bool operator()(const Node* lhs, const Node* rhs) const
+            {
+                auto lhs_inputs = lhs->inputs();
+                auto rhs_inputs = rhs->inputs();
+            }
+
+          private:
+            std::unordered_map<std::string, Tensor>& params_;
+        };
+
+        struct CommonSubexpressionEliminator
+        {
+            using ParamMapType = std::unordered_map<std::string, std::pair<Tensor, Value*>>;
+            CommonSubexpressionEliminator(std::shared_ptr<Graph>                   graph,
+                                          std::unordered_map<std::string, Tensor>& params)
+                : graph_(std::move(graph))
+                , params_(params)
+            {
+            }
+
+            bool run(std::function<Node*(Node*)> parent_lookup_fn)
+            {
+                ParamMapType param_map;
+                return run(graph_->block(), std::move(parent_lookup_fn), param_map);
+            }
+
+            // The function implements common subexpression elimination.
+            // Since the nodes are visited in topological order, one pass is enough.
+            // returns true if CSE made changes to a graph
+            bool run(Block* block, std::function<Node*(Node*)> parent_lookup_fn, ParamMapType& param_map)
+            {
+                std::unordered_set<Node*, HashNode, EqualNode> subexprs;
+                bool                                           changed = false;
+                for (auto it = block->nodes().begin(); it != block->nodes().end(); ++it)
+                {
+                    auto node = *it;
+
+                    // check if inputs come from params(graph input)
+                    auto node_inputs = node->inputs();
+                    for (auto input : node_inputs)
+                    {
+                        if (input->node()->kind() == Symbol::fromQualString("prim::Param"))
+                        {
+                            auto debug_name = input->debugName();
+
+                            // check if input in params_
+                            if (params_.find(debug_name) == params_.end()) continue;
+
+                            // check if input is already visited.
+                            if (param_map.find(debug_name) != param_map.end()) continue;
+
+                            // check if there is a param has same value with input
+                            auto val        = params_[debug_name];
+                            bool update_map = true;
+                            for (auto kv : param_map)
+                            {
+                                auto param_val = kv.second.first;
+                                if (val.device() != param_val.device()) continue;
+                                if (val.dtype() != param_val.dtype()) continue;
+                                if (!val.equal(param_val)) continue;
+                                input->replaceAllUsesWith(kv.second.second);
+                                update_map = false;
+                                break;
+                            }
+
+                            // add input to param_map
+                            if (update_map)
+                            {
+                                param_map.emplace(debug_name,
+                                                  std::make_pair<Tensor, Value*>(std::move(val), std::move(input)));
+                            }
+                        }
+                    }
+
+                    if (!node->blocks().empty())
+                    {
+                        // Traverse sub-blocks.
+                        for (auto block : node->blocks())
+                        {
+                            changed |= run(
+                                block,
+                                [&](Node* n)
+                                {
+                                    auto existing = subexprs.find(n);
+                                    if (existing != subexprs.end())
+                                    {
+                                        return *existing;
+                                    }
+
+                                    return parent_lookup_fn(n);
+                                },
+                                param_map);
+                        }
+
+                        continue;
+                    }
+
+                    // Check for CSE opportunities in the parent block.
+                    auto parent_lookup = parent_lookup_fn(node);
+                    auto g_out         = node->owningGraph()->outputs();
+                    if (parent_lookup != nullptr)
+                    {
+                        changed = true;
+                        node->replaceAllUsesWith(parent_lookup);
+                        it.destroyCurrent();
+                        continue;
+                    }
+
+                    // Check whether the same subexpression already exists.
+                    auto subit = subexprs.insert(node);
+                    if (!subit.second)
+                    {
+                        // Subexpression exists, replace the uses of node, and destroy it.
+                        auto existing = *subit.first;
+
+                        changed = true;
+                        node->replaceAllUsesWith(existing);
+                        // Destroy the node.
+                        it.destroyCurrent();
+                    }
                 }
 
-                return parent_lookup_fn(n);
-              },
-              param_map);
-        }
+                return changed;
+            }
 
-        continue;
-      }
-
-      // Check for CSE opportunities in the parent block.
-      auto parent_lookup = parent_lookup_fn(node);
-      auto g_out = node->owningGraph()->outputs();
-      if (parent_lookup != nullptr) {
-        changed = true;
-        node->replaceAllUsesWith(parent_lookup);
-        it.destroyCurrent();
-        continue;
-      }
-
-      // Check whether the same subexpression already exists.
-      auto subit = subexprs.insert(node);
-      if (!subit.second) {
-        // Subexpression exists, replace the uses of node, and destroy it.
-        auto existing = *subit.first;
-
-        changed = true;
-        node->replaceAllUsesWith(existing);
-        // Destroy the node.
-        it.destroyCurrent();
-      }
-    }
-
-    return changed;
-  }
-
- private:
-  std::shared_ptr<Graph> graph_;
-  std::unordered_map<std::string, Tensor>& params_;
-};
-
-void CommonSubgraphElimination(std::shared_ptr<Graph>& graph,
-                               std::unordered_map<std::string, Tensor>& params) {
-  CommonSubexpressionEliminator cse(graph, params);
-  cse.run([](Node*) { return nullptr; });
-}
-}  // namespace torch_jit
+          private:
+            std::shared_ptr<Graph>                   graph_;
+            std::unordered_map<std::string, Tensor>& params_;
+        };
+
+        void CommonSubgraphElimination(std::shared_ptr<Graph>&                  graph,
+                                       std::unordered_map<std::string, Tensor>& params)
+        {
+            CommonSubexpressionEliminator cse(graph, params);
+            cse.run([](Node*)
+                    { return nullptr; });
+        }
+    }  // namespace torch_jit
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/common_subgraph_elimination.h b/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/common_subgraph_elimination.h
index d90b98073e..da108ff733 100644
--- a/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/common_subgraph_elimination.h
+++ b/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/common_subgraph_elimination.h
@@ -3,18 +3,20 @@
 #define _COMMON_SUBGRAPH_ELIMINATION_H_
 
 #include <torch/script.h>
-namespace mmdeploy {
-namespace torch_jit {
-using torch::Tensor;
-using torch::jit::Graph;
+namespace mmdeploy
+{
+    namespace torch_jit
+    {
+        using torch::Tensor;
+        using torch::jit::Graph;
 
-// This pass is used eliminate the common subgraph.
-// There are two main difference between the one in torch/csrc/jit/pass
-// 1. AliasDb is not needed in ONNX model
-// 2. params might also participated in the elimination
-void CommonSubgraphElimination(std::shared_ptr<Graph>& graph,
-                               std::unordered_map<std::string, Tensor>& params);
-}  // namespace torch_jit
+        // This pass is used eliminate the common subgraph.
+        // There are two main difference between the one in torch/csrc/jit/pass
+        // 1. AliasDb is not needed in ONNX model
+        // 2. params might also participated in the elimination
+        void CommonSubgraphElimination(std::shared_ptr<Graph>&                  graph,
+                                       std::unordered_map<std::string, Tensor>& params);
+    }  // namespace torch_jit
 }  // namespace mmdeploy
 
 #endif
diff --git a/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/flatten_cls_head.cpp b/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/flatten_cls_head.cpp
index 73f8965412..db44bdb4c1 100644
--- a/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/flatten_cls_head.cpp
+++ b/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/flatten_cls_head.cpp
@@ -9,89 +9,94 @@
 
 #include "utils.h"
 
-namespace mmdeploy {
-namespace torch_jit {
-
-using c10::Symbol;
-using torch::jit::IValue;
-using torch::jit::Match;
-using torch::jit::TensorType;
-using torch::jit::TypeKind;
-using torch::jit::Value;
-
-static bool matchClsHead(const Match& match, const std::unordered_map<std::string, Value*>& map) {
-  // TODO: check if value map in latest pytorch can ease the filter.
-
-  // check cat -1
-  {
-    // check if the shape of second inputs is 1
-    auto cat_v1 = match.values_map.at(map.at("cat1"));
-    if (cat_v1->type()->kind() != TypeKind::TensorType) return false;
-    auto cat_v1_type = cat_v1->type()->cast<TensorType>();
-    auto cat_v1_size = cat_v1_type->sizes().concrete_sizes();
-    if (!cat_v1_size.has_value()) return false;
-    IValue cat_v1_size_value(cat_v1_size.value());
-    auto size_list = cat_v1_size_value.toIntList();
-    if (size_list.size() != 1 || size_list[0] != 1) return false;
-  }
-
-  // check unsqueeze
-  auto cat_v0 = match.values_map.at(map.at("cat0"));
-  auto unsqueeze_node = cat_v0->node();
-  {
-    if (!is_kind(unsqueeze_node, "onnx::Unsqueeze")) return false;
-    auto unsqueeze_axes = unsqueeze_node->is(Symbol::attr("axes"));
-    if (unsqueeze_axes.size() != 1 || unsqueeze_axes[0] != 0) return false;
-  }
-
-  // check gather
-  auto gather_node = unsqueeze_node->input()->node();
-  auto gather_inputs = gather_node->inputs();
-  {
-    if (!is_kind(gather_node, "onnx::Gather")) return false;
-    auto gather_axis = gather_node->i(Symbol::attr("axis"));
-    if (gather_axis != 0) return false;
-  }
-
-  auto x = match.values_map.at(map.at("x"));
-  // check shape
-  auto shape_node = gather_inputs[0]->node();
-  {
-    if (!is_kind(shape_node, "onnx::Shape")) return false;
-    if (shape_node->input() != x) return false;
-  }
-
-  // check constant
-  auto const_node = gather_inputs[1]->node();
-  {
-    if (!is_kind(const_node, "onnx::Constant")) return false;
-    auto ival = const_node->t(Symbol::attr("value"));
-    if (ival.dim() != 0) return false;
-    auto ival_dataptr = ival.data_ptr<int64_t>();
-    if (ival_dataptr[0] != 0) return false;
-  }
-
-  // check if reshape is the output of the graph
-  auto reshape_pattern = map.at("reshape");
-  auto reshape_node = match.values_map.at(reshape_pattern);
-  auto uses = reshape_node->uses();
-  for (auto use : uses) {
-    auto user = use.user;
-    if (is_kind(user, "prim::Return")) return false;
-  }
-
-  return true;
-}
-
-// from:
-// x->shape->gather->unsqueeze->concat
-// |                              |
-// gap--------------------------reshape
-//
-// to:
-// x->gap->flatten
-void FlattenClsHead(std::shared_ptr<Graph>& graph) {
-  std::string pattern = R"IR(
+namespace mmdeploy
+{
+    namespace torch_jit
+    {
+
+        using c10::Symbol;
+        using torch::jit::IValue;
+        using torch::jit::Match;
+        using torch::jit::TensorType;
+        using torch::jit::TypeKind;
+        using torch::jit::Value;
+
+        static bool matchClsHead(const Match& match, const std::unordered_map<std::string, Value*>& map)
+        {
+            // TODO: check if value map in latest pytorch can ease the filter.
+
+            // check cat -1
+            {
+                // check if the shape of second inputs is 1
+                auto cat_v1 = match.values_map.at(map.at("cat1"));
+                if (cat_v1->type()->kind() != TypeKind::TensorType) return false;
+                auto cat_v1_type = cat_v1->type()->cast<TensorType>();
+                auto cat_v1_size = cat_v1_type->sizes().concrete_sizes();
+                if (!cat_v1_size.has_value()) return false;
+                IValue cat_v1_size_value(cat_v1_size.value());
+                auto   size_list = cat_v1_size_value.toIntList();
+                if (size_list.size() != 1 || size_list[0] != 1) return false;
+            }
+
+            // check unsqueeze
+            auto cat_v0         = match.values_map.at(map.at("cat0"));
+            auto unsqueeze_node = cat_v0->node();
+            {
+                if (!is_kind(unsqueeze_node, "onnx::Unsqueeze")) return false;
+                auto unsqueeze_axes = unsqueeze_node->is(Symbol::attr("axes"));
+                if (unsqueeze_axes.size() != 1 || unsqueeze_axes[0] != 0) return false;
+            }
+
+            // check gather
+            auto gather_node   = unsqueeze_node->input()->node();
+            auto gather_inputs = gather_node->inputs();
+            {
+                if (!is_kind(gather_node, "onnx::Gather")) return false;
+                auto gather_axis = gather_node->i(Symbol::attr("axis"));
+                if (gather_axis != 0) return false;
+            }
+
+            auto x          = match.values_map.at(map.at("x"));
+            // check shape
+            auto shape_node = gather_inputs[0]->node();
+            {
+                if (!is_kind(shape_node, "onnx::Shape")) return false;
+                if (shape_node->input() != x) return false;
+            }
+
+            // check constant
+            auto const_node = gather_inputs[1]->node();
+            {
+                if (!is_kind(const_node, "onnx::Constant")) return false;
+                auto ival = const_node->t(Symbol::attr("value"));
+                if (ival.dim() != 0) return false;
+                auto ival_dataptr = ival.data_ptr<int64_t>();
+                if (ival_dataptr[0] != 0) return false;
+            }
+
+            // check if reshape is the output of the graph
+            auto reshape_pattern = map.at("reshape");
+            auto reshape_node    = match.values_map.at(reshape_pattern);
+            auto uses            = reshape_node->uses();
+            for (auto use : uses)
+            {
+                auto user = use.user;
+                if (is_kind(user, "prim::Return")) return false;
+            }
+
+            return true;
+        }
+
+        // from:
+        // x->shape->gather->unsqueeze->concat
+        // |                              |
+        // gap--------------------------reshape
+        //
+        // to:
+        // x->gap->flatten
+        void FlattenClsHead(std::shared_ptr<Graph>& graph)
+        {
+            std::string                  pattern = R"IR(
       graph(%x, %cat0, %cat1):
         %gap = onnx::GlobalAveragePool(%x)
         %cat = onnx::Concat[axis=0](%cat0, %cat1)
@@ -99,21 +104,22 @@ void FlattenClsHead(std::shared_ptr<Graph>& graph) {
         return (%reshape)
   )IR";
 
-  std::string replacement = R"IR(
+            std::string                  replacement = R"IR(
       graph(%x, %cat0, %cat1):
         %gap = onnx::GlobalAveragePool(%x)
         %flatten = onnx::Flatten(%gap)
         return (%flatten)
   )IR";
 
-  torch::jit::SubgraphRewriter subgraph_rewriter;
-  subgraph_rewriter.RegisterRewritePattern(pattern, replacement);
-  subgraph_rewriter.runOnGraph(graph, matchClsHead);
+            torch::jit::SubgraphRewriter subgraph_rewriter;
+            subgraph_rewriter.RegisterRewritePattern(pattern, replacement);
+            subgraph_rewriter.runOnGraph(graph, matchClsHead);
 
-  torch::jit::EliminateDeadCode(
-      graph->block(), true,
-      torch::jit::DCESideEffectPolicy::ALLOW_DELETING_NODES_WITH_SIDE_EFFECTS);
-}
+            torch::jit::EliminateDeadCode(
+                graph->block(),
+                true,
+                torch::jit::DCESideEffectPolicy::ALLOW_DELETING_NODES_WITH_SIDE_EFFECTS);
+        }
 
-}  // namespace torch_jit
+    }  // namespace torch_jit
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/flatten_cls_head.h b/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/flatten_cls_head.h
index b66b700d1c..64d8ea3352 100644
--- a/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/flatten_cls_head.h
+++ b/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/flatten_cls_head.h
@@ -3,12 +3,14 @@
 #define _FLATTEN_CLS_HEAD_H_
 
 #include <torch/script.h>
-namespace mmdeploy {
-namespace torch_jit {
-using torch::jit::Graph;
+namespace mmdeploy
+{
+    namespace torch_jit
+    {
+        using torch::jit::Graph;
 
-void FlattenClsHead(std::shared_ptr<Graph>& graph);
-}  // namespace torch_jit
+        void FlattenClsHead(std::shared_ptr<Graph>& graph);
+    }  // namespace torch_jit
 }  // namespace mmdeploy
 
 #endif
diff --git a/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/fuse_select_assign.cpp b/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/fuse_select_assign.cpp
index 8dc5847753..bc784671ea 100644
--- a/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/fuse_select_assign.cpp
+++ b/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/fuse_select_assign.cpp
@@ -6,131 +6,155 @@
 #include "common_subgraph_elimination.h"
 #include "torch/csrc/jit/ir/irparser.h"
 
-namespace mmdeploy {
-namespace torch_jit {
-
-using c10::Symbol;
-using torch::jit::Block;
-using torch::jit::IValue;
-using torch::jit::Node;
-
-bool RemoveBoolCast(Node* node) {
-  auto bottom_node = node->input()->node();
-  if (bottom_node->kind() != Symbol::onnx("Greater") &&
-      bottom_node->kind() != Symbol::onnx("Less")) {
-    return false;
-  }
-  node->output()->replaceAllUsesWith(bottom_node->output());
-  return true;
-}
-
-bool FuseSelectAssign(Node* node, std::unordered_map<std::string, Tensor>& params,
-                      std::unordered_map<std::string, Value*>& vmap, SubgraphMatcher& matcher) {
-  auto values_map = matcher.values_map();
-
-  auto cmp1 = values_map[vmap["cmp_1"]]->node();
-  auto cmp2 = values_map[vmap["cmp_2"]]->node();
-  if (cmp1 != cmp2) {
-    // cmp_1 == cmp_2, cmp in (Great, Less)
-    if (cmp1->kind() != cmp2->kind()) return false;
-    if (!(cmp1->kind() == Symbol::onnx("Greater") || cmp1->kind() == Symbol::onnx("Less")))
-      return false;
-
-    // check threshold
-    Node* cmps[] = {cmp1, cmp2};
-    float thres = 0.0f;
-    Node* x = nullptr;
-    for (int i = 0; i < 2; ++i) {
-      auto cmp = cmps[i];
-      auto threshold = cmp->inputs()[1]->node();
-      if (threshold->kind() != Symbol::onnx("Constant")) return false;
-      auto thres_val = threshold->t(Symbol::attr("value"));
-      if (i == 0) {
-        thres = thres_val.data_ptr<float>()[0];
-        x = cmp->inputs()[0]->node();
-      } else {
-        float tmp_val = thres_val.data_ptr<float>()[0];
-        if (fabs(thres - tmp_val) > 1e-10) {
-          return false;
+namespace mmdeploy
+{
+    namespace torch_jit
+    {
+
+        using c10::Symbol;
+        using torch::jit::Block;
+        using torch::jit::IValue;
+        using torch::jit::Node;
+
+        bool RemoveBoolCast(Node* node)
+        {
+            auto bottom_node = node->input()->node();
+            if (bottom_node->kind() != Symbol::onnx("Greater") &&
+                bottom_node->kind() != Symbol::onnx("Less"))
+            {
+                return false;
+            }
+            node->output()->replaceAllUsesWith(bottom_node->output());
+            return true;
         }
-        if (x != cmp->inputs()[0]->node()) {
-          return false;
+
+        bool FuseSelectAssign(Node*                                    node,
+                              std::unordered_map<std::string, Tensor>& params,
+                              std::unordered_map<std::string, Value*>& vmap,
+                              SubgraphMatcher&                         matcher)
+        {
+            auto values_map = matcher.values_map();
+
+            auto cmp1 = values_map[vmap["cmp_1"]]->node();
+            auto cmp2 = values_map[vmap["cmp_2"]]->node();
+            if (cmp1 != cmp2)
+            {
+                // cmp_1 == cmp_2, cmp in (Great, Less)
+                if (cmp1->kind() != cmp2->kind()) return false;
+                if (!(cmp1->kind() == Symbol::onnx("Greater") || cmp1->kind() == Symbol::onnx("Less")))
+                    return false;
+
+                // check threshold
+                Node* cmps[] = {cmp1, cmp2};
+                float thres  = 0.0f;
+                Node* x      = nullptr;
+                for (int i = 0; i < 2; ++i)
+                {
+                    auto cmp       = cmps[i];
+                    auto threshold = cmp->inputs()[1]->node();
+                    if (threshold->kind() != Symbol::onnx("Constant")) return false;
+                    auto thres_val = threshold->t(Symbol::attr("value"));
+                    if (i == 0)
+                    {
+                        thres = thres_val.data_ptr<float>()[0];
+                        x     = cmp->inputs()[0]->node();
+                    }
+                    else
+                    {
+                        float tmp_val = thres_val.data_ptr<float>()[0];
+                        if (fabs(thres - tmp_val) > 1e-10)
+                        {
+                            return false;
+                        }
+                        if (x != cmp->inputs()[0]->node())
+                        {
+                            return false;
+                        }
+                    }
+                }
+            }
+
+            {
+                // check shape of reshape
+                Node* shape     = values_map[vmap["reshape_1_shape"]]->node();
+                auto  shape_val = shape->t(Symbol::attr("value"));
+                if (shape_val.dim() != 1) return false;
+                if (shape_val.data_ptr<int64_t>()[0] != -1) return false;
+            }
+
+            {
+                // check transpose
+                Node* trans[] = {values_map[vmap["trans_1"]]->node(), values_map[vmap["trans_2"]]->node()};
+                for (auto tran : trans)
+                {
+                    auto tran_perm = tran->is(Symbol::attr("perm"));
+                    if (tran_perm.size() != 2) return false;
+                    if (tran_perm[0] != 1 || tran_perm[1] != 0) return false;
+                }
+            }
+
+            {
+                // check gather indice
+                Node* gather_inds = values_map[vmap["gather_inds_2"]]->node();
+                auto  inds_val    = gather_inds->t(Symbol::attr("value"));
+                if (inds_val.dim() != 0) return false;
+                if (inds_val.data_ptr<int64_t>()[0] != 0) return false;
+            }
+
+            {
+                // check slice start
+                Node* slice      = values_map[vmap["slice_2"]]->node();
+                auto  start_name = slice->inputs()[1]->debugName();
+                auto  start_val  = params[start_name];
+                if (start_val.dim() != 1) return false;
+                if (start_val.data_ptr<int64_t>()[0] != 0) return false;
+            }
+
+            // create new node
+            auto graph      = node->owningGraph();
+            auto z          = values_map[vmap["z"]];
+            auto y          = values_map[vmap["y"]];
+            auto where_node = graph->create(Symbol::onnx("Where"), {cmp1->output(), z, y});
+            where_node->insertBefore(node);
+            where_node->output()->copyMetadata(node->output());
+            node->output()->replaceAllUsesWith(where_node->output());
+            return true;
+        }
+
+        void FuseSelectAssign(Block*                                   block,
+                              std::unordered_map<std::string, Tensor>& params,
+                              std::unordered_map<std::string, Value*>& vmap,
+                              SubgraphMatcher&                         matcher)
+        {
+            auto graph = block->owningGraph();
+            auto it    = block->nodes().begin();
+            while (it != block->nodes().end())
+            {
+                auto node = *it;
+                ++it;
+                for (auto block : node->blocks())
+                {
+                    FuseSelectAssign(block, params, vmap, matcher);
+                }
+
+                if (node->kind() == Symbol::onnx("Cast") && node->i(Symbol::attr("to")) == 9)
+                {
+                    RemoveBoolCast(node);
+                }
+                else if (matcher.matchesSubgraphFromAnchorNode(node))
+                {
+                    FuseSelectAssign(node, params, vmap, matcher);
+                }
+            }
         }
-      }
-    }
-  }
-
-  {
-    // check shape of reshape
-    Node* shape = values_map[vmap["reshape_1_shape"]]->node();
-    auto shape_val = shape->t(Symbol::attr("value"));
-    if (shape_val.dim() != 1) return false;
-    if (shape_val.data_ptr<int64_t>()[0] != -1) return false;
-  }
-
-  {
-    // check transpose
-    Node* trans[] = {values_map[vmap["trans_1"]]->node(), values_map[vmap["trans_2"]]->node()};
-    for (auto tran : trans) {
-      auto tran_perm = tran->is(Symbol::attr("perm"));
-      if (tran_perm.size() != 2) return false;
-      if (tran_perm[0] != 1 || tran_perm[1] != 0) return false;
-    }
-  }
-
-  {
-    // check gather indice
-    Node* gather_inds = values_map[vmap["gather_inds_2"]]->node();
-    auto inds_val = gather_inds->t(Symbol::attr("value"));
-    if (inds_val.dim() != 0) return false;
-    if (inds_val.data_ptr<int64_t>()[0] != 0) return false;
-  }
-
-  {
-    // check slice start
-    Node* slice = values_map[vmap["slice_2"]]->node();
-    auto start_name = slice->inputs()[1]->debugName();
-    auto start_val = params[start_name];
-    if (start_val.dim() != 1) return false;
-    if (start_val.data_ptr<int64_t>()[0] != 0) return false;
-  }
-
-  // create new node
-  auto graph = node->owningGraph();
-  auto z = values_map[vmap["z"]];
-  auto y = values_map[vmap["y"]];
-  auto where_node = graph->create(Symbol::onnx("Where"), {cmp1->output(), z, y});
-  where_node->insertBefore(node);
-  where_node->output()->copyMetadata(node->output());
-  node->output()->replaceAllUsesWith(where_node->output());
-  return true;
-}
-
-void FuseSelectAssign(Block* block, std::unordered_map<std::string, Tensor>& params,
-                      std::unordered_map<std::string, Value*>& vmap, SubgraphMatcher& matcher) {
-  auto graph = block->owningGraph();
-  auto it = block->nodes().begin();
-  while (it != block->nodes().end()) {
-    auto node = *it;
-    ++it;
-    for (auto block : node->blocks()) {
-      FuseSelectAssign(block, params, vmap, matcher);
-    }
-
-    if (node->kind() == Symbol::onnx("Cast") && node->i(Symbol::attr("to")) == 9) {
-      RemoveBoolCast(node);
-    } else if (matcher.matchesSubgraphFromAnchorNode(node)) {
-      FuseSelectAssign(node, params, vmap, matcher);
-    }
-  }
-}
-
-void FuseSelectAssign(std::shared_ptr<Graph>& graph,
-                      std::unordered_map<std::string, Tensor>& params) {
-  // cse before search
-  CommonSubgraphElimination(graph, params);
-
-  std::string pattern_str = R"IR(
+
+        void FuseSelectAssign(std::shared_ptr<Graph>&                  graph,
+                              std::unordered_map<std::string, Tensor>& params)
+        {
+            // cse before search
+            CommonSubgraphElimination(graph, params);
+
+            std::string                             pattern_str = R"IR(
       graph(%y, %z, %cmp_1, %cmp_2, %start, %axes, %shape_2):
         %nz_1 = onnx::NonZero(%cmp_1)
         %trans_1 = onnx::Transpose(%nz_1)
@@ -149,15 +173,16 @@ void FuseSelectAssign(std::shared_ptr<Graph>& graph,
         return (%scatter_2)
   )IR";
 
-  Graph pattern;
-  std::unordered_map<std::string, Value*> vmap;
-  torch::jit::parseIR(pattern_str, &pattern, vmap);
-
-  SubgraphMatcher matcher(pattern, MatchAttribute::NO_MATCH);
-  FuseSelectAssign(graph->block(), params, vmap, matcher);
-  torch::jit::EliminateDeadCode(
-      graph->block(), true,
-      torch::jit::DCESideEffectPolicy::ALLOW_DELETING_NODES_WITH_SIDE_EFFECTS);
-}
-}  // namespace torch_jit
+            Graph                                   pattern;
+            std::unordered_map<std::string, Value*> vmap;
+            torch::jit::parseIR(pattern_str, &pattern, vmap);
+
+            SubgraphMatcher matcher(pattern, MatchAttribute::NO_MATCH);
+            FuseSelectAssign(graph->block(), params, vmap, matcher);
+            torch::jit::EliminateDeadCode(
+                graph->block(),
+                true,
+                torch::jit::DCESideEffectPolicy::ALLOW_DELETING_NODES_WITH_SIDE_EFFECTS);
+        }
+    }  // namespace torch_jit
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/fuse_select_assign.h b/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/fuse_select_assign.h
index afa0dc56d6..0e80ec1d67 100644
--- a/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/fuse_select_assign.h
+++ b/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/fuse_select_assign.h
@@ -3,15 +3,17 @@
 #define _FUSE_SELECT_ASSIGN_H_
 
 #include <torch/script.h>
-namespace mmdeploy {
-namespace torch_jit {
-using torch::Tensor;
-using torch::jit::Graph;
+namespace mmdeploy
+{
+    namespace torch_jit
+    {
+        using torch::Tensor;
+        using torch::jit::Graph;
 
-// this pass is used to fuse y[x>thres] = z[x>thres]
-void FuseSelectAssign(std::shared_ptr<Graph>& graph,
-                      std::unordered_map<std::string, Tensor>& params);
-}  // namespace torch_jit
+        // this pass is used to fuse y[x>thres] = z[x>thres]
+        void FuseSelectAssign(std::shared_ptr<Graph>&                  graph,
+                              std::unordered_map<std::string, Tensor>& params);
+    }  // namespace torch_jit
 }  // namespace mmdeploy
 
 #endif
diff --git a/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/merge_shape_concate.cpp b/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/merge_shape_concate.cpp
index 3da4933b15..dea6909f8b 100644
--- a/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/merge_shape_concate.cpp
+++ b/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/merge_shape_concate.cpp
@@ -5,111 +5,131 @@
 
 #include "utils.h"
 
-namespace mmdeploy {
-namespace torch_jit {
-
-using c10::Symbol;
-using torch::jit::Block;
-using torch::jit::IValue;
-using torch::jit::Node;
-using torch::jit::TensorType;
-using torch::jit::Value;
-
-void MergeShapeConcate(Node* node) {
-  auto inputs = node->inputs();
-
-  std::vector<int64_t> gather_value;
-  Value* shape_from = nullptr;
-
-  std::vector<Node*> node_to_remove{node};
-
-  // check pattern shape->gather->unsqueeze->concate
-  for (auto input : inputs) {
-    auto unsqueeze_node = input->node();
-    if (!is_kind(unsqueeze_node, "onnx::Unsqueeze") || unsqueeze_node->output()->uses().size() != 1)
-      return;
-
-    if (unsqueeze_node->hasAttribute(Symbol::attr("axes"))) {
-      auto axes = unsqueeze_node->is(Symbol::attr("axes"));
-      if (axes.size() != 1 && axes[0] != 0) return;
-    }
-
-    auto gather_node = unsqueeze_node->input(0)->node();
-    if (!is_kind(gather_node, "onnx::Gather") || gather_node->i(Symbol::attr("axis")) != 0 ||
-        gather_node->output()->uses().size() != 1)
-      return;
-
-    auto gather_inputs = gather_node->inputs();
-    auto gather_data = gather_inputs[0];
-    auto gather_indices = gather_inputs[1];
-    auto shape_node = gather_data->node();
-    if (!is_kind(shape_node, "onnx::Shape") || shape_node->output()->uses().size() != 1) return;
-
-    auto current_shape_from = shape_node->input();
-    if (!shape_from) {
-      shape_from = current_shape_from;
-    } else {
-      if (shape_from != current_shape_from) return;
-    }
-
-    auto constant_node = gather_indices->node();
-    if (!is_kind(constant_node, "onnx::Constant")) return;
-
-    auto gather_indices_val = constant_node->t(Symbol::attr("value"));
-    int64_t* data_ptr = gather_indices_val.data_ptr<int64_t>();
-    if (gather_indices_val.dim() == 0) {
-      gather_value.push_back(data_ptr[0]);
-    } else {
-      int element_size = gather_indices_val.element_size();
-      for (int j = 0; j < element_size; ++j) {
-        gather_value.push_back(data_ptr[j]);
-      }
-    }
-
-    node_to_remove.insert(node_to_remove.end(), {unsqueeze_node, gather_node, shape_node});
-  }
-
-  // create constant value
-  auto graph = node->owningGraph();
-  auto const_node = graph->create(Symbol::onnx("Constant"));
-  const_node->t_(Symbol::attr("value"), at::tensor(gather_value));
-  auto first_node = node->owningGraph()->block()->nodes().front();
-  if (const_node != first_node) const_node->insertBefore(first_node);
-
-  // recreate shape node
-  auto shape_node = graph->create(Symbol::onnx("Shape"), {shape_from});
-  shape_node->insertBefore(node);
-
-  // create gather node
-  auto gather_node =
-      graph->create(Symbol::onnx("Gather"), {shape_node->output(), const_node->output()});
-
-  // insert into graph
-  gather_node->insertAfter(node);
-  node->output()->replaceAllUsesWith(gather_node->output());
-
-  for (auto n : node_to_remove) {
-    n->destroy();
-  }
-}
-
-void MergeShapeConcate(Block* block) {
-  auto graph = block->owningGraph();
-  auto it = block->nodes().begin();
-  while (it != block->nodes().end()) {
-    auto node = *it;
-    ++it;
-    for (auto block : node->blocks()) {
-      MergeShapeConcate(block);
-    }
-
-    if (is_kind(node, "onnx::Concat")) {
-      MergeShapeConcate(node);
-    }
-  }
-}
-
-void MergeShapeConcate(const std::shared_ptr<Graph>& graph) { MergeShapeConcate(graph->block()); }
-
-}  // namespace torch_jit
+namespace mmdeploy
+{
+    namespace torch_jit
+    {
+
+        using c10::Symbol;
+        using torch::jit::Block;
+        using torch::jit::IValue;
+        using torch::jit::Node;
+        using torch::jit::TensorType;
+        using torch::jit::Value;
+
+        void MergeShapeConcate(Node* node)
+        {
+            auto                 inputs = node->inputs();
+
+            std::vector<int64_t> gather_value;
+            Value*               shape_from = nullptr;
+
+            std::vector<Node*>   node_to_remove{node};
+
+            // check pattern shape->gather->unsqueeze->concate
+            for (auto input : inputs)
+            {
+                auto unsqueeze_node = input->node();
+                if (!is_kind(unsqueeze_node, "onnx::Unsqueeze") || unsqueeze_node->output()->uses().size() != 1)
+                    return;
+
+                if (unsqueeze_node->hasAttribute(Symbol::attr("axes")))
+                {
+                    auto axes = unsqueeze_node->is(Symbol::attr("axes"));
+                    if (axes.size() != 1 && axes[0] != 0) return;
+                }
+
+                auto gather_node = unsqueeze_node->input(0)->node();
+                if (!is_kind(gather_node, "onnx::Gather") || gather_node->i(Symbol::attr("axis")) != 0 ||
+                    gather_node->output()->uses().size() != 1)
+                    return;
+
+                auto gather_inputs  = gather_node->inputs();
+                auto gather_data    = gather_inputs[0];
+                auto gather_indices = gather_inputs[1];
+                auto shape_node     = gather_data->node();
+                if (!is_kind(shape_node, "onnx::Shape") || shape_node->output()->uses().size() != 1) return;
+
+                auto current_shape_from = shape_node->input();
+                if (!shape_from)
+                {
+                    shape_from = current_shape_from;
+                }
+                else
+                {
+                    if (shape_from != current_shape_from) return;
+                }
+
+                auto constant_node = gather_indices->node();
+                if (!is_kind(constant_node, "onnx::Constant")) return;
+
+                auto     gather_indices_val = constant_node->t(Symbol::attr("value"));
+                int64_t* data_ptr           = gather_indices_val.data_ptr<int64_t>();
+                if (gather_indices_val.dim() == 0)
+                {
+                    gather_value.push_back(data_ptr[0]);
+                }
+                else
+                {
+                    int element_size = gather_indices_val.element_size();
+                    for (int j = 0; j < element_size; ++j)
+                    {
+                        gather_value.push_back(data_ptr[j]);
+                    }
+                }
+
+                node_to_remove.insert(node_to_remove.end(), {unsqueeze_node, gather_node, shape_node});
+            }
+
+            // create constant value
+            auto graph      = node->owningGraph();
+            auto const_node = graph->create(Symbol::onnx("Constant"));
+            const_node->t_(Symbol::attr("value"), at::tensor(gather_value));
+            auto first_node = node->owningGraph()->block()->nodes().front();
+            if (const_node != first_node) const_node->insertBefore(first_node);
+
+            // recreate shape node
+            auto shape_node = graph->create(Symbol::onnx("Shape"), {shape_from});
+            shape_node->insertBefore(node);
+
+            // create gather node
+            auto gather_node =
+                graph->create(Symbol::onnx("Gather"), {shape_node->output(), const_node->output()});
+
+            // insert into graph
+            gather_node->insertAfter(node);
+            node->output()->replaceAllUsesWith(gather_node->output());
+
+            for (auto n : node_to_remove)
+            {
+                n->destroy();
+            }
+        }
+
+        void MergeShapeConcate(Block* block)
+        {
+            auto graph = block->owningGraph();
+            auto it    = block->nodes().begin();
+            while (it != block->nodes().end())
+            {
+                auto node = *it;
+                ++it;
+                for (auto block : node->blocks())
+                {
+                    MergeShapeConcate(block);
+                }
+
+                if (is_kind(node, "onnx::Concat"))
+                {
+                    MergeShapeConcate(node);
+                }
+            }
+        }
+
+        void MergeShapeConcate(const std::shared_ptr<Graph>& graph)
+        {
+            MergeShapeConcate(graph->block());
+        }
+
+    }  // namespace torch_jit
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/merge_shape_concate.h b/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/merge_shape_concate.h
index 8656da63c2..13a67f0f47 100644
--- a/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/merge_shape_concate.h
+++ b/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/merge_shape_concate.h
@@ -3,12 +3,14 @@
 #define _MERGE_SHAPE_CONCATE_H_
 
 #include <torch/script.h>
-namespace mmdeploy {
-namespace torch_jit {
-using torch::jit::Graph;
+namespace mmdeploy
+{
+    namespace torch_jit
+    {
+        using torch::jit::Graph;
 
-void MergeShapeConcate(const std::shared_ptr<Graph>& graph);
-}  // namespace torch_jit
+        void MergeShapeConcate(const std::shared_ptr<Graph>& graph);
+    }  // namespace torch_jit
 }  // namespace mmdeploy
 
 #endif
diff --git a/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/onnx_peephole.cpp b/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/onnx_peephole.cpp
index f0ef5a5230..7c2f866b85 100644
--- a/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/onnx_peephole.cpp
+++ b/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/onnx_peephole.cpp
@@ -7,75 +7,90 @@
 
 #include "utils.h"
 
-namespace mmdeploy {
-namespace torch_jit {
-
-using c10::Symbol;
-using torch::jit::Block;
-using torch::jit::IValue;
-using torch::jit::Node;
-using torch::jit::TensorType;
-using torch::jit::Value;
-
-void RemoveReshapeChain(Node* node) {
-  // reshape->reshape => reshape
-  auto output = node->output();
-  if (!(output->hasUses())) {
-    return;
-  }
-  auto uses = output->uses();
-
-  for (auto use : uses) {
-    if (!is_kind(use.user, "onnx::Reshape") || use.offset != 0) {
-      return;
-    }
-  }
-
-  auto input = node->inputs()[0];
-  output->replaceAllUsesWith(input);
-
-  node->destroy();
-}
-
-void RemoveRedundantCast(Node* node) {
-  // Cast(type n)->Cast(type n) => Cast(type n)
-
-  auto to_type = node->i(Symbol::attr("to"));
-  auto input = node->input();
-
-  auto input_node = input->node();
-  if (is_kind(input_node, "onnx::Cast") && input_node->i(Symbol::attr("to")) == to_type) {
-    auto output = node->output();
-
-    output->replaceAllUsesWith(input);
-    node->destroy();
-  }
-}
-
-void ONNXPeephole(Block* block) {
-  auto graph = block->owningGraph();
-  auto it = block->nodes().begin();
-  while (it != block->nodes().end()) {
-    auto node = *it;
-    ++it;
-    for (auto block : node->blocks()) {
-      ONNXPeephole(block);
-    }
-
-    if (is_kind(node, "onnx::Reshape")) {
-      RemoveReshapeChain(node);
-    } else if (is_kind(node, "onnx::Cast")) {
-      RemoveRedundantCast(node);
-    }
-  }
-}
-
-void ONNXPeephole(const std::shared_ptr<Graph>& graph) {
-  ONNXPeephole(graph->block());
-  torch::jit::EliminateDeadCode(
-      graph->block(), true,
-      torch::jit::DCESideEffectPolicy::ALLOW_DELETING_NODES_WITH_SIDE_EFFECTS);
-}
-
-}  // namespace torch_jit
+namespace mmdeploy
+{
+    namespace torch_jit
+    {
+
+        using c10::Symbol;
+        using torch::jit::Block;
+        using torch::jit::IValue;
+        using torch::jit::Node;
+        using torch::jit::TensorType;
+        using torch::jit::Value;
+
+        void RemoveReshapeChain(Node* node)
+        {
+            // reshape->reshape => reshape
+            auto output = node->output();
+            if (!(output->hasUses()))
+            {
+                return;
+            }
+            auto uses = output->uses();
+
+            for (auto use : uses)
+            {
+                if (!is_kind(use.user, "onnx::Reshape") || use.offset != 0)
+                {
+                    return;
+                }
+            }
+
+            auto input = node->inputs()[0];
+            output->replaceAllUsesWith(input);
+
+            node->destroy();
+        }
+
+        void RemoveRedundantCast(Node* node)
+        {
+            // Cast(type n)->Cast(type n) => Cast(type n)
+
+            auto to_type = node->i(Symbol::attr("to"));
+            auto input   = node->input();
+
+            auto input_node = input->node();
+            if (is_kind(input_node, "onnx::Cast") && input_node->i(Symbol::attr("to")) == to_type)
+            {
+                auto output = node->output();
+
+                output->replaceAllUsesWith(input);
+                node->destroy();
+            }
+        }
+
+        void ONNXPeephole(Block* block)
+        {
+            auto graph = block->owningGraph();
+            auto it    = block->nodes().begin();
+            while (it != block->nodes().end())
+            {
+                auto node = *it;
+                ++it;
+                for (auto block : node->blocks())
+                {
+                    ONNXPeephole(block);
+                }
+
+                if (is_kind(node, "onnx::Reshape"))
+                {
+                    RemoveReshapeChain(node);
+                }
+                else if (is_kind(node, "onnx::Cast"))
+                {
+                    RemoveRedundantCast(node);
+                }
+            }
+        }
+
+        void ONNXPeephole(const std::shared_ptr<Graph>& graph)
+        {
+            ONNXPeephole(graph->block());
+            torch::jit::EliminateDeadCode(graph->block(),
+                                          true,
+                                          torch::jit::DCESideEffectPolicy::ALLOW_DELETING_NODES_WITH_SIDE_EFFECTS);
+        }
+
+    }  // namespace torch_jit
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/onnx_peephole.h b/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/onnx_peephole.h
index f388da1bfa..21b7be15d1 100644
--- a/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/onnx_peephole.h
+++ b/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/onnx_peephole.h
@@ -3,13 +3,15 @@
 #define _ONNX_PEEPHOLE_H_
 
 #include <torch/script.h>
-namespace mmdeploy {
-namespace torch_jit {
-using torch::jit::Graph;
+namespace mmdeploy
+{
+    namespace torch_jit
+    {
+        using torch::jit::Graph;
 
-void ONNXPeephole(const std::shared_ptr<Graph>& graph);
+        void ONNXPeephole(const std::shared_ptr<Graph>& graph);
 
-}  // namespace torch_jit
+    }  // namespace torch_jit
 }  // namespace mmdeploy
 
 #endif
diff --git a/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/utils.h b/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/utils.h
index 1c92cd15a1..147e5b1349 100644
--- a/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/utils.h
+++ b/csrc/mmdeploy/backend_ops/torchscript/optimizer/passes/onnx/utils.h
@@ -3,18 +3,24 @@
 
 #include <torch/script.h>
 
-namespace mmdeploy {
-namespace torch_jit {
-using c10::Symbol;
-using torch::jit::Node;
+namespace mmdeploy
+{
+    namespace torch_jit
+    {
+        using c10::Symbol;
+        using torch::jit::Node;
 
-inline bool is_kind(const Node* node, const Symbol& symbol) { return node->kind() == symbol; }
+        inline bool is_kind(const Node* node, const Symbol& symbol)
+        {
+            return node->kind() == symbol;
+        }
 
-inline bool is_kind(const Node* node, const char* symbol_name) {
-  return is_kind(node, Symbol::fromQualString(symbol_name));
-}
+        inline bool is_kind(const Node* node, const char* symbol_name)
+        {
+            return is_kind(node, Symbol::fromQualString(symbol_name));
+        }
 
-}  // namespace torch_jit
+    }  // namespace torch_jit
 }  // namespace mmdeploy
 
 #endif
diff --git a/csrc/mmdeploy/codebase/CMakeLists.txt b/csrc/mmdeploy/codebase/CMakeLists.txt
index f933b7fb92..172274efcb 100644
--- a/csrc/mmdeploy/codebase/CMakeLists.txt
+++ b/csrc/mmdeploy/codebase/CMakeLists.txt
@@ -3,29 +3,29 @@
 project(mmdeploy_codebase)
 
 set(CODEBASES "")
-if ("all" IN_LIST MMDEPLOY_CODEBASES)
-    list(APPEND CODEBASES "mmcls")
-    list(APPEND CODEBASES "mmdet")
-    list(APPEND CODEBASES "mmseg")
-    list(APPEND CODEBASES "mmocr")
-    list(APPEND CODEBASES "mmedit")
-    list(APPEND CODEBASES "mmpose")
-    list(APPEND CODEBASES "mmrotate")
-    list(APPEND CODEBASES "mmaction")
-else ()
-    set(CODEBASES ${MMDEPLOY_CODEBASES})
-endif ()
+if("all" IN_LIST MMDEPLOY_CODEBASES)
+  list(APPEND CODEBASES "mmcls")
+  list(APPEND CODEBASES "mmdet")
+  list(APPEND CODEBASES "mmseg")
+  list(APPEND CODEBASES "mmocr")
+  list(APPEND CODEBASES "mmedit")
+  list(APPEND CODEBASES "mmpose")
+  list(APPEND CODEBASES "mmrotate")
+  list(APPEND CODEBASES "mmaction")
+else()
+  set(CODEBASES ${MMDEPLOY_CODEBASES})
+endif()
 
-foreach (codebase IN LISTS CODEBASES)
-    message(STATUS "build codebase: ${codebase}")
-    if (codebase STREQUAL "mmpretrain")
-        set(subdir_name "mmcls")
-    elseif (codebase STREQUAL "mmyolo")
-        set(subdir_name "mmdet")
-    elseif (codebase STREQUAL "mmagic")
-        set(subdir_name "mmedit")
-    else()
-	    set(subdir_name ${codebase})
-    endif()
-    add_subdirectory(${subdir_name})
-endforeach ()
+foreach(codebase IN LISTS CODEBASES)
+  message(STATUS "build codebase: ${codebase}")
+  if(codebase STREQUAL "mmpretrain")
+    set(subdir_name "mmcls")
+  elseif(codebase STREQUAL "mmyolo")
+    set(subdir_name "mmdet")
+  elseif(codebase STREQUAL "mmagic")
+    set(subdir_name "mmedit")
+  else()
+    set(subdir_name ${codebase})
+  endif()
+  add_subdirectory(${subdir_name})
+endforeach()
diff --git a/csrc/mmdeploy/codebase/common.h b/csrc/mmdeploy/codebase/common.h
index 391f177590..a0d1bc4a18 100644
--- a/csrc/mmdeploy/codebase/common.h
+++ b/csrc/mmdeploy/codebase/common.h
@@ -9,69 +9,92 @@
 #include "mmdeploy/core/utils/formatter.h"
 #include "mmdeploy/experimental/module_adapter.h"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-using namespace framework;
+    using namespace framework;
 
-class Context {
- public:
-  explicit Context(const Value& config) {
-    MMDEPLOY_DEBUG("config: {}", config);
-    device_ = config["context"]["device"].get<Device>();
-    stream_ = config["context"]["stream"].get<Stream>();
-  }
+    class Context
+    {
+      public:
+        explicit Context(const Value& config)
+        {
+            MMDEPLOY_DEBUG("config: {}", config);
+            device_ = config["context"]["device"].get<Device>();
+            stream_ = config["context"]["stream"].get<Stream>();
+        }
 
-  Device& device() { return device_; }
-  Stream& stream() { return stream_; }
+        Device& device()
+        {
+            return device_;
+        }
 
- protected:
-  Device device_;
-  Stream stream_;
-};
+        Stream& stream()
+        {
+            return stream_;
+        }
 
-template <class Tag>
-class CodebaseCreator : public Creator<Module> {
- public:
-  std::string_view name() const noexcept override { return Tag::name; }
-  std::unique_ptr<Module> Create(const Value& cfg) override {
-    constexpr auto key{"component"};
-    if (!cfg.contains(key)) {
-      MMDEPLOY_ERROR("no key '{}' in config {}", key, cfg);
-      throw_exception(eInvalidArgument);
-    }
-    if (!cfg[key].is_string()) {
-      MMDEPLOY_ERROR("key '{}' is not a string", key);
-      throw_exception(eInvalidArgument);
-    }
-    auto postprocess_type = cfg[key].get<std::string>();
-    auto creator = gRegistry<Tag>().Get(postprocess_type);
-    if (creator == nullptr) {
-      MMDEPLOY_ERROR("Could not found entry '{}' in {}. Available components: {}", postprocess_type,
-                     Tag::name, gRegistry<Tag>().List());
-      throw_exception(eEntryNotFound);
-    }
-    return creator->Create(cfg);
-  }
-};
+      protected:
+        Device device_;
+        Stream stream_;
+    };
 
-#define MMDEPLOY_DECLARE_CODEBASE(codebase_type, codebase_name)      \
-  class codebase_type : public Context {                             \
-   public:                                                           \
-    static constexpr const auto name = #codebase_name;               \
-    using type = std::unique_ptr<Module>;                            \
-    explicit codebase_type(const Value& config) : Context(config) {} \
-  };                                                                 \
-  MMDEPLOY_DECLARE_REGISTRY(codebase_type, std::unique_ptr<Module>(const Value& config));
+    template<class Tag>
+    class CodebaseCreator : public Creator<Module>
+    {
+      public:
+        std::string_view name() const noexcept override
+        {
+            return Tag::name;
+        }
 
-#define MMDEPLOY_REGISTER_CODEBASE(codebase)              \
-  using codebase##_##Creator = CodebaseCreator<codebase>; \
-  MMDEPLOY_REGISTER_CREATOR(Module, codebase##_##Creator) \
-  MMDEPLOY_DEFINE_REGISTRY(codebase)
+        std::unique_ptr<Module> Create(const Value& cfg) override
+        {
+            constexpr auto key{"component"};
+            if (!cfg.contains(key))
+            {
+                MMDEPLOY_ERROR("no key '{}' in config {}", key, cfg);
+                throw_exception(eInvalidArgument);
+            }
+            if (!cfg[key].is_string())
+            {
+                MMDEPLOY_ERROR("key '{}' is not a string", key);
+                throw_exception(eInvalidArgument);
+            }
+            auto postprocess_type = cfg[key].get<std::string>();
+            auto creator          = gRegistry<Tag>().Get(postprocess_type);
+            if (creator == nullptr)
+            {
+                MMDEPLOY_ERROR("Could not found entry '{}' in {}. Available components: {}",
+                               postprocess_type,
+                               Tag::name,
+                               gRegistry<Tag>().List());
+                throw_exception(eEntryNotFound);
+            }
+            return creator->Create(cfg);
+        }
+    };
 
-#define MMDEPLOY_REGISTER_CODEBASE_COMPONENT(codebase, component_type)                    \
-  MMDEPLOY_REGISTER_FACTORY_FUNC(codebase, (component_type, 0), [](const Value& config) { \
-    return CreateTask(component_type(config));                                            \
-  })
+#define MMDEPLOY_DECLARE_CODEBASE(codebase_type, codebase_name)     \
+    class codebase_type : public Context                            \
+    {                                                               \
+      public:                                                       \
+        static constexpr const auto name = #codebase_name;          \
+        using type                       = std::unique_ptr<Module>; \
+        explicit codebase_type(const Value& config)                 \
+            : Context(config)                                       \
+        {                                                           \
+        }                                                           \
+    };                                                              \
+    MMDEPLOY_DECLARE_REGISTRY(codebase_type, std::unique_ptr<Module>(const Value& config));
+
+#define MMDEPLOY_REGISTER_CODEBASE(codebase)                \
+    using codebase##_##Creator = CodebaseCreator<codebase>; \
+    MMDEPLOY_REGISTER_CREATOR(Module, codebase##_##Creator) \
+    MMDEPLOY_DEFINE_REGISTRY(codebase)
+
+#define MMDEPLOY_REGISTER_CODEBASE_COMPONENT(codebase, component_type) \
+    MMDEPLOY_REGISTER_FACTORY_FUNC(codebase, (component_type, 0), [](const Value& config) { return CreateTask(component_type(config)); })
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/codebase/mmaction/CMakeLists.txt b/csrc/mmdeploy/codebase/mmaction/CMakeLists.txt
index 2ea41f7271..380b7b6f46 100644
--- a/csrc/mmdeploy/codebase/mmaction/CMakeLists.txt
+++ b/csrc/mmdeploy/codebase/mmaction/CMakeLists.txt
@@ -5,11 +5,12 @@ project(mmdeploy_mmaction)
 file(GLOB SRCS ${CMAKE_CURRENT_SOURCE_DIR} "*.cpp")
 mmdeploy_add_module(${PROJECT_NAME} "${SRCS}")
 
-target_link_libraries(${PROJECT_NAME} PRIVATE
-    mmdeploy_operation
-    mmdeploy_transform
-    mmdeploy_opencv_utils)
+target_link_libraries(
+  ${PROJECT_NAME} PRIVATE mmdeploy_operation mmdeploy_transform
+                          mmdeploy_opencv_utils)
 
 add_library(mmdeploy::mmaction ALIAS ${PROJECT_NAME})
 
-set(MMDEPLOY_TASKS ${MMDEPLOY_TASKS} video_recognizer CACHE INTERNAL "")
+set(MMDEPLOY_TASKS
+    ${MMDEPLOY_TASKS} video_recognizer
+    CACHE INTERNAL "")
diff --git a/csrc/mmdeploy/codebase/mmaction/base_head.cpp b/csrc/mmdeploy/codebase/mmaction/base_head.cpp
index 931c9663eb..2e541fd660 100644
--- a/csrc/mmdeploy/codebase/mmaction/base_head.cpp
+++ b/csrc/mmdeploy/codebase/mmaction/base_head.cpp
@@ -7,66 +7,75 @@
 #include "mmdeploy/core/tensor.h"
 #include "mmdeploy/core/utils/device_utils.h"
 
-namespace mmdeploy::mmaction {
+namespace mmdeploy::mmaction
+{
 
-class BaseHead : public MMAction {
- public:
-  explicit BaseHead(const Value& cfg) : MMAction(cfg) {
-    if (cfg.contains("params")) {
-      topk_ = cfg["params"].value("topk", 1);
-      if (topk_ <= 0) {
-        MMDEPLOY_ERROR("'topk' should be greater than 0, but got '{}'", topk_);
-        throw_exception(eInvalidArgument);
-      }
-    }
-  }
+    class BaseHead : public MMAction
+    {
+      public:
+        explicit BaseHead(const Value& cfg)
+            : MMAction(cfg)
+        {
+            if (cfg.contains("params"))
+            {
+                topk_ = cfg["params"].value("topk", 1);
+                if (topk_ <= 0)
+                {
+                    MMDEPLOY_ERROR("'topk' should be greater than 0, but got '{}'", topk_);
+                    throw_exception(eInvalidArgument);
+                }
+            }
+        }
 
-  Result<Value> operator()(const Value& infer_res) {
-    MMDEPLOY_DEBUG("infer_res: {}", infer_res);
-    auto output = infer_res["output"].get<Tensor>();
+        Result<Value> operator()(const Value& infer_res)
+        {
+            MMDEPLOY_DEBUG("infer_res: {}", infer_res);
+            auto output = infer_res["output"].get<Tensor>();
 
-    if (!(output.shape().size() >= 2 && output.data_type() == DataType::kFLOAT)) {
-      MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}, dtype: {}", output.shape(),
-                     (int)output.data_type());
-      return Status(eNotSupported);
-    }
+            if (!(output.shape().size() >= 2 && output.data_type() == DataType::kFLOAT))
+            {
+                MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}, dtype: {}", output.shape(), (int)output.data_type());
+                return Status(eNotSupported);
+            }
 
-    auto class_num = (int)output.shape(1);
+            auto class_num = (int)output.shape(1);
 
-    OUTCOME_TRY(auto _scores, MakeAvailableOnDevice(output, kHost, stream()));
-    OUTCOME_TRY(stream().Wait());
+            OUTCOME_TRY(auto _scores, MakeAvailableOnDevice(output, kHost, stream()));
+            OUTCOME_TRY(stream().Wait());
 
-    return GetLabels(_scores, class_num);
-  }
+            return GetLabels(_scores, class_num);
+        }
 
- private:
-  Value GetLabels(const Tensor& scores, int class_num) const {
-    auto scores_data = scores.data<float>();
-    Labels output;
-    output.reserve(topk_);
-    std::vector<int> idx(class_num);
-    iota(begin(idx), end(idx), 0);
-    partial_sort(begin(idx), begin(idx) + topk_, end(idx),
-                 [&](int i, int j) { return scores_data[i] > scores_data[j]; });
-    for (int i = 0; i < topk_; ++i) {
-      auto label = Label{idx[i], scores_data[idx[i]]};
-      MMDEPLOY_DEBUG("label_id: {}, score: {}", label.label_id, label.score);
-      output.push_back(label);
-    }
-    return to_value(std::move(output));
-  }
+      private:
+        Value GetLabels(const Tensor& scores, int class_num) const
+        {
+            auto   scores_data = scores.data<float>();
+            Labels output;
+            output.reserve(topk_);
+            std::vector<int> idx(class_num);
+            iota(begin(idx), end(idx), 0);
+            partial_sort(begin(idx), begin(idx) + topk_, end(idx), [&](int i, int j)
+                         { return scores_data[i] > scores_data[j]; });
+            for (int i = 0; i < topk_; ++i)
+            {
+                auto label = Label{idx[i], scores_data[idx[i]]};
+                MMDEPLOY_DEBUG("label_id: {}, score: {}", label.label_id, label.score);
+                output.push_back(label);
+            }
+            return to_value(std::move(output));
+        }
 
- private:
-  static constexpr const auto kHost = Device{0};
-  int topk_{1};
-};
+      private:
+        static constexpr const auto kHost = Device{0};
+        int                         topk_{1};
+    };
 
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMAction, BaseHead);
+    MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMAction, BaseHead);
 
-using SlowFastHead = BaseHead;
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMAction, SlowFastHead);
+    using SlowFastHead = BaseHead;
+    MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMAction, SlowFastHead);
 
-using TSNHead = BaseHead;
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMAction, TSNHead);
+    using TSNHead = BaseHead;
+    MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMAction, TSNHead);
 
 }  // namespace mmdeploy::mmaction
diff --git a/csrc/mmdeploy/codebase/mmaction/format_shape.cpp b/csrc/mmdeploy/codebase/mmaction/format_shape.cpp
index 7d8c6ac5c6..ff65fe184d 100644
--- a/csrc/mmdeploy/codebase/mmaction/format_shape.cpp
+++ b/csrc/mmdeploy/codebase/mmaction/format_shape.cpp
@@ -7,122 +7,141 @@
 
 using namespace std;
 
-namespace mmdeploy::mmaction {
-
-FormatShape::FormatShape(const Value& args) {
-  input_format_ = args.value("input_format", std::string(""));
-  if (input_format_ != "NCHW" && input_format_ != "NCTHW") {
-    MMDEPLOY_ERROR("'input_format' should be 'NCHW' or 'NCTHW'");
-    throw_exception(eInvalidArgument);
-  }
-  permute_ = ::mmdeploy::operation::Managed<::mmdeploy::operation::Permute>::Create();
-}
-
-Result<void> FormatShape::MergeInputs(const std::vector<Tensor>& images, Tensor& inputs) {
-  auto N = static_cast<int64_t>(images.size());
-  auto H = images[0].shape(1);
-  auto W = images[0].shape(2);
-  auto C = images[0].shape(3);
-  auto& device = operation::gContext().device();
-  auto& stream = operation::gContext().stream();
-
-  TensorDesc desc = {device, DataType::kFLOAT, {N, H, W, C}};
-  inputs = Tensor(desc);
-  auto offset = 0UL;
-  auto n_item = H * W * C;
-  auto copy_size = n_item * sizeof(float);
-  for (int i = 0; i < N; i++) {
-    auto src_buffer = images[i].buffer();
-    auto dst_buffer = inputs.buffer();
-    OUTCOME_TRY(stream.Copy(src_buffer, dst_buffer, copy_size, 0, offset));
-    offset += copy_size;
-  }
-  return success();
-}
-
-Result<void> FormatShape::Format(const std::vector<Tensor>& images, Tensor& output, int clip_len,
-                                 int num_clips) {
-  Tensor inputs;
-  OUTCOME_TRY(MergeInputs(images, inputs));
-
-  // Tensor dst;
-  if (input_format_ == "NCHW") {
-    OUTCOME_TRY(FormatNCHW(inputs, clip_len, num_clips, output));
-  }
-  if (input_format_ == "NCTHW") {
-    OUTCOME_TRY(FormatNCTHW(inputs, clip_len, num_clips, output));
-  }
-
-  TensorShape expand_dim = output.shape();
-  expand_dim.insert(expand_dim.begin(), 1);
-  output.Reshape(expand_dim);
-
-  return success();
-}
-
-Result<void> FormatShape::FormatNCHW(Tensor& src, int clip_len, int num_clips, Tensor& dst) {
-  const vector<int> axes = {0, 3, 1, 2};
-  OUTCOME_TRY(permute_.Apply(src, dst, axes));
-  return success();
-}
-
-Result<void> FormatShape::FormatNCTHW(Tensor& src, int clip_len, int num_clips, Tensor& dst) {
-  auto N = src.shape(0);
-  auto H = src.shape(1);
-  auto W = src.shape(2);
-  auto C = src.shape(3);
-  int L = clip_len;
-  if (N % L != 0) {
-    return Status(eInvalidArgument);
-  }
-  int M = N / L;
-  src.Reshape({M, L, H, W, C});
-  const vector<int> axes = {0, 4, 1, 2, 3};
-  OUTCOME_TRY(permute_.Apply(src, dst, axes));
-  return success();
-}
-
-Result<void> FormatShape::Apply(Value& data) {
-  MMDEPLOY_DEBUG("input: {}", data);
-
-  if (!data.is_array()) {
-    MMDEPLOY_ERROR("input of format shape should be array");
-    return Status(eInvalidArgument);
-  }
-  if (!(data[0].contains("imgs") || data[0].contains("img"))) {
-    MMDEPLOY_ERROR("input should contains imgs or img");
-    return Status(eInvalidArgument);
-  }
-
-  int n_image = data.size();
-  int clip_len = data[0]["clip_len"].get<int>();
-  int num_clips = data[0]["num_clips"].get<int>();
-  std::vector<Tensor> images;
-
-  if (data[0].contains("imgs")) {
-    int n_crop = data[0]["imgs"].size();
-    int total = n_image * n_crop;
-    images.reserve(total);
-    for (int i = 0; i < n_crop; i++) {
-      for (int j = 0; j < n_image; j++) {
-        images.push_back(data[j]["imgs"][i].get<Tensor>());
-      }
+namespace mmdeploy::mmaction
+{
+
+    FormatShape::FormatShape(const Value& args)
+    {
+        input_format_ = args.value("input_format", std::string(""));
+        if (input_format_ != "NCHW" && input_format_ != "NCTHW")
+        {
+            MMDEPLOY_ERROR("'input_format' should be 'NCHW' or 'NCTHW'");
+            throw_exception(eInvalidArgument);
+        }
+        permute_ = ::mmdeploy::operation::Managed<::mmdeploy::operation::Permute>::Create();
     }
-  } else if (data[0].contains("img")) {
-    images.reserve(n_image);
-    for (int i = 0; i < n_image; i++) {
-      images.push_back(data[i]["img"].get<Tensor>());
+
+    Result<void> FormatShape::MergeInputs(const std::vector<Tensor>& images, Tensor& inputs)
+    {
+        auto       N      = static_cast<int64_t>(images.size());
+        auto       H      = images[0].shape(1);
+        auto       W      = images[0].shape(2);
+        auto       C      = images[0].shape(3);
+        auto&      device = operation::gContext().device();
+        auto&      stream = operation::gContext().stream();
+
+        TensorDesc desc = {device, DataType::kFLOAT, {N, H, W, C}};
+        inputs          = Tensor(desc);
+        auto offset     = 0UL;
+        auto n_item     = H * W * C;
+        auto copy_size  = n_item * sizeof(float);
+        for (int i = 0; i < N; i++)
+        {
+            auto src_buffer = images[i].buffer();
+            auto dst_buffer = inputs.buffer();
+            OUTCOME_TRY(stream.Copy(src_buffer, dst_buffer, copy_size, 0, offset));
+            offset += copy_size;
+        }
+        return success();
+    }
+
+    Result<void> FormatShape::Format(const std::vector<Tensor>& images, Tensor& output, int clip_len, int num_clips)
+    {
+        Tensor inputs;
+        OUTCOME_TRY(MergeInputs(images, inputs));
+
+        // Tensor dst;
+        if (input_format_ == "NCHW")
+        {
+            OUTCOME_TRY(FormatNCHW(inputs, clip_len, num_clips, output));
+        }
+        if (input_format_ == "NCTHW")
+        {
+            OUTCOME_TRY(FormatNCTHW(inputs, clip_len, num_clips, output));
+        }
+
+        TensorShape expand_dim = output.shape();
+        expand_dim.insert(expand_dim.begin(), 1);
+        output.Reshape(expand_dim);
+
+        return success();
     }
-  }
 
-  Tensor dst;
-  data = Value{};
-  OUTCOME_TRY(Format(images, dst, clip_len, num_clips));
-  data["img"] = std::move(dst);
+    Result<void> FormatShape::FormatNCHW(Tensor& src, int clip_len, int num_clips, Tensor& dst)
+    {
+        const vector<int> axes = {0, 3, 1, 2};
+        OUTCOME_TRY(permute_.Apply(src, dst, axes));
+        return success();
+    }
 
-  return success();
-}
+    Result<void> FormatShape::FormatNCTHW(Tensor& src, int clip_len, int num_clips, Tensor& dst)
+    {
+        auto N = src.shape(0);
+        auto H = src.shape(1);
+        auto W = src.shape(2);
+        auto C = src.shape(3);
+        int  L = clip_len;
+        if (N % L != 0)
+        {
+            return Status(eInvalidArgument);
+        }
+        int M = N / L;
+        src.Reshape({M, L, H, W, C});
+        const vector<int> axes = {0, 4, 1, 2, 3};
+        OUTCOME_TRY(permute_.Apply(src, dst, axes));
+        return success();
+    }
+
+    Result<void> FormatShape::Apply(Value& data)
+    {
+        MMDEPLOY_DEBUG("input: {}", data);
+
+        if (!data.is_array())
+        {
+            MMDEPLOY_ERROR("input of format shape should be array");
+            return Status(eInvalidArgument);
+        }
+        if (!(data[0].contains("imgs") || data[0].contains("img")))
+        {
+            MMDEPLOY_ERROR("input should contains imgs or img");
+            return Status(eInvalidArgument);
+        }
+
+        int                 n_image   = data.size();
+        int                 clip_len  = data[0]["clip_len"].get<int>();
+        int                 num_clips = data[0]["num_clips"].get<int>();
+        std::vector<Tensor> images;
+
+        if (data[0].contains("imgs"))
+        {
+            int n_crop = data[0]["imgs"].size();
+            int total  = n_image * n_crop;
+            images.reserve(total);
+            for (int i = 0; i < n_crop; i++)
+            {
+                for (int j = 0; j < n_image; j++)
+                {
+                    images.push_back(data[j]["imgs"][i].get<Tensor>());
+                }
+            }
+        }
+        else if (data[0].contains("img"))
+        {
+            images.reserve(n_image);
+            for (int i = 0; i < n_image; i++)
+            {
+                images.push_back(data[i]["img"].get<Tensor>());
+            }
+        }
+
+        Tensor dst;
+        data = Value{};
+        OUTCOME_TRY(Format(images, dst, clip_len, num_clips));
+        data["img"] = std::move(dst);
+
+        return success();
+    }
 
-MMDEPLOY_REGISTER_TRANSFORM(FormatShape);
+    MMDEPLOY_REGISTER_TRANSFORM(FormatShape);
 
 }  // namespace mmdeploy::mmaction
diff --git a/csrc/mmdeploy/codebase/mmaction/format_shape.h b/csrc/mmdeploy/codebase/mmaction/format_shape.h
index 97e4f99356..7ea0326c84 100644
--- a/csrc/mmdeploy/codebase/mmaction/format_shape.h
+++ b/csrc/mmdeploy/codebase/mmaction/format_shape.h
@@ -12,27 +12,28 @@
 #include "mmdeploy/operation/vision.h"
 #include "mmdeploy/preprocess/transform/transform.h"
 
-namespace mmdeploy::mmaction {
+namespace mmdeploy::mmaction
+{
 
-class FormatShape : public Transform {
- public:
-  explicit FormatShape(const Value& args);
+    class FormatShape : public Transform
+    {
+      public:
+        explicit FormatShape(const Value& args);
 
-  Result<void> Apply(Value& data) override;
+        Result<void> Apply(Value& data) override;
 
-  Result<void> Format(const std::vector<Tensor>& images, Tensor& output, int clip_len,
-                      int num_clips);
+        Result<void> Format(const std::vector<Tensor>& images, Tensor& output, int clip_len, int num_clips);
 
-  Result<void> FormatNCHW(Tensor& src, int clip_len, int num_clips, Tensor& dst);
+        Result<void> FormatNCHW(Tensor& src, int clip_len, int num_clips, Tensor& dst);
 
-  Result<void> FormatNCTHW(Tensor& src, int clip_len, int num_clips, Tensor& dst);
+        Result<void> FormatNCTHW(Tensor& src, int clip_len, int num_clips, Tensor& dst);
 
-  Result<void> MergeInputs(const std::vector<Tensor>& images, Tensor& inputs);
+        Result<void> MergeInputs(const std::vector<Tensor>& images, Tensor& inputs);
 
- private:
-  std::string input_format_;
-  operation::Managed<operation::Permute> permute_;
-};
+      private:
+        std::string                            input_format_;
+        operation::Managed<operation::Permute> permute_;
+    };
 
 }  // namespace mmdeploy::mmaction
 
diff --git a/csrc/mmdeploy/codebase/mmaction/mmaction.cpp b/csrc/mmdeploy/codebase/mmaction/mmaction.cpp
index dc590a1800..7de226ecd1 100644
--- a/csrc/mmdeploy/codebase/mmaction/mmaction.cpp
+++ b/csrc/mmdeploy/codebase/mmaction/mmaction.cpp
@@ -2,8 +2,9 @@
 
 #include "mmdeploy/codebase/mmaction/mmaction.h"
 
-namespace mmdeploy::mmaction {
+namespace mmdeploy::mmaction
+{
 
-MMDEPLOY_REGISTER_CODEBASE(MMAction);
+    MMDEPLOY_REGISTER_CODEBASE(MMAction);
 
 }  // namespace mmdeploy::mmaction
diff --git a/csrc/mmdeploy/codebase/mmaction/mmaction.h b/csrc/mmdeploy/codebase/mmaction/mmaction.h
index ef097e6f20..a3add86894 100644
--- a/csrc/mmdeploy/codebase/mmaction/mmaction.h
+++ b/csrc/mmdeploy/codebase/mmaction/mmaction.h
@@ -8,17 +8,19 @@
 #include "mmdeploy/core/module.h"
 #include "mmdeploy/core/serialization.h"
 
-namespace mmdeploy::mmaction {
+namespace mmdeploy::mmaction
+{
 
-struct Label {
-  int label_id;
-  float score;
-  MMDEPLOY_ARCHIVE_MEMBERS(label_id, score);
-};
+    struct Label
+    {
+        int   label_id;
+        float score;
+        MMDEPLOY_ARCHIVE_MEMBERS(label_id, score);
+    };
 
-using Labels = std::vector<Label>;
+    using Labels = std::vector<Label>;
 
-MMDEPLOY_DECLARE_CODEBASE(MMAction, mmaction);
+    MMDEPLOY_DECLARE_CODEBASE(MMAction, mmaction);
 
 }  // namespace mmdeploy::mmaction
 
diff --git a/csrc/mmdeploy/codebase/mmcls/CMakeLists.txt b/csrc/mmdeploy/codebase/mmcls/CMakeLists.txt
index 79ba5a2ecf..9f084a0a21 100644
--- a/csrc/mmdeploy/codebase/mmcls/CMakeLists.txt
+++ b/csrc/mmdeploy/codebase/mmcls/CMakeLists.txt
@@ -4,8 +4,9 @@ project(mmdeploy_mmcls)
 
 file(GLOB_RECURSE SRCS ${CMAKE_CURRENT_SOURCE_DIR} "*.cpp")
 mmdeploy_add_module(${PROJECT_NAME} "${SRCS}")
-target_link_libraries(${PROJECT_NAME}
-    PRIVATE mmdeploy_opencv_utils)
+target_link_libraries(${PROJECT_NAME} PRIVATE mmdeploy_opencv_utils)
 add_library(mmdeploy::mmcls ALIAS ${PROJECT_NAME})
 
-set(MMDEPLOY_TASKS ${MMDEPLOY_TASKS} classifier CACHE INTERNAL "")
+set(MMDEPLOY_TASKS
+    ${MMDEPLOY_TASKS} classifier
+    CACHE INTERNAL "")
diff --git a/csrc/mmdeploy/codebase/mmcls/linear_cls.cpp b/csrc/mmdeploy/codebase/mmcls/linear_cls.cpp
index 6d22a67b3a..bad8d7c1cf 100644
--- a/csrc/mmdeploy/codebase/mmcls/linear_cls.cpp
+++ b/csrc/mmdeploy/codebase/mmcls/linear_cls.cpp
@@ -12,105 +12,123 @@
 
 using std::vector;
 
-namespace mmdeploy::mmcls {
-
-class LinearClsHead : public MMClassification {
- public:
-  explicit LinearClsHead(const Value& cfg) : MMClassification(cfg) {
-    if (cfg.contains("params")) {
-      softmax_ = cfg["params"].value("softmax", false);
-      topk_ = cfg["params"].value("topk", 1);
-      if (topk_ <= 0) {
-        MMDEPLOY_ERROR("'topk' should be greater than 0, but got '{}'", topk_);
-        throw_exception(eInvalidArgument);
-      }
-    }
-  }
-
-  Result<Value> operator()(const Value& infer_res) {
-    MMDEPLOY_DEBUG("infer_res: {}", infer_res);
-    auto output = infer_res["output"].get<Tensor>();
-
-    if (!(output.shape().size() >= 2 && output.data_type() == DataType::kFLOAT)) {
-      MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}, dtype: {}", output.shape(),
-                     (int)output.data_type());
-      return Status(eNotSupported);
-    }
-
-    auto class_num = (int)output.shape(1);
-
-    OUTCOME_TRY(auto _scores, MakeAvailableOnDevice(output, kHost, stream()));
-    OUTCOME_TRY(stream().Wait());
-
-    return GetLabels(_scores, class_num);
-  }
-
- private:
-  Value GetLabels(const Tensor& scores, int class_num) const {
-    auto scores_data = scores.data<float>();
-    auto topk = std::min(topk_, class_num);
-    Labels output;
-    output.reserve(topk);
-    std::vector<int> idx(class_num);
-    iota(begin(idx), end(idx), 0);
-    partial_sort(begin(idx), begin(idx) + topk, end(idx),
-                 [&](int i, int j) { return scores_data[i] > scores_data[j]; });
-
-    auto sum_exp = 0.f;
-    std::vector<float> exp_scores;
-    if (softmax_) {
-      exp_scores.reserve(class_num);
-      auto max_val = scores_data[idx[0]];
-      for (int i = 0; i < class_num; ++i) {
-        sum_exp += exp_scores.emplace_back(std::exp(scores_data[i] - max_val));
-      }
-    }
-    for (int i = 0; i < topk; ++i) {
-      float score = 0.f;
-      if (softmax_) {
-        score = exp_scores[idx[i]] / sum_exp;
-      } else {
-        score = scores_data[idx[i]];
-      }
-      output.push_back({idx[i], score});
-    }
-    return to_value(std::move(output));
-  }
-
- private:
-  static constexpr const auto kHost = Device{0};
-
-  bool softmax_{false};
-  int topk_{1};
-};
-
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMClassification, LinearClsHead);
-using ConformerHead = LinearClsHead;
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMClassification, ConformerHead);
-
-class CropBox {
- public:
-  Result<Value> operator()(const Value& img, const Value& dets) {
-    auto patch = img["ori_img"].get<Mat>();
-    if (dets.is_object() && dets.contains("bbox")) {
-      auto _box = from_value<std::vector<float>>(dets["bbox"]);
-      cv::Rect rect(cv::Rect_<float>(cv::Point2f(_box[0], _box[1]), cv::Point2f(_box[2], _box[3])));
-      patch = crop(patch, rect);
-    }
-    return Value{{"ori_img", patch}};
-  }
-
- private:
-  static Mat crop(const Mat& img, cv::Rect rect) {
-    cv::Mat mat(img.height(), img.width(), CV_8UC(img.channel()), img.data<void>());
-    rect &= cv::Rect(cv::Point(0, 0), mat.size());
-    mat = mat(rect).clone();
-    std::shared_ptr<void> data(mat.data, [mat = mat](void*) {});
-    return Mat{mat.rows, mat.cols, img.pixel_format(), img.type(), std::move(data)};
-  }
-};
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(Module, (CropBox, 0),
-                               [](const Value&) { return CreateTask(CropBox{}); });
+namespace mmdeploy::mmcls
+{
+
+    class LinearClsHead : public MMClassification
+    {
+      public:
+        explicit LinearClsHead(const Value& cfg)
+            : MMClassification(cfg)
+        {
+            if (cfg.contains("params"))
+            {
+                softmax_ = cfg["params"].value("softmax", false);
+                topk_    = cfg["params"].value("topk", 1);
+                if (topk_ <= 0)
+                {
+                    MMDEPLOY_ERROR("'topk' should be greater than 0, but got '{}'", topk_);
+                    throw_exception(eInvalidArgument);
+                }
+            }
+        }
+
+        Result<Value> operator()(const Value& infer_res)
+        {
+            MMDEPLOY_DEBUG("infer_res: {}", infer_res);
+            auto output = infer_res["output"].get<Tensor>();
+
+            if (!(output.shape().size() >= 2 && output.data_type() == DataType::kFLOAT))
+            {
+                MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}, dtype: {}", output.shape(), (int)output.data_type());
+                return Status(eNotSupported);
+            }
+
+            auto class_num = (int)output.shape(1);
+
+            OUTCOME_TRY(auto _scores, MakeAvailableOnDevice(output, kHost, stream()));
+            OUTCOME_TRY(stream().Wait());
+
+            return GetLabels(_scores, class_num);
+        }
+
+      private:
+        Value GetLabels(const Tensor& scores, int class_num) const
+        {
+            auto   scores_data = scores.data<float>();
+            auto   topk        = std::min(topk_, class_num);
+            Labels output;
+            output.reserve(topk);
+            std::vector<int> idx(class_num);
+            iota(begin(idx), end(idx), 0);
+            partial_sort(begin(idx), begin(idx) + topk, end(idx), [&](int i, int j)
+                         { return scores_data[i] > scores_data[j]; });
+
+            auto               sum_exp = 0.f;
+            std::vector<float> exp_scores;
+            if (softmax_)
+            {
+                exp_scores.reserve(class_num);
+                auto max_val = scores_data[idx[0]];
+                for (int i = 0; i < class_num; ++i)
+                {
+                    sum_exp += exp_scores.emplace_back(std::exp(scores_data[i] - max_val));
+                }
+            }
+            for (int i = 0; i < topk; ++i)
+            {
+                float score = 0.f;
+                if (softmax_)
+                {
+                    score = exp_scores[idx[i]] / sum_exp;
+                }
+                else
+                {
+                    score = scores_data[idx[i]];
+                }
+                output.push_back({idx[i], score});
+            }
+            return to_value(std::move(output));
+        }
+
+      private:
+        static constexpr const auto kHost = Device{0};
+
+        bool                        softmax_{false};
+        int                         topk_{1};
+    };
+
+    MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMClassification, LinearClsHead);
+    using ConformerHead = LinearClsHead;
+    MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMClassification, ConformerHead);
+
+    class CropBox
+    {
+      public:
+        Result<Value> operator()(const Value& img, const Value& dets)
+        {
+            auto patch = img["ori_img"].get<Mat>();
+            if (dets.is_object() && dets.contains("bbox"))
+            {
+                auto     _box = from_value<std::vector<float>>(dets["bbox"]);
+                cv::Rect rect(cv::Rect_<float>(cv::Point2f(_box[0], _box[1]), cv::Point2f(_box[2], _box[3])));
+                patch = crop(patch, rect);
+            }
+            return Value{{"ori_img", patch}};
+        }
+
+      private:
+        static Mat crop(const Mat& img, cv::Rect rect)
+        {
+            cv::Mat mat(img.height(), img.width(), CV_8UC(img.channel()), img.data<void>());
+            rect &= cv::Rect(cv::Point(0, 0), mat.size());
+            mat = mat(rect).clone();
+            std::shared_ptr<void> data(mat.data, [mat = mat](void*) {});
+            return Mat{mat.rows, mat.cols, img.pixel_format(), img.type(), std::move(data)};
+        }
+    };
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Module, (CropBox, 0), [](const Value&)
+                                   { return CreateTask(CropBox{}); });
 
 }  // namespace mmdeploy::mmcls
diff --git a/csrc/mmdeploy/codebase/mmcls/mmcls.cpp b/csrc/mmdeploy/codebase/mmcls/mmcls.cpp
index 27c11eb012..c29680e7b5 100644
--- a/csrc/mmdeploy/codebase/mmcls/mmcls.cpp
+++ b/csrc/mmdeploy/codebase/mmcls/mmcls.cpp
@@ -2,8 +2,9 @@
 
 #include "mmdeploy/codebase/mmcls/mmcls.h"
 
-namespace mmdeploy::mmcls {
+namespace mmdeploy::mmcls
+{
 
-MMDEPLOY_REGISTER_CODEBASE(MMClassification);
+    MMDEPLOY_REGISTER_CODEBASE(MMClassification);
 
 }  // namespace mmdeploy::mmcls
diff --git a/csrc/mmdeploy/codebase/mmcls/mmcls.h b/csrc/mmdeploy/codebase/mmcls/mmcls.h
index 5ac3169fd9..99f1f9065d 100644
--- a/csrc/mmdeploy/codebase/mmcls/mmcls.h
+++ b/csrc/mmdeploy/codebase/mmcls/mmcls.h
@@ -8,17 +8,19 @@
 #include "mmdeploy/core/module.h"
 #include "mmdeploy/core/serialization.h"
 
-namespace mmdeploy::mmcls {
+namespace mmdeploy::mmcls
+{
 
-struct Label {
-  int label_id;
-  float score;
-  MMDEPLOY_ARCHIVE_MEMBERS(label_id, score);
-};
+    struct Label
+    {
+        int   label_id;
+        float score;
+        MMDEPLOY_ARCHIVE_MEMBERS(label_id, score);
+    };
 
-using Labels = std::vector<Label>;
+    using Labels = std::vector<Label>;
 
-MMDEPLOY_DECLARE_CODEBASE(MMClassification, mmcls);
+    MMDEPLOY_DECLARE_CODEBASE(MMClassification, mmcls);
 
 }  // namespace mmdeploy::mmcls
 
diff --git a/csrc/mmdeploy/codebase/mmcls/multi_label_linear_cls.cpp b/csrc/mmdeploy/codebase/mmcls/multi_label_linear_cls.cpp
index e3374891e6..0c9615ea32 100644
--- a/csrc/mmdeploy/codebase/mmcls/multi_label_linear_cls.cpp
+++ b/csrc/mmdeploy/codebase/mmcls/multi_label_linear_cls.cpp
@@ -11,45 +11,53 @@
 
 using std::vector;
 
-namespace mmdeploy::mmcls {
-
-class MultiLabelLinearClsHead : public MMClassification {
- public:
-  explicit MultiLabelLinearClsHead(const Value& cfg) : MMClassification(cfg) {}
-  Result<Value> operator()(const Value& infer_res) {
-    MMDEPLOY_DEBUG("infer_res: {}", infer_res);
-    auto output = infer_res["output"].get<Tensor>();
-
-    if (!(output.shape().size() >= 2 && output.data_type() == DataType::kFLOAT)) {
-      MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}, dtype: {}", output.shape(),
-                     (int)output.data_type());
-      return Status(eNotSupported);
-    }
-
-    auto class_num = (int)output.shape(1);
-
-    OUTCOME_TRY(auto _scores, MakeAvailableOnDevice(output, kHost, stream()));
-    OUTCOME_TRY(stream().Wait());
-
-    return GetLabels(_scores, class_num);
-  }
-
- private:
-  Value GetLabels(const Tensor& scores, int class_num) const {
-    auto scores_data = scores.data<float>();
-    Labels output;
-    for (int i = 0; i < class_num; ++i) {
-      auto label = Label{i, scores_data[i]};
-      MMDEPLOY_DEBUG("label_id: {}, score: {}", label.label_id, label.score);
-      output.push_back(label);
-    }
-    return to_value(std::move(output));
-  }
-
- private:
-  static constexpr const auto kHost = Device{0};
-};
-
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMClassification, MultiLabelLinearClsHead);
+namespace mmdeploy::mmcls
+{
+
+    class MultiLabelLinearClsHead : public MMClassification
+    {
+      public:
+        explicit MultiLabelLinearClsHead(const Value& cfg)
+            : MMClassification(cfg)
+        {
+        }
+        Result<Value> operator()(const Value& infer_res)
+        {
+            MMDEPLOY_DEBUG("infer_res: {}", infer_res);
+            auto output = infer_res["output"].get<Tensor>();
+
+            if (!(output.shape().size() >= 2 && output.data_type() == DataType::kFLOAT))
+            {
+                MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}, dtype: {}", output.shape(), (int)output.data_type());
+                return Status(eNotSupported);
+            }
+
+            auto class_num = (int)output.shape(1);
+
+            OUTCOME_TRY(auto _scores, MakeAvailableOnDevice(output, kHost, stream()));
+            OUTCOME_TRY(stream().Wait());
+
+            return GetLabels(_scores, class_num);
+        }
+
+      private:
+        Value GetLabels(const Tensor& scores, int class_num) const
+        {
+            auto   scores_data = scores.data<float>();
+            Labels output;
+            for (int i = 0; i < class_num; ++i)
+            {
+                auto label = Label{i, scores_data[i]};
+                MMDEPLOY_DEBUG("label_id: {}, score: {}", label.label_id, label.score);
+                output.push_back(label);
+            }
+            return to_value(std::move(output));
+        }
+
+      private:
+        static constexpr const auto kHost = Device{0};
+    };
+
+    MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMClassification, MultiLabelLinearClsHead);
 
 }  // namespace mmdeploy::mmcls
diff --git a/csrc/mmdeploy/codebase/mmdet/CMakeLists.txt b/csrc/mmdeploy/codebase/mmdet/CMakeLists.txt
index 0d2c75d8e8..c4039f1629 100644
--- a/csrc/mmdeploy/codebase/mmdet/CMakeLists.txt
+++ b/csrc/mmdeploy/codebase/mmdet/CMakeLists.txt
@@ -4,9 +4,11 @@ project(mmdeploy_mmdet)
 
 file(GLOB_RECURSE SRCS ${CMAKE_CURRENT_SOURCE_DIR} "*.cpp")
 mmdeploy_add_module(${PROJECT_NAME} "${SRCS}")
-target_link_libraries(${PROJECT_NAME}
-        PRIVATE mmdeploy_opencv_utils mmdeploy_operation)
+target_link_libraries(${PROJECT_NAME} PRIVATE mmdeploy_opencv_utils
+                                              mmdeploy_operation)
 
 add_library(mmdeploy::mmdet ALIAS ${PROJECT_NAME})
 
-set(MMDEPLOY_TASKS ${MMDEPLOY_TASKS} detector  CACHE INTERNAL "")
+set(MMDEPLOY_TASKS
+    ${MMDEPLOY_TASKS} detector
+    CACHE INTERNAL "")
diff --git a/csrc/mmdeploy/codebase/mmdet/base_dense_head.cpp b/csrc/mmdeploy/codebase/mmdet/base_dense_head.cpp
index a085ff26a2..3576285f08 100644
--- a/csrc/mmdeploy/codebase/mmdet/base_dense_head.cpp
+++ b/csrc/mmdeploy/codebase/mmdet/base_dense_head.cpp
@@ -8,97 +8,109 @@
 #include "mmdeploy/core/utils/formatter.h"
 #include "utils.h"
 
-namespace mmdeploy::mmdet {
-
-BaseDenseHead::BaseDenseHead(const Value& cfg) : MMDetection(cfg) {
-  auto init = [&]() -> Result<void> {
-    auto model = cfg["context"]["model"].get<Model>();
-    if (cfg.contains("params")) {
-      nms_pre_ = cfg["params"].value("nms_pre", -1);
-      score_thr_ = cfg["params"].value("score_thr", 0.02f);
-      min_bbox_size_ = cfg["params"].value("min_bbox_size", 0);
-      iou_threshold_ = cfg["params"].contains("nms")
-                           ? cfg["params"]["nms"].value("iou_threshold", 0.45f)
-                           : 0.45f;
+namespace mmdeploy::mmdet
+{
+
+    BaseDenseHead::BaseDenseHead(const Value& cfg)
+        : MMDetection(cfg)
+    {
+        auto init = [&]() -> Result<void>
+        {
+            auto model = cfg["context"]["model"].get<Model>();
+            if (cfg.contains("params"))
+            {
+                nms_pre_       = cfg["params"].value("nms_pre", -1);
+                score_thr_     = cfg["params"].value("score_thr", 0.02f);
+                min_bbox_size_ = cfg["params"].value("min_bbox_size", 0);
+                iou_threshold_ = cfg["params"].contains("nms") ? cfg["params"]["nms"].value("iou_threshold", 0.45f) : 0.45f;
+            }
+            return success();
+        };
+        init().value();
     }
-    return success();
-  };
-  init().value();
-}
-
-Result<Value> BaseDenseHead::operator()(const Value& prep_res, const Value& infer_res) {
-  MMDEPLOY_DEBUG("prep_res: {}\ninfer_res: {}", prep_res, infer_res);
-  try {
-    auto dets = infer_res["dets"].get<Tensor>();
-    auto scores = infer_res["labels"].get<Tensor>();
-    const Device kHost{0, 0};
-    OUTCOME_TRY(auto _dets, MakeAvailableOnDevice(dets, kHost, stream()));
-    OUTCOME_TRY(auto _scores, MakeAvailableOnDevice(scores, kHost, stream()));
-    OUTCOME_TRY(stream().Wait());
-    OUTCOME_TRY(auto result, GetBBoxes(prep_res["img_metas"], _dets, _scores));
-    return to_value(result);
-  } catch (...) {
-    return Status(eFail);
-  }
-}
-
-Result<Detections> BaseDenseHead::GetBBoxes(const Value& prep_res, const Tensor& dets,
-                                            const Tensor& scores) const {
-  MMDEPLOY_DEBUG("dets: {}, {}", dets.shape(), dets.data_type());
-  MMDEPLOY_DEBUG("scores: {}, {}", scores.shape(), scores.data_type());
-
-  std::vector<float> probs;
-  std::vector<int> label_ids;
-  std::vector<int> anchor_idxs;
-
-  FilterScoresAndTopk(scores, score_thr_, nms_pre_, probs, label_ids, anchor_idxs);
-
-  Sort(probs, label_ids, anchor_idxs);
-
-  NMS(dets, iou_threshold_, anchor_idxs);
-
-  Detections objs;
-  std::vector<float> scale_factor;
-  if (prep_res.contains("scale_factor")) {
-    from_value(prep_res["scale_factor"], scale_factor);
-  } else {
-    scale_factor = {1.f, 1.f, 1.f, 1.f};
-  }
-  int ori_width = prep_res["ori_shape"][2].get<int>();
-  int ori_height = prep_res["ori_shape"][1].get<int>();
-  auto det_ptr = dets.data<float>();
-  for (int i = 0; i < anchor_idxs.size(); ++i) {
-    if (anchor_idxs[i] == -1) {
-      continue;
+
+    Result<Value> BaseDenseHead::operator()(const Value& prep_res, const Value& infer_res)
+    {
+        MMDEPLOY_DEBUG("prep_res: {}\ninfer_res: {}", prep_res, infer_res);
+        try
+        {
+            auto         dets   = infer_res["dets"].get<Tensor>();
+            auto         scores = infer_res["labels"].get<Tensor>();
+            const Device kHost{0, 0};
+            OUTCOME_TRY(auto _dets, MakeAvailableOnDevice(dets, kHost, stream()));
+            OUTCOME_TRY(auto _scores, MakeAvailableOnDevice(scores, kHost, stream()));
+            OUTCOME_TRY(stream().Wait());
+            OUTCOME_TRY(auto result, GetBBoxes(prep_res["img_metas"], _dets, _scores));
+            return to_value(result);
+        }
+        catch (...)
+        {
+            return Status(eFail);
+        }
     }
-    int j = anchor_idxs[i];
-    auto x1 = det_ptr[j * 4 + 0];
-    auto y1 = det_ptr[j * 4 + 1];
-    auto x2 = det_ptr[j * 4 + 2];
-    auto y2 = det_ptr[j * 4 + 3];
-    int label_id = label_ids[i];
-    float score = probs[i];
-
-    MMDEPLOY_DEBUG("{}-th box: ({}, {}, {}, {}), {}, {}", i, x1, y1, x2, y2, label_id, score);
-
-    auto rect =
-        MapToOriginImage(x1, y1, x2, y2, scale_factor.data(), 0, 0, ori_width, ori_height, 0, 0);
-    if (rect[2] - rect[0] < min_bbox_size_ || rect[3] - rect[1] < min_bbox_size_) {
-      MMDEPLOY_DEBUG("ignore small bbox with width '{}' and height '{}", rect[2] - rect[0],
-                     rect[3] - rect[1]);
-      continue;
+
+    Result<Detections> BaseDenseHead::GetBBoxes(const Value& prep_res, const Tensor& dets, const Tensor& scores) const
+    {
+        MMDEPLOY_DEBUG("dets: {}, {}", dets.shape(), dets.data_type());
+        MMDEPLOY_DEBUG("scores: {}, {}", scores.shape(), scores.data_type());
+
+        std::vector<float> probs;
+        std::vector<int>   label_ids;
+        std::vector<int>   anchor_idxs;
+
+        FilterScoresAndTopk(scores, score_thr_, nms_pre_, probs, label_ids, anchor_idxs);
+
+        Sort(probs, label_ids, anchor_idxs);
+
+        NMS(dets, iou_threshold_, anchor_idxs);
+
+        Detections         objs;
+        std::vector<float> scale_factor;
+        if (prep_res.contains("scale_factor"))
+        {
+            from_value(prep_res["scale_factor"], scale_factor);
+        }
+        else
+        {
+            scale_factor = {1.f, 1.f, 1.f, 1.f};
+        }
+        int  ori_width  = prep_res["ori_shape"][2].get<int>();
+        int  ori_height = prep_res["ori_shape"][1].get<int>();
+        auto det_ptr    = dets.data<float>();
+        for (int i = 0; i < anchor_idxs.size(); ++i)
+        {
+            if (anchor_idxs[i] == -1)
+            {
+                continue;
+            }
+            int   j        = anchor_idxs[i];
+            auto  x1       = det_ptr[j * 4 + 0];
+            auto  y1       = det_ptr[j * 4 + 1];
+            auto  x2       = det_ptr[j * 4 + 2];
+            auto  y2       = det_ptr[j * 4 + 3];
+            int   label_id = label_ids[i];
+            float score    = probs[i];
+
+            MMDEPLOY_DEBUG("{}-th box: ({}, {}, {}, {}), {}, {}", i, x1, y1, x2, y2, label_id, score);
+
+            auto rect =
+                MapToOriginImage(x1, y1, x2, y2, scale_factor.data(), 0, 0, ori_width, ori_height, 0, 0);
+            if (rect[2] - rect[0] < min_bbox_size_ || rect[3] - rect[1] < min_bbox_size_)
+            {
+                MMDEPLOY_DEBUG("ignore small bbox with width '{}' and height '{}", rect[2] - rect[0], rect[3] - rect[1]);
+                continue;
+            }
+            Detection det{};
+            det.index    = i;
+            det.label_id = label_id;
+            det.score    = score;
+            det.bbox     = rect;
+            objs.push_back(std::move(det));
+        }
+
+        return objs;
     }
-    Detection det{};
-    det.index = i;
-    det.label_id = label_id;
-    det.score = score;
-    det.bbox = rect;
-    objs.push_back(std::move(det));
-  }
-
-  return objs;
-}
-
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMDetection, BaseDenseHead);
+
+    MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMDetection, BaseDenseHead);
 
 }  // namespace mmdeploy::mmdet
diff --git a/csrc/mmdeploy/codebase/mmdet/base_dense_head.h b/csrc/mmdeploy/codebase/mmdet/base_dense_head.h
index 0bac99ced4..66ebd29ed5 100644
--- a/csrc/mmdeploy/codebase/mmdet/base_dense_head.h
+++ b/csrc/mmdeploy/codebase/mmdet/base_dense_head.h
@@ -5,22 +5,23 @@
 #include "mmdeploy/codebase/mmdet/mmdet.h"
 #include "mmdeploy/core/tensor.h"
 
-namespace mmdeploy::mmdet {
+namespace mmdeploy::mmdet
+{
 
-class BaseDenseHead : public MMDetection {
- public:
-  explicit BaseDenseHead(const Value& cfg);
+    class BaseDenseHead : public MMDetection
+    {
+      public:
+        explicit BaseDenseHead(const Value& cfg);
 
-  Result<Value> operator()(const Value& prep_res, const Value& infer_res);
-  Result<Detections> GetBBoxes(const Value& prep_res, const Tensor& dets,
-                               const Tensor& scores) const;
+        Result<Value>      operator()(const Value& prep_res, const Value& infer_res);
+        Result<Detections> GetBBoxes(const Value& prep_res, const Tensor& dets, const Tensor& scores) const;
 
- private:
-  float score_thr_{0.4f};
-  int nms_pre_{1000};
-  float iou_threshold_{0.45f};
-  int min_bbox_size_{0};
-};
+      private:
+        float score_thr_{0.4f};
+        int   nms_pre_{1000};
+        float iou_threshold_{0.45f};
+        int   min_bbox_size_{0};
+    };
 }  // namespace mmdeploy::mmdet
 
 #endif  // MMDEPLOY_CODEBASE_MMDET_BASE_DENSE_HEAD_H_
diff --git a/csrc/mmdeploy/codebase/mmdet/instance_segmentation.cpp b/csrc/mmdeploy/codebase/mmdet/instance_segmentation.cpp
index 14360dc87a..05e205ede6 100644
--- a/csrc/mmdeploy/codebase/mmdet/instance_segmentation.cpp
+++ b/csrc/mmdeploy/codebase/mmdet/instance_segmentation.cpp
@@ -9,128 +9,24 @@
 #include "opencv2/imgproc/imgproc.hpp"
 #include "opencv_utils.h"
 
-namespace mmdeploy::mmdet {
-
-class ResizeInstanceMask : public ResizeBBox {
- public:
-  explicit ResizeInstanceMask(const Value& cfg) : ResizeBBox(cfg) {
-    if (cfg.contains("params")) {
-      mask_thr_binary_ = cfg["params"].value("mask_thr_binary", mask_thr_binary_);
-      is_rcnn_ = cfg["params"].contains("rcnn");
-      is_resize_mask_ = cfg["params"].value("is_resize_mask", is_resize_mask_);
-    }
-    operation::Context ctx(device_, stream_);
-    warp_affine_ = operation::Managed<operation::WarpAffine>::Create("bilinear");
-    permute_ = operation::Managed<::mmdeploy::operation::Permute>::Create();
-  }
-
-  // TODO: remove duplication
-  Result<Value> operator()(const Value& prep_res, const Value& infer_res) {
-    MMDEPLOY_DEBUG("prep_res: {}\ninfer_res: {}", prep_res, infer_res);
-    try {
-      DeviceGuard guard(device_);
-      auto dets = infer_res["dets"].get<Tensor>();
-      auto labels = infer_res["labels"].get<Tensor>();
-      auto masks = infer_res["masks"].get<Tensor>();
-
-      MMDEPLOY_DEBUG("dets.shape: {}", dets.shape());
-      MMDEPLOY_DEBUG("labels.shape: {}", labels.shape());
-      MMDEPLOY_DEBUG("masks.shape: {}", masks.shape());
-
-      // `dets` is supposed to have 3 dims. They are 'batch', 'bboxes_number'
-      // and 'channels' respectively
-      if (!(dets.shape().size() == 3 && dets.data_type() == DataType::kFLOAT)) {
-        MMDEPLOY_ERROR("unsupported `dets` tensor, shape: {}, dtype: {}", dets.shape(),
-                       (int)dets.data_type());
-        return Status(eNotSupported);
-      }
-
-      // `labels` is supposed to have 2 dims, which are 'batch' and
-      // 'bboxes_number'
-      if (labels.shape().size() != 2) {
-        MMDEPLOY_ERROR("unsupported `labels`, tensor, shape: {}, dtype: {}", labels.shape(),
-                       (int)labels.data_type());
-        return Status(eNotSupported);
-      }
-
-      if (!(masks.shape().size() == 4 && masks.data_type() == DataType::kFLOAT)) {
-        MMDEPLOY_ERROR("unsupported `mask` tensor, shape: {}, dtype: {}", masks.shape(),
-                       (int)masks.data_type());
-        return Status(eNotSupported);
-      }
-
-      OUTCOME_TRY(auto _dets, MakeAvailableOnDevice(dets, kHost, stream()));
-      OUTCOME_TRY(auto _labels, MakeAvailableOnDevice(labels, kHost, stream()));
-      // Note: `masks` are kept on device to avoid data copy overhead from device to host.
-      // refer to https://github.com/open-mmlab/mmdeploy/issues/1849
-      // OUTCOME_TRY(auto _masks, MakeAvailableOnDevice(masks, kHost, stream()));
-      // OUTCOME_TRY(stream().Wait());
-
-      OUTCOME_TRY(auto result, DispatchGetBBoxes(prep_res["img_metas"], _dets, _labels));
-      auto ori_w = prep_res["img_metas"]["ori_shape"][2].get<int>();
-      auto ori_h = prep_res["img_metas"]["ori_shape"][1].get<int>();
-      from_value(prep_res["img_metas"]["scale_factor"], scale_factor_);
-
-      ProcessMasks(result, masks, _dets, ori_w, ori_h);
-
-      return to_value(result);
-    } catch (const std::exception& e) {
-      MMDEPLOY_ERROR("{}", e.what());
-      return Status(eFail);
-    }
-  }
-
- protected:
-  Result<void> ProcessMasks(Detections& result, Tensor d_mask, Tensor cpu_dets, int img_w,
-                            int img_h) {
-    d_mask.Squeeze(0);
-    cpu_dets.Squeeze(0);
-
-    ::mmdeploy::operation::Context ctx(device_, stream_);
-
-    std::vector<Tensor> warped_masks;
-    warped_masks.reserve(result.size());
-
-    std::vector<Tensor> h_warped_masks;
-    h_warped_masks.reserve(result.size());
-
-    if (is_rcnn_) {  // mask r-cnn
-      for (auto& det : result) {
-        auto mask = d_mask.Slice(det.index);
-        auto mask_height = (int)mask.shape(1);
-        auto mask_width = (int)mask.shape(2);
-        mask.Reshape({1, mask_height, mask_width, 1});
-        // resize masks to origin image shape instead of input image shape
-        // default is true
-        if (is_resize_mask_) {
-          auto& bbox = det.bbox;
-          // same as mmdet with skip_empty = True
-          auto x0 = std::max(std::floor(bbox[0]) - 1, 0.f);
-          auto y0 = std::max(std::floor(bbox[1]) - 1, 0.f);
-          auto x1 = std::min(std::ceil(bbox[2]) + 1, (float)img_w);
-          auto y1 = std::min(std::ceil(bbox[3]) + 1, (float)img_h);
-          auto width = static_cast<int>(x1 - x0);
-          auto height = static_cast<int>(y1 - y0);
-          // params align_corners = False
-          float fx;
-          float fy;
-          float tx;
-          float ty;
-          fx = (float)mask_width / (bbox[2] - bbox[0]);
-          fy = (float)mask_height / (bbox[3] - bbox[1]);
-          tx = (x0 + .5f - bbox[0]) * fx - .5f;
-          ty = (y0 + .5f - bbox[1]) * fy - .5f;
-
-          float affine_matrix[] = {fx, 0, tx, 0, fy, ty};
-
-          cv::Mat_<float> m(2, 3, affine_matrix);
-          cv::invertAffineTransform(m, m);
-          Tensor& warped_mask = warped_masks.emplace_back();
-          OUTCOME_TRY(warp_affine_.Apply(mask, warped_mask, affine_matrix, height, width));
-          OUTCOME_TRY(CopyToHost(warped_mask, h_warped_masks.emplace_back()));
-
-        } else {
-          OUTCOME_TRY(CopyToHost(mask, h_warped_masks.emplace_back()));
+namespace mmdeploy::mmdet
+{
+
+    class ResizeInstanceMask : public ResizeBBox
+    {
+      public:
+        explicit ResizeInstanceMask(const Value& cfg)
+            : ResizeBBox(cfg)
+        {
+            if (cfg.contains("params"))
+            {
+                mask_thr_binary_ = cfg["params"].value("mask_thr_binary", mask_thr_binary_);
+                is_rcnn_         = cfg["params"].contains("rcnn");
+                is_resize_mask_  = cfg["params"].value("is_resize_mask", is_resize_mask_);
+            }
+            operation::Context ctx(device_, stream_);
+            warp_affine_ = operation::Managed<operation::WarpAffine>::Create("bilinear");
+            permute_     = operation::Managed<::mmdeploy::operation::Permute>::Create();
         }
       }
 
diff --git a/csrc/mmdeploy/codebase/mmdet/mmdet.cpp b/csrc/mmdeploy/codebase/mmdet/mmdet.cpp
index fb4e4d1f1f..679b589dde 100644
--- a/csrc/mmdeploy/codebase/mmdet/mmdet.cpp
+++ b/csrc/mmdeploy/codebase/mmdet/mmdet.cpp
@@ -2,8 +2,9 @@
 
 #include "mmdeploy/codebase/mmdet/mmdet.h"
 
-namespace mmdeploy::mmdet {
+namespace mmdeploy::mmdet
+{
 
-MMDEPLOY_REGISTER_CODEBASE(MMDetection);
+    MMDEPLOY_REGISTER_CODEBASE(MMDetection);
 
 }  // namespace mmdeploy::mmdet
diff --git a/csrc/mmdeploy/codebase/mmdet/mmdet.h b/csrc/mmdeploy/codebase/mmdet/mmdet.h
index bc02a9c1d2..51d3de43d3 100644
--- a/csrc/mmdeploy/codebase/mmdet/mmdet.h
+++ b/csrc/mmdeploy/codebase/mmdet/mmdet.h
@@ -12,20 +12,22 @@
 #include "mmdeploy/core/registry.h"
 #include "mmdeploy/core/serialization.h"
 
-namespace mmdeploy::mmdet {
-
-struct Detection {
-  int index;
-  int label_id;
-  float score;
-  std::array<float, 4> bbox;  // left, top, right, bottom
-  Mat mask;
-  MMDEPLOY_ARCHIVE_MEMBERS(index, label_id, score, bbox, mask);
-};
-
-using Detections = std::vector<Detection>;
-
-MMDEPLOY_DECLARE_CODEBASE(MMDetection, mmdet);
+namespace mmdeploy::mmdet
+{
+
+    struct Detection
+    {
+        int                  index;
+        int                  label_id;
+        float                score;
+        std::array<float, 4> bbox;  // left, top, right, bottom
+        Mat                  mask;
+        MMDEPLOY_ARCHIVE_MEMBERS(index, label_id, score, bbox, mask);
+    };
+
+    using Detections = std::vector<Detection>;
+
+    MMDEPLOY_DECLARE_CODEBASE(MMDetection, mmdet);
 
 }  // namespace mmdeploy::mmdet
 
diff --git a/csrc/mmdeploy/codebase/mmdet/object_detection.cpp b/csrc/mmdeploy/codebase/mmdet/object_detection.cpp
index 561d38c102..4a27c6975d 100644
--- a/csrc/mmdeploy/codebase/mmdet/object_detection.cpp
+++ b/csrc/mmdeploy/codebase/mmdet/object_detection.cpp
@@ -10,175 +10,203 @@
 
 using namespace std;
 
-namespace mmdeploy::mmdet {
-
-ResizeBBox::ResizeBBox(const Value& cfg) : MMDetection(cfg) {
-  if (cfg.contains("params")) {
-    if (cfg["params"].contains("conf_thr")) {
-      // for mobilev2yolov3
-      score_thr_ = cfg["params"].value("conf_thr", 0.f);
-    } else {
-      score_thr_ = cfg["params"].value("score_thr", 0.f);
+namespace mmdeploy::mmdet
+{
+
+    ResizeBBox::ResizeBBox(const Value& cfg)
+        : MMDetection(cfg)
+    {
+        if (cfg.contains("params"))
+        {
+            if (cfg["params"].contains("conf_thr"))
+            {
+                // for mobilev2yolov3
+                score_thr_ = cfg["params"].value("conf_thr", 0.f);
+            }
+            else
+            {
+                score_thr_ = cfg["params"].value("score_thr", 0.f);
+            }
+            min_bbox_size_ = cfg["params"].value("min_bbox_size", 0.f);
+        }
     }
-    min_bbox_size_ = cfg["params"].value("min_bbox_size", 0.f);
-  }
-}
-std::vector<Tensor> ResizeBBox::GetDetsLabels(const Value& prep_res, const Value& infer_res) {
-  std::vector<Tensor> results;
-  if (infer_res.contains("dets") && infer_res.contains("labels")) {
-    results.push_back(infer_res["dets"].get<Tensor>());
-    results.push_back(infer_res["labels"].get<Tensor>());
-    return results;
-  } else if (infer_res.contains("detection_output") && (!infer_res.contains("dets")) &&
-             (!infer_res.contains("labels"))) {
-    int img_width = prep_res["img_metas"]["img_shape"][2].get<int>();
-    int img_height = prep_res["img_metas"]["img_shape"][1].get<int>();
-    auto detection_output = infer_res["detection_output"].get<Tensor>();
-    auto* detection_output_ptr = detection_output.data<float>();
-    // detection_output: (1, num_det, 6)
-    TensorDesc labeldesc = detection_output.desc();
-    int batch_size = detection_output.shape()[0];
-    int num_det = detection_output.shape()[1];
-    labeldesc.shape = {batch_size, num_det};
-    Tensor labels(labeldesc);
-    TensorDesc detdesc = detection_output.desc();
-    detdesc.shape = {batch_size, num_det, 5};
-    Tensor dets(detdesc);
-    auto* dets_ptr = dets.data<float>();
-    auto* labels_ptr = labels.data<float>();
-
-    for (int i = 0; i < batch_size * num_det; ++i) {
-      *labels_ptr++ = detection_output_ptr[0] - 1;
-      dets_ptr[4] = detection_output_ptr[1];
-      dets_ptr[0] = detection_output_ptr[2] * img_width;
-      dets_ptr[1] = detection_output_ptr[3] * img_height;
-      dets_ptr[2] = detection_output_ptr[4] * img_width;
-      dets_ptr[3] = detection_output_ptr[5] * img_height;
-      dets_ptr += 5;
-      detection_output_ptr += 6;
+    std::vector<Tensor> ResizeBBox::GetDetsLabels(const Value& prep_res,
+                                                  const Value& infer_res)
+    {
+        std::vector<Tensor> results;
+        if (infer_res.contains("dets") && infer_res.contains("labels"))
+        {
+            results.push_back(infer_res["dets"].get<Tensor>());
+            results.push_back(infer_res["labels"].get<Tensor>());
+            return results;
+        }
+        else if (infer_res.contains("detection_output") && (!infer_res.contains("dets")) &&
+                 (!infer_res.contains("labels")))
+        {
+            int        img_width            = prep_res["img_metas"]["img_shape"][2].get<int>();
+            int        img_height           = prep_res["img_metas"]["img_shape"][1].get<int>();
+            auto       detection_output     = infer_res["detection_output"].get<Tensor>();
+            auto*      detection_output_ptr = detection_output.data<float>();
+            // detection_output: (1, num_det, 6)
+            TensorDesc labeldesc            = detection_output.desc();
+            int        batch_size           = detection_output.shape()[0];
+            int        num_det              = detection_output.shape()[1];
+            labeldesc.shape                 = {batch_size, num_det};
+            Tensor     labels(labeldesc);
+            TensorDesc detdesc = detection_output.desc();
+            detdesc.shape      = {batch_size, num_det, 5};
+            Tensor dets(detdesc);
+            auto*  dets_ptr   = dets.data<float>();
+            auto*  labels_ptr = labels.data<float>();
+
+            for (int i = 0; i < batch_size * num_det; ++i)
+            {
+                *labels_ptr++ = detection_output_ptr[0] - 1;
+                dets_ptr[4]   = detection_output_ptr[1];
+                dets_ptr[0]   = detection_output_ptr[2] * img_width;
+                dets_ptr[1]   = detection_output_ptr[3] * img_height;
+                dets_ptr[2]   = detection_output_ptr[4] * img_width;
+                dets_ptr[3]   = detection_output_ptr[5] * img_height;
+                dets_ptr += 5;
+                detection_output_ptr += 6;
+            }
+            results.push_back(dets);
+            results.push_back(labels);
+            return results;
+        }
+        else
+        {
+            MMDEPLOY_ERROR("No support for another key of detection results!");
+            return results;
+        }
     }
-    results.push_back(dets);
-    results.push_back(labels);
-    return results;
-  } else {
-    MMDEPLOY_ERROR("No support for another key of detection results!");
-    return results;
-  }
-}
-Result<Value> ResizeBBox::operator()(const Value& prep_res, const Value& infer_res) {
-  MMDEPLOY_DEBUG("prep_res: {}\ninfer_res: {}", prep_res, infer_res);
-  try {
-    Tensor dets, labels;
-    vector<Tensor> outputs = GetDetsLabels(prep_res, infer_res);
-    dets = outputs[0];
-    labels = outputs[1];
-    MMDEPLOY_DEBUG("dets.shape: {}", dets.shape());
-    MMDEPLOY_DEBUG("labels.shape: {}", labels.shape());
-    // `dets` is supposed to have 3 dims. They are 'batch', 'bboxes_number'
-    // and 'channels' respectively
-    if (!(dets.shape().size() == 3 && dets.data_type() == DataType::kFLOAT)) {
-      MMDEPLOY_ERROR("unsupported `dets` tensor, shape: {}, dtype: {}", dets.shape(),
-                     (int)dets.data_type());
-      return Status(eNotSupported);
+    Result<Value> ResizeBBox::operator()(const Value& prep_res,
+                                         const Value& infer_res)
+    {
+        MMDEPLOY_DEBUG("prep_res: {}\ninfer_res: {}", prep_res, infer_res);
+        try
+        {
+            Tensor         dets, labels;
+            vector<Tensor> outputs = GetDetsLabels(prep_res, infer_res);
+            dets                   = outputs[0];
+            labels                 = outputs[1];
+            MMDEPLOY_DEBUG("dets.shape: {}", dets.shape());
+            MMDEPLOY_DEBUG("labels.shape: {}", labels.shape());
+            // `dets` is supposed to have 3 dims. They are 'batch', 'bboxes_number'
+            // and 'channels' respectively
+            if (!(dets.shape().size() == 3 && dets.data_type() == DataType::kFLOAT))
+            {
+                MMDEPLOY_ERROR("unsupported `dets` tensor, shape: {}, dtype: {}", dets.shape(), (int)dets.data_type());
+                return Status(eNotSupported);
+            }
+
+            // `labels` is supposed to have 2 dims, which are 'batch' and
+            // 'bboxes_number'
+            if (labels.shape().size() != 2)
+            {
+                MMDEPLOY_ERROR("unsupported `labels`, tensor, shape: {}, dtype: {}", labels.shape(), (int)labels.data_type());
+                return Status(eNotSupported);
+            }
+
+            OUTCOME_TRY(auto _dets, MakeAvailableOnDevice(dets, kHost, stream()));
+            OUTCOME_TRY(auto _labels, MakeAvailableOnDevice(labels, kHost, stream()));
+            OUTCOME_TRY(stream().Wait());
+
+            OUTCOME_TRY(auto result, DispatchGetBBoxes(prep_res["img_metas"], _dets, _labels));
+            return to_value(result);
+        }
+        catch (...)
+        {
+            return Status(eFail);
+        }
     }
-
-    // `labels` is supposed to have 2 dims, which are 'batch' and
-    // 'bboxes_number'
-    if (labels.shape().size() != 2) {
-      MMDEPLOY_ERROR("unsupported `labels`, tensor, shape: {}, dtype: {}", labels.shape(),
-                     (int)labels.data_type());
-      return Status(eNotSupported);
+    Result<Detections> ResizeBBox::DispatchGetBBoxes(const Value&  prep_res,
+                                                     const Tensor& dets,
+                                                     const Tensor& labels)
+    {
+        auto data_type = labels.data_type();
+        switch (data_type)
+        {
+            case DataType::kFLOAT:
+                return GetBBoxes<float>(prep_res, dets, labels);
+            case DataType::kINT32:
+                return GetBBoxes<int32_t>(prep_res, dets, labels);
+            case DataType::kINT64:
+                return GetBBoxes<int64_t>(prep_res, dets, labels);
+            default:
+                return Status(eNotSupported);
+        }
     }
-
-    OUTCOME_TRY(auto _dets, MakeAvailableOnDevice(dets, kHost, stream()));
-    OUTCOME_TRY(auto _labels, MakeAvailableOnDevice(labels, kHost, stream()));
-    OUTCOME_TRY(stream().Wait());
-
-    OUTCOME_TRY(auto result, DispatchGetBBoxes(prep_res["img_metas"], _dets, _labels));
-    return to_value(result);
-
-  } catch (...) {
-    return Status(eFail);
-  }
-}
-Result<Detections> ResizeBBox::DispatchGetBBoxes(const Value& prep_res, const Tensor& dets,
-                                                 const Tensor& labels) {
-  auto data_type = labels.data_type();
-  switch (data_type) {
-    case DataType::kFLOAT:
-      return GetBBoxes<float>(prep_res, dets, labels);
-    case DataType::kINT32:
-      return GetBBoxes<int32_t>(prep_res, dets, labels);
-    case DataType::kINT64:
-      return GetBBoxes<int64_t>(prep_res, dets, labels);
-    default:
-      return Status(eNotSupported);
-  }
-}
-template <typename T>
-Result<Detections> ResizeBBox::GetBBoxes(const Value& prep_res, const Tensor& dets,
-                                         const Tensor& labels) {
-  Detections objs;
-  auto* dets_ptr = dets.data<float>();
-  auto* labels_ptr = labels.data<T>();
-  vector<float> scale_factor;
-  if (prep_res.contains("scale_factor")) {
-    from_value(prep_res["scale_factor"], scale_factor);
-  } else {
-    scale_factor = {1.f, 1.f, 1.f, 1.f};
-  }
-
-  int top_padding = 0;
-  int left_padding = 0;
-  if (prep_res.contains("pad_param")) {
-    top_padding = prep_res["pad_param"][0].get<int>();
-    left_padding = prep_res["pad_param"][1].get<int>();
-  }
-
-  float w_offset = 0.f;
-  float h_offset = 0.f;
-  if (prep_res.contains("border")) {
-    w_offset = -prep_res["border"][1].get<int>();
-    h_offset = -prep_res["border"][0].get<int>();
-  }
-  int ori_width = prep_res["ori_shape"][2].get<int>();
-  int ori_height = prep_res["ori_shape"][1].get<int>();
-
-  // `dets` has shape(1, n, 4) or shape(1, n, 5). The latter one has `score`
-  auto bboxes_number = dets.shape()[1];
-  auto channels = dets.shape()[2];
-  for (auto i = 0; i < bboxes_number; ++i, dets_ptr += channels, ++labels_ptr) {
-    float score = 0.f;
-    if (channels > 4 && dets_ptr[4] <= score_thr_) {
-      continue;
-    }
-    score = channels > 4 ? dets_ptr[4] : score;
-    auto left = dets_ptr[0];
-    auto top = dets_ptr[1];
-    auto right = dets_ptr[2];
-    auto bottom = dets_ptr[3];
-
-    MMDEPLOY_DEBUG("ori left {}, top {}, right {}, bottom {}, label {}", left, top, right, bottom,
-                   *labels_ptr);
-    auto rect = MapToOriginImage(left, top, right, bottom, scale_factor.data(), w_offset, h_offset,
-                                 ori_width, ori_height, top_padding, left_padding);
-    if (rect[2] - rect[0] < min_bbox_size_ || rect[3] - rect[1] < min_bbox_size_) {
-      MMDEPLOY_DEBUG("ignore small bbox with width '{}' and height '{}", rect[2] - rect[0],
-                     rect[3] - rect[1]);
-      continue;
+    template<typename T>
+    Result<Detections> ResizeBBox::GetBBoxes(const Value&  prep_res,
+                                             const Tensor& dets,
+                                             const Tensor& labels)
+    {
+        Detections    objs;
+        auto*         dets_ptr   = dets.data<float>();
+        auto*         labels_ptr = labels.data<T>();
+        vector<float> scale_factor;
+        if (prep_res.contains("scale_factor"))
+        {
+            from_value(prep_res["scale_factor"], scale_factor);
+        }
+        else
+        {
+            scale_factor = {1.f, 1.f, 1.f, 1.f};
+        }
+
+        int top_padding  = 0;
+        int left_padding = 0;
+        if (prep_res.contains("pad_param"))
+        {
+            top_padding  = prep_res["pad_param"][0].get<int>();
+            left_padding = prep_res["pad_param"][1].get<int>();
+        }
+
+        float w_offset = 0.f;
+        float h_offset = 0.f;
+        if (prep_res.contains("border"))
+        {
+            w_offset = -prep_res["border"][1].get<int>();
+            h_offset = -prep_res["border"][0].get<int>();
+        }
+        int  ori_width  = prep_res["ori_shape"][2].get<int>();
+        int  ori_height = prep_res["ori_shape"][1].get<int>();
+
+        // `dets` has shape(1, n, 4) or shape(1, n, 5). The latter one has `score`
+        auto bboxes_number = dets.shape()[1];
+        auto channels      = dets.shape()[2];
+        for (auto i = 0; i < bboxes_number; ++i, dets_ptr += channels, ++labels_ptr)
+        {
+            float score = 0.f;
+            if (channels > 4 && dets_ptr[4] <= score_thr_)
+            {
+                continue;
+            }
+            score       = channels > 4 ? dets_ptr[4] : score;
+            auto left   = dets_ptr[0];
+            auto top    = dets_ptr[1];
+            auto right  = dets_ptr[2];
+            auto bottom = dets_ptr[3];
+
+            MMDEPLOY_DEBUG("ori left {}, top {}, right {}, bottom {}, label {}", left, top, right, bottom, *labels_ptr);
+            auto rect = MapToOriginImage(left, top, right, bottom, scale_factor.data(), w_offset, h_offset, ori_width, ori_height, top_padding, left_padding);
+            if (rect[2] - rect[0] < min_bbox_size_ || rect[3] - rect[1] < min_bbox_size_)
+            {
+                MMDEPLOY_DEBUG("ignore small bbox with width '{}' and height '{}", rect[2] - rect[0], rect[3] - rect[1]);
+                continue;
+            }
+            MMDEPLOY_DEBUG("remap left {}, top {}, right {}, bottom {}", rect[0], rect[1], rect[2], rect[3]);
+            Detection det{};
+            det.index    = i;
+            det.label_id = static_cast<int>(*labels_ptr);
+            det.score    = score;
+            det.bbox     = rect;
+            objs.push_back(std::move(det));
+        }
+        return objs;
     }
-    MMDEPLOY_DEBUG("remap left {}, top {}, right {}, bottom {}", rect[0], rect[1], rect[2],
-                   rect[3]);
-    Detection det{};
-    det.index = i;
-    det.label_id = static_cast<int>(*labels_ptr);
-    det.score = score;
-    det.bbox = rect;
-    objs.push_back(std::move(det));
-  }
-  return objs;
-}
 
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMDetection, ResizeBBox);
+    MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMDetection, ResizeBBox);
 
 }  // namespace mmdeploy::mmdet
diff --git a/csrc/mmdeploy/codebase/mmdet/object_detection.h b/csrc/mmdeploy/codebase/mmdet/object_detection.h
index acdda04ef6..8d71a2772b 100644
--- a/csrc/mmdeploy/codebase/mmdet/object_detection.h
+++ b/csrc/mmdeploy/codebase/mmdet/object_detection.h
@@ -7,28 +7,35 @@
 #include "mmdeploy/core/tensor.h"
 #include "mmdeploy/core/utils/formatter.h"
 
-namespace mmdeploy::mmdet {
-
-class ResizeBBox : public MMDetection {
- public:
-  explicit ResizeBBox(const Value& cfg);
-
-  Result<Value> operator()(const Value& prep_res, const Value& infer_res);
-
- protected:
-  Result<Detections> DispatchGetBBoxes(const Value& prep_res, const Tensor& dets,
-                                       const Tensor& labels);
-
-  template <typename T>
-  Result<Detections> GetBBoxes(const Value& prep_res, const Tensor& dets, const Tensor& labels);
-
-  std::vector<Tensor> GetDetsLabels(const Value& prep_res, const Value& infer_res);
-
- protected:
-  constexpr static Device kHost{0, 0};
-  float score_thr_{0.f};
-  float min_bbox_size_{0.f};
-};
+namespace mmdeploy::mmdet
+{
+
+    class ResizeBBox : public MMDetection
+    {
+      public:
+        explicit ResizeBBox(const Value& cfg);
+
+        Result<Value> operator()(const Value& prep_res,
+                                 const Value& infer_res);
+
+      protected:
+        Result<Detections> DispatchGetBBoxes(const Value&  prep_res,
+                                             const Tensor& dets,
+                                             const Tensor& labels);
+
+        template<typename T>
+        Result<Detections>  GetBBoxes(const Value&  prep_res,
+                                      const Tensor& dets,
+                                      const Tensor& labels);
+
+        std::vector<Tensor> GetDetsLabels(const Value& prep_res,
+                                          const Value& infer_res);
+
+      protected:
+        constexpr static Device kHost{0, 0};
+        float                   score_thr_{0.f};
+        float                   min_bbox_size_{0.f};
+    };
 
 }  // namespace mmdeploy::mmdet
 
diff --git a/csrc/mmdeploy/codebase/mmdet/rtmdet_head.cpp b/csrc/mmdeploy/codebase/mmdet/rtmdet_head.cpp
index 27dc6578b5..f9b4de1445 100644
--- a/csrc/mmdeploy/codebase/mmdet/rtmdet_head.cpp
+++ b/csrc/mmdeploy/codebase/mmdet/rtmdet_head.cpp
@@ -11,184 +11,216 @@
 #include "mmdeploy/core/utils/formatter.h"
 #include "utils.h"
 
-namespace mmdeploy::mmdet {
-
-RTMDetSepBNHead::RTMDetSepBNHead(const Value& cfg) : MMDetection(cfg) {
-  auto init = [&]() -> Result<void> {
-    auto model = cfg["context"]["model"].get<Model>();
-    if (cfg.contains("params")) {
-      nms_pre_ = cfg["params"].value("nms_pre", -1);
-      score_thr_ = cfg["params"].value("score_thr", 0.02f);
-      min_bbox_size_ = cfg["params"].value("min_bbox_size", 0);
-      max_per_img_ = cfg["params"].value("max_per_img", 100);
-      iou_threshold_ = cfg["params"].contains("nms")
-                           ? cfg["params"]["nms"].value("iou_threshold", 0.45f)
-                           : 0.45f;
-      if (cfg["params"].contains("anchor_generator")) {
-        offset_ = cfg["params"]["anchor_generator"].value("offset", 0);
-        from_value(cfg["params"]["anchor_generator"]["strides"], strides_);
-      }
+namespace mmdeploy::mmdet
+{
+
+    RTMDetSepBNHead::RTMDetSepBNHead(const Value& cfg)
+        : MMDetection(cfg)
+    {
+        auto init = [&]() -> Result<void>
+        {
+            auto model = cfg["context"]["model"].get<Model>();
+            if (cfg.contains("params"))
+            {
+                nms_pre_       = cfg["params"].value("nms_pre", -1);
+                score_thr_     = cfg["params"].value("score_thr", 0.02f);
+                min_bbox_size_ = cfg["params"].value("min_bbox_size", 0);
+                max_per_img_   = cfg["params"].value("max_per_img", 100);
+                iou_threshold_ = cfg["params"].contains("nms") ? cfg["params"]["nms"].value("iou_threshold", 0.45f) : 0.45f;
+                if (cfg["params"].contains("anchor_generator"))
+                {
+                    offset_ = cfg["params"]["anchor_generator"].value("offset", 0);
+                    from_value(cfg["params"]["anchor_generator"]["strides"], strides_);
+                }
+            }
+            return success();
+        };
+        init().value();
     }
-    return success();
-  };
-  init().value();
-}
-
-Result<Value> RTMDetSepBNHead::operator()(const Value& prep_res, const Value& infer_res) {
-  MMDEPLOY_DEBUG("prep_res: {}\ninfer_res: {}", prep_res, infer_res);
-  try {
-    std::vector<Tensor> cls_scores;
-    std::vector<Tensor> bbox_preds;
-    const Device kHost{0, 0};
-    int i = 0;
-    int divisor = infer_res.size() / 2;
-    for (auto iter = infer_res.begin(); iter != infer_res.end(); iter++) {
-      auto pred_map = iter->get<Tensor>();
-      OUTCOME_TRY(auto _pred_map, MakeAvailableOnDevice(pred_map, kHost, stream()));
-      if (i < divisor)
-        cls_scores.push_back(_pred_map);
-      else
-        bbox_preds.push_back(_pred_map);
-      i++;
+
+    Result<Value> RTMDetSepBNHead::operator()(const Value& prep_res, const Value& infer_res)
+    {
+        MMDEPLOY_DEBUG("prep_res: {}\ninfer_res: {}", prep_res, infer_res);
+        try
+        {
+            std::vector<Tensor> cls_scores;
+            std::vector<Tensor> bbox_preds;
+            const Device        kHost{0, 0};
+            int                 i       = 0;
+            int                 divisor = infer_res.size() / 2;
+            for (auto iter = infer_res.begin(); iter != infer_res.end(); iter++)
+            {
+                auto pred_map = iter->get<Tensor>();
+                OUTCOME_TRY(auto _pred_map, MakeAvailableOnDevice(pred_map, kHost, stream()));
+                if (i < divisor)
+                    cls_scores.push_back(_pred_map);
+                else
+                    bbox_preds.push_back(_pred_map);
+                i++;
+            }
+            OUTCOME_TRY(stream().Wait());
+            OUTCOME_TRY(auto result, GetBBoxes(prep_res["img_metas"], bbox_preds, cls_scores));
+            return to_value(result);
+        }
+        catch (...)
+        {
+            return Status(eFail);
+        }
     }
-    OUTCOME_TRY(stream().Wait());
-    OUTCOME_TRY(auto result, GetBBoxes(prep_res["img_metas"], bbox_preds, cls_scores));
-    return to_value(result);
-  } catch (...) {
-    return Status(eFail);
-  }
-}
-
-static float sigmoid(float x) { return 1.0 / (1.0 + expf(-x)); }
-
-Result<Detections> RTMDetSepBNHead::GetBBoxes(const Value& prep_res,
-                                              const std::vector<Tensor>& bbox_preds,
-                                              const std::vector<Tensor>& cls_scores) const {
-  MMDEPLOY_DEBUG("bbox_pred: {}, {}", bbox_preds[0].shape(), dets[0].data_type());
-  MMDEPLOY_DEBUG("cls_score: {}, {}", scores[0].shape(), scores[0].data_type());
-
-  std::vector<float> filter_boxes;
-  std::vector<float> obj_probs;
-  std::vector<int> class_ids;
-
-  for (int i = 0; i < bbox_preds.size(); i++) {
-    RTMDetFeatDeocde(bbox_preds[i], cls_scores[i], strides_[i], offset_, filter_boxes, obj_probs,
-                     class_ids);
-  }
-
-  std::vector<int> indexArray;
-  for (int i = 0; i < obj_probs.size(); ++i) {
-    indexArray.push_back(i);
-  }
-  Sort(obj_probs, class_ids, indexArray);
-
-  Tensor dets(TensorDesc{Device{0, 0}, DataType::kFLOAT,
-                         TensorShape{int(filter_boxes.size() / 4), 4}, "dets"});
-  std::copy(filter_boxes.begin(), filter_boxes.end(), dets.data<float>());
-  NMS(dets, iou_threshold_, indexArray);
-
-  Detections objs;
-  std::vector<float> scale_factor;
-  if (prep_res.contains("scale_factor")) {
-    from_value(prep_res["scale_factor"], scale_factor);
-  } else {
-    scale_factor = {1.f, 1.f, 1.f, 1.f};
-  }
-  int ori_width = prep_res["ori_shape"][2].get<int>();
-  int ori_height = prep_res["ori_shape"][1].get<int>();
-  auto det_ptr = dets.data<float>();
-  for (int i = 0; i < indexArray.size(); ++i) {
-    if (indexArray[i] == -1) {
-      continue;
+
+    static float sigmoid(float x)
+    {
+        return 1.0 / (1.0 + expf(-x));
     }
-    int j = indexArray[i];
-    auto x1 = det_ptr[j * 4 + 0];
-    auto y1 = det_ptr[j * 4 + 1];
-    auto x2 = det_ptr[j * 4 + 2];
-    auto y2 = det_ptr[j * 4 + 3];
-    int label_id = class_ids[i];
-    float score = obj_probs[i];
-
-    MMDEPLOY_DEBUG("{}-th box: ({}, {}, {}, {}), {}, {}", i, x1, y1, x2, y2, label_id, score);
-
-    auto rect =
-        MapToOriginImage(x1, y1, x2, y2, scale_factor.data(), 0, 0, ori_width, ori_height, 0, 0);
-    if (rect[2] - rect[0] < min_bbox_size_ || rect[3] - rect[1] < min_bbox_size_) {
-      MMDEPLOY_DEBUG("ignore small bbox with width '{}' and height '{}", rect[2] - rect[0],
-                     rect[3] - rect[1]);
-      continue;
+
+    Result<Detections> RTMDetSepBNHead::GetBBoxes(const Value&               prep_res,
+                                                  const std::vector<Tensor>& bbox_preds,
+                                                  const std::vector<Tensor>& cls_scores) const
+    {
+        MMDEPLOY_DEBUG("bbox_pred: {}, {}", bbox_preds[0].shape(), dets[0].data_type());
+        MMDEPLOY_DEBUG("cls_score: {}, {}", scores[0].shape(), scores[0].data_type());
+
+        std::vector<float> filter_boxes;
+        std::vector<float> obj_probs;
+        std::vector<int>   class_ids;
+
+        for (int i = 0; i < bbox_preds.size(); i++)
+        {
+            RTMDetFeatDeocde(bbox_preds[i], cls_scores[i], strides_[i], offset_, filter_boxes, obj_probs, class_ids);
+        }
+
+        std::vector<int> indexArray;
+        for (int i = 0; i < obj_probs.size(); ++i)
+        {
+            indexArray.push_back(i);
+        }
+        Sort(obj_probs, class_ids, indexArray);
+
+        Tensor dets(TensorDesc{Device{0, 0}, DataType::kFLOAT, TensorShape{int(filter_boxes.size() / 4), 4}, "dets"});
+        std::copy(filter_boxes.begin(), filter_boxes.end(), dets.data<float>());
+        NMS(dets, iou_threshold_, indexArray);
+
+        Detections         objs;
+        std::vector<float> scale_factor;
+        if (prep_res.contains("scale_factor"))
+        {
+            from_value(prep_res["scale_factor"], scale_factor);
+        }
+        else
+        {
+            scale_factor = {1.f, 1.f, 1.f, 1.f};
+        }
+        int  ori_width  = prep_res["ori_shape"][2].get<int>();
+        int  ori_height = prep_res["ori_shape"][1].get<int>();
+        auto det_ptr    = dets.data<float>();
+        for (int i = 0; i < indexArray.size(); ++i)
+        {
+            if (indexArray[i] == -1)
+            {
+                continue;
+            }
+            int   j        = indexArray[i];
+            auto  x1       = det_ptr[j * 4 + 0];
+            auto  y1       = det_ptr[j * 4 + 1];
+            auto  x2       = det_ptr[j * 4 + 2];
+            auto  y2       = det_ptr[j * 4 + 3];
+            int   label_id = class_ids[i];
+            float score    = obj_probs[i];
+
+            MMDEPLOY_DEBUG("{}-th box: ({}, {}, {}, {}), {}, {}", i, x1, y1, x2, y2, label_id, score);
+
+            auto rect =
+                MapToOriginImage(x1, y1, x2, y2, scale_factor.data(), 0, 0, ori_width, ori_height, 0, 0);
+            if (rect[2] - rect[0] < min_bbox_size_ || rect[3] - rect[1] < min_bbox_size_)
+            {
+                MMDEPLOY_DEBUG("ignore small bbox with width '{}' and height '{}", rect[2] - rect[0], rect[3] - rect[1]);
+                continue;
+            }
+            Detection det{};
+            det.index    = i;
+            det.label_id = label_id;
+            det.score    = score;
+            det.bbox     = rect;
+            objs.push_back(std::move(det));
+        }
+
+        return objs;
     }
-    Detection det{};
-    det.index = i;
-    det.label_id = label_id;
-    det.score = score;
-    det.bbox = rect;
-    objs.push_back(std::move(det));
-  }
-
-  return objs;
-}
-
-int RTMDetSepBNHead::RTMDetFeatDeocde(const Tensor& bbox_pred, const Tensor& cls_score,
-                                      const float stride, const float offset,
-                                      std::vector<float>& filter_boxes,
-                                      std::vector<float>& obj_probs,
-                                      std::vector<int>& class_ids) const {
-  int cls_param_num = cls_score.shape(1);
-  int feat_h = bbox_pred.shape(2);
-  int feat_w = bbox_pred.shape(3);
-  int feat_size = feat_h * feat_w;
-  auto bbox_ptr = bbox_pred.data<float>();
-  auto score_ptr = cls_score.data<float>();  // (b, c, h, w)
-  int valid_count = 0;
-  for (int i = 0; i < feat_h; i++) {
-    for (int j = 0; j < feat_w; j++) {
-      float max_score = score_ptr[i * feat_w + j];
-      int class_id = 0;
-      for (int k = 0; k < cls_param_num; k++) {
-        float score = score_ptr[k * feat_size + i * feat_w + j];
-        if (score > max_score) {
-          max_score = score;
-          class_id = k;
+
+    int RTMDetSepBNHead::RTMDetFeatDeocde(const Tensor&       bbox_pred,
+                                          const Tensor&       cls_score,
+                                          const float         stride,
+                                          const float         offset,
+                                          std::vector<float>& filter_boxes,
+                                          std::vector<float>& obj_probs,
+                                          std::vector<int>&   class_ids) const
+    {
+        int  cls_param_num = cls_score.shape(1);
+        int  feat_h        = bbox_pred.shape(2);
+        int  feat_w        = bbox_pred.shape(3);
+        int  feat_size     = feat_h * feat_w;
+        auto bbox_ptr      = bbox_pred.data<float>();
+        auto score_ptr     = cls_score.data<float>();  // (b, c, h, w)
+        int  valid_count   = 0;
+        for (int i = 0; i < feat_h; i++)
+        {
+            for (int j = 0; j < feat_w; j++)
+            {
+                float max_score = score_ptr[i * feat_w + j];
+                int   class_id  = 0;
+                for (int k = 0; k < cls_param_num; k++)
+                {
+                    float score = score_ptr[k * feat_size + i * feat_w + j];
+                    if (score > max_score)
+                    {
+                        max_score = score;
+                        class_id  = k;
+                    }
+                }
+                max_score = sigmoid(max_score);
+                if (max_score < score_thr_) continue;
+
+                obj_probs.push_back(max_score);
+                class_ids.push_back(class_id);
+
+                float tl_x = bbox_ptr[0 * feat_size + i * feat_w + j];
+                float tl_y = bbox_ptr[1 * feat_size + i * feat_w + j];
+                float br_x = bbox_ptr[2 * feat_size + i * feat_w + j];
+                float br_y = bbox_ptr[3 * feat_size + i * feat_w + j];
+
+                auto  box = RTMDetdecode(tl_x, tl_y, br_x, br_y, stride, offset, j, i);
+
+                tl_x = box[0];
+                tl_y = box[1];
+                br_x = box[2];
+                br_y = box[3];
+
+                filter_boxes.push_back(tl_x);
+                filter_boxes.push_back(tl_y);
+                filter_boxes.push_back(br_x);
+                filter_boxes.push_back(br_y);
+                valid_count++;
+            }
         }
-      }
-      max_score = sigmoid(max_score);
-      if (max_score < score_thr_) continue;
-
-      obj_probs.push_back(max_score);
-      class_ids.push_back(class_id);
-
-      float tl_x = bbox_ptr[0 * feat_size + i * feat_w + j];
-      float tl_y = bbox_ptr[1 * feat_size + i * feat_w + j];
-      float br_x = bbox_ptr[2 * feat_size + i * feat_w + j];
-      float br_y = bbox_ptr[3 * feat_size + i * feat_w + j];
-
-      auto box = RTMDetdecode(tl_x, tl_y, br_x, br_y, stride, offset, j, i);
-
-      tl_x = box[0];
-      tl_y = box[1];
-      br_x = box[2];
-      br_y = box[3];
-
-      filter_boxes.push_back(tl_x);
-      filter_boxes.push_back(tl_y);
-      filter_boxes.push_back(br_x);
-      filter_boxes.push_back(br_y);
-      valid_count++;
+        return valid_count;
     }
-  }
-  return valid_count;
-}
-
-std::array<float, 4> RTMDetSepBNHead::RTMDetdecode(float tl_x, float tl_y, float br_x, float br_y,
-                                                   float stride, float offset, int j, int i) const {
-  tl_x = (offset + j) * stride - tl_x;
-  tl_y = (offset + i) * stride - tl_y;
-  br_x = (offset + j) * stride + br_x;
-  br_y = (offset + i) * stride + br_y;
-  return std::array<float, 4>{tl_x, tl_y, br_x, br_y};
-}
-
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMDetection, RTMDetSepBNHead);
+
+    std::array<float, 4> RTMDetSepBNHead::RTMDetdecode(float tl_x,
+                                                       float tl_y,
+                                                       float br_x,
+                                                       float br_y,
+                                                       float stride,
+                                                       float offset,
+                                                       int   j,
+                                                       int   i) const
+    {
+        tl_x = (offset + j) * stride - tl_x;
+        tl_y = (offset + i) * stride - tl_y;
+        br_x = (offset + j) * stride + br_x;
+        br_y = (offset + i) * stride + br_y;
+        return std::array<float, 4>{tl_x, tl_y, br_x, br_y};
+    }
+
+    MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMDetection, RTMDetSepBNHead);
 
 }  // namespace mmdeploy::mmdet
diff --git a/csrc/mmdeploy/codebase/mmdet/rtmdet_head.h b/csrc/mmdeploy/codebase/mmdet/rtmdet_head.h
index 98665412a0..1334e393de 100644
--- a/csrc/mmdeploy/codebase/mmdet/rtmdet_head.h
+++ b/csrc/mmdeploy/codebase/mmdet/rtmdet_head.h
@@ -5,29 +5,46 @@
 #include "mmdeploy/codebase/mmdet/mmdet.h"
 #include "mmdeploy/core/tensor.h"
 
-namespace mmdeploy::mmdet {
-
-class RTMDetSepBNHead : public MMDetection {
- public:
-  explicit RTMDetSepBNHead(const Value& cfg);
-  Result<Value> operator()(const Value& prep_res, const Value& infer_res);
-  Result<Detections> GetBBoxes(const Value& prep_res, const std::vector<Tensor>& bbox_preds,
-                               const std::vector<Tensor>& cls_scores) const;
-  int RTMDetFeatDeocde(const Tensor& bbox_pred, const Tensor& cls_score, const float stride,
-                       const float offset, std::vector<float>& filter_boxes,
-                       std::vector<float>& obj_probs, std::vector<int>& class_ids) const;
-  std::array<float, 4> RTMDetdecode(float tl_x, float tl_y, float br_x, float br_y, float stride,
-                                    float offset, int j, int i) const;
-
- private:
-  float score_thr_{0.4f};
-  int nms_pre_{1000};
-  float iou_threshold_{0.45f};
-  int min_bbox_size_{0};
-  int max_per_img_{100};
-  float offset_{0.0f};
-  std::vector<float> strides_;
-};
+namespace mmdeploy::mmdet
+{
+
+    class RTMDetSepBNHead : public MMDetection
+    {
+      public:
+        explicit RTMDetSepBNHead(const Value& cfg);
+
+        Result<Value>        operator()(const Value& prep_res, const Value& infer_res);
+
+        Result<Detections>   GetBBoxes(const Value&               prep_res,
+                                       const std::vector<Tensor>& bbox_preds,
+                                       const std::vector<Tensor>& cls_scores) const;
+
+        int                  RTMDetFeatDeocde(const Tensor&       bbox_pred,
+                                              const Tensor&       cls_score,
+                                              const float         stride,
+                                              const float         offset,
+                                              std::vector<float>& filter_boxes,
+                                              std::vector<float>& obj_probs,
+                                              std::vector<int>&   class_ids) const;
+
+        std::array<float, 4> RTMDetdecode(float tl_x,
+                                          float tl_y,
+                                          float br_x,
+                                          float br_y,
+                                          float stride,
+                                          float offset,
+                                          int   j,
+                                          int   i) const;
+
+      private:
+        float              score_thr_{0.4f};
+        int                nms_pre_{1000};
+        float              iou_threshold_{0.45f};
+        int                min_bbox_size_{0};
+        int                max_per_img_{100};
+        float              offset_{0.0f};
+        std::vector<float> strides_;
+    };
 
 }  // namespace mmdeploy::mmdet
 
diff --git a/csrc/mmdeploy/codebase/mmdet/utils.cpp b/csrc/mmdeploy/codebase/mmdet/utils.cpp
index fd1f9ba106..1f2f9fe274 100644
--- a/csrc/mmdeploy/codebase/mmdet/utils.cpp
+++ b/csrc/mmdeploy/codebase/mmdet/utils.cpp
@@ -6,89 +6,127 @@
 
 using mmdeploy::framework::Tensor;
 
-namespace mmdeploy::mmdet {
+namespace mmdeploy::mmdet
+{
 
-std::array<float, 4> MapToOriginImage(float left, float top, float right, float bottom,
-                                      const float* scale_factor, float x_offset, float y_offset,
-                                      int ori_width, int ori_height, int top_padding,
-                                      int left_padding) {
-  left = std::max((left - left_padding) / scale_factor[0] + x_offset, 0.f);
-  top = std::max((top - top_padding) / scale_factor[1] + y_offset, 0.f);
-  right = std::min((right - left_padding) / scale_factor[2] + x_offset, (float)ori_width - 1.f);
-  bottom = std::min((bottom - top_padding) / scale_factor[3] + y_offset, (float)ori_height - 1.f);
-  return {left, top, right, bottom};
-}
+    std::array<float, 4> MapToOriginImage(float        left,
+                                          float        top,
+                                          float        right,
+                                          float        bottom,
+                                          const float* scale_factor,
+                                          float        x_offset,
+                                          float        y_offset,
+                                          int          ori_width,
+                                          int          ori_height,
+                                          int          top_padding,
+                                          int          left_padding)
+    {
+        left   = std::max((left - left_padding) / scale_factor[0] + x_offset, 0.f);
+        top    = std::max((top - top_padding) / scale_factor[1] + y_offset, 0.f);
+        right  = std::min((right - left_padding) / scale_factor[2] + x_offset, (float)ori_width - 1.f);
+        bottom = std::min((bottom - top_padding) / scale_factor[3] + y_offset, (float)ori_height - 1.f);
+        return {left, top, right, bottom};
+    }
 
-void FilterScoresAndTopk(const Tensor& scores, float score_thr, int topk, std::vector<float>& probs,
-                         std::vector<int>& label_ids, std::vector<int>& anchor_idxs) {
-  auto kDets = scores.shape(1);
-  auto kClasses = scores.shape(2);
-  auto score_ptr = scores.data<float>();
+    void FilterScoresAndTopk(const Tensor&       scores,
+                             float               score_thr,
+                             int                 topk,
+                             std::vector<float>& probs,
+                             std::vector<int>&   label_ids,
+                             std::vector<int>&   anchor_idxs)
+    {
+        auto kDets     = scores.shape(1);
+        auto kClasses  = scores.shape(2);
+        auto score_ptr = scores.data<float>();
 
-  for (auto i = 0; i < kDets; ++i, score_ptr += kClasses) {
-    auto iter = std::max_element(score_ptr, score_ptr + kClasses);
-    auto max_score = *iter;
-    if (*iter < score_thr) {
-      continue;
+        for (auto i = 0; i < kDets; ++i, score_ptr += kClasses)
+        {
+            auto iter      = std::max_element(score_ptr, score_ptr + kClasses);
+            auto max_score = *iter;
+            if (*iter < score_thr)
+            {
+                continue;
+            }
+            probs.push_back(*iter);
+            label_ids.push_back(iter - score_ptr);
+            anchor_idxs.push_back(i);
+        }
     }
-    probs.push_back(*iter);
-    label_ids.push_back(iter - score_ptr);
-    anchor_idxs.push_back(i);
-  }
-}
 
-float IOU(float xmin0, float ymin0, float xmax0, float ymax0, float xmin1, float ymin1, float xmax1,
-          float ymax1) {
-  auto w = std::max(0.f, std::min(xmax0, xmax1) - std::max(xmin0, xmin1));
-  auto h = std::max(0.f, std::min(ymax0, ymax1) - std::max(ymin0, ymin1));
-  auto area = w * h;
-  auto sum = (xmax0 - xmin0) * (ymax0 - ymin0) + (xmax1 - xmin1) * (ymax1 - ymin1);
-  auto iou = area / (sum - area);
-  return iou <= 0.f ? 0.f : iou;
-}
+    float IOU(float xmin0,
+              float ymin0,
+              float xmax0,
+              float ymax0,
+              float xmin1,
+              float ymin1,
+              float xmax1,
+              float ymax1)
+    {
+        auto w    = std::max(0.f, std::min(xmax0, xmax1) - std::max(xmin0, xmin1));
+        auto h    = std::max(0.f, std::min(ymax0, ymax1) - std::max(ymin0, ymin1));
+        auto area = w * h;
+        auto sum  = (xmax0 - xmin0) * (ymax0 - ymin0) + (xmax1 - xmin1) * (ymax1 - ymin1);
+        auto iou  = area / (sum - area);
+        return iou <= 0.f ? 0.f : iou;
+    }
 
-void NMS(const Tensor& dets, float iou_threshold, std::vector<int>& keep_idxs) {
-  auto det_ptr = dets.data<float>();
-  for (auto i = 0; i < keep_idxs.size(); ++i) {
-    auto n = keep_idxs[i];
-    for (auto j = i + 1; j < keep_idxs.size(); ++j) {
-      auto m = keep_idxs[j];
+    void NMS(const Tensor&     dets,
+             float             iou_threshold,
+             std::vector<int>& keep_idxs)
+    {
+        auto det_ptr = dets.data<float>();
+        for (auto i = 0; i < keep_idxs.size(); ++i)
+        {
+            auto n = keep_idxs[i];
+            for (auto j = i + 1; j < keep_idxs.size(); ++j)
+            {
+                auto  m = keep_idxs[j];
 
-      // `delta_xywh_bbox_coder` decode return tl_x, tl_y, br_x, br_y
-      float xmin0 = det_ptr[n * 4 + 0];
-      float ymin0 = det_ptr[n * 4 + 1];
-      float xmax0 = det_ptr[n * 4 + 2];
-      float ymax0 = det_ptr[n * 4 + 3];
+                // `delta_xywh_bbox_coder` decode return tl_x, tl_y, br_x, br_y
+                float xmin0 = det_ptr[n * 4 + 0];
+                float ymin0 = det_ptr[n * 4 + 1];
+                float xmax0 = det_ptr[n * 4 + 2];
+                float ymax0 = det_ptr[n * 4 + 3];
 
-      float xmin1 = det_ptr[m * 4 + 0];
-      float ymin1 = det_ptr[m * 4 + 1];
-      float xmax1 = det_ptr[m * 4 + 2];
-      float ymax1 = det_ptr[m * 4 + 3];
+                float xmin1 = det_ptr[m * 4 + 0];
+                float ymin1 = det_ptr[m * 4 + 1];
+                float xmax1 = det_ptr[m * 4 + 2];
+                float ymax1 = det_ptr[m * 4 + 3];
 
-      float iou = IOU(xmin0, ymin0, xmax0, ymax0, xmin1, ymin1, xmax1, ymax1);
+                float iou = IOU(xmin0, ymin0, xmax0, ymax0, xmin1, ymin1, xmax1, ymax1);
 
-      if (iou > iou_threshold) {
-        keep_idxs[j] = -1;
-      }
+                if (iou > iou_threshold)
+                {
+                    keep_idxs[j] = -1;
+                }
+            }
+        }
     }
-  }
-}
 
-void Sort(std::vector<float>& probs, std::vector<int>& label_ids, std::vector<int>& anchor_idxs) {
-  std::vector<int> prob_idxs(probs.size());
-  std::iota(prob_idxs.begin(), prob_idxs.end(), 0);
-  std::sort(prob_idxs.begin(), prob_idxs.end(), [&](int i, int j) { return probs[i] > probs[j]; });
-  std::vector<float> _probs;
-  std::vector<int> _label_ids;
-  std::vector<int> _keep_idxs;
-  for (auto idx : prob_idxs) {
-    _probs.push_back(probs[idx]);
-    _label_ids.push_back(label_ids[idx]);
-    _keep_idxs.push_back(anchor_idxs[idx]);
-  }
-  probs = std::move(_probs);
-  label_ids = std::move(_label_ids);
-  anchor_idxs = std::move(_keep_idxs);
-}
+    void Sort(std::vector<float>& probs,
+              std::vector<int>&   label_ids,
+              std::vector<int>&   anchor_idxs)
+    {
+        std::vector<int> prob_idxs(probs.size());
+        std::iota(prob_idxs.begin(), prob_idxs.end(), 0);
+        std::sort(prob_idxs.begin(),
+                  prob_idxs.end(),
+                  [&](int i, int j)
+                  {
+                      return probs[i] > probs[j];
+                  });
+        std::vector<float> _probs;
+        std::vector<int>   _label_ids;
+        std::vector<int>   _keep_idxs;
+        for (auto idx : prob_idxs)
+        {
+            _probs.push_back(probs[idx]);
+            _label_ids.push_back(label_ids[idx]);
+            _keep_idxs.push_back(anchor_idxs[idx]);
+        }
+        probs       = std::move(_probs);
+        label_ids   = std::move(_label_ids);
+        anchor_idxs = std::move(_keep_idxs);
+    }
 
 }  // namespace mmdeploy::mmdet
diff --git a/csrc/mmdeploy/codebase/mmdet/utils.h b/csrc/mmdeploy/codebase/mmdet/utils.h
index 47867b7a60..474906f783 100644
--- a/csrc/mmdeploy/codebase/mmdet/utils.h
+++ b/csrc/mmdeploy/codebase/mmdet/utils.h
@@ -8,25 +8,48 @@
 
 #include "mmdeploy/core/tensor.h"
 
-namespace mmdeploy::mmdet {
-std::array<float, 4> MapToOriginImage(float left, float top, float right, float bottom,
-                                      const float* scale_factor, float x_offset, float y_offset,
-                                      int ori_width, int ori_height, int top_padding,
-                                      int left_padding);
-// @brief Filter results using score threshold and topk candidates.
-// scores (Tensor): The scores, shape (num_bboxes, K).
-// probs: The scores after being filtered
-// label_ids: The class labels
-// anchor_idxs: The anchor indexes
-void FilterScoresAndTopk(const mmdeploy::framework::Tensor& scores, float score_thr, int topk,
-                         std::vector<float>& probs, std::vector<int>& label_ids,
-                         std::vector<int>& anchor_idxs);
-float IOU(float xmin0, float ymin0, float xmax0, float ymax0, float xmin1, float ymin1, float xmax1,
-          float ymax1);
-
-void Sort(std::vector<float>& probs, std::vector<int>& label_ids, std::vector<int>& anchor_idxs);
-
-void NMS(const mmdeploy::framework::Tensor& dets, float iou_threshold, std::vector<int>& keep_idxs);
+namespace mmdeploy::mmdet
+{
+    std::array<float, 4> MapToOriginImage(float        left,
+                                          float        top,
+                                          float        right,
+                                          float        bottom,
+                                          const float* scale_factor,
+                                          float        x_offset,
+                                          float        y_offset,
+                                          int          ori_width,
+                                          int          ori_height,
+                                          int          top_padding,
+                                          int          left_padding);
+
+    // @brief Filter results using score threshold and topk candidates.
+    // scores (Tensor): The scores, shape (num_bboxes, K).
+    // probs: The scores after being filtered
+    // label_ids: The class labels
+    // anchor_idxs: The anchor indexes
+    void                 FilterScoresAndTopk(const mmdeploy::framework::Tensor& scores,
+                                             float                              score_thr,
+                                             int                                topk,
+                                             std::vector<float>&                probs,
+                                             std::vector<int>&                  label_ids,
+                                             std::vector<int>&                  anchor_idxs);
+
+    float                IOU(float xmin0,
+                             float ymin0,
+                             float xmax0,
+                             float ymax0,
+                             float xmin1,
+                             float ymin1,
+                             float xmax1,
+                             float ymax1);
+
+    void                 Sort(std::vector<float>& probs,
+                              std::vector<int>&   label_ids,
+                              std::vector<int>&   anchor_idxs);
+
+    void                 NMS(const mmdeploy::framework::Tensor& dets,
+                             float                              iou_threshold,
+                             std::vector<int>&                  keep_idxs);
 
 }  // namespace mmdeploy::mmdet
 
diff --git a/csrc/mmdeploy/codebase/mmdet/yolo_head.cpp b/csrc/mmdeploy/codebase/mmdet/yolo_head.cpp
index 1920df1914..6df6a6cbe7 100644
--- a/csrc/mmdeploy/codebase/mmdet/yolo_head.cpp
+++ b/csrc/mmdeploy/codebase/mmdet/yolo_head.cpp
@@ -11,219 +11,284 @@
 #include "mmdeploy/core/utils/formatter.h"
 #include "utils.h"
 
-namespace mmdeploy::mmdet {
-
-YOLOHead::YOLOHead(const Value& cfg) : MMDetection(cfg) {
-  auto init = [&]() -> Result<void> {
-    auto model = cfg["context"]["model"].get<Model>();
-    if (cfg.contains("params")) {
-      nms_pre_ = cfg["params"].value("nms_pre", -1);
-      score_thr_ = cfg["params"].value("score_thr", 0.02f);
-      min_bbox_size_ = cfg["params"].value("min_bbox_size", 0);
-      iou_threshold_ = cfg["params"].contains("nms")
-                           ? cfg["params"]["nms"].value("iou_threshold", 0.45f)
-                           : 0.45f;
-      if (cfg["params"].contains("anchor_generator")) {
-        from_value(cfg["params"]["anchor_generator"]["base_sizes"], anchors_);
-        from_value(cfg["params"]["anchor_generator"]["strides"], strides_);
-      }
+namespace mmdeploy::mmdet
+{
+
+    YOLOHead::YOLOHead(const Value& cfg)
+        : MMDetection(cfg)
+    {
+        auto init = [&]() -> Result<void>
+        {
+            auto model = cfg["context"]["model"].get<Model>();
+            if (cfg.contains("params"))
+            {
+                nms_pre_       = cfg["params"].value("nms_pre", -1);
+                score_thr_     = cfg["params"].value("score_thr", 0.02f);
+                min_bbox_size_ = cfg["params"].value("min_bbox_size", 0);
+                iou_threshold_ = cfg["params"].contains("nms") ?
+                                     cfg["params"]["nms"].value("iou_threshold", 0.45f) :
+                                     0.45f;
+                if (cfg["params"].contains("anchor_generator"))
+                {
+                    from_value(cfg["params"]["anchor_generator"]["base_sizes"], anchors_);
+                    from_value(cfg["params"]["anchor_generator"]["strides"], strides_);
+                }
+            }
+            return success();
+        };
+        init().value();
     }
-    return success();
-  };
-  init().value();
-}
-
-Result<Value> YOLOHead::operator()(const Value& prep_res, const Value& infer_res) {
-  MMDEPLOY_DEBUG("prep_res: {}\ninfer_res: {}", prep_res, infer_res);
-  try {
-    const Device kHost{0, 0};
-    std::vector<Tensor> pred_maps;
-    for (auto iter = infer_res.begin(); iter != infer_res.end(); iter++) {
-      auto pred_map = iter->get<Tensor>();
-      OUTCOME_TRY(auto _pred_map, MakeAvailableOnDevice(pred_map, kHost, stream()));
-      pred_maps.push_back(_pred_map);
+
+    Result<Value> YOLOHead::operator()(const Value& prep_res, const Value& infer_res)
+    {
+        MMDEPLOY_DEBUG("prep_res: {}\ninfer_res: {}", prep_res, infer_res);
+        try
+        {
+            const Device        kHost{0, 0};
+            std::vector<Tensor> pred_maps;
+            for (auto iter = infer_res.begin(); iter != infer_res.end(); iter++)
+            {
+                auto pred_map = iter->get<Tensor>();
+                OUTCOME_TRY(auto _pred_map, MakeAvailableOnDevice(pred_map, kHost, stream()));
+                pred_maps.push_back(_pred_map);
+            }
+            OUTCOME_TRY(stream().Wait());
+            // reorder pred_maps according to strides and anchors, mainly for rknpu yolov3
+            if ((pred_maps.size() > 1) &&
+                !((strides_[0] < strides_[1]) ^ (pred_maps[0].shape(3) < pred_maps[1].shape(3))))
+            {
+                std::reverse(pred_maps.begin(), pred_maps.end());
+            }
+            OUTCOME_TRY(auto result, GetBBoxes(prep_res["img_metas"], pred_maps));
+            return to_value(result);
+        }
+        catch (...)
+        {
+            return Status(eFail);
+        }
     }
-    OUTCOME_TRY(stream().Wait());
-    // reorder pred_maps according to strides and anchors, mainly for rknpu yolov3
-    if ((pred_maps.size() > 1) &&
-        !((strides_[0] < strides_[1]) ^ (pred_maps[0].shape(3) < pred_maps[1].shape(3)))) {
-      std::reverse(pred_maps.begin(), pred_maps.end());
+
+    inline static int clamp(float val, int min, int max)
+    {
+        return val > min ? (val < max ? val : max) : min;
     }
-    OUTCOME_TRY(auto result, GetBBoxes(prep_res["img_metas"], pred_maps));
-    return to_value(result);
-  } catch (...) {
-    return Status(eFail);
-  }
-}
-
-inline static int clamp(float val, int min, int max) {
-  return val > min ? (val < max ? val : max) : min;
-}
-
-static float sigmoid(float x) { return 1.0 / (1.0 + expf(-x)); }
-
-static float unsigmoid(float y) { return -1.0 * logf((1.0 / y) - 1.0); }
-
-int YOLOHead::YOLOFeatDecode(const Tensor& feat_map, const std::vector<std::vector<float>>& anchor,
-                             int grid_h, int grid_w, int height, int width, int stride,
-                             std::vector<float>& boxes, std::vector<float>& obj_probs,
-                             std::vector<int>& class_id, float threshold) const {
-  auto input = const_cast<float*>(feat_map.data<float>());
-  auto prop_box_size = feat_map.shape(1) / anchor.size();
-  const int kClasses = prop_box_size - 5;
-  int valid_count = 0;
-  int grid_len = grid_h * grid_w;
-  float thres = unsigmoid(threshold);
-  for (int a = 0; a < anchor.size(); a++) {
-    for (int i = 0; i < grid_h; i++) {
-      for (int j = 0; j < grid_w; j++) {
-        float box_confidence = input[(prop_box_size * a + 4) * grid_len + i * grid_w + j];
-        if (box_confidence >= thres) {
-          int offset = (prop_box_size * a) * grid_len + i * grid_w + j;
-          float* in_ptr = input + offset;
-
-          float box_x = sigmoid(*in_ptr);
-          float box_y = sigmoid(in_ptr[grid_len]);
-          float box_w = in_ptr[2 * grid_len];
-          float box_h = in_ptr[3 * grid_len];
-          auto box = yolo_decode(box_x, box_y, box_w, box_h, stride, anchor, j, i, a);
-
-          box_x = box[0];
-          box_y = box[1];
-          box_w = box[2];
-          box_h = box[3];
-
-          box_x -= (box_w / 2.0);
-          box_y -= (box_h / 2.0);
-          boxes.push_back(box_x);
-          boxes.push_back(box_y);
-          boxes.push_back(box_x + box_w);
-          boxes.push_back(box_y + box_h);
-
-          float max_class_probs = in_ptr[5 * grid_len];
-          int max_class_id = 0;
-          for (int k = 1; k < kClasses; ++k) {
-            float prob = in_ptr[(5 + k) * grid_len];
-            if (prob > max_class_probs) {
-              max_class_id = k;
-              max_class_probs = prob;
+
+    static float sigmoid(float x)
+    {
+        return 1.0 / (1.0 + expf(-x));
+    }
+
+    static float unsigmoid(float y)
+    {
+        return -1.0 * logf((1.0 / y) - 1.0);
+    }
+
+    int YOLOHead::YOLOFeatDecode(const Tensor&                          feat_map,
+                                 const std::vector<std::vector<float>>& anchor,
+                                 int                                    grid_h,
+                                 int                                    grid_w,
+                                 int                                    height,
+                                 int                                    width,
+                                 int                                    stride,
+                                 std::vector<float>&                    boxes,
+                                 std::vector<float>&                    obj_probs,
+                                 std::vector<int>&                      class_id,
+                                 float                                  threshold) const
+    {
+        auto      input         = const_cast<float*>(feat_map.data<float>());
+        auto      prop_box_size = feat_map.shape(1) / anchor.size();
+        const int kClasses      = prop_box_size - 5;
+        int       valid_count   = 0;
+        int       grid_len      = grid_h * grid_w;
+        float     thres         = unsigmoid(threshold);
+
+        for (int a = 0; a < anchor.size(); a++)
+        {
+            for (int i = 0; i < grid_h; i++)
+            {
+                for (int j = 0; j < grid_w; j++)
+                {
+                    float box_confidence = input[(prop_box_size * a + 4) * grid_len + i * grid_w + j];
+                    if (box_confidence >= thres)
+                    {
+                        int    offset = (prop_box_size * a) * grid_len + i * grid_w + j;
+                        float* in_ptr = input + offset;
+
+                        float  box_x = sigmoid(*in_ptr);
+                        float  box_y = sigmoid(in_ptr[grid_len]);
+                        float  box_w = in_ptr[2 * grid_len];
+                        float  box_h = in_ptr[3 * grid_len];
+                        auto   box   = yolo_decode(box_x, box_y, box_w, box_h, stride, anchor, j, i, a);
+
+                        box_x = box[0];
+                        box_y = box[1];
+                        box_w = box[2];
+                        box_h = box[3];
+
+                        box_x -= (box_w / 2.0);
+                        box_y -= (box_h / 2.0);
+                        boxes.push_back(box_x);
+                        boxes.push_back(box_y);
+                        boxes.push_back(box_x + box_w);
+                        boxes.push_back(box_y + box_h);
+
+                        float max_class_probs = in_ptr[5 * grid_len];
+                        int   max_class_id    = 0;
+                        for (int k = 1; k < kClasses; ++k)
+                        {
+                            float prob = in_ptr[(5 + k) * grid_len];
+                            if (prob > max_class_probs)
+                            {
+                                max_class_id    = k;
+                                max_class_probs = prob;
+                            }
+                        }
+                        obj_probs.push_back(sigmoid(max_class_probs) * sigmoid(box_confidence));
+                        class_id.push_back(max_class_id);
+                        valid_count++;
+                    }
+                }
             }
-          }
-          obj_probs.push_back(sigmoid(max_class_probs) * sigmoid(box_confidence));
-          class_id.push_back(max_class_id);
-          valid_count++;
         }
-      }
+        return valid_count;
     }
-  }
-  return valid_count;
-}
-
-Result<Detections> YOLOHead::GetBBoxes(const Value& prep_res,
-                                       const std::vector<Tensor>& pred_maps) const {
-  std::vector<float> filter_boxes;
-  std::vector<float> obj_probs;
-  std::vector<int> class_id;
-
-  int model_in_h = prep_res["img_shape"][1].get<int>();
-  int model_in_w = prep_res["img_shape"][2].get<int>();
-
-  for (int i = 0; i < pred_maps.size(); i++) {
-    int stride = strides_[i];
-    int grid_h = model_in_h / stride;
-    int grid_w = model_in_w / stride;
-    YOLOFeatDecode(pred_maps[i], anchors_[i], grid_h, grid_w, model_in_h, model_in_w, stride,
-                   filter_boxes, obj_probs, class_id, score_thr_);
-  }
-
-  std::vector<int> indexArray;
-  for (int i = 0; i < obj_probs.size(); ++i) {
-    indexArray.push_back(i);
-  }
-  Sort(obj_probs, class_id, indexArray);
-
-  Tensor dets(TensorDesc{Device{0, 0}, DataType::kFLOAT,
-                         TensorShape{int(filter_boxes.size() / 4), 4}, "dets"});
-  std::copy(filter_boxes.begin(), filter_boxes.end(), dets.data<float>());
-  NMS(dets, iou_threshold_, indexArray);
-
-  Detections objs;
-  std::vector<float> scale_factor;
-  if (prep_res.contains("scale_factor")) {
-    from_value(prep_res["scale_factor"], scale_factor);
-  } else {
-    scale_factor = {1.f, 1.f, 1.f, 1.f};
-  }
-
-  int top_padding = 0;
-  int left_padding = 0;
-  if (prep_res.contains("pad_param")) {
-    top_padding = prep_res["pad_param"][0].get<int>();
-    left_padding = prep_res["pad_param"][1].get<int>();
-  }
-
-  int ori_width = prep_res["ori_shape"][2].get<int>();
-  int ori_height = prep_res["ori_shape"][1].get<int>();
-  auto det_ptr = dets.data<float>();
-  for (int i = 0; i < indexArray.size(); ++i) {
-    if (indexArray[i] == -1) {
-      continue;
+
+    Result<Detections> YOLOHead::GetBBoxes(const Value&               prep_res,
+                                           const std::vector<Tensor>& pred_maps) const
+    {
+        std::vector<float> filter_boxes;
+        std::vector<float> obj_probs;
+        std::vector<int>   class_id;
+
+        int                model_in_h = prep_res["img_shape"][1].get<int>();
+        int                model_in_w = prep_res["img_shape"][2].get<int>();
+
+        for (int i = 0; i < pred_maps.size(); i++)
+        {
+            int stride = strides_[i];
+            int grid_h = model_in_h / stride;
+            int grid_w = model_in_w / stride;
+            YOLOFeatDecode(pred_maps[i],
+                           anchors_[i],
+                           grid_h,
+                           grid_w,
+                           model_in_h,
+                           model_in_w,
+                           stride,
+                           filter_boxes,
+                           obj_probs,
+                           class_id,
+                           score_thr_);
+        }
+
+        std::vector<int> indexArray;
+        for (int i = 0; i < obj_probs.size(); ++i)
+        {
+            indexArray.push_back(i);
+        }
+        Sort(obj_probs, class_id, indexArray);
+
+        Tensor dets(TensorDesc{Device{0, 0},
+                               DataType::kFLOAT,
+                               TensorShape{int(filter_boxes.size() / 4), 4},
+                               "dets"});
+        std::copy(filter_boxes.begin(), filter_boxes.end(), dets.data<float>());
+        NMS(dets, iou_threshold_, indexArray);
+
+        Detections         objs;
+        std::vector<float> scale_factor;
+        if (prep_res.contains("scale_factor"))
+        {
+            from_value(prep_res["scale_factor"], scale_factor);
+        }
+        else
+        {
+            scale_factor = {1.f, 1.f, 1.f, 1.f};
+        }
+
+        int top_padding  = 0;
+        int left_padding = 0;
+        if (prep_res.contains("pad_param"))
+        {
+            top_padding  = prep_res["pad_param"][0].get<int>();
+            left_padding = prep_res["pad_param"][1].get<int>();
+        }
+
+        int  ori_width  = prep_res["ori_shape"][2].get<int>();
+        int  ori_height = prep_res["ori_shape"][1].get<int>();
+        auto det_ptr    = dets.data<float>();
+        for (int i = 0; i < indexArray.size(); ++i)
+        {
+            if (indexArray[i] == -1)
+            {
+                continue;
+            }
+            int   j        = indexArray[i];
+            auto  x1       = clamp(det_ptr[j * 4 + 0], 0, model_in_w);
+            auto  y1       = clamp(det_ptr[j * 4 + 1], 0, model_in_h);
+            auto  x2       = clamp(det_ptr[j * 4 + 2], 0, model_in_w);
+            auto  y2       = clamp(det_ptr[j * 4 + 3], 0, model_in_h);
+            int   label_id = class_id[i];
+            float score    = obj_probs[i];
+
+            MMDEPLOY_DEBUG("{}-th box: ({}, {}, {}, {}), {}, {}", i, x1, y1, x2, y2, label_id, score);
+
+            auto rect = MapToOriginImage(x1, y1, x2, y2, scale_factor.data(), 0, 0, ori_width, ori_height, top_padding, left_padding);
+            if (rect[2] - rect[0] < min_bbox_size_ || rect[3] - rect[1] < min_bbox_size_)
+            {
+                MMDEPLOY_DEBUG("ignore small bbox with width '{}' and height '{}", rect[2] - rect[0], rect[3] - rect[1]);
+                continue;
+            }
+            Detection det{};
+            det.index    = i;
+            det.label_id = label_id;
+            det.score    = score;
+            det.bbox     = rect;
+            objs.push_back(std::move(det));
+        }
+
+        return objs;
     }
-    int j = indexArray[i];
-    auto x1 = clamp(det_ptr[j * 4 + 0], 0, model_in_w);
-    auto y1 = clamp(det_ptr[j * 4 + 1], 0, model_in_h);
-    auto x2 = clamp(det_ptr[j * 4 + 2], 0, model_in_w);
-    auto y2 = clamp(det_ptr[j * 4 + 3], 0, model_in_h);
-    int label_id = class_id[i];
-    float score = obj_probs[i];
-
-    MMDEPLOY_DEBUG("{}-th box: ({}, {}, {}, {}), {}, {}", i, x1, y1, x2, y2, label_id, score);
-
-    auto rect = MapToOriginImage(x1, y1, x2, y2, scale_factor.data(), 0, 0, ori_width, ori_height,
-                                 top_padding, left_padding);
-    if (rect[2] - rect[0] < min_bbox_size_ || rect[3] - rect[1] < min_bbox_size_) {
-      MMDEPLOY_DEBUG("ignore small bbox with width '{}' and height '{}", rect[2] - rect[0],
-                     rect[3] - rect[1]);
-      continue;
+
+    std::array<float, 4> YOLOV3Head::yolo_decode(float                                  box_x,
+                                                 float                                  box_y,
+                                                 float                                  box_w,
+                                                 float                                  box_h,
+                                                 float                                  stride,
+                                                 const std::vector<std::vector<float>>& anchor,
+                                                 int                                    j,
+                                                 int                                    i,
+                                                 int                                    a) const
+    {
+        box_x = (box_x + j) * stride;
+        box_y = (box_y + i) * stride;
+        box_w = expf(box_w) * anchor[a][0];
+        box_h = expf(box_h) * anchor[a][1];
+        return std::array<float, 4>{box_x, box_y, box_w, box_h};
     }
-    Detection det{};
-    det.index = i;
-    det.label_id = label_id;
-    det.score = score;
-    det.bbox = rect;
-    objs.push_back(std::move(det));
-  }
-
-  return objs;
-}
-
-std::array<float, 4> YOLOV3Head::yolo_decode(float box_x, float box_y, float box_w, float box_h,
-                                             float stride,
-                                             const std::vector<std::vector<float>>& anchor, int j,
-                                             int i, int a) const {
-  box_x = (box_x + j) * stride;
-  box_y = (box_y + i) * stride;
-  box_w = expf(box_w) * anchor[a][0];
-  box_h = expf(box_h) * anchor[a][1];
-  return std::array<float, 4>{box_x, box_y, box_w, box_h};
-}
-
-std::array<float, 4> YOLOv5Head::yolo_decode(float box_x, float box_y, float box_w, float box_h,
-                                             float stride,
-                                             const std::vector<std::vector<float>>& anchor, int j,
-                                             int i, int a) const {
-  box_x = box_x * 2 - 0.5;
-  box_y = box_y * 2 - 0.5;
-  box_w = sigmoid(box_w) * 2;
-  box_h = sigmoid(box_h) * 2;
-  box_x = (box_x + j) * stride;
-  box_y = (box_y + i) * stride;
-  box_w = box_w * box_w * anchor[a][0];
-  box_h = box_h * box_h * anchor[a][1];
-  return std::array<float, 4>{box_x, box_y, box_w, box_h};
-}
-
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMDetection, YOLOV3Head);
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMDetection, YOLOv5Head);
+
+    std::array<float, 4> YOLOv5Head::yolo_decode(float                                  box_x,
+                                                 float                                  box_y,
+                                                 float                                  box_w,
+                                                 float                                  box_h,
+                                                 float                                  stride,
+                                                 const std::vector<std::vector<float>>& anchor,
+                                                 int                                    j,
+                                                 int                                    i,
+                                                 int                                    a) const
+    {
+        box_x = box_x * 2 - 0.5;
+        box_y = box_y * 2 - 0.5;
+        box_w = sigmoid(box_w) * 2;
+        box_h = sigmoid(box_h) * 2;
+        box_x = (box_x + j) * stride;
+        box_y = (box_y + i) * stride;
+        box_w = box_w * box_w * anchor[a][0];
+        box_h = box_h * box_h * anchor[a][1];
+        return std::array<float, 4>{box_x, box_y, box_w, box_h};
+    }
+
+    MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMDetection, YOLOV3Head);
+    MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMDetection, YOLOv5Head);
 
 }  // namespace mmdeploy::mmdet
diff --git a/csrc/mmdeploy/codebase/mmdet/yolo_head.h b/csrc/mmdeploy/codebase/mmdet/yolo_head.h
index 918a359aab..63234930d7 100644
--- a/csrc/mmdeploy/codebase/mmdet/yolo_head.h
+++ b/csrc/mmdeploy/codebase/mmdet/yolo_head.h
@@ -5,46 +5,79 @@
 #include "mmdeploy/codebase/mmdet/mmdet.h"
 #include "mmdeploy/core/tensor.h"
 
-namespace mmdeploy::mmdet {
-
-class YOLOHead : public MMDetection {
- public:
-  explicit YOLOHead(const Value& cfg);
-  Result<Value> operator()(const Value& prep_res, const Value& infer_res);
-  int YOLOFeatDecode(const Tensor& feat_map, const std::vector<std::vector<float>>& anchor,
-                     int grid_h, int grid_w, int height, int width, int stride,
-                     std::vector<float>& boxes, std::vector<float>& obj_probs,
-                     std::vector<int>& class_id, float threshold) const;
-  Result<Detections> GetBBoxes(const Value& prep_res, const std::vector<Tensor>& pred_maps) const;
-  virtual std::array<float, 4> yolo_decode(float box_x, float box_y, float box_w, float box_h,
-                                           float stride,
-                                           const std::vector<std::vector<float>>& anchor, int j,
-                                           int i, int a) const = 0;
-
- private:
-  float score_thr_{0.4f};
-  int nms_pre_{1000};
-  float iou_threshold_{0.45f};
-  int min_bbox_size_{0};
-  std::vector<std::vector<std::vector<float>>> anchors_;
-  std::vector<float> strides_;
-};
-
-class YOLOV3Head : public YOLOHead {
- public:
-  using YOLOHead::YOLOHead;
-  std::array<float, 4> yolo_decode(float box_x, float box_y, float box_w, float box_h, float stride,
-                                   const std::vector<std::vector<float>>& anchor, int j, int i,
-                                   int a) const override;
-};
-
-class YOLOv5Head : public YOLOHead {
- public:
-  using YOLOHead::YOLOHead;
-  std::array<float, 4> yolo_decode(float box_x, float box_y, float box_w, float box_h, float stride,
-                                   const std::vector<std::vector<float>>& anchor, int j, int i,
-                                   int a) const override;
-};
+namespace mmdeploy::mmdet
+{
+
+    class YOLOHead : public MMDetection
+    {
+      public:
+        explicit YOLOHead(const Value& cfg);
+
+        Result<Value>                operator()(const Value& prep_res, const Value& infer_res);
+
+        int                          YOLOFeatDecode(const Tensor&                          feat_map,
+                                                    const std::vector<std::vector<float>>& anchor,
+                                                    int                                    grid_h,
+                                                    int                                    grid_w,
+                                                    int                                    height,
+                                                    int                                    width,
+                                                    int                                    stride,
+                                                    std::vector<float>&                    boxes,
+                                                    std::vector<float>&                    obj_probs,
+                                                    std::vector<int>&                      class_id,
+                                                    float                                  threshold) const;
+
+        Result<Detections>           GetBBoxes(const Value&               prep_res,
+                                               const std::vector<Tensor>& pred_maps) const;
+
+        virtual std::array<float, 4> yolo_decode(float                                  box_x,
+                                                 float                                  box_y,
+                                                 float                                  box_w,
+                                                 float                                  box_h,
+                                                 float                                  stride,
+                                                 const std::vector<std::vector<float>>& anchor,
+                                                 int                                    j,
+                                                 int                                    i,
+                                                 int                                    a) const = 0;
+
+      private:
+        float                                        score_thr_{0.4f};
+        int                                          nms_pre_{1000};
+        float                                        iou_threshold_{0.45f};
+        int                                          min_bbox_size_{0};
+        std::vector<std::vector<std::vector<float>>> anchors_;
+        std::vector<float>                           strides_;
+    };
+
+    class YOLOV3Head : public YOLOHead
+    {
+      public:
+        using YOLOHead::YOLOHead;
+        std::array<float, 4> yolo_decode(float                                  box_x,
+                                         float                                  box_y,
+                                         float                                  box_w,
+                                         float                                  box_h,
+                                         float                                  stride,
+                                         const std::vector<std::vector<float>>& anchor,
+                                         int                                    j,
+                                         int                                    i,
+                                         int                                    a) const override;
+    };
+
+    class YOLOv5Head : public YOLOHead
+    {
+      public:
+        using YOLOHead::YOLOHead;
+        std::array<float, 4> yolo_decode(float                                  box_x,
+                                         float                                  box_y,
+                                         float                                  box_w,
+                                         float                                  box_h,
+                                         float                                  stride,
+                                         const std::vector<std::vector<float>>& anchor,
+                                         int                                    j,
+                                         int                                    i,
+                                         int                                    a) const override;
+    };
 
 }  // namespace mmdeploy::mmdet
 
diff --git a/csrc/mmdeploy/codebase/mmedit/CMakeLists.txt b/csrc/mmdeploy/codebase/mmedit/CMakeLists.txt
index 1d9f256bb2..d90a62bbd4 100644
--- a/csrc/mmdeploy/codebase/mmedit/CMakeLists.txt
+++ b/csrc/mmdeploy/codebase/mmedit/CMakeLists.txt
@@ -4,8 +4,9 @@ project(mmdeploy_mmedit)
 
 file(GLOB_RECURSE SRCS ${CMAKE_CURRENT_SOURCE_DIR} "*.cpp")
 mmdeploy_add_module(${PROJECT_NAME} "${SRCS}")
-target_link_libraries(${PROJECT_NAME}
-    PRIVATE mmdeploy_opencv_utils)
+target_link_libraries(${PROJECT_NAME} PRIVATE mmdeploy_opencv_utils)
 add_library(mmdeploy::mmedit ALIAS ${PROJECT_NAME})
 
-set(MMDEPLOY_TASKS ${MMDEPLOY_TASKS} restorer CACHE INTERNAL "")
+set(MMDEPLOY_TASKS
+    ${MMDEPLOY_TASKS} restorer
+    CACHE INTERNAL "")
diff --git a/csrc/mmdeploy/codebase/mmedit/mmedit.cpp b/csrc/mmdeploy/codebase/mmedit/mmedit.cpp
index b25317b27f..8935b15e63 100644
--- a/csrc/mmdeploy/codebase/mmedit/mmedit.cpp
+++ b/csrc/mmdeploy/codebase/mmedit/mmedit.cpp
@@ -4,8 +4,9 @@
 
 #include "mmdeploy/core/registry.h"
 
-namespace mmdeploy::mmedit {
+namespace mmdeploy::mmedit
+{
 
-MMDEPLOY_REGISTER_CODEBASE(MMEdit);
+    MMDEPLOY_REGISTER_CODEBASE(MMEdit);
 
 }  // namespace mmdeploy::mmedit
diff --git a/csrc/mmdeploy/codebase/mmedit/mmedit.h b/csrc/mmdeploy/codebase/mmedit/mmedit.h
index af51c5cd2c..4f47807afa 100644
--- a/csrc/mmdeploy/codebase/mmedit/mmedit.h
+++ b/csrc/mmdeploy/codebase/mmedit/mmedit.h
@@ -9,11 +9,12 @@
 #include "mmdeploy/core/module.h"
 #include "mmdeploy/core/serialization.h"
 
-namespace mmdeploy::mmedit {
+namespace mmdeploy::mmedit
+{
 
-using RestorerOutput = Mat;
+    using RestorerOutput = Mat;
 
-MMDEPLOY_DECLARE_CODEBASE(MMEdit, mmedit);
+    MMDEPLOY_DECLARE_CODEBASE(MMEdit, mmedit);
 
 }  // namespace mmdeploy::mmedit
 
diff --git a/csrc/mmdeploy/codebase/mmedit/restorer.cpp b/csrc/mmdeploy/codebase/mmedit/restorer.cpp
index 2b582f2425..8a4149b728 100644
--- a/csrc/mmdeploy/codebase/mmedit/restorer.cpp
+++ b/csrc/mmdeploy/codebase/mmedit/restorer.cpp
@@ -6,53 +6,63 @@
 #include "mmdeploy/core/tensor.h"
 #include "mmdeploy/core/utils/device_utils.h"
 
-namespace mmdeploy::mmedit {
-
-class TensorToImg : public MMEdit {
- public:
-  explicit TensorToImg(const Value& cfg) : MMEdit(cfg) {}
-
-  Result<Value> operator()(const Value& input) {
-    auto upscale = input["output"].get<Tensor>();
-    OUTCOME_TRY(auto upscale_cpu, MakeAvailableOnDevice(upscale, kHOST, stream()));
-    OUTCOME_TRY(stream().Wait());
-    if (upscale.shape().size() == 4 && upscale.data_type() == DataType::kFLOAT) {
-      auto channels = static_cast<int>(upscale.shape(1));
-      auto height = static_cast<int>(upscale.shape(2));
-      auto width = static_cast<int>(upscale.shape(3));
-      // TODO: handle BGR <-> RGB conversion
-      OUTCOME_TRY(auto format, ChannelsToFormat(channels));
-      Mat mat(height, width, format, DataType::kINT8, kHOST);
-      cv::Mat_<float> mat_chw(channels, height * width, upscale_cpu.data<float>());
-      cv::Mat mat_hwc(height * width, channels, CV_32F);
-      cv::transpose(mat_chw, mat_hwc);
-      cv::Mat rescale_uint8(height, width, CV_8UC(channels), mat.data<uint8_t>());
-      mat_hwc = mat_hwc.reshape(channels, height);
-      // convert has saturate_cast inside
-      mat_hwc.convertTo(rescale_uint8, CV_8UC(channels), 255.f);
-      return mat;
-    } else {
-      MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}, dtype: {}", upscale.shape(),
-                     (int)upscale.data_type());
-      return Status(eNotSupported);
-    }
-  }
-
- protected:
-  static Result<PixelFormat> ChannelsToFormat(int channels) {
-    switch (channels) {
-      case 1:
-        return PixelFormat::kGRAYSCALE;
-      case 3:
-        return PixelFormat::kRGB;
-      default:
-        return Status(eNotSupported);
-    }
-  }
-
-  static constexpr const Device kHOST{0, 0};
-};
-
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMEdit, TensorToImg);
+namespace mmdeploy::mmedit
+{
+
+    class TensorToImg : public MMEdit
+    {
+      public:
+        explicit TensorToImg(const Value& cfg)
+            : MMEdit(cfg)
+        {
+        }
+
+        Result<Value> operator()(const Value& input)
+        {
+            auto upscale = input["output"].get<Tensor>();
+            OUTCOME_TRY(auto upscale_cpu, MakeAvailableOnDevice(upscale, kHOST, stream()));
+            OUTCOME_TRY(stream().Wait());
+            if (upscale.shape().size() == 4 && upscale.data_type() == DataType::kFLOAT)
+            {
+                auto channels = static_cast<int>(upscale.shape(1));
+                auto height   = static_cast<int>(upscale.shape(2));
+                auto width    = static_cast<int>(upscale.shape(3));
+                // TODO: handle BGR <-> RGB conversion
+                OUTCOME_TRY(auto format, ChannelsToFormat(channels));
+                Mat             mat(height, width, format, DataType::kINT8, kHOST);
+                cv::Mat_<float> mat_chw(channels, height * width, upscale_cpu.data<float>());
+                cv::Mat         mat_hwc(height * width, channels, CV_32F);
+                cv::transpose(mat_chw, mat_hwc);
+                cv::Mat rescale_uint8(height, width, CV_8UC(channels), mat.data<uint8_t>());
+                mat_hwc = mat_hwc.reshape(channels, height);
+                // convert has saturate_cast inside
+                mat_hwc.convertTo(rescale_uint8, CV_8UC(channels), 255.f);
+                return mat;
+            }
+            else
+            {
+                MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}, dtype: {}", upscale.shape(), (int)upscale.data_type());
+                return Status(eNotSupported);
+            }
+        }
+
+      protected:
+        static Result<PixelFormat> ChannelsToFormat(int channels)
+        {
+            switch (channels)
+            {
+                case 1:
+                    return PixelFormat::kGRAYSCALE;
+                case 3:
+                    return PixelFormat::kRGB;
+                default:
+                    return Status(eNotSupported);
+            }
+        }
+
+        static constexpr const Device kHOST{0, 0};
+    };
+
+    MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMEdit, TensorToImg);
 
 }  // namespace mmdeploy::mmedit
diff --git a/csrc/mmdeploy/codebase/mmocr/CMakeLists.txt b/csrc/mmdeploy/codebase/mmocr/CMakeLists.txt
index e1567cba85..d54c2b6ef1 100644
--- a/csrc/mmdeploy/codebase/mmocr/CMakeLists.txt
+++ b/csrc/mmdeploy/codebase/mmocr/CMakeLists.txt
@@ -11,13 +11,13 @@ mmdeploy_add_module(${PROJECT_NAME} "${SRCS}")
 add_subdirectory(cpu)
 add_subdirectory(cuda)
 
-target_include_directories(${PROJECT_NAME} PRIVATE
-        ${CMAKE_SOURCE_DIR}/third_party/clipper)
-target_link_libraries(${PROJECT_NAME} PRIVATE
-        mmdeploy_opencv_utils
-        mmdeploy_operation
-        mmdeploy::transform
-        mmdeploy::core)
+target_include_directories(${PROJECT_NAME}
+                           PRIVATE ${CMAKE_SOURCE_DIR}/third_party/clipper)
+target_link_libraries(
+  ${PROJECT_NAME} PRIVATE mmdeploy_opencv_utils mmdeploy_operation
+                          mmdeploy::transform mmdeploy::core)
 add_library(mmdeploy::mmocr ALIAS ${PROJECT_NAME})
 
-set(MMDEPLOY_TASKS ${MMDEPLOY_TASKS} text_detector text_recognizer CACHE INTERNAL "")
+set(MMDEPLOY_TASKS
+    ${MMDEPLOY_TASKS} text_detector text_recognizer
+    CACHE INTERNAL "")
diff --git a/csrc/mmdeploy/codebase/mmocr/attention_convertor.cpp b/csrc/mmdeploy/codebase/mmocr/attention_convertor.cpp
index 4bef6cc13c..af2c7c31c7 100644
--- a/csrc/mmdeploy/codebase/mmocr/attention_convertor.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/attention_convertor.cpp
@@ -14,64 +14,73 @@
 #include "mmdeploy/experimental/module_adapter.h"
 #include "mmocr.h"
 
-namespace mmdeploy::mmocr {
-
-using std::string;
-using std::vector;
-
-class AttnConvertor : public BaseConvertor {
- public:
-  explicit AttnConvertor(const Value& cfg) : BaseConvertor(cfg) {}
-
-  Result<Value> operator()(const Value& _data, const Value& _prob) {
-    auto d_conf = _prob["output"].get<Tensor>();
-
-    if (!(d_conf.shape().size() == 3 && d_conf.data_type() == DataType::kFLOAT)) {
-      MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}, dtype: {}", d_conf.shape(),
-                     (int)d_conf.data_type());
-      return Status(eNotSupported);
-    }
-
-    OUTCOME_TRY(auto h_conf, MakeAvailableOnDevice(d_conf, Device{0}, stream()));
-    OUTCOME_TRY(stream().Wait());
-
-    auto data = h_conf.data<float>();
-
-    auto shape = d_conf.shape();
-    auto w = static_cast<int>(shape[1]);
-    auto c = static_cast<int>(shape[2]);
-
-    auto [indexes, scores] = Tensor2Idx(data, w, c);
-
-    auto text = Idx2Str(indexes);
-    MMDEPLOY_DEBUG("text: {}", text);
-
-    TextRecognition output{text, scores};
-
-    return make_pointer(to_value(output));
-  }
-
-  std::pair<vector<int>, vector<float> > Tensor2Idx(const float* data, int w, int c) {
-    auto decode_len = w;
-    vector<int> indexes;
-    indexes.reserve(decode_len);
-    vector<float> scores;
-    scores.reserve(decode_len);
-    for (int t = 0; t < decode_len; ++t, data += c) {
-      vector<float> prob(data, data + c);
-      auto iter = max_element(begin(prob), end(prob));
-      auto index = static_cast<int>(iter - begin(prob));
-      if (index == end_idx_) break;
-      if (std::find(ignore_indexes_.begin(), ignore_indexes_.end(), index) ==
-          ignore_indexes_.end()) {
-        indexes.push_back(index);
-        scores.push_back(*iter);
-      }
-    }
-    return {indexes, scores};
-  }
-};
-
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMOCR, AttnConvertor);
+namespace mmdeploy::mmocr
+{
+
+    using std::string;
+    using std::vector;
+
+    class AttnConvertor : public BaseConvertor
+    {
+      public:
+        explicit AttnConvertor(const Value& cfg)
+            : BaseConvertor(cfg)
+        {
+        }
+
+        Result<Value> operator()(const Value& _data, const Value& _prob)
+        {
+            auto d_conf = _prob["output"].get<Tensor>();
+
+            if (!(d_conf.shape().size() == 3 && d_conf.data_type() == DataType::kFLOAT))
+            {
+                MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}, dtype: {}", d_conf.shape(), (int)d_conf.data_type());
+                return Status(eNotSupported);
+            }
+
+            OUTCOME_TRY(auto h_conf, MakeAvailableOnDevice(d_conf, Device{0}, stream()));
+            OUTCOME_TRY(stream().Wait());
+
+            auto data = h_conf.data<float>();
+
+            auto shape = d_conf.shape();
+            auto w     = static_cast<int>(shape[1]);
+            auto c     = static_cast<int>(shape[2]);
+
+            auto [indexes, scores] = Tensor2Idx(data, w, c);
+
+            auto text = Idx2Str(indexes);
+            MMDEPLOY_DEBUG("text: {}", text);
+
+            TextRecognition output{text, scores};
+
+            return make_pointer(to_value(output));
+        }
+
+        std::pair<vector<int>, vector<float>> Tensor2Idx(const float* data, int w, int c)
+        {
+            auto        decode_len = w;
+            vector<int> indexes;
+            indexes.reserve(decode_len);
+            vector<float> scores;
+            scores.reserve(decode_len);
+            for (int t = 0; t < decode_len; ++t, data += c)
+            {
+                vector<float> prob(data, data + c);
+                auto          iter  = max_element(begin(prob), end(prob));
+                auto          index = static_cast<int>(iter - begin(prob));
+                if (index == end_idx_) break;
+                if (std::find(ignore_indexes_.begin(), ignore_indexes_.end(), index) ==
+                    ignore_indexes_.end())
+                {
+                    indexes.push_back(index);
+                    scores.push_back(*iter);
+                }
+            }
+            return {indexes, scores};
+        }
+    };
+
+    MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMOCR, AttnConvertor);
 
 }  // namespace mmdeploy::mmocr
diff --git a/csrc/mmdeploy/codebase/mmocr/base_convertor.cpp b/csrc/mmdeploy/codebase/mmocr/base_convertor.cpp
index 5047a48bef..baf66df81a 100644
--- a/csrc/mmdeploy/codebase/mmocr/base_convertor.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/base_convertor.cpp
@@ -2,145 +2,189 @@
 
 #include "base_convertor.h"
 
-namespace mmdeploy::mmocr {
+namespace mmdeploy::mmocr
+{
 
-using std::string;
-using std::unordered_map;
-using std::unordered_set;
-using std::vector;
+    using std::string;
+    using std::unordered_map;
+    using std::unordered_set;
+    using std::vector;
 
-BaseConvertor::BaseConvertor(const Value& cfg) : MMOCR(cfg) {
-  auto model = cfg["context"]["model"].get<Model>();
-  if (!cfg.contains("params")) {
-    MMDEPLOY_ERROR("'params' is required, but it's not in the config");
-    throw_exception(eInvalidArgument);
-  }
-  // BaseConverter
-  auto& _cfg = cfg["params"];
-  if (_cfg.contains("dict_file")) {
-    auto filename = _cfg["dict_file"].get<std::string>();
-    auto content = model.ReadFile(filename).value();
-    idx2char_ = SplitLines(content);
-  } else if (_cfg.contains("dict_list")) {
-    from_value(_cfg["dict_list"], idx2char_);
-  } else if (_cfg.contains("dict_type")) {
-    auto dict_type = _cfg["dict_type"].get<std::string>();
-    if (dict_type == "DICT36") {
-      idx2char_ = SplitChars(DICT36);
-    } else if (dict_type == "DICT90") {
-      idx2char_ = SplitChars(DICT90);
-    } else {
-      MMDEPLOY_ERROR("unknown dict_type: {}", dict_type);
-      throw_exception(eInvalidArgument);
-    }
-  } else {
-    MMDEPLOY_ERROR("either dict_file, dict_list or dict_type must be specified");
-    throw_exception(eInvalidArgument);
-  }
-  model_ = model;
+    BaseConvertor::BaseConvertor(const Value& cfg)
+        : MMOCR(cfg)
+    {
+        auto model = cfg["context"]["model"].get<Model>();
+        if (!cfg.contains("params"))
+        {
+            MMDEPLOY_ERROR("'params' is required, but it's not in the config");
+            throw_exception(eInvalidArgument);
+        }
+        // BaseConverter
+        auto& _cfg = cfg["params"];
+        if (_cfg.contains("dict_file"))
+        {
+            auto filename = _cfg["dict_file"].get<std::string>();
+            auto content  = model.ReadFile(filename).value();
+            idx2char_     = SplitLines(content);
+        }
+        else if (_cfg.contains("dict_list"))
+        {
+            from_value(_cfg["dict_list"], idx2char_);
+        }
+        else if (_cfg.contains("dict_type"))
+        {
+            auto dict_type = _cfg["dict_type"].get<std::string>();
+            if (dict_type == "DICT36")
+            {
+                idx2char_ = SplitChars(DICT36);
+            }
+            else if (dict_type == "DICT90")
+            {
+                idx2char_ = SplitChars(DICT90);
+            }
+            else
+            {
+                MMDEPLOY_ERROR("unknown dict_type: {}", dict_type);
+                throw_exception(eInvalidArgument);
+            }
+        }
+        else
+        {
+            MMDEPLOY_ERROR("either dict_file, dict_list or dict_type must be specified");
+            throw_exception(eInvalidArgument);
+        }
+        model_ = model;
 
-  // Update Dictionary
-  bool with_start = _cfg.value("with_start", false);
-  bool with_end = _cfg.value("with_end", false);
-  bool same_start_end = _cfg.value("same_start_end", false);
-  bool with_padding = _cfg.value("with_padding", false);
-  bool with_unknown = _cfg.value("with_unknown", false);
+        // Update Dictionary
+        bool with_start     = _cfg.value("with_start", false);
+        bool with_end       = _cfg.value("with_end", false);
+        bool same_start_end = _cfg.value("same_start_end", false);
+        bool with_padding   = _cfg.value("with_padding", false);
+        bool with_unknown   = _cfg.value("with_unknown", false);
 
-  if (with_start && with_end && same_start_end) {
-    start_idx_ = static_cast<int>(idx2char_.size());
-    end_idx_ = start_idx_;
-    string start_end_token = _cfg.value("start_end_token", string("<BOS/EOS>"));
-    idx2char_.emplace_back(std::move(start_end_token));
-  } else {
-    if (with_start) {
-      start_idx_ = static_cast<int>(idx2char_.size());
-      string start_token = _cfg.value("start_token", string("<BOS>"));
-      idx2char_.emplace_back(std::move(start_token));
-    }
-    if (with_end) {
-      end_idx_ = static_cast<int>(idx2char_.size());
-      string end_token = _cfg.value("end_token", string("<EOS>"));
-      idx2char_.emplace_back(std::move(end_token));
-    }
-  }
-  if (with_padding) {
-    padding_idx_ = static_cast<int>(idx2char_.size());
-    string padding_token = _cfg.value("padding_token", string("<PAD>"));
-    idx2char_.emplace_back(std::move(padding_token));
-  }
-  if (with_unknown && _cfg.contains("unknown_token") && !_cfg["unknown_token"].is_null()) {
-    unknown_idx_ = static_cast<int>(idx2char_.size());
-    string unknown_token = _cfg.value("unknown_token", string("<UKN>"));
-    idx2char_.emplace_back(unknown_token);
-  }
+        if (with_start && with_end && same_start_end)
+        {
+            start_idx_             = static_cast<int>(idx2char_.size());
+            end_idx_               = start_idx_;
+            string start_end_token = _cfg.value("start_end_token", string("<BOS/EOS>"));
+            idx2char_.emplace_back(std::move(start_end_token));
+        }
+        else
+        {
+            if (with_start)
+            {
+                start_idx_         = static_cast<int>(idx2char_.size());
+                string start_token = _cfg.value("start_token", string("<BOS>"));
+                idx2char_.emplace_back(std::move(start_token));
+            }
+            if (with_end)
+            {
+                end_idx_         = static_cast<int>(idx2char_.size());
+                string end_token = _cfg.value("end_token", string("<EOS>"));
+                idx2char_.emplace_back(std::move(end_token));
+            }
+        }
+        if (with_padding)
+        {
+            padding_idx_         = static_cast<int>(idx2char_.size());
+            string padding_token = _cfg.value("padding_token", string("<PAD>"));
+            idx2char_.emplace_back(std::move(padding_token));
+        }
+        if (with_unknown && _cfg.contains("unknown_token") && !_cfg["unknown_token"].is_null())
+        {
+            unknown_idx_         = static_cast<int>(idx2char_.size());
+            string unknown_token = _cfg.value("unknown_token", string("<UKN>"));
+            idx2char_.emplace_back(unknown_token);
+        }
 
-  // char2idx
-  for (int i = 0; i < (int)idx2char_.size(); i++) {
-    char2idx_[idx2char_[i]] = i;
-  }
+        // char2idx
+        for (int i = 0; i < (int)idx2char_.size(); i++)
+        {
+            char2idx_[idx2char_[i]] = i;
+        }
 
-  vector<string> ignore_chars;
-  if (cfg.contains("ignore_chars")) {
-    for (int i = 0; i < cfg["ignore_chars"].size(); i++)
-      ignore_chars.emplace_back(cfg["ignore_chars"][i].get<string>());
-  } else {
-    ignore_chars.emplace_back("padding");
-  }
-  std::map<string, int> mapping_table = {
-      {"padding", padding_idx_}, {"end", end_idx_}, {"unknown", unknown_idx_}};
-  for (int i = 0; i < ignore_chars.size(); i++) {
-    const auto& ignore_char = ignore_chars[i];
-    int ignore_idx = -1;
+        vector<string> ignore_chars;
+        if (cfg.contains("ignore_chars"))
+        {
+            for (int i = 0; i < cfg["ignore_chars"].size(); i++)
+                ignore_chars.emplace_back(cfg["ignore_chars"][i].get<string>());
+        }
+        else
+        {
+            ignore_chars.emplace_back("padding");
+        }
+        std::map<string, int> mapping_table = {
+            {"padding", padding_idx_},
+            {"end", end_idx_},
+            {"unknown", unknown_idx_}};
+        for (int i = 0; i < ignore_chars.size(); i++)
+        {
+            const auto& ignore_char = ignore_chars[i];
+            int         ignore_idx  = -1;
 
-    if (auto it_default = mapping_table.find(ignore_char); it_default != mapping_table.end()) {
-      ignore_idx = it_default->second;
-    } else if (auto it_candidate = char2idx_.find(ignore_char); it_candidate != char2idx_.end()) {
-      ignore_idx = it_candidate->second;
-    } else if (with_unknown) {
-      ignore_idx = unknown_idx_;
-    }
+            if (auto it_default = mapping_table.find(ignore_char); it_default != mapping_table.end())
+            {
+                ignore_idx = it_default->second;
+            }
+            else if (auto it_candidate = char2idx_.find(ignore_char); it_candidate != char2idx_.end())
+            {
+                ignore_idx = it_candidate->second;
+            }
+            else if (with_unknown)
+            {
+                ignore_idx = unknown_idx_;
+            }
 
-    if (ignore_idx == -1 || (ignore_idx == unknown_idx_ && ignore_char != "unknown")) {
-      MMDEPLOY_WARN("{} does not exist in the dictionary", ignore_char);
-      continue;
+            if (ignore_idx == -1 || (ignore_idx == unknown_idx_ && ignore_char != "unknown"))
+            {
+                MMDEPLOY_WARN("{} does not exist in the dictionary", ignore_char);
+                continue;
+            }
+            ignore_indexes_.insert(ignore_idx);
+        }
     }
-    ignore_indexes_.insert(ignore_idx);
-  }
-}
 
-string BaseConvertor::Idx2Str(const vector<int>& indexes) {
-  size_t count = 0;
-  for (const auto& idx : indexes) {
-    if (idx >= idx2char_.size()) {
-      MMDEPLOY_ERROR("idx exceeds array bounds {} {}", idx, idx2char_.size());
+    string BaseConvertor::Idx2Str(const vector<int>& indexes)
+    {
+        size_t count = 0;
+        for (const auto& idx : indexes)
+        {
+            if (idx >= idx2char_.size())
+            {
+                MMDEPLOY_ERROR("idx exceeds array bounds {} {}", idx, idx2char_.size());
+            }
+            count += idx2char_[idx].size();
+        }
+        std::string text;
+        text.reserve(count);
+        for (const auto& idx : indexes)
+        {
+            text += idx2char_[idx];
+        }
+        return text;
     }
-    count += idx2char_[idx].size();
-  }
-  std::string text;
-  text.reserve(count);
-  for (const auto& idx : indexes) {
-    text += idx2char_[idx];
-  }
-  return text;
-}
 
-vector<string> BaseConvertor::SplitLines(const string& s) {
-  std::istringstream is(s);
-  vector<string> ret;
-  string line;
-  while (std::getline(is, line)) {
-    ret.push_back(std::move(line));
-  }
-  return ret;
-}
+    vector<string> BaseConvertor::SplitLines(const string& s)
+    {
+        std::istringstream is(s);
+        vector<string>     ret;
+        string             line;
+        while (std::getline(is, line))
+        {
+            ret.push_back(std::move(line));
+        }
+        return ret;
+    }
 
-vector<string> BaseConvertor::SplitChars(const string& s) {
-  vector<string> ret;
-  ret.reserve(s.size());
-  for (char c : s) {
-    ret.push_back({c});
-  }
-  return ret;
-}
+    vector<string> BaseConvertor::SplitChars(const string& s)
+    {
+        vector<string> ret;
+        ret.reserve(s.size());
+        for (char c : s)
+        {
+            ret.push_back({c});
+        }
+        return ret;
+    }
 
 }  // namespace mmdeploy::mmocr
diff --git a/csrc/mmdeploy/codebase/mmocr/base_convertor.h b/csrc/mmdeploy/codebase/mmocr/base_convertor.h
index 8c1a28eb42..52b95627bd 100644
--- a/csrc/mmdeploy/codebase/mmocr/base_convertor.h
+++ b/csrc/mmdeploy/codebase/mmocr/base_convertor.h
@@ -15,44 +15,46 @@
 #include "mmdeploy/experimental/module_adapter.h"
 #include "mmocr.h"
 
-namespace mmdeploy::mmocr {
+namespace mmdeploy::mmocr
+{
 
-using std::string;
-using std::unordered_map;
-using std::unordered_set;
-using std::vector;
+    using std::string;
+    using std::unordered_map;
+    using std::unordered_set;
+    using std::vector;
 
-class BaseConvertor : public MMOCR {
- public:
-  explicit BaseConvertor(const Value& cfg);
+    class BaseConvertor : public MMOCR
+    {
+      public:
+        explicit BaseConvertor(const Value& cfg);
 
-  string Idx2Str(const vector<int>& indexes);
+        string                Idx2Str(const vector<int>& indexes);
 
-  virtual Result<Value> operator()(const Value& _data, const Value& _prob) = 0;
+        virtual Result<Value> operator()(const Value& _data, const Value& _prob) = 0;
 
- protected:
-  static vector<string> SplitLines(const string& s);
+      protected:
+        static vector<string>       SplitLines(const string& s);
 
-  static vector<string> SplitChars(const string& s);
+        static vector<string>       SplitChars(const string& s);
 
-  static constexpr const auto DICT36 = R"(0123456789abcdefghijklmnopqrstuvwxyz)";
-  static constexpr const auto DICT90 = R"(0123456789abcdefghijklmnopqrstuvwxyz)"
-                                       R"(ABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'())"
-                                       R"(*+,-./:;<=>?@[\]_`~)";
+        static constexpr const auto DICT36 = R"(0123456789abcdefghijklmnopqrstuvwxyz)";
+        static constexpr const auto DICT90 = R"(0123456789abcdefghijklmnopqrstuvwxyz)"
+                                             R"(ABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'())"
+                                             R"(*+,-./:;<=>?@[\]_`~)";
 
-  static constexpr const auto kHost = Device(0);
+        static constexpr const auto kHost = Device(0);
 
-  Model model_;
+        Model                       model_;
 
-  int padding_idx_{-1};
-  int end_idx_{-1};
-  int start_idx_{-1};
-  int unknown_idx_{-1};
+        int                         padding_idx_{-1};
+        int                         end_idx_{-1};
+        int                         start_idx_{-1};
+        int                         unknown_idx_{-1};
 
-  unordered_set<int> ignore_indexes_;
-  unordered_map<string, int> char2idx_;
-  vector<string> idx2char_;
+        unordered_set<int>          ignore_indexes_;
+        unordered_map<string, int>  char2idx_;
+        vector<string>              idx2char_;
 
-};  // class BaseConvertor
+    };  // class BaseConvertor
 
 }  // namespace mmdeploy::mmocr
diff --git a/csrc/mmdeploy/codebase/mmocr/contour_expand.cpp b/csrc/mmdeploy/codebase/mmocr/contour_expand.cpp
index 4199a964cb..4d08f83ecb 100644
--- a/csrc/mmdeploy/codebase/mmocr/contour_expand.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/contour_expand.cpp
@@ -11,114 +11,129 @@
 #include "mmdeploy/core/tensor.h"
 #include "opencv2/imgproc/imgproc.hpp"
 
-namespace mmdeploy::mmocr {
-
-using namespace std;
-using cv::Mat_;
-
-class Point2d {
- public:
-  int x;
-  int y;
-
-  Point2d() : x(0), y(0) {}
-  Point2d(int _x, int _y) : x(_x), y(_y) {}
-};
-
-void kernel_dilate(const uint8_t* data, int kernel_num, int height, int width, const int* label_map,
-                   int label_num, const float* score_map, int min_area, Mat_<int32_t>& text_labels,
-                   vector<int>& text_areas, vector<float>& text_scores,
-                   vector<vector<int>>& text_points) {
-  text_labels = Mat_<int32_t>::zeros(height, width);
-  text_areas = vector<int>(label_num);
-  text_scores = vector<float>(label_num);
-  text_points = vector<vector<int>>(label_num);
-
-  for (int x = 0; x < height; ++x) {
-    for (int y = 0; y < width; ++y) {
-      int label = label_map[x * width + y];
-      if (label == 0) continue;
-      text_areas[label] += 1;
-      text_scores[label] += score_map[x * width + y];
-      text_points[label].push_back(y);
-      text_points[label].push_back(x);
+namespace mmdeploy::mmocr
+{
+
+    using namespace std;
+    using cv::Mat_;
+
+    class Point2d
+    {
+      public:
+        int x;
+        int y;
+
+        Point2d()
+            : x(0)
+            , y(0)
+        {
+        }
+        Point2d(int _x, int _y)
+            : x(_x)
+            , y(_y)
+        {
+        }
+    };
+
+    void kernel_dilate(const uint8_t* data, int kernel_num, int height, int width, const int* label_map, int label_num, const float* score_map, int min_area, Mat_<int32_t>& text_labels, vector<int>& text_areas, vector<float>& text_scores, vector<vector<int>>& text_points)
+    {
+        text_labels = Mat_<int32_t>::zeros(height, width);
+        text_areas  = vector<int>(label_num);
+        text_scores = vector<float>(label_num);
+        text_points = vector<vector<int>>(label_num);
+
+        for (int x = 0; x < height; ++x)
+        {
+            for (int y = 0; y < width; ++y)
+            {
+                int label = label_map[x * width + y];
+                if (label == 0) continue;
+                text_areas[label] += 1;
+                text_scores[label] += score_map[x * width + y];
+                text_points[label].push_back(y);
+                text_points[label].push_back(x);
+            }
+        }
+
+        queue<Point2d> queue, next_queue;
+        for (int x = 0; x < height; ++x)
+        {
+            auto row = text_labels[x];
+            for (int y = 0; y < width; ++y)
+            {
+                int label = label_map[x * width + y];
+                if (label == 0) continue;
+                if (text_areas[label] < min_area) continue;
+                Point2d point(x, y);
+                queue.push(point);
+                row[y] = label;
+            }
+        }
+
+        const int   dx[] = {-1, 1, 0, 0};
+        const int   dy[] = {0, 0, -1, 1};
+        vector<int> kernel_step(kernel_num);
+        std::for_each(kernel_step.begin(), kernel_step.end(), [=](int& k)
+                      { return k * height * width; });
+
+        for (int kernel_id = kernel_num - 2; kernel_id >= 0; --kernel_id)
+        {
+            while (!queue.empty())
+            {
+                Point2d point = queue.front();
+                queue.pop();
+                int  x       = point.x;
+                int  y       = point.y;
+                int  label   = text_labels[x][y];
+                bool is_edge = true;
+                for (int d = 0; d < 4; ++d)
+                {
+                    int tmp_x = x + dx[d];
+                    int tmp_y = y + dy[d];
+                    if (tmp_x < 0 || tmp_x >= height) continue;
+                    if (tmp_y < 0 || tmp_y >= width) continue;
+                    int kernel_value = data[kernel_step[kernel_id] + tmp_x * width + tmp_y];
+                    if (kernel_value == 0) continue;
+                    if (text_labels[tmp_x][tmp_y] > 0) continue;
+                    Point2d point(tmp_x, tmp_y);
+                    queue.push(point);
+                    text_labels[tmp_x][tmp_y] = label;
+                    text_areas[label] += 1;
+                    text_scores[label] += score_map[tmp_x * width + tmp_y];
+                    text_points[label].push_back(tmp_y);
+                    text_points[label].push_back(tmp_x);
+                    is_edge = false;
+                }
+                if (is_edge)
+                {
+                    next_queue.push(point);
+                }
+            }
+            swap(queue, next_queue);
+        }
+
+        for (int i = 1; i < label_num; ++i)
+        {
+            if (text_areas[i])
+            {
+                text_scores[i] /= static_cast<float>(text_areas[i]);
+            }
+        }
     }
-  }
-
-  queue<Point2d> queue, next_queue;
-  for (int x = 0; x < height; ++x) {
-    auto row = text_labels[x];
-    for (int y = 0; y < width; ++y) {
-      int label = label_map[x * width + y];
-      if (label == 0) continue;
-      if (text_areas[label] < min_area) continue;
-      Point2d point(x, y);
-      queue.push(point);
-      row[y] = label;
-    }
-  }
-
-  const int dx[] = {-1, 1, 0, 0};
-  const int dy[] = {0, 0, -1, 1};
-  vector<int> kernel_step(kernel_num);
-  std::for_each(kernel_step.begin(), kernel_step.end(), [=](int& k) { return k * height * width; });
-
-  for (int kernel_id = kernel_num - 2; kernel_id >= 0; --kernel_id) {
-    while (!queue.empty()) {
-      Point2d point = queue.front();
-      queue.pop();
-      int x = point.x;
-      int y = point.y;
-      int label = text_labels[x][y];
-      bool is_edge = true;
-      for (int d = 0; d < 4; ++d) {
-        int tmp_x = x + dx[d];
-        int tmp_y = y + dy[d];
-        if (tmp_x < 0 || tmp_x >= height) continue;
-        if (tmp_y < 0 || tmp_y >= width) continue;
-        int kernel_value = data[kernel_step[kernel_id] + tmp_x * width + tmp_y];
-        if (kernel_value == 0) continue;
-        if (text_labels[tmp_x][tmp_y] > 0) continue;
-        Point2d point(tmp_x, tmp_y);
-        queue.push(point);
-        text_labels[tmp_x][tmp_y] = label;
-        text_areas[label] += 1;
-        text_scores[label] += score_map[tmp_x * width + tmp_y];
-        text_points[label].push_back(tmp_y);
-        text_points[label].push_back(tmp_x);
-        is_edge = false;
-      }
-      if (is_edge) {
-        next_queue.push(point);
-      }
-    }
-    swap(queue, next_queue);
-  }
 
-  for (int i = 1; i < label_num; ++i) {
-    if (text_areas[i]) {
-      text_scores[i] /= static_cast<float>(text_areas[i]);
+    void contour_expand(const Mat_<uint8_t>& kernel_masks, const Mat_<int32_t>& kernel_label, const Mat_<float>& score, int min_kernel_area, int kernel_num, vector<int>& text_areas, vector<float>& text_scores, vector<vector<int>>& text_points)
+    {
+        assert(kernel_masks.cols == kernel_label.total());
+        assert(score.size() == kernel_label.size());
+
+        auto                ptr_data       = kernel_masks.ptr<uint8_t>();
+        auto                data_score_map = score.ptr<float>();
+        auto                data_label_map = kernel_label.ptr<int32_t>();
+        vector<vector<int>> text_line;
+
+        Mat_<int32_t>       text_labels;
+
+        kernel_dilate(ptr_data, kernel_masks.rows, kernel_label.rows, kernel_label.cols, data_label_map, kernel_num, data_score_map, min_kernel_area, text_labels, text_areas, text_scores, text_points);
     }
-  }
-}
-
-void contour_expand(const Mat_<uint8_t>& kernel_masks, const Mat_<int32_t>& kernel_label,
-                    const Mat_<float>& score, int min_kernel_area, int kernel_num,
-                    vector<int>& text_areas, vector<float>& text_scores,
-                    vector<vector<int>>& text_points) {
-  assert(kernel_masks.cols == kernel_label.total());
-  assert(score.size() == kernel_label.size());
-
-  auto ptr_data = kernel_masks.ptr<uint8_t>();
-  auto data_score_map = score.ptr<float>();
-  auto data_label_map = kernel_label.ptr<int32_t>();
-  vector<vector<int>> text_line;
-
-  Mat_<int32_t> text_labels;
-
-  kernel_dilate(ptr_data, kernel_masks.rows, kernel_label.rows, kernel_label.cols, data_label_map,
-                kernel_num, data_score_map, min_kernel_area, text_labels, text_areas, text_scores,
-                text_points);
-}
 
 }  // namespace mmdeploy::mmocr
diff --git a/csrc/mmdeploy/codebase/mmocr/cpu/CMakeLists.txt b/csrc/mmdeploy/codebase/mmocr/cpu/CMakeLists.txt
index 38f19f4ae9..ca2eb84c1c 100644
--- a/csrc/mmdeploy/codebase/mmocr/cpu/CMakeLists.txt
+++ b/csrc/mmdeploy/codebase/mmocr/cpu/CMakeLists.txt
@@ -2,15 +2,15 @@
 
 project(mmdeploy_mmocr_cpu_impl CXX)
 
-if ("cpu" IN_LIST MMDEPLOY_TARGET_DEVICES)
-    add_library(${PROJECT_NAME} OBJECT dbnet.cpp panet.cpp psenet.cpp)
-    set_target_properties(${PROJECT_NAME} PROPERTIES POSITION_INDEPENDENT_CODE 1)
-    if (NOT (MMDEPLOY_SHARED_LIBS OR MSVC))
-        target_compile_options(${PROJECT_NAME} PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-fvisibility=hidden>)
-    endif ()
-    target_link_libraries(${PROJECT_NAME} PRIVATE
-            mmdeploy_opencv_utils
-            mmdeploy::core)
-    target_link_libraries(mmdeploy_mmocr PRIVATE ${PROJECT_NAME})
-    mmdeploy_export(${PROJECT_NAME})
-endif ()
+if("cpu" IN_LIST MMDEPLOY_TARGET_DEVICES)
+  add_library(${PROJECT_NAME} OBJECT dbnet.cpp panet.cpp psenet.cpp)
+  set_target_properties(${PROJECT_NAME} PROPERTIES POSITION_INDEPENDENT_CODE 1)
+  if(NOT (MMDEPLOY_SHARED_LIBS OR MSVC))
+    target_compile_options(
+      ${PROJECT_NAME} PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-fvisibility=hidden>)
+  endif()
+  target_link_libraries(${PROJECT_NAME} PRIVATE mmdeploy_opencv_utils
+                                                mmdeploy::core)
+  target_link_libraries(mmdeploy_mmocr PRIVATE ${PROJECT_NAME})
+  mmdeploy_export(${PROJECT_NAME})
+endif()
diff --git a/csrc/mmdeploy/codebase/mmocr/cpu/dbnet.cpp b/csrc/mmdeploy/codebase/mmocr/cpu/dbnet.cpp
index 05dc494397..e4ebae18d9 100644
--- a/csrc/mmdeploy/codebase/mmocr/cpu/dbnet.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/cpu/dbnet.cpp
@@ -5,66 +5,72 @@
 #include "mmdeploy/core/utils/device_utils.h"
 #include "opencv2/imgproc.hpp"
 
-namespace mmdeploy::mmocr {
-
-class DbHeadCpuImpl : public DbHeadImpl {
- public:
-  void Init(const Stream& stream) override {
-    DbHeadImpl::Init(stream);
-    device_ = Device("cpu");
-  }
-
-  Result<void> Process(Tensor prob, float mask_thr, int max_candidates,
-                       std::vector<std::vector<cv::Point>>& points,
-                       std::vector<float>& scores) override {
-    OUTCOME_TRY(auto conf, MakeAvailableOnDevice(prob, device_, stream_));
-    OUTCOME_TRY(stream_.Wait());
-
-    auto h = conf.shape(1);
-    auto w = conf.shape(2);
-    auto data = conf.data<float>();
-
-    cv::Mat score_map((int)h, (int)w, CV_32F, data);
-
-    cv::Mat text_mask = score_map >= mask_thr;
-
-    std::vector<std::vector<cv::Point>> contours;
-    cv::findContours(text_mask, contours, cv::RETR_LIST, cv::CHAIN_APPROX_SIMPLE);
-
-    if (contours.size() > max_candidates) {
-      contours.resize(max_candidates);
-    }
-
-    for (auto& poly : contours) {
-      auto epsilon = 0.01 * cv::arcLength(poly, true);
-      std::vector<cv::Point> approx;
-      cv::approxPolyDP(poly, approx, epsilon, true);
-      if (approx.size() < 4) {
-        continue;
-      }
-      auto score = box_score_fast(score_map, approx);
-
-      points.push_back(approx);
-      scores.push_back(score);
-    }
-
-    return success();
-  }
-
-  static float box_score_fast(const cv::Mat& bitmap, const std::vector<cv::Point>& box) noexcept {
-    auto rect = cv::boundingRect(box) & cv::Rect({}, bitmap.size());
-
-    cv::Mat mask(rect.size(), CV_8U, cv::Scalar(0));
-
-    cv::fillPoly(mask, std::vector<std::vector<cv::Point>>{box}, 1, cv::LINE_8, 0, -rect.tl());
-    auto mean = cv::mean(bitmap(rect), mask)[0];
-    return static_cast<float>(mean);
-  }
-
-  Device device_;
-};
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(DbHeadImpl, (cpu, 0),
-                               [] { return std::make_unique<DbHeadCpuImpl>(); });
+namespace mmdeploy::mmocr
+{
+
+    class DbHeadCpuImpl : public DbHeadImpl
+    {
+      public:
+        void Init(const Stream& stream) override
+        {
+            DbHeadImpl::Init(stream);
+            device_ = Device("cpu");
+        }
+
+        Result<void> Process(Tensor prob, float mask_thr, int max_candidates, std::vector<std::vector<cv::Point>>& points, std::vector<float>& scores) override
+        {
+            OUTCOME_TRY(auto conf, MakeAvailableOnDevice(prob, device_, stream_));
+            OUTCOME_TRY(stream_.Wait());
+
+            auto                                h    = conf.shape(1);
+            auto                                w    = conf.shape(2);
+            auto                                data = conf.data<float>();
+
+            cv::Mat                             score_map((int)h, (int)w, CV_32F, data);
+
+            cv::Mat                             text_mask = score_map >= mask_thr;
+
+            std::vector<std::vector<cv::Point>> contours;
+            cv::findContours(text_mask, contours, cv::RETR_LIST, cv::CHAIN_APPROX_SIMPLE);
+
+            if (contours.size() > max_candidates)
+            {
+                contours.resize(max_candidates);
+            }
+
+            for (auto& poly : contours)
+            {
+                auto                   epsilon = 0.01 * cv::arcLength(poly, true);
+                std::vector<cv::Point> approx;
+                cv::approxPolyDP(poly, approx, epsilon, true);
+                if (approx.size() < 4)
+                {
+                    continue;
+                }
+                auto score = box_score_fast(score_map, approx);
+
+                points.push_back(approx);
+                scores.push_back(score);
+            }
+
+            return success();
+        }
+
+        static float box_score_fast(const cv::Mat& bitmap, const std::vector<cv::Point>& box) noexcept
+        {
+            auto    rect = cv::boundingRect(box) & cv::Rect({}, bitmap.size());
+
+            cv::Mat mask(rect.size(), CV_8U, cv::Scalar(0));
+
+            cv::fillPoly(mask, std::vector<std::vector<cv::Point>>{box}, 1, cv::LINE_8, 0, -rect.tl());
+            auto mean = cv::mean(bitmap(rect), mask)[0];
+            return static_cast<float>(mean);
+        }
+
+        Device device_;
+    };
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(DbHeadImpl, (cpu, 0), []
+                                   { return std::make_unique<DbHeadCpuImpl>(); });
 
 }  // namespace mmdeploy::mmocr
diff --git a/csrc/mmdeploy/codebase/mmocr/cpu/panet.cpp b/csrc/mmdeploy/codebase/mmocr/cpu/panet.cpp
index cf988046be..2be6ebfb41 100644
--- a/csrc/mmdeploy/codebase/mmocr/cpu/panet.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/cpu/panet.cpp
@@ -4,52 +4,56 @@
 
 #include "opencv2/imgproc.hpp"
 
-namespace mmdeploy::mmocr {
-
-class PaHeadCpuImpl : public PaHeadImpl {
- public:
-  Result<void> Process(Tensor text_pred,             //
-                       Tensor kernel_pred,           //
-                       Tensor embed_pred,            //
-                       float min_text_confidence,    //
-                       float min_kernel_confidence,  //
-                       cv::Mat_<float>& text_score,  //
-                       cv::Mat_<uint8_t>& text,      //
-                       cv::Mat_<uint8_t>& kernel,    //
-                       cv::Mat_<int>& label,         //
-                       cv::Mat_<float>& embed,       //
-                       int& region_num) override {
-    OUTCOME_TRY(stream_.Wait());
-
-    auto height = static_cast<int>(text_pred.shape(1));
-    auto width = static_cast<int>(text_pred.shape(2));
-
-    text_score = cv::Mat_<float>(height, width, text_pred.data<float>());
-    sigmoid(text_score);
-
-    text = text_score > min_text_confidence;
-
-    cv::Mat_<float> kernel_score(height, width, kernel_pred.data<float>());
-    sigmoid(kernel_score);
-
-    kernel = kernel_score > min_kernel_confidence & text;
-
-    auto n_embed_channels = static_cast<int>(embed_pred.shape(0));
-    embed = cv::Mat_<float>(n_embed_channels, height * width, embed_pred.data<float>());
-    cv::transpose(embed, embed);
-
-    region_num = cv::connectedComponents(kernel, label, 4, CV_32S);
-
-    return success();
-  }
-
-  static void sigmoid(cv::Mat_<float>& score) {
-    cv::exp(-score, score);
-    score = 1 / (1 + score);
-  }
-};
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(PaHeadImpl, (cpu, 0),
-                               [] { return std::make_unique<PaHeadCpuImpl>(); });
+namespace mmdeploy::mmocr
+{
+
+    class PaHeadCpuImpl : public PaHeadImpl
+    {
+      public:
+        Result<void> Process(Tensor             text_pred,              //
+                             Tensor             kernel_pred,            //
+                             Tensor             embed_pred,             //
+                             float              min_text_confidence,    //
+                             float              min_kernel_confidence,  //
+                             cv::Mat_<float>&   text_score,             //
+                             cv::Mat_<uint8_t>& text,                   //
+                             cv::Mat_<uint8_t>& kernel,                 //
+                             cv::Mat_<int>&     label,                  //
+                             cv::Mat_<float>&   embed,                  //
+                             int&               region_num) override
+        {
+            OUTCOME_TRY(stream_.Wait());
+
+            auto height = static_cast<int>(text_pred.shape(1));
+            auto width  = static_cast<int>(text_pred.shape(2));
+
+            text_score = cv::Mat_<float>(height, width, text_pred.data<float>());
+            sigmoid(text_score);
+
+            text = text_score > min_text_confidence;
+
+            cv::Mat_<float> kernel_score(height, width, kernel_pred.data<float>());
+            sigmoid(kernel_score);
+
+            kernel = kernel_score > min_kernel_confidence & text;
+
+            auto n_embed_channels = static_cast<int>(embed_pred.shape(0));
+            embed                 = cv::Mat_<float>(n_embed_channels, height * width, embed_pred.data<float>());
+            cv::transpose(embed, embed);
+
+            region_num = cv::connectedComponents(kernel, label, 4, CV_32S);
+
+            return success();
+        }
+
+        static void sigmoid(cv::Mat_<float>& score)
+        {
+            cv::exp(-score, score);
+            score = 1 / (1 + score);
+        }
+    };
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(PaHeadImpl, (cpu, 0), []
+                                   { return std::make_unique<PaHeadCpuImpl>(); });
 
 }  // namespace mmdeploy::mmocr
diff --git a/csrc/mmdeploy/codebase/mmocr/cpu/psenet.cpp b/csrc/mmdeploy/codebase/mmocr/cpu/psenet.cpp
index 141d7c1cbd..f5ab0c478c 100644
--- a/csrc/mmdeploy/codebase/mmocr/cpu/psenet.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/cpu/psenet.cpp
@@ -5,52 +5,60 @@
 #include "mmdeploy/core/utils/device_utils.h"
 #include "opencv2/imgproc.hpp"
 
-namespace mmdeploy::mmocr {
-
-class PseHeadCpuImpl : public PseHeadImpl {
- public:
-  PseHeadCpuImpl() : device_(0) {}
-
-  Result<void> Process(Tensor preds,                 //
-                       float min_kernel_confidence,  //
-                       cv::Mat_<float>& score,       //
-                       cv::Mat_<uint8_t>& masks,     //
-                       cv::Mat_<int>& label,         //
-                       int& region_num) override {
-    OUTCOME_TRY(preds, MakeAvailableOnDevice(preds, device_, stream_));
-    OUTCOME_TRY(stream_.Wait());
-
-    auto channels = static_cast<int>(preds.shape(0));
-    auto height = static_cast<int>(preds.shape(1));
-    auto width = static_cast<int>(preds.shape(2));
-
-    cv::Mat_<float> probs(channels, height * width, preds.data<float>());
-    sigmoid(probs);
-
-    probs.row(0).reshape(1, height).copyTo(score);
-
-    masks = probs > min_kernel_confidence;
-
-    for (int i = 1; i < channels; ++i) {
-      masks.row(i) &= masks.row(0);
-    }
-
-    cv::Mat_<uint8_t> kernel = masks.row(channels - 1).reshape(1, height);
-    region_num = cv::connectedComponents(kernel, label, 4, CV_32S);
-
-    return success();
-  }
-
-  static void sigmoid(cv::Mat_<float>& score) {
-    cv::exp(-score, score);
-    score = 1 / (1 + score);
-  }
-
- private:
-  Device device_;
-};
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(PseHeadImpl, (cpu, 0),
-                               [] { return std::make_unique<PseHeadCpuImpl>(); });
+namespace mmdeploy::mmocr
+{
+
+    class PseHeadCpuImpl : public PseHeadImpl
+    {
+      public:
+        PseHeadCpuImpl()
+            : device_(0)
+        {
+        }
+
+        Result<void> Process(Tensor             preds,                  //
+                             float              min_kernel_confidence,  //
+                             cv::Mat_<float>&   score,                  //
+                             cv::Mat_<uint8_t>& masks,                  //
+                             cv::Mat_<int>&     label,                  //
+                             int&               region_num) override
+        {
+            OUTCOME_TRY(preds, MakeAvailableOnDevice(preds, device_, stream_));
+            OUTCOME_TRY(stream_.Wait());
+
+            auto            channels = static_cast<int>(preds.shape(0));
+            auto            height   = static_cast<int>(preds.shape(1));
+            auto            width    = static_cast<int>(preds.shape(2));
+
+            cv::Mat_<float> probs(channels, height * width, preds.data<float>());
+            sigmoid(probs);
+
+            probs.row(0).reshape(1, height).copyTo(score);
+
+            masks = probs > min_kernel_confidence;
+
+            for (int i = 1; i < channels; ++i)
+            {
+                masks.row(i) &= masks.row(0);
+            }
+
+            cv::Mat_<uint8_t> kernel = masks.row(channels - 1).reshape(1, height);
+            region_num               = cv::connectedComponents(kernel, label, 4, CV_32S);
+
+            return success();
+        }
+
+        static void sigmoid(cv::Mat_<float>& score)
+        {
+            cv::exp(-score, score);
+            score = 1 / (1 + score);
+        }
+
+      private:
+        Device device_;
+    };
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(PseHeadImpl, (cpu, 0), []
+                                   { return std::make_unique<PseHeadCpuImpl>(); });
 
 }  // namespace mmdeploy::mmocr
diff --git a/csrc/mmdeploy/codebase/mmocr/crnn.cpp b/csrc/mmdeploy/codebase/mmocr/crnn.cpp
index 807f83eece..bc08ec8f46 100644
--- a/csrc/mmdeploy/codebase/mmocr/crnn.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/crnn.cpp
@@ -14,66 +14,74 @@
 #include "mmdeploy/experimental/module_adapter.h"
 #include "mmocr.h"
 
-namespace mmdeploy::mmocr {
-
-using std::string;
-using std::vector;
-
-class CTCConvertor : public BaseConvertor {
- public:
-  explicit CTCConvertor(const Value& cfg) : BaseConvertor(cfg) {}
-
-  Result<Value> operator()(const Value& _data, const Value& _prob) {
-    auto d_conf = _prob["output"].get<Tensor>();
-
-    if (!(d_conf.shape().size() == 3 && d_conf.data_type() == DataType::kFLOAT)) {
-      MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}, dtype: {}", d_conf.shape(),
-                     (int)d_conf.data_type());
-      return Status(eNotSupported);
-    }
-
-    OUTCOME_TRY(auto h_conf, MakeAvailableOnDevice(d_conf, Device{0}, stream()));
-    OUTCOME_TRY(stream().Wait());
-
-    auto data = h_conf.data<float>();
-
-    auto shape = d_conf.shape();
-    auto w = static_cast<int>(shape[1]);
-    auto c = static_cast<int>(shape[2]);
-
-    auto valid_ratio = _data["img_metas"]["valid_ratio"].get<float>();
-    auto [indexes, scores] = Tensor2Idx(data, w, c, valid_ratio);
-
-    auto text = Idx2Str(indexes);
-    MMDEPLOY_DEBUG("text: {}", text);
-
-    TextRecognition output{text, scores};
-
-    return make_pointer(to_value(output));
-  }
-
-  std::pair<vector<int>, vector<float> > Tensor2Idx(const float* data, int w, int c,
-                                                    float valid_ratio) {
-    auto decode_len = std::min(w, static_cast<int>(std::ceil(w * valid_ratio)));
-    vector<int> indexes;
-    indexes.reserve(decode_len);
-    vector<float> scores;
-    scores.reserve(decode_len);
-    int prev = padding_idx_;
-    for (int t = 0; t < decode_len; ++t, data += c) {
-      vector<float> prob(data, data + c);
-      auto iter = max_element(begin(prob), end(prob));
-      auto index = static_cast<int>(iter - begin(prob));
-      if (index != prev && ignore_indexes_.find(index) == ignore_indexes_.end()) {
-        indexes.push_back(index);
-        scores.push_back(*iter);
-      }
-      prev = index;
-    }
-    return {indexes, scores};
-  }
-};
-
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMOCR, CTCConvertor);
+namespace mmdeploy::mmocr
+{
+
+    using std::string;
+    using std::vector;
+
+    class CTCConvertor : public BaseConvertor
+    {
+      public:
+        explicit CTCConvertor(const Value& cfg)
+            : BaseConvertor(cfg)
+        {
+        }
+
+        Result<Value> operator()(const Value& _data, const Value& _prob)
+        {
+            auto d_conf = _prob["output"].get<Tensor>();
+
+            if (!(d_conf.shape().size() == 3 && d_conf.data_type() == DataType::kFLOAT))
+            {
+                MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}, dtype: {}", d_conf.shape(), (int)d_conf.data_type());
+                return Status(eNotSupported);
+            }
+
+            OUTCOME_TRY(auto h_conf, MakeAvailableOnDevice(d_conf, Device{0}, stream()));
+            OUTCOME_TRY(stream().Wait());
+
+            auto data = h_conf.data<float>();
+
+            auto shape = d_conf.shape();
+            auto w     = static_cast<int>(shape[1]);
+            auto c     = static_cast<int>(shape[2]);
+
+            auto valid_ratio       = _data["img_metas"]["valid_ratio"].get<float>();
+            auto [indexes, scores] = Tensor2Idx(data, w, c, valid_ratio);
+
+            auto text = Idx2Str(indexes);
+            MMDEPLOY_DEBUG("text: {}", text);
+
+            TextRecognition output{text, scores};
+
+            return make_pointer(to_value(output));
+        }
+
+        std::pair<vector<int>, vector<float>> Tensor2Idx(const float* data, int w, int c, float valid_ratio)
+        {
+            auto        decode_len = std::min(w, static_cast<int>(std::ceil(w * valid_ratio)));
+            vector<int> indexes;
+            indexes.reserve(decode_len);
+            vector<float> scores;
+            scores.reserve(decode_len);
+            int prev = padding_idx_;
+            for (int t = 0; t < decode_len; ++t, data += c)
+            {
+                vector<float> prob(data, data + c);
+                auto          iter  = max_element(begin(prob), end(prob));
+                auto          index = static_cast<int>(iter - begin(prob));
+                if (index != prev && ignore_indexes_.find(index) == ignore_indexes_.end())
+                {
+                    indexes.push_back(index);
+                    scores.push_back(*iter);
+                }
+                prev = index;
+            }
+            return {indexes, scores};
+        }
+    };
+
+    MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMOCR, CTCConvertor);
 
 }  // namespace mmdeploy::mmocr
diff --git a/csrc/mmdeploy/codebase/mmocr/cuda/CMakeLists.txt b/csrc/mmdeploy/codebase/mmocr/cuda/CMakeLists.txt
index 1234e27010..07d62c3aab 100644
--- a/csrc/mmdeploy/codebase/mmocr/cuda/CMakeLists.txt
+++ b/csrc/mmdeploy/codebase/mmocr/cuda/CMakeLists.txt
@@ -2,22 +2,14 @@
 
 project(mmdeploy_mmocr_cuda_impl)
 
-if ("cuda" IN_LIST MMDEPLOY_TARGET_DEVICES)
-    add_library(${PROJECT_NAME} OBJECT
-            connected_component.cu
-            utils.cu
-            dbnet.cpp
-            panet.cpp
-            psenet.cpp)
-    set_target_properties(${PROJECT_NAME} PROPERTIES POSITION_INDEPENDENT_CODE 1)
-    target_include_directories(${PROJECT_NAME} PRIVATE
-            ${CUDA_INCLUDE_DIRS})
-    target_link_libraries(${PROJECT_NAME} PRIVATE
-            mmdeploy_opencv_utils
-            mmdeploy::core
-            ${CUDA_LIBRARIES}
-            cuda)
-    target_link_libraries(mmdeploy_mmocr PRIVATE
-            ${PROJECT_NAME})
-    mmdeploy_export(${PROJECT_NAME})
-endif ()
+if("cuda" IN_LIST MMDEPLOY_TARGET_DEVICES)
+  add_library(${PROJECT_NAME} OBJECT connected_component.cu utils.cu dbnet.cpp
+                                     panet.cpp psenet.cpp)
+  set_target_properties(${PROJECT_NAME} PROPERTIES POSITION_INDEPENDENT_CODE 1)
+  target_include_directories(${PROJECT_NAME} PRIVATE ${CUDA_INCLUDE_DIRS})
+  target_link_libraries(
+    ${PROJECT_NAME} PRIVATE mmdeploy_opencv_utils mmdeploy::core
+                            ${CUDA_LIBRARIES} cuda)
+  target_link_libraries(mmdeploy_mmocr PRIVATE ${PROJECT_NAME})
+  mmdeploy_export(${PROJECT_NAME})
+endif()
diff --git a/csrc/mmdeploy/codebase/mmocr/cuda/connected_component.cu b/csrc/mmdeploy/codebase/mmocr/cuda/connected_component.cu
index 03007b2a5a..5faaad486b 100644
--- a/csrc/mmdeploy/codebase/mmocr/cuda/connected_component.cu
+++ b/csrc/mmdeploy/codebase/mmocr/cuda/connected_component.cu
@@ -9,423 +9,498 @@
 #include "thrust/for_each.h"
 #include "thrust/iterator/counting_iterator.h"
 
-namespace mmdeploy {
-
-__device__ int start_distance(unsigned pixels, int tx) {
-  unsigned v = ~(pixels << (32 - tx));
-  return __clz(reinterpret_cast<int&>(v));
-}
-
-__device__ int end_distance(unsigned pixels, int tx) {
-  unsigned v = ~(pixels >> (tx + 1));
-  return __ffs(reinterpret_cast<int&>(v));
-}
-
-template <typename T>
-__device__ void swap(T& x, T& y) {
-  T tmp = x;
-  x = y;
-  y = tmp;
-}
-
-__device__ void merge(int* label, int u, int v) {
-  // find root of u
-  while (u != v && u != label[u]) {
-    u = label[u];
-  }
-  // find root of v
-  while (u != v && v != label[v]) {
-    v = label[v];
-  }
-  while (u != v) {
-    // post-condition: u > v
-    if (u < v) swap(u, v);
-    // try to set label[u] = v
-    auto w = atomicMin(label + u, v);
-    // if u is modified by other threads, try again
-    u = u == w ? v : w;
-  }
-}
-
-__host__ __device__ int div_up(int x, int y) { return (x + y - 1) / y; }
-
-__host__ __device__ int round_up(int x, int y) { return div_up(x, y) * y; }
-
-template <int block_w, int block_h>
-__global__ void LabelStripsKernel(const uint8_t* mask, int h, int w, int* label) {
-  __shared__ unsigned shared_pixels[block_h];
-  auto tx = static_cast<int>(threadIdx.x);
-  auto ty = static_cast<int>(threadIdx.y);
-  auto x0 = tx + static_cast<int>(blockIdx.x * blockDim.x);
-  auto y0 = ty + static_cast<int>(blockIdx.y * blockDim.y);
-  auto w_32 = round_up(w, 32);
-  for (auto y = y0; y < h; y += blockDim.y * gridDim.y) {
-    //* 0 -> current line
-    //* 1 -> line above
-    int distance0 = 0;
-    int distance1 = 0;
-    for (auto x = x0; x < w_32; x += blockDim.x * gridDim.x) {
-      unsigned active = __ballot_sync(0xffffffff, x < w);
-      if (x < w) {
-        auto key = y * w + x;
-        auto p0 = mask[y * w + x];
-        auto pixels0 = __ballot_sync(active, p0);
-        auto s_dist0 = start_distance(pixels0, tx);
-        if (p0 && s_dist0 == 0) {
-          auto l = tx ? key : key - distance0;
-          label[y * w + x] = static_cast<int>(l);
+namespace mmdeploy
+{
+
+    __device__ int start_distance(unsigned pixels, int tx)
+    {
+        unsigned v = ~(pixels << (32 - tx));
+        return __clz(reinterpret_cast<int&>(v));
+    }
+
+    __device__ int end_distance(unsigned pixels, int tx)
+    {
+        unsigned v = ~(pixels >> (tx + 1));
+        return __ffs(reinterpret_cast<int&>(v));
+    }
+
+    template<typename T>
+    __device__ void swap(T& x, T& y)
+    {
+        T tmp = x;
+        x     = y;
+        y     = tmp;
+    }
+
+    __device__ void merge(int* label, int u, int v)
+    {
+        // find root of u
+        while (u != v && u != label[u])
+        {
+            u = label[u];
         }
-        if (tx == 0) {
-          shared_pixels[ty] = pixels0;
+        // find root of v
+        while (u != v && v != label[v])
+        {
+            v = label[v];
         }
-        __syncthreads();
-        auto pixels1 = ty ? shared_pixels[ty - 1] : 0;
-        int p1 = (pixels1 & (1 << tx));
-        int s_dist1 = start_distance(pixels1, tx);
-        if (tx == 0) {
-          s_dist0 = distance0;
-          s_dist1 = distance1;
+        while (u != v)
+        {
+            // post-condition: u > v
+            if (u < v) swap(u, v);
+            // try to set label[u] = v
+            auto w = atomicMin(label + u, v);
+            // if u is modified by other threads, try again
+            u      = u == w ? v : w;
         }
-        if (p0 && p1 && (s_dist0 == 0 || s_dist1 == 0)) {
-          int label0 = key - s_dist0;
-          int label1 = key - w - s_dist1;
-          merge(label, label0, label1);
+    }
+
+    __host__ __device__ int div_up(int x, int y)
+    {
+        return (x + y - 1) / y;
+    }
+
+    __host__ __device__ int round_up(int x, int y)
+    {
+        return div_up(x, y) * y;
+    }
+
+    template<int block_w, int block_h>
+    __global__ void LabelStripsKernel(const uint8_t* mask, int h, int w, int* label)
+    {
+        __shared__ unsigned shared_pixels[block_h];
+        auto                tx   = static_cast<int>(threadIdx.x);
+        auto                ty   = static_cast<int>(threadIdx.y);
+        auto                x0   = tx + static_cast<int>(blockIdx.x * blockDim.x);
+        auto                y0   = ty + static_cast<int>(blockIdx.y * blockDim.y);
+        auto                w_32 = round_up(w, 32);
+        for (auto y = y0; y < h; y += blockDim.y * gridDim.y)
+        {
+            //* 0 -> current line
+            //* 1 -> line above
+            int distance0 = 0;
+            int distance1 = 0;
+            for (auto x = x0; x < w_32; x += blockDim.x * gridDim.x)
+            {
+                unsigned active = __ballot_sync(0xffffffff, x < w);
+                if (x < w)
+                {
+                    auto key     = y * w + x;
+                    auto p0      = mask[y * w + x];
+                    auto pixels0 = __ballot_sync(active, p0);
+                    auto s_dist0 = start_distance(pixels0, tx);
+                    if (p0 && s_dist0 == 0)
+                    {
+                        auto l           = tx ? key : key - distance0;
+                        label[y * w + x] = static_cast<int>(l);
+                    }
+                    if (tx == 0)
+                    {
+                        shared_pixels[ty] = pixels0;
+                    }
+                    __syncthreads();
+                    auto pixels1 = ty ? shared_pixels[ty - 1] : 0;
+                    int  p1      = (pixels1 & (1 << tx));
+                    int  s_dist1 = start_distance(pixels1, tx);
+                    if (tx == 0)
+                    {
+                        s_dist0 = distance0;
+                        s_dist1 = distance1;
+                    }
+                    if (p0 && p1 && (s_dist0 == 0 || s_dist1 == 0))
+                    {
+                        int label0 = key - s_dist0;
+                        int label1 = key - w - s_dist1;
+                        merge(label, label0, label1);
+                    }
+                    auto d1   = start_distance(pixels1, 32);
+                    distance1 = d1 == 32 ? d1 + distance1 : d1;
+                    auto d0   = start_distance(pixels0, 32);
+                    distance0 = d0 == 32 ? d0 + distance0 : d0;
+                }
+            }
         }
-        auto d1 = start_distance(pixels1, 32);
-        distance1 = d1 == 32 ? d1 + distance1 : d1;
-        auto d0 = start_distance(pixels0, 32);
-        distance0 = d0 == 32 ? d0 + distance0 : d0;
-      }
     }
-  }
-}
-
-__global__ void MergeStripsKernel(const uint8_t* mask, int h, int w, int* label) {
-  auto tx = threadIdx.x;
-  auto ty = threadIdx.y;
-  auto x0 = tx + blockIdx.x * blockDim.x;
-  auto y0 = ty + blockIdx.y * blockDim.y;
-  auto w_32 = round_up(w, 32);
-  for (auto y = y0; y < h; y += blockDim.y * gridDim.y) {
-    if (y > 0) {
-      for (auto x = x0; x < w_32; x += blockDim.x * gridDim.x) {
-        unsigned active = __ballot_sync(0xffffffff, x < w);
-        if (x < w) {
-          auto key0 = y * w + x;
-          auto key1 = key0 - w;
-          auto p0 = mask[key0];
-          auto p1 = mask[key1];
-          auto pixels0 = __ballot_sync(active, p0);
-          auto pixels1 = __ballot_sync(active, p1);
-          if (p0 && p1) {
-            auto s_dist0 = start_distance(pixels0, tx);
-            auto s_dist1 = start_distance(pixels1, tx);
-            if (s_dist0 == 0 || s_dist1 == 0) {
-              merge(label, key0 - s_dist0, key1 - s_dist1);
+
+    __global__ void MergeStripsKernel(const uint8_t* mask, int h, int w, int* label)
+    {
+        auto tx   = threadIdx.x;
+        auto ty   = threadIdx.y;
+        auto x0   = tx + blockIdx.x * blockDim.x;
+        auto y0   = ty + blockIdx.y * blockDim.y;
+        auto w_32 = round_up(w, 32);
+        for (auto y = y0; y < h; y += blockDim.y * gridDim.y)
+        {
+            if (y > 0)
+            {
+                for (auto x = x0; x < w_32; x += blockDim.x * gridDim.x)
+                {
+                    unsigned active = __ballot_sync(0xffffffff, x < w);
+                    if (x < w)
+                    {
+                        auto key0    = y * w + x;
+                        auto key1    = key0 - w;
+                        auto p0      = mask[key0];
+                        auto p1      = mask[key1];
+                        auto pixels0 = __ballot_sync(active, p0);
+                        auto pixels1 = __ballot_sync(active, p1);
+                        if (p0 && p1)
+                        {
+                            auto s_dist0 = start_distance(pixels0, tx);
+                            auto s_dist1 = start_distance(pixels1, tx);
+                            if (s_dist0 == 0 || s_dist1 == 0)
+                            {
+                                merge(label, key0 - s_dist0, key1 - s_dist1);
+                            }
+                        }
+                    }
+                }
             }
-          }
         }
-      }
     }
-  }
-}
 
-__device__ int encode(int label) { return -2 - label; }
+    __device__ int encode(int label)
+    {
+        return -2 - label;
+    }
 
-__device__ int decode(int label) { return -2 - label; }
+    __device__ int decode(int label)
+    {
+        return -2 - label;
+    }
 
-struct _discretize_label_op {
-  int* label;
-  int* n_comp;
-  __device__ void operator()(int index) const {
-    if (label[index] == index) {
-      auto comp = atomicAdd(n_comp, 1);
-      label[index] = encode(comp);
+    struct _discretize_label_op
+    {
+        int*            label;
+        int*            n_comp;
+        __device__ void operator()(int index) const
+        {
+            if (label[index] == index)
+            {
+                auto comp    = atomicAdd(n_comp, 1);
+                label[index] = encode(comp);
+            }
+        }
+    };
+
+    struct _decode_label_op
+    {
+        const int*      label;
+        int*            output;
+        __device__ void operator()(int index) const
+        {
+            auto comp     = label[index];
+            output[index] = comp < -1 ? decode(comp) + 1 : 0;
+        }
+    };
+
+    __global__ void RelabelStripsKernel(const uint8_t* mask, int h, int w, int* label)
+    {
+        auto       tx       = threadIdx.x;
+        auto       ty       = threadIdx.y;
+        auto       x0       = tx + blockIdx.x * blockDim.x;
+        auto       y0       = ty + blockIdx.y * blockDim.y;
+        const auto stride_x = static_cast<int>(blockDim.x * gridDim.x);
+        const auto stride_y = static_cast<int>(blockDim.y * gridDim.y);
+        const auto w_32     = round_up(w, 32);
+        for (auto y = y0; y < h; y += stride_y)
+        {
+            for (auto x = x0; x < w_32; x += stride_x)
+            {
+                unsigned active = __ballot_sync(0xffffffff, x < w);
+                if (x < w)
+                {
+                    auto k      = y * w + x;
+                    auto p      = mask[k];
+                    auto pixels = __ballot_sync(active, p);
+                    auto s_dist = start_distance(pixels, tx);
+                    auto idx    = 0;
+                    if (p && s_dist == 0)
+                    {
+                        idx = label[k];
+                        while (idx > 0)
+                        {
+                            idx = label[idx];
+                        }
+                    }
+                    idx = __shfl_sync(active, idx, tx - s_dist);
+                    if (p)
+                    {
+                        label[k] = idx;
+                    }
+                }
+            }
+        }
     }
-  }
-};
-
-struct _decode_label_op {
-  const int* label;
-  int* output;
-  __device__ void operator()(int index) const {
-    auto comp = label[index];
-    output[index] = comp < -1 ? decode(comp) + 1 : 0;
-  }
-};
-
-__global__ void RelabelStripsKernel(const uint8_t* mask, int h, int w, int* label) {
-  auto tx = threadIdx.x;
-  auto ty = threadIdx.y;
-  auto x0 = tx + blockIdx.x * blockDim.x;
-  auto y0 = ty + blockIdx.y * blockDim.y;
-  const auto stride_x = static_cast<int>(blockDim.x * gridDim.x);
-  const auto stride_y = static_cast<int>(blockDim.y * gridDim.y);
-  const auto w_32 = round_up(w, 32);
-  for (auto y = y0; y < h; y += stride_y) {
-    for (auto x = x0; x < w_32; x += stride_x) {
-      unsigned active = __ballot_sync(0xffffffff, x < w);
-      if (x < w) {
-        auto k = y * w + x;
-        auto p = mask[k];
-        auto pixels = __ballot_sync(active, p);
-        auto s_dist = start_distance(pixels, tx);
-        auto idx = 0;
-        if (p && s_dist == 0) {
-          idx = label[k];
-          while (idx > 0) {
-            idx = label[idx];
-          }
+
+    __global__ void ComputeStatsKernel_v2(const uint8_t* mask, const int* label, const float* score, int h, int w, float* comp_score, int* comp_area)
+    {
+        auto       tx       = threadIdx.x;
+        auto       ty       = threadIdx.y;
+        auto       x0       = tx + blockIdx.x * blockDim.x;
+        auto       y0       = ty + blockIdx.y * blockDim.y;
+        const auto stride_x = static_cast<int>(blockDim.x * gridDim.x);
+        const auto stride_y = static_cast<int>(blockDim.y * gridDim.y);
+        const auto w_32     = round_up(w, 32);
+        for (auto y = y0; y < h; y += stride_y)
+        {
+            for (auto x = x0; x < w_32; x += stride_x)
+            {
+                unsigned active = __ballot_sync(0xffffffff, x < w);
+                if (x < w)
+                {
+                    auto  k      = y * w + x;
+                    auto  p      = mask[k];
+                    auto  pixels = __ballot_sync(active, p);
+                    auto  s_dist = start_distance(pixels, tx);
+                    auto  count  = end_distance(pixels, tx);
+
+                    float s = p ? score[k] : 0;
+                    for (int offset = 16; offset > 0; offset /= 2)
+                    {
+                        auto v = __shfl_down_sync(active, s, offset);
+                        // mask out past-the-end items
+                        s += offset < count ? v : 0.f;
+                    }
+
+                    if (p && s_dist == 0)
+                    {
+                        auto idx = decode(label[k]);
+                        atomicAdd(comp_area + idx, count);
+                        atomicAdd(comp_score + idx, s);
+                    }
+                }
+            }
         }
-        idx = __shfl_sync(active, idx, tx - s_dist);
-        if (p) {
-          label[k] = idx;
+    }
+
+    __global__ void GetContoursKernel(const int* label, int h, int w, int2* contour, int* size)
+    {
+        const auto x0       = static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);
+        const auto y0       = static_cast<int>(threadIdx.y + blockIdx.y * blockDim.y);
+        const auto stride_x = static_cast<int>(blockDim.x * gridDim.x);
+        const auto stride_y = static_cast<int>(blockDim.y * gridDim.y);
+        for (auto y = y0; y < h; y += stride_y)
+        {
+            for (auto x = x0; x < w; x += stride_x)
+            {
+                const auto index = y * w + x;
+                // encoded label
+                const auto comp  = label[index];
+                if (comp < -1)
+                {
+                    // non-linear filters
+                    const auto l  = x > 0 && label[index - 1] == comp;
+                    const auto t  = y > 0 && label[index - w] == comp;
+                    const auto r  = x < w - 1 && label[index + 1] == comp;
+                    const auto b  = y < h - 1 && label[index + w] == comp;
+                    const auto tl = y > 0 && x > 0 && label[index - w - 1] == comp;
+                    const auto tr = y > 0 && x < w - 1 && label[index - w + 1] == comp;
+                    const auto bl = y < h - 1 && x > 0 && label[index + w - 1] == comp;
+                    const auto br = y < h - 1 && x < w - 1 && label[index + w + 1] == comp;
+                    if (!((l && r) || (t && b) || (tl && br) || (tr && bl)))
+                    {
+                        const auto p = atomicAdd(size, 1);
+                        contour[p]   = {index, decode(comp)};
+                    }
+                }
+            }
         }
-      }
     }
-  }
-}
-
-__global__ void ComputeStatsKernel_v2(const uint8_t* mask, const int* label, const float* score,
-                                      int h, int w, float* comp_score, int* comp_area) {
-  auto tx = threadIdx.x;
-  auto ty = threadIdx.y;
-  auto x0 = tx + blockIdx.x * blockDim.x;
-  auto y0 = ty + blockIdx.y * blockDim.y;
-  const auto stride_x = static_cast<int>(blockDim.x * gridDim.x);
-  const auto stride_y = static_cast<int>(blockDim.y * gridDim.y);
-  const auto w_32 = round_up(w, 32);
-  for (auto y = y0; y < h; y += stride_y) {
-    for (auto x = x0; x < w_32; x += stride_x) {
-      unsigned active = __ballot_sync(0xffffffff, x < w);
-      if (x < w) {
-        auto k = y * w + x;
-        auto p = mask[k];
-        auto pixels = __ballot_sync(active, p);
-        auto s_dist = start_distance(pixels, tx);
-        auto count = end_distance(pixels, tx);
-
-        float s = p ? score[k] : 0;
-        for (int offset = 16; offset > 0; offset /= 2) {
-          auto v = __shfl_down_sync(active, s, offset);
-          // mask out past-the-end items
-          s += offset < count ? v : 0.f;
+
+    struct ConnectedComponents::Impl
+    {
+      public:
+        explicit Impl(cudaStream_t stream);
+
+        void Resize(int height, int width);
+
+        int  GetComponents(const uint8_t* d_mask, int* h_label);
+
+        void GetContours(std::vector<std::vector<cv::Point>>& corners);
+
+        void GetStats(const uint8_t* d_mask, const float* d_score, std::vector<float>& scores, std::vector<int>& areas);
+
+        ~Impl();
+
+        int*         d_label_{nullptr};
+        float*       d_comp_score_{nullptr};
+        int*         d_comp_area_{nullptr};
+        int*         d_contour_{nullptr};  // int2
+        int*         d_contour_size_{nullptr};
+        int*         d_n_comp_{nullptr};
+        int          n_comp_{0};
+        int          height_{0};
+        int          width_{0};
+        size_t       size_{0};
+        size_t       capacity_{0};
+        double       growth_factor_{1.1};
+        cudaStream_t stream_{nullptr};
+        bool         owned_stream_{false};
+    };
+
+    int ConnectedComponents::Impl::GetComponents(const uint8_t* d_mask, int* h_label)
+    {
+        {
+            dim3 threads(32, 4);
+            dim3 blocks(1, div_up(height_, (int)threads.y));
+            cudaMemsetAsync(d_label_, -1, sizeof(int) * size_, stream_);
+            LabelStripsKernel<32, 4><<<blocks, threads, 0, stream_>>>(d_mask, height_, width_, d_label_);
         }
+        {
+            dim3 threads(32, 4);
+            dim3 blocks(div_up(width_, (int)threads.x), div_up(height_, (int)threads.y));
+            MergeStripsKernel<<<blocks, threads, 0, stream_>>>(d_mask, height_, width_, d_label_);
+
+            cudaMemsetAsync(d_n_comp_, 0, sizeof(int), stream_);
+            thrust::for_each_n(thrust::cuda::par.on(stream_), thrust::counting_iterator<int>(0), height_ * width_, _discretize_label_op{d_label_, d_n_comp_});
+            RelabelStripsKernel<<<blocks, threads, 0, stream_>>>(d_mask, height_, width_, d_label_);
+        }
+        cudaMemcpyAsync(&n_comp_, d_n_comp_, sizeof(int), cudaMemcpyDefault, stream_);
+        if (h_label)
+        {
+            dim3 threads(32, 4);
+            dim3 blocks(div_up(width_, (int)threads.x), div_up(height_, (int)threads.y));
+            // reuse d_comp_area_, which is also an int buffer
+            thrust::for_each_n(thrust::cuda::par.on(stream_), thrust::counting_iterator<int>(0), height_ * width_, _decode_label_op{d_label_, d_comp_area_});
+            cudaMemcpyAsync(h_label, d_comp_area_, sizeof(int) * size_, cudaMemcpyDefault, stream_);
+        }
+        cudaStreamSynchronize(stream_);
+        return n_comp_;
+    }
 
-        if (p && s_dist == 0) {
-          auto idx = decode(label[k]);
-          atomicAdd(comp_area + idx, count);
-          atomicAdd(comp_score + idx, s);
+    void ConnectedComponents::Impl::GetStats(const uint8_t* d_mask, const float* d_score, std::vector<float>& scores, std::vector<int>& areas)
+    {
+        cudaMemsetAsync(d_comp_score_, 0, sizeof(float) * size_, stream_);
+        cudaMemsetAsync(d_comp_area_, 0, sizeof(int) * size_, stream_);
+        dim3 threads(32, 4);
+        dim3 blocks(div_up(width_, (int)threads.x), div_up(height_, (int)threads.y));
+        ComputeStatsKernel_v2<<<blocks, threads, 0, stream_>>>(d_mask, d_label_, d_score, height_, width_, d_comp_score_, d_comp_area_);
+        scores.resize(n_comp_);
+        areas.resize(n_comp_);
+        cudaMemcpyAsync(scores.data(), d_comp_score_, sizeof(float) * n_comp_, cudaMemcpyDefault, stream_);
+        cudaMemcpyAsync(areas.data(), d_comp_area_, sizeof(int) * n_comp_, cudaMemcpyDefault, stream_);
+
+        cudaStreamSynchronize(stream_);
+    }
+
+    void ConnectedComponents::Impl::GetContours(std::vector<std::vector<cv::Point>>& corners)
+    {
+        cudaMemsetAsync(d_contour_size_, 0, sizeof(int), stream_);
+
+        auto d_contour = reinterpret_cast<int2*>(d_contour_);
+        {
+            dim3 threads(32, 4);
+            dim3 blocks(div_up(width_, (int)threads.x), div_up(height_, (int)threads.y));
+            GetContoursKernel<<<blocks, threads, 0, stream_>>>(d_label_, height_, width_, d_contour, d_contour_size_);
+        }
+
+        int contour_size{};
+        cudaMemcpyAsync(&contour_size, d_contour_size_, sizeof(int), cudaMemcpyDefault, stream_);
+
+        cudaStreamSynchronize(stream_);
+
+        std::vector<int2> index_comp(contour_size);
+        cudaMemcpyAsync(index_comp.data(), d_contour_, sizeof(int2) * contour_size, cudaMemcpyDefault, stream_);
+
+        cudaStreamSynchronize(stream_);
+
+        corners.resize(n_comp_);
+        for (const auto& p : index_comp)
+        {
+            auto comp = p.y;
+            assert(0 <= comp && comp < n_comp_);
+            corners[comp].emplace_back(p.x % width_, p.x / width_);
+        }
+    }
+
+    void ConnectedComponents::Impl::Resize(int height, int width)
+    {
+        size_t size = height * width;
+        if (size > capacity_)
+        {
+            if (!capacity_)
+            {
+                capacity_ = size;
+            }
+            else
+            {
+                while (capacity_ < size)
+                {
+                    capacity_ *= growth_factor_;
+                }
+            }
+            cudaFree(d_label_);
+            cudaFree(d_comp_score_);
+            cudaFree(d_comp_area_);
+            cudaFree(d_contour_);
+            cudaMalloc(&d_label_, sizeof(int) * capacity_);
+            cudaMalloc(&d_comp_score_, sizeof(float) * capacity_);
+            cudaMalloc(&d_comp_area_, sizeof(int) * capacity_);
+            cudaMalloc(&d_contour_, sizeof(int2) * capacity_);
+        }
+        if (!d_contour_size_)
+        {
+            cudaMalloc(&d_contour_size_, sizeof(int));
+        }
+        if (!d_n_comp_)
+        {
+            cudaMalloc(&d_n_comp_, sizeof(int));
+        }
+        height_ = height;
+        width_  = width;
+        size_   = size;
+    }
+
+    ConnectedComponents::Impl::Impl(cudaStream_t stream)
+        : stream_(stream)
+    {
+        if (!stream_)
+        {
+            cudaStreamCreate(&stream_);
+            owned_stream_ = true;
         }
-      }
     }
-  }
-}
-
-__global__ void GetContoursKernel(const int* label, int h, int w, int2* contour, int* size) {
-  const auto x0 = static_cast<int>(threadIdx.x + blockIdx.x * blockDim.x);
-  const auto y0 = static_cast<int>(threadIdx.y + blockIdx.y * blockDim.y);
-  const auto stride_x = static_cast<int>(blockDim.x * gridDim.x);
-  const auto stride_y = static_cast<int>(blockDim.y * gridDim.y);
-  for (auto y = y0; y < h; y += stride_y) {
-    for (auto x = x0; x < w; x += stride_x) {
-      const auto index = y * w + x;
-      // encoded label
-      const auto comp = label[index];
-      if (comp < -1) {
-        // non-linear filters
-        const auto l = x > 0 && label[index - 1] == comp;
-        const auto t = y > 0 && label[index - w] == comp;
-        const auto r = x < w - 1 && label[index + 1] == comp;
-        const auto b = y < h - 1 && label[index + w] == comp;
-        const auto tl = y > 0 && x > 0 && label[index - w - 1] == comp;
-        const auto tr = y > 0 && x < w - 1 && label[index - w + 1] == comp;
-        const auto bl = y < h - 1 && x > 0 && label[index + w - 1] == comp;
-        const auto br = y < h - 1 && x < w - 1 && label[index + w + 1] == comp;
-        if (!((l && r) || (t && b) || (tl && br) || (tr && bl))) {
-          const auto p = atomicAdd(size, 1);
-          contour[p] = {index, decode(comp)};
+
+    ConnectedComponents::Impl::~Impl()
+    {
+        cudaFree(d_label_);
+        cudaFree(d_comp_score_);
+        cudaFree(d_comp_area_);
+        cudaFree(d_contour_);
+        cudaFree(d_contour_size_);
+        cudaFree(d_n_comp_);
+        if (owned_stream_)
+        {
+            cudaStreamDestroy(stream_);
         }
-      }
     }
-  }
-}
-
-struct ConnectedComponents::Impl {
- public:
-  explicit Impl(cudaStream_t stream);
-
-  void Resize(int height, int width);
-
-  int GetComponents(const uint8_t* d_mask, int* h_label);
-
-  void GetContours(std::vector<std::vector<cv::Point>>& corners);
-
-  void GetStats(const uint8_t* d_mask, const float* d_score, std::vector<float>& scores,
-                std::vector<int>& areas);
-
-  ~Impl();
-
-  int* d_label_{nullptr};
-  float* d_comp_score_{nullptr};
-  int* d_comp_area_{nullptr};
-  int* d_contour_{nullptr};  // int2
-  int* d_contour_size_{nullptr};
-  int* d_n_comp_{nullptr};
-  int n_comp_{0};
-  int height_{0};
-  int width_{0};
-  size_t size_{0};
-  size_t capacity_{0};
-  double growth_factor_{1.1};
-  cudaStream_t stream_{nullptr};
-  bool owned_stream_{false};
-};
-
-int ConnectedComponents::Impl::GetComponents(const uint8_t* d_mask, int* h_label) {
-  {
-    dim3 threads(32, 4);
-    dim3 blocks(1, div_up(height_, (int)threads.y));
-    cudaMemsetAsync(d_label_, -1, sizeof(int) * size_, stream_);
-    LabelStripsKernel<32, 4><<<blocks, threads, 0, stream_>>>(d_mask, height_, width_, d_label_);
-  }
-  {
-    dim3 threads(32, 4);
-    dim3 blocks(div_up(width_, (int)threads.x), div_up(height_, (int)threads.y));
-    MergeStripsKernel<<<blocks, threads, 0, stream_>>>(d_mask, height_, width_, d_label_);
-
-    cudaMemsetAsync(d_n_comp_, 0, sizeof(int), stream_);
-    thrust::for_each_n(thrust::cuda::par.on(stream_), thrust::counting_iterator<int>(0),
-                       height_ * width_, _discretize_label_op{d_label_, d_n_comp_});
-    RelabelStripsKernel<<<blocks, threads, 0, stream_>>>(d_mask, height_, width_, d_label_);
-  }
-  cudaMemcpyAsync(&n_comp_, d_n_comp_, sizeof(int), cudaMemcpyDefault, stream_);
-  if (h_label) {
-    dim3 threads(32, 4);
-    dim3 blocks(div_up(width_, (int)threads.x), div_up(height_, (int)threads.y));
-    // reuse d_comp_area_, which is also an int buffer
-    thrust::for_each_n(thrust::cuda::par.on(stream_), thrust::counting_iterator<int>(0),
-                       height_ * width_, _decode_label_op{d_label_, d_comp_area_});
-    cudaMemcpyAsync(h_label, d_comp_area_, sizeof(int) * size_, cudaMemcpyDefault, stream_);
-  }
-  cudaStreamSynchronize(stream_);
-  return n_comp_;
-}
-
-void ConnectedComponents::Impl::GetStats(const uint8_t* d_mask, const float* d_score,
-                                         std::vector<float>& scores, std::vector<int>& areas) {
-  cudaMemsetAsync(d_comp_score_, 0, sizeof(float) * size_, stream_);
-  cudaMemsetAsync(d_comp_area_, 0, sizeof(int) * size_, stream_);
-  dim3 threads(32, 4);
-  dim3 blocks(div_up(width_, (int)threads.x), div_up(height_, (int)threads.y));
-  ComputeStatsKernel_v2<<<blocks, threads, 0, stream_>>>(d_mask, d_label_, d_score, height_, width_,
-                                                         d_comp_score_, d_comp_area_);
-  scores.resize(n_comp_);
-  areas.resize(n_comp_);
-  cudaMemcpyAsync(scores.data(), d_comp_score_, sizeof(float) * n_comp_, cudaMemcpyDefault,
-                  stream_);
-  cudaMemcpyAsync(areas.data(), d_comp_area_, sizeof(int) * n_comp_, cudaMemcpyDefault, stream_);
-
-  cudaStreamSynchronize(stream_);
-}
-
-void ConnectedComponents::Impl::GetContours(std::vector<std::vector<cv::Point>>& corners) {
-  cudaMemsetAsync(d_contour_size_, 0, sizeof(int), stream_);
-
-  auto d_contour = reinterpret_cast<int2*>(d_contour_);
-  {
-    dim3 threads(32, 4);
-    dim3 blocks(div_up(width_, (int)threads.x), div_up(height_, (int)threads.y));
-    GetContoursKernel<<<blocks, threads, 0, stream_>>>(d_label_, height_, width_, d_contour,
-                                                       d_contour_size_);
-  }
-
-  int contour_size{};
-  cudaMemcpyAsync(&contour_size, d_contour_size_, sizeof(int), cudaMemcpyDefault, stream_);
-
-  cudaStreamSynchronize(stream_);
-
-  std::vector<int2> index_comp(contour_size);
-  cudaMemcpyAsync(index_comp.data(), d_contour_, sizeof(int2) * contour_size, cudaMemcpyDefault,
-                  stream_);
-
-  cudaStreamSynchronize(stream_);
-
-  corners.resize(n_comp_);
-  for (const auto& p : index_comp) {
-    auto comp = p.y;
-    assert(0 <= comp && comp < n_comp_);
-    corners[comp].emplace_back(p.x % width_, p.x / width_);
-  }
-}
-
-void ConnectedComponents::Impl::Resize(int height, int width) {
-  size_t size = height * width;
-  if (size > capacity_) {
-    if (!capacity_) {
-      capacity_ = size;
-    } else {
-      while (capacity_ < size) {
-        capacity_ *= growth_factor_;
-      }
+
+    ConnectedComponents::ConnectedComponents(void* stream)
+        : impl_(std::make_unique<Impl>((cudaStream_t)stream))
+    {
+    }
+
+    ConnectedComponents::~ConnectedComponents() = default;
+
+    void ConnectedComponents::Resize(int height, int width)
+    {
+        impl_->Resize(height, width);
+    }
+
+    int ConnectedComponents::GetComponents(const uint8_t* d_mask, int* h_label)
+    {
+        return impl_->GetComponents(d_mask, h_label);
+    }
+
+    void ConnectedComponents::GetContours(std::vector<std::vector<cv::Point>>& corners)
+    {
+        return impl_->GetContours(corners);
+    }
+
+    void ConnectedComponents::GetStats(const uint8_t* d_mask, const float* d_score, std::vector<float>& scores, std::vector<int>& areas)
+    {
+        return impl_->GetStats(d_mask, d_score, scores, areas);
     }
-    cudaFree(d_label_);
-    cudaFree(d_comp_score_);
-    cudaFree(d_comp_area_);
-    cudaFree(d_contour_);
-    cudaMalloc(&d_label_, sizeof(int) * capacity_);
-    cudaMalloc(&d_comp_score_, sizeof(float) * capacity_);
-    cudaMalloc(&d_comp_area_, sizeof(int) * capacity_);
-    cudaMalloc(&d_contour_, sizeof(int2) * capacity_);
-  }
-  if (!d_contour_size_) {
-    cudaMalloc(&d_contour_size_, sizeof(int));
-  }
-  if (!d_n_comp_) {
-    cudaMalloc(&d_n_comp_, sizeof(int));
-  }
-  height_ = height;
-  width_ = width;
-  size_ = size;
-}
-
-ConnectedComponents::Impl::Impl(cudaStream_t stream) : stream_(stream) {
-  if (!stream_) {
-    cudaStreamCreate(&stream_);
-    owned_stream_ = true;
-  }
-}
-
-ConnectedComponents::Impl::~Impl() {
-  cudaFree(d_label_);
-  cudaFree(d_comp_score_);
-  cudaFree(d_comp_area_);
-  cudaFree(d_contour_);
-  cudaFree(d_contour_size_);
-  cudaFree(d_n_comp_);
-  if (owned_stream_) {
-    cudaStreamDestroy(stream_);
-  }
-}
-
-ConnectedComponents::ConnectedComponents(void* stream)
-    : impl_(std::make_unique<Impl>((cudaStream_t)stream)) {}
-
-ConnectedComponents::~ConnectedComponents() = default;
-
-void ConnectedComponents::Resize(int height, int width) { impl_->Resize(height, width); }
-
-int ConnectedComponents::GetComponents(const uint8_t* d_mask, int* h_label) {
-  return impl_->GetComponents(d_mask, h_label);
-}
-
-void ConnectedComponents::GetContours(std::vector<std::vector<cv::Point>>& corners) {
-  return impl_->GetContours(corners);
-}
-
-void ConnectedComponents::GetStats(const uint8_t* d_mask, const float* d_score,
-                                   std::vector<float>& scores, std::vector<int>& areas) {
-  return impl_->GetStats(d_mask, d_score, scores, areas);
-}
 
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/codebase/mmocr/cuda/connected_component.h b/csrc/mmdeploy/codebase/mmocr/cuda/connected_component.h
index d57337d447..a6e0287116 100644
--- a/csrc/mmdeploy/codebase/mmocr/cuda/connected_component.h
+++ b/csrc/mmdeploy/codebase/mmocr/cuda/connected_component.h
@@ -10,27 +10,28 @@
 
 #include "opencv2/core.hpp"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-class ConnectedComponents {
- public:
-  explicit ConnectedComponents(void* stream);
+    class ConnectedComponents
+    {
+      public:
+        explicit ConnectedComponents(void* stream);
 
-  ~ConnectedComponents();
+        ~ConnectedComponents();
 
-  void Resize(int height, int width);
+        void Resize(int height, int width);
 
-  int GetComponents(const uint8_t* d_mask, int* h_label);
+        int  GetComponents(const uint8_t* d_mask, int* h_label);
 
-  void GetContours(std::vector<std::vector<cv::Point>>& corners);
+        void GetContours(std::vector<std::vector<cv::Point>>& corners);
 
-  void GetStats(const uint8_t* d_mask, const float* d_score, std::vector<float>& scores,
-                std::vector<int>& areas);
+        void GetStats(const uint8_t* d_mask, const float* d_score, std::vector<float>& scores, std::vector<int>& areas);
 
- private:
-  struct Impl;
-  std::unique_ptr<Impl> impl_;
-};
+      private:
+        struct Impl;
+        std::unique_ptr<Impl> impl_;
+    };
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/codebase/mmocr/cuda/dbnet.cpp b/csrc/mmdeploy/codebase/mmocr/cuda/dbnet.cpp
index b14ef83962..d413734469 100644
--- a/csrc/mmdeploy/codebase/mmocr/cuda/dbnet.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/cuda/dbnet.cpp
@@ -8,72 +8,77 @@
 #include "mmdeploy/device/cuda/cuda_device.h"
 #include "opencv2/imgproc.hpp"
 
-namespace mmdeploy::mmocr {
+namespace mmdeploy::mmocr
+{
 
-class DbHeadCudaImpl : public DbHeadImpl {
- public:
-  void Init(const Stream& stream) override {
-    DbHeadImpl::Init(stream);
-    device_ = stream_.GetDevice();
+    class DbHeadCudaImpl : public DbHeadImpl
     {
-      CudaDeviceGuard device_guard(device_);
-      cc_.emplace(GetNative<cudaStream_t>(stream_));
-    }
-  }
-
-  ~DbHeadCudaImpl() override {
-    CudaDeviceGuard device_guard(device_);
-    cc_.reset();
-  }
-
-  Result<void> Process(Tensor score, float mask_thr, int max_candidates,
-                       std::vector<std::vector<cv::Point>>& contours,
-                       std::vector<float>& scores) override {
-    CudaDeviceGuard device_guard(device_);
-
-    auto height = static_cast<int>(score.shape(1));
-    auto width = static_cast<int>(score.shape(2));
-
-    Buffer mask(device_, score.size() * sizeof(uint8_t));
-
-    auto score_data = score.data<float>();
-    auto mask_data = GetNative<uint8_t*>(mask);
-
-    dbnet::Threshold(score_data, height * width, mask_thr, mask_data,
-                     GetNative<cudaStream_t>(stream_));
-
-    cc_->Resize(height, width);
-    cc_->GetComponents(mask_data, nullptr);
-
-    std::vector<std::vector<cv::Point>> points;
-    cc_->GetContours(points);
-
-    std::vector<float> _scores;
-    std::vector<int> _areas;
-    cc_->GetStats(mask_data, score_data, _scores, _areas);
-
-    if (points.size() > max_candidates) {
-      points.resize(max_candidates);
-    }
-
-    for (int i = 0; i < points.size(); ++i) {
-      std::vector<cv::Point> hull;
-      cv::convexHull(points[i], hull);
-      if (hull.size() < 4) {
-        continue;
-      }
-      contours.push_back(hull);
-      scores.push_back(_scores[i] / (float)_areas[i]);
-    }
-    return success();
-  }
-
- private:
-  Device device_;
-  std::optional<ConnectedComponents> cc_;
-};
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(DbHeadImpl, (cuda, 0),
-                               [] { return std::make_unique<DbHeadCudaImpl>(); });
+      public:
+        void Init(const Stream& stream) override
+        {
+            DbHeadImpl::Init(stream);
+            device_ = stream_.GetDevice();
+            {
+                CudaDeviceGuard device_guard(device_);
+                cc_.emplace(GetNative<cudaStream_t>(stream_));
+            }
+        }
+
+        ~DbHeadCudaImpl() override
+        {
+            CudaDeviceGuard device_guard(device_);
+            cc_.reset();
+        }
+
+        Result<void> Process(Tensor score, float mask_thr, int max_candidates, std::vector<std::vector<cv::Point>>& contours, std::vector<float>& scores) override
+        {
+            CudaDeviceGuard device_guard(device_);
+
+            auto            height = static_cast<int>(score.shape(1));
+            auto            width  = static_cast<int>(score.shape(2));
+
+            Buffer          mask(device_, score.size() * sizeof(uint8_t));
+
+            auto            score_data = score.data<float>();
+            auto            mask_data  = GetNative<uint8_t*>(mask);
+
+            dbnet::Threshold(score_data, height * width, mask_thr, mask_data, GetNative<cudaStream_t>(stream_));
+
+            cc_->Resize(height, width);
+            cc_->GetComponents(mask_data, nullptr);
+
+            std::vector<std::vector<cv::Point>> points;
+            cc_->GetContours(points);
+
+            std::vector<float> _scores;
+            std::vector<int>   _areas;
+            cc_->GetStats(mask_data, score_data, _scores, _areas);
+
+            if (points.size() > max_candidates)
+            {
+                points.resize(max_candidates);
+            }
+
+            for (int i = 0; i < points.size(); ++i)
+            {
+                std::vector<cv::Point> hull;
+                cv::convexHull(points[i], hull);
+                if (hull.size() < 4)
+                {
+                    continue;
+                }
+                contours.push_back(hull);
+                scores.push_back(_scores[i] / (float)_areas[i]);
+            }
+            return success();
+        }
+
+      private:
+        Device                             device_;
+        std::optional<ConnectedComponents> cc_;
+    };
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(DbHeadImpl, (cuda, 0), []
+                                   { return std::make_unique<DbHeadCudaImpl>(); });
 
 }  // namespace mmdeploy::mmocr
diff --git a/csrc/mmdeploy/codebase/mmocr/cuda/panet.cpp b/csrc/mmdeploy/codebase/mmocr/cuda/panet.cpp
index 3dcfec9411..edd2003342 100644
--- a/csrc/mmdeploy/codebase/mmocr/cuda/panet.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/cuda/panet.cpp
@@ -6,97 +6,102 @@
 #include "mmdeploy/codebase/mmocr/cuda/utils.h"
 #include "mmdeploy/device/cuda/cuda_device.h"
 
-namespace mmdeploy::mmocr {
+namespace mmdeploy::mmocr
+{
 
-class PaHeadCudaImpl : public PaHeadImpl {
- public:
-  void Init(const Stream& stream) override {
-    PaHeadImpl::Init(stream);
-    device_ = stream.GetDevice();
+    class PaHeadCudaImpl : public PaHeadImpl
     {
-      CudaDeviceGuard device_guard(device_);
-      cc_.emplace(GetNative<cudaStream_t>(stream_));
-    }
-  }
-
-  ~PaHeadCudaImpl() override {
-    CudaDeviceGuard device_guard(device_);
-    cc_.reset();
-  }
-
-  Result<void> Process(Tensor text_pred,             //
-                       Tensor kernel_pred,           //
-                       Tensor embed_pred,            //
-                       float min_text_confidence,    //
-                       float min_kernel_confidence,  //
-                       cv::Mat_<float>& text_score,  //
-                       cv::Mat_<uint8_t>& text,      //
-                       cv::Mat_<uint8_t>& kernel,    //
-                       cv::Mat_<int>& label,         //
-                       cv::Mat_<float>& embed,       //
-                       int& region_num) override {
-    CudaDeviceGuard device_guard(device_);
-
-    auto height = static_cast<int>(text_pred.shape(1));
-    auto width = static_cast<int>(text_pred.shape(2));
-
-    Buffer text_buf(device_, height * width);
-    Buffer text_score_buf(device_, height * width * sizeof(float));
-    Buffer kernel_buf(device_, height * width);
-
-    auto text_buf_data = GetNative<uint8_t*>(text_buf);
-    auto text_score_buf_data = GetNative<float*>(text_score_buf);
-    auto kernel_buf_data = GetNative<uint8_t*>(kernel_buf);
-
-    auto stream = GetNative<cudaStream_t>(stream_);
-
-    panet::ProcessMasks(text_pred.data<float>(),    //
-                        kernel_pred.data<float>(),  //
-                        min_text_confidence,        //
-                        min_kernel_confidence,      //
-                        height * width,             //
-                        text_buf_data,              //
-                        kernel_buf_data,            //
-                        text_score_buf_data,        //
-                        stream);
-
-    auto n_embed_channels = embed_pred.shape(0);
-    Buffer embed_buf(device_, embed_pred.byte_size());
-
-    panet::Transpose(embed_pred.data<float>(),      //
-                     n_embed_channels,              //
-                     height * width,                //
-                     GetNative<float*>(embed_buf),  //
-                     stream);
-
-    label = cv::Mat_<int>(height, width);
-
-    cc_->Resize(height, width);
-    region_num = cc_->GetComponents(kernel_buf_data, label.ptr<int>()) + 1;
-
-    text_score = cv::Mat_<float>(label.size());
-    OUTCOME_TRY(stream_.Copy(text_score_buf, text_score.data));
-
-    text = cv::Mat_<uint8_t>(label.size());
-    OUTCOME_TRY(stream_.Copy(text_buf, text.data));
-
-    kernel = cv::Mat_<uint8_t>(label.size());
-    OUTCOME_TRY(stream_.Copy(kernel_buf, kernel.data));
-
-    embed = cv::Mat_<float>(height * width, n_embed_channels);
-    OUTCOME_TRY(stream_.Copy(embed_buf, embed.data));
-
-    OUTCOME_TRY(stream_.Wait());
-
-    return success();
-  }
-
- private:
-  Device device_;
-  std::optional<ConnectedComponents> cc_;
-};
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(PaHeadImpl, (cuda, 0),
-                               [] { return std::make_unique<PaHeadCudaImpl>(); });
+      public:
+        void Init(const Stream& stream) override
+        {
+            PaHeadImpl::Init(stream);
+            device_ = stream.GetDevice();
+            {
+                CudaDeviceGuard device_guard(device_);
+                cc_.emplace(GetNative<cudaStream_t>(stream_));
+            }
+        }
+
+        ~PaHeadCudaImpl() override
+        {
+            CudaDeviceGuard device_guard(device_);
+            cc_.reset();
+        }
+
+        Result<void> Process(Tensor             text_pred,              //
+                             Tensor             kernel_pred,            //
+                             Tensor             embed_pred,             //
+                             float              min_text_confidence,    //
+                             float              min_kernel_confidence,  //
+                             cv::Mat_<float>&   text_score,             //
+                             cv::Mat_<uint8_t>& text,                   //
+                             cv::Mat_<uint8_t>& kernel,                 //
+                             cv::Mat_<int>&     label,                  //
+                             cv::Mat_<float>&   embed,                  //
+                             int&               region_num) override
+        {
+            CudaDeviceGuard device_guard(device_);
+
+            auto            height = static_cast<int>(text_pred.shape(1));
+            auto            width  = static_cast<int>(text_pred.shape(2));
+
+            Buffer          text_buf(device_, height * width);
+            Buffer          text_score_buf(device_, height * width * sizeof(float));
+            Buffer          kernel_buf(device_, height * width);
+
+            auto            text_buf_data       = GetNative<uint8_t*>(text_buf);
+            auto            text_score_buf_data = GetNative<float*>(text_score_buf);
+            auto            kernel_buf_data     = GetNative<uint8_t*>(kernel_buf);
+
+            auto            stream = GetNative<cudaStream_t>(stream_);
+
+            panet::ProcessMasks(text_pred.data<float>(),    //
+                                kernel_pred.data<float>(),  //
+                                min_text_confidence,        //
+                                min_kernel_confidence,      //
+                                height * width,             //
+                                text_buf_data,              //
+                                kernel_buf_data,            //
+                                text_score_buf_data,        //
+                                stream);
+
+            auto   n_embed_channels = embed_pred.shape(0);
+            Buffer embed_buf(device_, embed_pred.byte_size());
+
+            panet::Transpose(embed_pred.data<float>(),      //
+                             n_embed_channels,              //
+                             height * width,                //
+                             GetNative<float*>(embed_buf),  //
+                             stream);
+
+            label = cv::Mat_<int>(height, width);
+
+            cc_->Resize(height, width);
+            region_num = cc_->GetComponents(kernel_buf_data, label.ptr<int>()) + 1;
+
+            text_score = cv::Mat_<float>(label.size());
+            OUTCOME_TRY(stream_.Copy(text_score_buf, text_score.data));
+
+            text = cv::Mat_<uint8_t>(label.size());
+            OUTCOME_TRY(stream_.Copy(text_buf, text.data));
+
+            kernel = cv::Mat_<uint8_t>(label.size());
+            OUTCOME_TRY(stream_.Copy(kernel_buf, kernel.data));
+
+            embed = cv::Mat_<float>(height * width, n_embed_channels);
+            OUTCOME_TRY(stream_.Copy(embed_buf, embed.data));
+
+            OUTCOME_TRY(stream_.Wait());
+
+            return success();
+        }
+
+      private:
+        Device                             device_;
+        std::optional<ConnectedComponents> cc_;
+    };
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(PaHeadImpl, (cuda, 0), []
+                                   { return std::make_unique<PaHeadCudaImpl>(); });
 
 }  // namespace mmdeploy::mmocr
diff --git a/csrc/mmdeploy/codebase/mmocr/cuda/psenet.cpp b/csrc/mmdeploy/codebase/mmocr/cuda/psenet.cpp
index 6432c13247..9e01bdd041 100644
--- a/csrc/mmdeploy/codebase/mmocr/cuda/psenet.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/cuda/psenet.cpp
@@ -8,70 +8,74 @@
 #include "mmdeploy/device/cuda/cuda_device.h"
 #include "opencv2/imgproc.hpp"
 
-namespace mmdeploy::mmocr {
+namespace mmdeploy::mmocr
+{
 
-class PseHeadCudaImpl : public PseHeadImpl {
- public:
-  void Init(const Stream& stream) override {
-    PseHeadImpl::Init(stream);
-    device_ = stream.GetDevice();
+    class PseHeadCudaImpl : public PseHeadImpl
     {
-      CudaDeviceGuard device_guard(device_);
-      cc_.emplace(GetNative<cudaStream_t>(stream_));
-    }
-  }
-
-  ~PseHeadCudaImpl() override {
-    CudaDeviceGuard device_guard(device_);
-    cc_.reset();
-  }
-
-  Result<void> Process(Tensor preds,                 //
-                       float min_kernel_confidence,  //
-                       cv::Mat_<float>& score,       //
-                       cv::Mat_<uint8_t>& masks,     //
-                       cv::Mat_<int>& label,         //
-                       int& region_num) override {
-    CudaDeviceGuard device_guard(device_);
-
-    OUTCOME_TRY(preds, MakeAvailableOnDevice(preds, device_, stream_));
-    OUTCOME_TRY(stream_.Wait());
-
-    auto channels = static_cast<int>(preds.shape(0));
-    auto height = static_cast<int>(preds.shape(1));
-    auto width = static_cast<int>(preds.shape(2));
-
-    Buffer masks_buf(device_, preds.size());
-    Buffer score_buf(device_, height * width * sizeof(float));
-
-    auto masks_data = GetNative<uint8_t*>(masks_buf);
-    auto score_data = GetNative<float*>(score_buf);
-
-    psenet::ProcessMasks(preds.data<float>(), channels, height * width, min_kernel_confidence,
-                         masks_data, score_data, GetNative<cudaStream_t>(stream_));
-
-    cc_->Resize(height, width);
-
-    label = cv::Mat_<int>(height, width);
-
-    auto kernel_mask_data = masks_data + height * width * (channels - 1);
-    region_num = cc_->GetComponents(kernel_mask_data, label.ptr<int>()) + 1;
-
-    score = cv::Mat_<float>(label.size());
-    OUTCOME_TRY(stream_.Copy(score_buf, score.ptr<float>()));
-
-    masks = cv::Mat_<uint8_t>(channels, height * width);
-    OUTCOME_TRY(stream_.Copy(masks_buf, masks.ptr<uint8_t>()));
-
-    return success();
-  }
-
- private:
-  Device device_;
-  std::optional<ConnectedComponents> cc_;
-};
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(PseHeadImpl, (cuda, 0),
-                               [] { return std::make_unique<PseHeadCudaImpl>(); });
+      public:
+        void Init(const Stream& stream) override
+        {
+            PseHeadImpl::Init(stream);
+            device_ = stream.GetDevice();
+            {
+                CudaDeviceGuard device_guard(device_);
+                cc_.emplace(GetNative<cudaStream_t>(stream_));
+            }
+        }
+
+        ~PseHeadCudaImpl() override
+        {
+            CudaDeviceGuard device_guard(device_);
+            cc_.reset();
+        }
+
+        Result<void> Process(Tensor             preds,                  //
+                             float              min_kernel_confidence,  //
+                             cv::Mat_<float>&   score,                  //
+                             cv::Mat_<uint8_t>& masks,                  //
+                             cv::Mat_<int>&     label,                  //
+                             int&               region_num) override
+        {
+            CudaDeviceGuard device_guard(device_);
+
+            OUTCOME_TRY(preds, MakeAvailableOnDevice(preds, device_, stream_));
+            OUTCOME_TRY(stream_.Wait());
+
+            auto   channels = static_cast<int>(preds.shape(0));
+            auto   height   = static_cast<int>(preds.shape(1));
+            auto   width    = static_cast<int>(preds.shape(2));
+
+            Buffer masks_buf(device_, preds.size());
+            Buffer score_buf(device_, height * width * sizeof(float));
+
+            auto   masks_data = GetNative<uint8_t*>(masks_buf);
+            auto   score_data = GetNative<float*>(score_buf);
+
+            psenet::ProcessMasks(preds.data<float>(), channels, height * width, min_kernel_confidence, masks_data, score_data, GetNative<cudaStream_t>(stream_));
+
+            cc_->Resize(height, width);
+
+            label = cv::Mat_<int>(height, width);
+
+            auto kernel_mask_data = masks_data + height * width * (channels - 1);
+            region_num            = cc_->GetComponents(kernel_mask_data, label.ptr<int>()) + 1;
+
+            score = cv::Mat_<float>(label.size());
+            OUTCOME_TRY(stream_.Copy(score_buf, score.ptr<float>()));
+
+            masks = cv::Mat_<uint8_t>(channels, height * width);
+            OUTCOME_TRY(stream_.Copy(masks_buf, masks.ptr<uint8_t>()));
+
+            return success();
+        }
+
+      private:
+        Device                             device_;
+        std::optional<ConnectedComponents> cc_;
+    };
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(PseHeadImpl, (cuda, 0), []
+                                   { return std::make_unique<PseHeadCudaImpl>(); });
 
 }  // namespace mmdeploy::mmocr
diff --git a/csrc/mmdeploy/codebase/mmocr/cuda/utils.cu b/csrc/mmdeploy/codebase/mmocr/cuda/utils.cu
index 2f0cf27419..2d19f16193 100644
--- a/csrc/mmdeploy/codebase/mmocr/cuda/utils.cu
+++ b/csrc/mmdeploy/codebase/mmocr/cuda/utils.cu
@@ -4,102 +4,119 @@
 #include "thrust/iterator/counting_iterator.h"
 #include "thrust/transform.h"
 
-namespace mmdeploy {
-
-namespace mmocr {
-
-__device__ float sigmoid(float x) { return 1.f / (1.f + expf(-x)); }
-
-namespace panet {
-
-struct _process_masks_op {
-  const float* text_pred;
-  const float* kernel_pred;
-  float text_thr;
-  float kernel_thr;
-  uint8_t* text_mask;
-  uint8_t* kernel_mask;
-  float* text_score;
-  __device__ void operator()(int index) const {
-    auto text_sigmoid = sigmoid(text_pred[index]);
-    auto kernel_sigmoid = sigmoid(kernel_pred[index]);
-    text_score[index] = text_sigmoid;
-    auto text_valid = text_sigmoid > text_thr;
-    text_mask[index] = text_valid ? 255 : 0;
-    kernel_mask[index] = (text_valid && kernel_sigmoid > kernel_thr) ? 255 : 0;
-  }
-};
-
-void ProcessMasks(const float* d_text_pred, const float* d_kernel_pred, float text_thr,
-                  float kernel_thr, int n, uint8_t* d_text_mask, uint8_t* d_kernel_mask,
-                  float* d_text_score, cudaStream_t stream) {
-  thrust::for_each_n(thrust::cuda::par.on(stream), thrust::counting_iterator<int>(0), n,
-                     _process_masks_op{d_text_pred, d_kernel_pred, text_thr, kernel_thr,
-                                       d_text_mask, d_kernel_mask, d_text_score});
-}
-
-struct _transpose_op {
-  const float* input;
-  float* output;
-  int h;
-  int w;
-  __device__ void operator()(int index) const {
-    int i = index / w;
-    int j = index % w;
-    output[j * h + i] = input[index];
-  }
-};
-
-void Transpose(const float* d_input, int h, int w, float* d_output, cudaStream_t stream) {
-  thrust::for_each_n(thrust::cuda::par.on(stream), thrust::counting_iterator<int>(0), h * w,
-                     _transpose_op{d_input, d_output, h, w});
-}
-
-}  // namespace panet
-
-namespace dbnet {
-
-struct _threshold_op {
-  float thr;
-  __device__ bool operator()(float score) const { return score >= thr; }
-};
-
-void Threshold(const float* d_score, int n, float thr, uint8_t* d_mask, cudaStream_t stream) {
-  thrust::transform(thrust::cuda::par.on(stream), d_score, d_score + n, d_mask, _threshold_op{thr});
-}
-
-}  // namespace dbnet
-
-namespace psenet {
-
-struct _process_masks_op {
-  const float* preds;
-  int c;
-  int n;
-  float thr;
-  uint8_t* masks;
-  float* score;
-  __device__ void operator()(int index) const {
-    bool m0 = false;
-    for (int i = 0; i < c; ++i) {
-      auto v = sigmoid(preds[i * n + index]);
-      if (i == 0) {
-        score[index] = v;
-        m0 = v > thr;
-      }
-      masks[i * n + index] = (m0 && v > thr) ? 255 : 0;
-    }
-  }
-};
-
-void ProcessMasks(const float* d_preds, int c, int n, float thr, uint8_t* d_masks, float* d_score,
-                  cudaStream_t stream) {
-  thrust::for_each_n(thrust::cuda::par.on(stream), thrust::counting_iterator<int>(0), n,
-                     _process_masks_op{d_preds, c, n, thr, d_masks, d_score});
-}
-
-}  // namespace psenet
-
-}  // namespace mmocr
+namespace mmdeploy
+{
+
+    namespace mmocr
+    {
+
+        __device__ float sigmoid(float x)
+        {
+            return 1.f / (1.f + expf(-x));
+        }
+
+        namespace panet
+        {
+
+            struct _process_masks_op
+            {
+                const float*    text_pred;
+                const float*    kernel_pred;
+                float           text_thr;
+                float           kernel_thr;
+                uint8_t*        text_mask;
+                uint8_t*        kernel_mask;
+                float*          text_score;
+                __device__ void operator()(int index) const
+                {
+                    auto text_sigmoid   = sigmoid(text_pred[index]);
+                    auto kernel_sigmoid = sigmoid(kernel_pred[index]);
+                    text_score[index]   = text_sigmoid;
+                    auto text_valid     = text_sigmoid > text_thr;
+                    text_mask[index]    = text_valid ? 255 : 0;
+                    kernel_mask[index]  = (text_valid && kernel_sigmoid > kernel_thr) ? 255 : 0;
+                }
+            };
+
+            void ProcessMasks(const float* d_text_pred, const float* d_kernel_pred, float text_thr, float kernel_thr, int n, uint8_t* d_text_mask, uint8_t* d_kernel_mask, float* d_text_score, cudaStream_t stream)
+            {
+                thrust::for_each_n(thrust::cuda::par.on(stream), thrust::counting_iterator<int>(0), n, _process_masks_op{d_text_pred, d_kernel_pred, text_thr, kernel_thr, d_text_mask, d_kernel_mask, d_text_score});
+            }
+
+            struct _transpose_op
+            {
+                const float*    input;
+                float*          output;
+                int             h;
+                int             w;
+                __device__ void operator()(int index) const
+                {
+                    int i             = index / w;
+                    int j             = index % w;
+                    output[j * h + i] = input[index];
+                }
+            };
+
+            void Transpose(const float* d_input, int h, int w, float* d_output, cudaStream_t stream)
+            {
+                thrust::for_each_n(thrust::cuda::par.on(stream), thrust::counting_iterator<int>(0), h * w, _transpose_op{d_input, d_output, h, w});
+            }
+
+        }  // namespace panet
+
+        namespace dbnet
+        {
+
+            struct _threshold_op
+            {
+                float           thr;
+                __device__ bool operator()(float score) const
+                {
+                    return score >= thr;
+                }
+            };
+
+            void Threshold(const float* d_score, int n, float thr, uint8_t* d_mask, cudaStream_t stream)
+            {
+                thrust::transform(thrust::cuda::par.on(stream), d_score, d_score + n, d_mask, _threshold_op{thr});
+            }
+
+        }  // namespace dbnet
+
+        namespace psenet
+        {
+
+            struct _process_masks_op
+            {
+                const float*    preds;
+                int             c;
+                int             n;
+                float           thr;
+                uint8_t*        masks;
+                float*          score;
+                __device__ void operator()(int index) const
+                {
+                    bool m0 = false;
+                    for (int i = 0; i < c; ++i)
+                    {
+                        auto v = sigmoid(preds[i * n + index]);
+                        if (i == 0)
+                        {
+                            score[index] = v;
+                            m0           = v > thr;
+                        }
+                        masks[i * n + index] = (m0 && v > thr) ? 255 : 0;
+                    }
+                }
+            };
+
+            void ProcessMasks(const float* d_preds, int c, int n, float thr, uint8_t* d_masks, float* d_score, cudaStream_t stream)
+            {
+                thrust::for_each_n(thrust::cuda::par.on(stream), thrust::counting_iterator<int>(0), n, _process_masks_op{d_preds, c, n, thr, d_masks, d_score});
+            }
+
+        }  // namespace psenet
+
+    }  // namespace mmocr
 
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/codebase/mmocr/cuda/utils.h b/csrc/mmdeploy/codebase/mmocr/cuda/utils.h
index d0dd3b44b8..c9893dc51f 100644
--- a/csrc/mmdeploy/codebase/mmocr/cuda/utils.h
+++ b/csrc/mmdeploy/codebase/mmocr/cuda/utils.h
@@ -7,34 +7,36 @@
 
 #include "cuda_runtime.h"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-namespace mmocr {
+    namespace mmocr
+    {
 
-namespace panet {
+        namespace panet
+        {
 
-void ProcessMasks(const float* d_text_pred, const float* d_kernel_pred, float text_thr,
-                  float kernel_thr, int n, uint8_t* d_text_mask, uint8_t* d_kernel_mask,
-                  float* d_text_score, cudaStream_t stream);
+            void ProcessMasks(const float* d_text_pred, const float* d_kernel_pred, float text_thr, float kernel_thr, int n, uint8_t* d_text_mask, uint8_t* d_kernel_mask, float* d_text_score, cudaStream_t stream);
 
-void Transpose(const float* d_input, int h, int w, float* d_output, cudaStream_t stream);
+            void Transpose(const float* d_input, int h, int w, float* d_output, cudaStream_t stream);
 
-}  // namespace panet
+        }  // namespace panet
 
-namespace dbnet {
+        namespace dbnet
+        {
 
-void Threshold(const float* d_score, int n, float thr, uint8_t* d_mask, cudaStream_t stream);
+            void Threshold(const float* d_score, int n, float thr, uint8_t* d_mask, cudaStream_t stream);
 
-}
+        }
 
-namespace psenet {
+        namespace psenet
+        {
 
-void ProcessMasks(const float* d_preds, int c, int n, float thr, uint8_t* d_masks, float* d_score,
-                  cudaStream_t stream);
+            void ProcessMasks(const float* d_preds, int c, int n, float thr, uint8_t* d_masks, float* d_score, cudaStream_t stream);
 
-}
+        }
 
-}  // namespace mmocr
+    }  // namespace mmocr
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/codebase/mmocr/dbnet.cpp b/csrc/mmdeploy/codebase/mmocr/dbnet.cpp
index 40d43b9da2..c062e121ce 100644
--- a/csrc/mmdeploy/codebase/mmocr/dbnet.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/dbnet.cpp
@@ -11,129 +11,144 @@
 #include "mmdeploy/experimental/module_adapter.h"
 #include "mmocr.h"
 
-namespace mmdeploy {
-
-namespace mmocr {
-
-using std::string;
-using std::vector;
-
-class DBHead : public MMOCR {
- public:
-  explicit DBHead(const Value& config) : MMOCR(config) {
-    if (config.contains("params")) {
-      auto& params = config["params"];
-      text_repr_type_ = params.value("text_repr_type", text_repr_type_);
-      mask_thr_ = params.value("mask_thr", mask_thr_);
-      min_text_score_ = params.value("min_text_score", min_text_score_);
-      min_text_width_ = params.value("min_text_width", min_text_width_);
-      unclip_ratio_ = params.value("unclip_ratio", unclip_ratio_);
-      max_candidates_ = params.value("max_candidate", max_candidates_);
-      rescale_ = params.value("rescale", rescale_);
-      downsample_ratio_ = params.value("downsample_ratio", downsample_ratio_);
-    }
-    auto platform = Platform(device_.platform_id()).GetPlatformName();
-    auto creator = gRegistry<DbHeadImpl>().Get(platform);
-    if (!creator) {
-      MMDEPLOY_ERROR(
-          "DBHead: implementation for platform \"{}\" not found. Available platforms: {}", platform,
-          gRegistry<DbHeadImpl>().List());
-      throw_exception(eEntryNotFound);
-    }
-    impl_ = creator->Create();
-    impl_->Init(stream_);
-  }
-
-  Result<Value> operator()(const Value& _data, const Value& _prob) const {
-    auto conf = _prob["output"].get<Tensor>();
-    if (!(conf.shape().size() == 3 && conf.data_type() == DataType::kFLOAT)) {
-      MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}, dtype: {}", conf.shape(),
-                     (int)conf.data_type());
-      return Status(eNotSupported);
-    }
-
-    std::vector<std::vector<cv::Point>> contours;
-    std::vector<float> scores;
-    OUTCOME_TRY(impl_->Process(conf, mask_thr_, max_candidates_, contours, scores));
-
-    auto scale_w = 1.f;
-    auto scale_h = 1.f;
-    if (rescale_) {
-      scale_w /= downsample_ratio_ * _data["img_metas"]["scale_factor"][0].get<float>();
-      scale_h /= downsample_ratio_ * _data["img_metas"]["scale_factor"][1].get<float>();
-    }
-
-    TextDetections output;
-    for (int idx = 0; idx < contours.size(); ++idx) {
-      if (scores[idx] < min_text_score_) {
-        continue;
-      }
-      auto expanded = unclip(contours[idx], unclip_ratio_);
-      if (expanded.empty()) {
-        continue;
-      }
-      auto rect = cv::minAreaRect(expanded);
-      if ((int)rect.size.width <= min_text_width_) {
-        continue;
-      }
-      std::array<cv::Point2f, 4> box_points;
-      rect.points(box_points.data());
-      auto& det = output.emplace_back();
-      for (int i = 0; i < 4; ++i) {
-        // ! performance metrics drops without rounding here
-        det.bbox[i * 2] = cvRound(box_points[i].x * scale_w);
-        det.bbox[i * 2 + 1] = cvRound(box_points[i].y * scale_h);
-      }
-      det.score = scores[idx];
-    }
-
-    return to_value(output);
-  }
-
-  static std::vector<cv::Point> unclip(std::vector<cv::Point>& box, float unclip_ratio) {
-    namespace cl = ClipperLib;
-
-    auto area = cv::contourArea(box);
-    auto length = cv::arcLength(box, true);
-    auto distance = area * unclip_ratio / length;
-
-    cl::Path src;
-    transform(begin(box), end(box), back_inserter(src), [](auto p) {
-      return cl::IntPoint{p.x, p.y};
-    });
-
-    cl::ClipperOffset offset;
-    offset.AddPath(src, cl::jtRound, cl::etClosedPolygon);
-
-    std::vector<cl::Path> dst;
-    offset.Execute(dst, distance);
-    if (dst.size() != 1) {
-      return {};
-    }
-
-    std::vector<cv::Point> ret;
-    transform(begin(dst[0]), end(dst[0]), back_inserter(ret), [](auto p) {
-      return cv::Point{static_cast<int>(p.X), static_cast<int>(p.Y)};
-    });
-    return ret;
-  }
-
-  std::string text_repr_type_{"quad"};
-  float mask_thr_{.3};
-  float min_text_score_{.3};
-  int min_text_width_{5};
-  float unclip_ratio_{1.5};
-  int max_candidates_{3000};
-  bool rescale_{true};
-  float downsample_ratio_{1.};
-
-  std::unique_ptr<DbHeadImpl> impl_;
-};
-
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMOCR, DBHead);
-
-MMDEPLOY_DEFINE_REGISTRY(DbHeadImpl);
-
-}  // namespace mmocr
+namespace mmdeploy
+{
+
+    namespace mmocr
+    {
+
+        using std::string;
+        using std::vector;
+
+        class DBHead : public MMOCR
+        {
+          public:
+            explicit DBHead(const Value& config)
+                : MMOCR(config)
+            {
+                if (config.contains("params"))
+                {
+                    auto& params      = config["params"];
+                    text_repr_type_   = params.value("text_repr_type", text_repr_type_);
+                    mask_thr_         = params.value("mask_thr", mask_thr_);
+                    min_text_score_   = params.value("min_text_score", min_text_score_);
+                    min_text_width_   = params.value("min_text_width", min_text_width_);
+                    unclip_ratio_     = params.value("unclip_ratio", unclip_ratio_);
+                    max_candidates_   = params.value("max_candidate", max_candidates_);
+                    rescale_          = params.value("rescale", rescale_);
+                    downsample_ratio_ = params.value("downsample_ratio", downsample_ratio_);
+                }
+                auto platform = Platform(device_.platform_id()).GetPlatformName();
+                auto creator  = gRegistry<DbHeadImpl>().Get(platform);
+                if (!creator)
+                {
+                    MMDEPLOY_ERROR(
+                        "DBHead: implementation for platform \"{}\" not found. Available platforms: {}",
+                        platform,
+                        gRegistry<DbHeadImpl>().List());
+                    throw_exception(eEntryNotFound);
+                }
+                impl_ = creator->Create();
+                impl_->Init(stream_);
+            }
+
+            Result<Value> operator()(const Value& _data, const Value& _prob) const
+            {
+                auto conf = _prob["output"].get<Tensor>();
+                if (!(conf.shape().size() == 3 && conf.data_type() == DataType::kFLOAT))
+                {
+                    MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}, dtype: {}", conf.shape(), (int)conf.data_type());
+                    return Status(eNotSupported);
+                }
+
+                std::vector<std::vector<cv::Point>> contours;
+                std::vector<float>                  scores;
+                OUTCOME_TRY(impl_->Process(conf, mask_thr_, max_candidates_, contours, scores));
+
+                auto scale_w = 1.f;
+                auto scale_h = 1.f;
+                if (rescale_)
+                {
+                    scale_w /= downsample_ratio_ * _data["img_metas"]["scale_factor"][0].get<float>();
+                    scale_h /= downsample_ratio_ * _data["img_metas"]["scale_factor"][1].get<float>();
+                }
+
+                TextDetections output;
+                for (int idx = 0; idx < contours.size(); ++idx)
+                {
+                    if (scores[idx] < min_text_score_)
+                    {
+                        continue;
+                    }
+                    auto expanded = unclip(contours[idx], unclip_ratio_);
+                    if (expanded.empty())
+                    {
+                        continue;
+                    }
+                    auto rect = cv::minAreaRect(expanded);
+                    if ((int)rect.size.width <= min_text_width_)
+                    {
+                        continue;
+                    }
+                    std::array<cv::Point2f, 4> box_points;
+                    rect.points(box_points.data());
+                    auto& det = output.emplace_back();
+                    for (int i = 0; i < 4; ++i)
+                    {
+                        // ! performance metrics drops without rounding here
+                        det.bbox[i * 2]     = cvRound(box_points[i].x * scale_w);
+                        det.bbox[i * 2 + 1] = cvRound(box_points[i].y * scale_h);
+                    }
+                    det.score = scores[idx];
+                }
+
+                return to_value(output);
+            }
+
+            static std::vector<cv::Point> unclip(std::vector<cv::Point>& box, float unclip_ratio)
+            {
+                namespace cl = ClipperLib;
+
+                auto     area     = cv::contourArea(box);
+                auto     length   = cv::arcLength(box, true);
+                auto     distance = area * unclip_ratio / length;
+
+                cl::Path src;
+                transform(begin(box), end(box), back_inserter(src), [](auto p)
+                          { return cl::IntPoint{p.x, p.y}; });
+
+                cl::ClipperOffset offset;
+                offset.AddPath(src, cl::jtRound, cl::etClosedPolygon);
+
+                std::vector<cl::Path> dst;
+                offset.Execute(dst, distance);
+                if (dst.size() != 1)
+                {
+                    return {};
+                }
+
+                std::vector<cv::Point> ret;
+                transform(begin(dst[0]), end(dst[0]), back_inserter(ret), [](auto p)
+                          { return cv::Point{static_cast<int>(p.X), static_cast<int>(p.Y)}; });
+                return ret;
+            }
+
+            std::string                 text_repr_type_{"quad"};
+            float                       mask_thr_{.3};
+            float                       min_text_score_{.3};
+            int                         min_text_width_{5};
+            float                       unclip_ratio_{1.5};
+            int                         max_candidates_{3000};
+            bool                        rescale_{true};
+            float                       downsample_ratio_{1.};
+
+            std::unique_ptr<DbHeadImpl> impl_;
+        };
+
+        MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMOCR, DBHead);
+
+        MMDEPLOY_DEFINE_REGISTRY(DbHeadImpl);
+
+    }  // namespace mmocr
 
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/codebase/mmocr/dbnet.h b/csrc/mmdeploy/codebase/mmocr/dbnet.h
index c7510c7760..1f10e6a980 100644
--- a/csrc/mmdeploy/codebase/mmocr/dbnet.h
+++ b/csrc/mmdeploy/codebase/mmocr/dbnet.h
@@ -9,23 +9,26 @@
 #include "mmdeploy/core/tensor.h"
 #include "opencv2/core.hpp"
 
-namespace mmdeploy::mmocr {
+namespace mmdeploy::mmocr
+{
 
-class DbHeadImpl {
- public:
-  virtual ~DbHeadImpl() = default;
+    class DbHeadImpl
+    {
+      public:
+        virtual ~DbHeadImpl() = default;
 
-  virtual void Init(const Stream& stream) { stream_ = stream; }
+        virtual void Init(const Stream& stream)
+        {
+            stream_ = stream;
+        }
 
-  virtual Result<void> Process(Tensor prob, float mask_thr, int max_candidates,
-                               std::vector<std::vector<cv::Point>>& points,
-                               std::vector<float>& scores) = 0;
+        virtual Result<void> Process(Tensor prob, float mask_thr, int max_candidates, std::vector<std::vector<cv::Point>>& points, std::vector<float>& scores) = 0;
 
- protected:
-  Stream stream_;
-};
+      protected:
+        Stream stream_;
+    };
 
-MMDEPLOY_DECLARE_REGISTRY(DbHeadImpl, std::unique_ptr<DbHeadImpl>());
+    MMDEPLOY_DECLARE_REGISTRY(DbHeadImpl, std::unique_ptr<DbHeadImpl>());
 
 }  // namespace mmdeploy::mmocr
 
diff --git a/csrc/mmdeploy/codebase/mmocr/mmocr.cpp b/csrc/mmdeploy/codebase/mmocr/mmocr.cpp
index 1654ddd929..d9690885cb 100644
--- a/csrc/mmdeploy/codebase/mmocr/mmocr.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/mmocr.cpp
@@ -5,8 +5,9 @@
 #include "mmdeploy/core/registry.h"
 #include "mmdeploy/core/utils/formatter.h"
 
-namespace mmdeploy::mmocr {
+namespace mmdeploy::mmocr
+{
 
-MMDEPLOY_REGISTER_CODEBASE(MMOCR);
+    MMDEPLOY_REGISTER_CODEBASE(MMOCR);
 
 }  // namespace mmdeploy::mmocr
diff --git a/csrc/mmdeploy/codebase/mmocr/mmocr.h b/csrc/mmdeploy/codebase/mmocr/mmocr.h
index f8c67a4036..00d0527d26 100644
--- a/csrc/mmdeploy/codebase/mmocr/mmocr.h
+++ b/csrc/mmdeploy/codebase/mmocr/mmocr.h
@@ -9,23 +9,26 @@
 #include "mmdeploy/core/device.h"
 #include "mmdeploy/core/module.h"
 
-namespace mmdeploy::mmocr {
-
-struct TextDetection {
-  std::array<float, 8> bbox;
-  float score;
-  MMDEPLOY_ARCHIVE_MEMBERS(bbox, score);
-};
-
-using TextDetections = std::vector<TextDetection>;
-
-struct TextRecognition {
-  std::string text;
-  std::vector<float> score;
-  MMDEPLOY_ARCHIVE_MEMBERS(text, score);
-};
-
-MMDEPLOY_DECLARE_CODEBASE(MMOCR, mmocr);
+namespace mmdeploy::mmocr
+{
+
+    struct TextDetection
+    {
+        std::array<float, 8> bbox;
+        float                score;
+        MMDEPLOY_ARCHIVE_MEMBERS(bbox, score);
+    };
+
+    using TextDetections = std::vector<TextDetection>;
+
+    struct TextRecognition
+    {
+        std::string        text;
+        std::vector<float> score;
+        MMDEPLOY_ARCHIVE_MEMBERS(text, score);
+    };
+
+    MMDEPLOY_DECLARE_CODEBASE(MMOCR, mmocr);
 
 }  // namespace mmdeploy::mmocr
 
diff --git a/csrc/mmdeploy/codebase/mmocr/panet.cpp b/csrc/mmdeploy/codebase/mmocr/panet.cpp
index b00d7b736a..6c7db7fa19 100644
--- a/csrc/mmdeploy/codebase/mmocr/panet.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/panet.cpp
@@ -11,116 +11,127 @@
 #include "mmdeploy/core/utils/device_utils.h"
 #include "opencv2/imgproc/imgproc.hpp"
 
-namespace mmdeploy::mmocr {
-
-std::vector<std::vector<float>> pixel_group_cpu(const cv::Mat_<float>& score,
-                                                const cv::Mat_<uint8_t>& mask,
-                                                const cv::Mat_<float>& embedding,
-                                                const cv::Mat_<int32_t>& kernel_label,
-                                                const cv::Mat_<uint8_t>& kernel_contour,
-                                                int kernel_region_num, float dis_threshold);
-
-class PANHead : public MMOCR {
- public:
-  explicit PANHead(const Value& config) : MMOCR(config) {
-    if (config.contains("params")) {
-      auto& params = config["params"];
-      min_text_avg_confidence_ = params.value("min_text_avg_confidence", min_text_avg_confidence_);
-      min_kernel_confidence_ = params.value("min_kernel_confidence", min_kernel_confidence_);
-      min_text_avg_confidence_ = params.value("min_text_avg_confidence", min_text_avg_confidence_);
-      min_text_area_ = params.value("min_text_area", min_text_area_);
-      rescale_ = params.value("rescale", rescale_);
-      downsample_ratio_ = params.value("downsample_ratio", downsample_ratio_);
-    }
-    auto platform = Platform(device_.platform_id()).GetPlatformName();
-    auto creator = gRegistry<PaHeadImpl>().Get(platform);
-    if (!creator) {
-      MMDEPLOY_ERROR(
-          "PANHead: implementation for platform \"{}\" not found. Available platforms: {}",
-          platform, gRegistry<PaHeadImpl>().List());
-      throw_exception(eEntryNotFound);
-    }
-    impl_ = creator->Create();
-    impl_->Init(stream_);
-  }
-
-  Result<Value> operator()(const Value& _data, const Value& _pred) noexcept {
-    OUTCOME_TRY(auto pred, MakeAvailableOnDevice(_pred["output"].get<Tensor>(), device_, stream_));
-    OUTCOME_TRY(stream_.Wait());
-
-    if (pred.shape().size() != 4 || pred.shape(0) != 1 || pred.data_type() != DataType::kFLOAT) {
-      MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}, dtype: {}", pred.shape(),
-                     (int)pred.data_type());
-      return Status(eNotSupported);
-    }
-
-    // drop batch dimension
-    pred.Squeeze(0);
-
-    auto text_pred = pred.Slice(0);
-    auto kernel_pred = pred.Slice(1);
-    auto embed_pred = pred.Slice(2, pred.shape(0));
-
-    cv::Mat_<float> text_score;
-    cv::Mat_<uint8_t> text;
-    cv::Mat_<uint8_t> kernel;
-    cv::Mat_<int> labels;
-    cv::Mat_<float> embed;
-    int region_num = 0;
-
-    OUTCOME_TRY(impl_->Process(text_pred, kernel_pred, embed_pred, min_text_confidence_,
-                               min_kernel_confidence_, text_score, text, kernel, labels, embed,
-                               region_num));
-
-    auto text_points = pixel_group_cpu(text_score, text, embed, labels, kernel, region_num,
-                                       min_text_avg_confidence_);
-
-    auto scale_w = _data["img_metas"]["scale_factor"][0].get<float>();
-    auto scale_h = _data["img_metas"]["scale_factor"][1].get<float>();
-
-    TextDetections output;
-    for (auto& text_point : text_points) {
-      auto text_confidence = text_point[0];
-      auto area = text_point.size() - 2;
-      if (filter_instance(static_cast<float>(area), text_confidence, min_text_area_,
-                          min_text_avg_confidence_)) {
-        continue;
-      }
-      cv::Mat_<float> points(text_point.size() / 2 - 1, 2, text_point.data() + 2);
-      cv::RotatedRect rect = cv::minAreaRect(points);
-      std::vector<cv::Point2f> vertices(4);
-      rect.points(vertices.data());
-      if (rescale_) {
-        for (auto& p : vertices) {
-          p.x /= scale_w * downsample_ratio_;
-          p.y /= scale_h * downsample_ratio_;
+namespace mmdeploy::mmocr
+{
+
+    std::vector<std::vector<float>> pixel_group_cpu(const cv::Mat_<float>&   score,
+                                                    const cv::Mat_<uint8_t>& mask,
+                                                    const cv::Mat_<float>&   embedding,
+                                                    const cv::Mat_<int32_t>& kernel_label,
+                                                    const cv::Mat_<uint8_t>& kernel_contour,
+                                                    int                      kernel_region_num,
+                                                    float                    dis_threshold);
+
+    class PANHead : public MMOCR
+    {
+      public:
+        explicit PANHead(const Value& config)
+            : MMOCR(config)
+        {
+            if (config.contains("params"))
+            {
+                auto& params             = config["params"];
+                min_text_avg_confidence_ = params.value("min_text_avg_confidence", min_text_avg_confidence_);
+                min_kernel_confidence_   = params.value("min_kernel_confidence", min_kernel_confidence_);
+                min_text_avg_confidence_ = params.value("min_text_avg_confidence", min_text_avg_confidence_);
+                min_text_area_           = params.value("min_text_area", min_text_area_);
+                rescale_                 = params.value("rescale", rescale_);
+                downsample_ratio_        = params.value("downsample_ratio", downsample_ratio_);
+            }
+            auto platform = Platform(device_.platform_id()).GetPlatformName();
+            auto creator  = gRegistry<PaHeadImpl>().Get(platform);
+            if (!creator)
+            {
+                MMDEPLOY_ERROR(
+                    "PANHead: implementation for platform \"{}\" not found. Available platforms: {}",
+                    platform,
+                    gRegistry<PaHeadImpl>().List());
+                throw_exception(eEntryNotFound);
+            }
+            impl_ = creator->Create();
+            impl_->Init(stream_);
         }
-      }
-      auto& det = output.emplace_back();
-      for (int i = 0; i < 4; ++i) {
-        det.bbox[i * 2] = vertices[i].x;
-        det.bbox[i * 2 + 1] = vertices[i].y;
-      }
-      det.score = text_confidence;
-    }
-    return to_value(output);
-  }
-
-  static bool filter_instance(float area, float confidence, float min_area, float min_confidence) {
-    return area < min_area || confidence < min_confidence;
-  }
-
-  float min_text_confidence_{.5f};
-  float min_kernel_confidence_{.5f};
-  float min_text_avg_confidence_{0.85};
-  float min_text_area_{16};
-  bool rescale_{true};
-  float downsample_ratio_{.25f};
-  std::unique_ptr<PaHeadImpl> impl_;
-};
-
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMOCR, PANHead);
-
-MMDEPLOY_DEFINE_REGISTRY(PaHeadImpl);
+
+        Result<Value> operator()(const Value& _data, const Value& _pred) noexcept
+        {
+            OUTCOME_TRY(auto pred, MakeAvailableOnDevice(_pred["output"].get<Tensor>(), device_, stream_));
+            OUTCOME_TRY(stream_.Wait());
+
+            if (pred.shape().size() != 4 || pred.shape(0) != 1 || pred.data_type() != DataType::kFLOAT)
+            {
+                MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}, dtype: {}", pred.shape(), (int)pred.data_type());
+                return Status(eNotSupported);
+            }
+
+            // drop batch dimension
+            pred.Squeeze(0);
+
+            auto              text_pred   = pred.Slice(0);
+            auto              kernel_pred = pred.Slice(1);
+            auto              embed_pred  = pred.Slice(2, pred.shape(0));
+
+            cv::Mat_<float>   text_score;
+            cv::Mat_<uint8_t> text;
+            cv::Mat_<uint8_t> kernel;
+            cv::Mat_<int>     labels;
+            cv::Mat_<float>   embed;
+            int               region_num = 0;
+
+            OUTCOME_TRY(impl_->Process(text_pred, kernel_pred, embed_pred, min_text_confidence_, min_kernel_confidence_, text_score, text, kernel, labels, embed, region_num));
+
+            auto           text_points = pixel_group_cpu(text_score, text, embed, labels, kernel, region_num, min_text_avg_confidence_);
+
+            auto           scale_w = _data["img_metas"]["scale_factor"][0].get<float>();
+            auto           scale_h = _data["img_metas"]["scale_factor"][1].get<float>();
+
+            TextDetections output;
+            for (auto& text_point : text_points)
+            {
+                auto text_confidence = text_point[0];
+                auto area            = text_point.size() - 2;
+                if (filter_instance(static_cast<float>(area), text_confidence, min_text_area_, min_text_avg_confidence_))
+                {
+                    continue;
+                }
+                cv::Mat_<float>          points(text_point.size() / 2 - 1, 2, text_point.data() + 2);
+                cv::RotatedRect          rect = cv::minAreaRect(points);
+                std::vector<cv::Point2f> vertices(4);
+                rect.points(vertices.data());
+                if (rescale_)
+                {
+                    for (auto& p : vertices)
+                    {
+                        p.x /= scale_w * downsample_ratio_;
+                        p.y /= scale_h * downsample_ratio_;
+                    }
+                }
+                auto& det = output.emplace_back();
+                for (int i = 0; i < 4; ++i)
+                {
+                    det.bbox[i * 2]     = vertices[i].x;
+                    det.bbox[i * 2 + 1] = vertices[i].y;
+                }
+                det.score = text_confidence;
+            }
+            return to_value(output);
+        }
+
+        static bool filter_instance(float area, float confidence, float min_area, float min_confidence)
+        {
+            return area < min_area || confidence < min_confidence;
+        }
+
+        float                       min_text_confidence_{.5f};
+        float                       min_kernel_confidence_{.5f};
+        float                       min_text_avg_confidence_{0.85};
+        float                       min_text_area_{16};
+        bool                        rescale_{true};
+        float                       downsample_ratio_{.25f};
+        std::unique_ptr<PaHeadImpl> impl_;
+    };
+
+    MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMOCR, PANHead);
+
+    MMDEPLOY_DEFINE_REGISTRY(PaHeadImpl);
 
 }  // namespace mmdeploy::mmocr
diff --git a/csrc/mmdeploy/codebase/mmocr/panet.h b/csrc/mmdeploy/codebase/mmocr/panet.h
index 32a5cd638c..bf13443b31 100644
--- a/csrc/mmdeploy/codebase/mmocr/panet.h
+++ b/csrc/mmdeploy/codebase/mmocr/panet.h
@@ -9,31 +9,36 @@
 #include "mmdeploy/core/tensor.h"
 #include "opencv2/core.hpp"
 
-namespace mmdeploy::mmocr {
-
-class PaHeadImpl {
- public:
-  virtual ~PaHeadImpl() = default;
-
-  virtual void Init(const Stream& stream) { stream_ = stream; }
-
-  virtual Result<void> Process(Tensor text_pred,             //
-                               Tensor kernel_pred,           //
-                               Tensor embed_pred,            //
-                               float min_text_confidence,    //
-                               float min_kernel_confidence,  //
-                               cv::Mat_<float>& text_score,  //
-                               cv::Mat_<uint8_t>& text,      //
-                               cv::Mat_<uint8_t>& kernel,    //
-                               cv::Mat_<int>& label,         //
-                               cv::Mat_<float>& embed,       //
-                               int& region_num) = 0;
-
- protected:
-  Stream stream_;
-};
-
-MMDEPLOY_DECLARE_REGISTRY(PaHeadImpl, std::unique_ptr<PaHeadImpl>());
+namespace mmdeploy::mmocr
+{
+
+    class PaHeadImpl
+    {
+      public:
+        virtual ~PaHeadImpl() = default;
+
+        virtual void Init(const Stream& stream)
+        {
+            stream_ = stream;
+        }
+
+        virtual Result<void> Process(Tensor             text_pred,              //
+                                     Tensor             kernel_pred,            //
+                                     Tensor             embed_pred,             //
+                                     float              min_text_confidence,    //
+                                     float              min_kernel_confidence,  //
+                                     cv::Mat_<float>&   text_score,             //
+                                     cv::Mat_<uint8_t>& text,                   //
+                                     cv::Mat_<uint8_t>& kernel,                 //
+                                     cv::Mat_<int>&     label,                  //
+                                     cv::Mat_<float>&   embed,                  //
+                                     int&               region_num) = 0;
+
+      protected:
+        Stream stream_;
+    };
+
+    MMDEPLOY_DECLARE_REGISTRY(PaHeadImpl, std::unique_ptr<PaHeadImpl>());
 
 }  // namespace mmdeploy::mmocr
 
diff --git a/csrc/mmdeploy/codebase/mmocr/pixel_group.cpp b/csrc/mmdeploy/codebase/mmocr/pixel_group.cpp
index f9810a91bd..11022a4ee4 100644
--- a/csrc/mmdeploy/codebase/mmocr/pixel_group.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/pixel_group.cpp
@@ -10,116 +10,133 @@
 #include "mmdeploy/core/tensor.h"
 #include "opencv2/imgproc/imgproc.hpp"
 
-namespace mmdeploy::mmocr {
+namespace mmdeploy::mmocr
+{
 
-std::vector<std::vector<float>> estimate_confidence(const int32_t* label, const float* score,
-                                                    int label_num, int height, int width) {
-  std::vector<std::vector<float>> point_vector;
-  for (int i = 0; i < label_num; i++) {
-    std::vector<float> point;
-    point.push_back(0);
-    point.push_back(0);
-    point_vector.push_back(point);
-  }
-  for (int y = 0; y < height; y++) {
-    auto label_tmp = label + y * width;
-    auto score_tmp = score + y * width;
-    for (int x = 0; x < width; x++) {
-      auto l = label_tmp[x];
-      if (l > 0) {
-        float confidence = score_tmp[x];
-        point_vector[l].push_back(x);
-        point_vector[l].push_back(y);
-        point_vector[l][0] += confidence;
-        point_vector[l][1] += 1;
-      }
-    }
-  }
-  for (size_t l = 0; l < point_vector.size(); l++)
-    if (point_vector[l][1] > 0) {
-      point_vector[l][0] /= point_vector[l][1];
+    std::vector<std::vector<float>> estimate_confidence(const int32_t* label, const float* score, int label_num, int height, int width)
+    {
+        std::vector<std::vector<float>> point_vector;
+        for (int i = 0; i < label_num; i++)
+        {
+            std::vector<float> point;
+            point.push_back(0);
+            point.push_back(0);
+            point_vector.push_back(point);
+        }
+        for (int y = 0; y < height; y++)
+        {
+            auto label_tmp = label + y * width;
+            auto score_tmp = score + y * width;
+            for (int x = 0; x < width; x++)
+            {
+                auto l = label_tmp[x];
+                if (l > 0)
+                {
+                    float confidence = score_tmp[x];
+                    point_vector[l].push_back(x);
+                    point_vector[l].push_back(y);
+                    point_vector[l][0] += confidence;
+                    point_vector[l][1] += 1;
+                }
+            }
+        }
+        for (size_t l = 0; l < point_vector.size(); l++)
+            if (point_vector[l][1] > 0)
+            {
+                point_vector[l][0] /= point_vector[l][1];
+            }
+        return point_vector;
     }
-  return point_vector;
-}
 
-std::vector<std::vector<float>> pixel_group_cpu(const cv::Mat_<float>& score,
-                                                const cv::Mat_<uint8_t>& mask,
-                                                const cv::Mat_<float>& embedding,
-                                                const cv::Mat_<int32_t>& kernel_label,
-                                                const cv::Mat_<uint8_t>& kernel_contour,
-                                                int kernel_region_num, float dis_threshold) {
-  int height = score.rows;
-  int width = score.cols;
-  assert(embedding.rows == height * width);
-  assert(height == mask.rows);
-  assert(width == mask.cols);
+    std::vector<std::vector<float>> pixel_group_cpu(const cv::Mat_<float>&   score,
+                                                    const cv::Mat_<uint8_t>& mask,
+                                                    const cv::Mat_<float>&   embedding,
+                                                    const cv::Mat_<int32_t>& kernel_label,
+                                                    const cv::Mat_<uint8_t>& kernel_contour,
+                                                    int                      kernel_region_num,
+                                                    float                    dis_threshold)
+    {
+        int height = score.rows;
+        int width  = score.cols;
+        assert(embedding.rows == height * width);
+        assert(height == mask.rows);
+        assert(width == mask.cols);
 
-  auto threshold_square = dis_threshold * dis_threshold;
-  auto ptr_score = score.ptr<float>();
-  auto ptr_mask = mask.ptr<uint8_t>();
-  auto ptr_kernel_contour = kernel_contour.ptr<uint8_t>();
-  auto ptr_embedding = embedding.ptr<float>();
-  auto ptr_kernel_label = kernel_label.ptr<int32_t>();
-  std::queue<std::tuple<int, int, int32_t>> contour_pixels;
-  auto embedding_dim = embedding.cols;
-  std::vector<std::vector<float>> kernel_vector(kernel_region_num,
-                                                std::vector<float>(embedding_dim + 1, 0));
+        auto                                      threshold_square   = dis_threshold * dis_threshold;
+        auto                                      ptr_score          = score.ptr<float>();
+        auto                                      ptr_mask           = mask.ptr<uint8_t>();
+        auto                                      ptr_kernel_contour = kernel_contour.ptr<uint8_t>();
+        auto                                      ptr_embedding      = embedding.ptr<float>();
+        auto                                      ptr_kernel_label   = kernel_label.ptr<int32_t>();
+        std::queue<std::tuple<int, int, int32_t>> contour_pixels;
+        auto                                      embedding_dim = embedding.cols;
+        std::vector<std::vector<float>>           kernel_vector(kernel_region_num,
+                                                      std::vector<float>(embedding_dim + 1, 0));
 
-  cv::Mat_<int32_t> text_label = kernel_label.clone();
-  auto ptr_text_label = text_label.ptr<int32_t>();
+        cv::Mat_<int32_t>                         text_label     = kernel_label.clone();
+        auto                                      ptr_text_label = text_label.ptr<int32_t>();
 
-  for (int i = 0; i < height; i++) {
-    auto ptr_embedding_tmp = ptr_embedding + i * width * embedding_dim;
-    auto ptr_kernel_label_tmp = ptr_kernel_label + i * width;
-    auto ptr_kernel_contour_tmp = ptr_kernel_contour + i * width;
+        for (int i = 0; i < height; i++)
+        {
+            auto ptr_embedding_tmp      = ptr_embedding + i * width * embedding_dim;
+            auto ptr_kernel_label_tmp   = ptr_kernel_label + i * width;
+            auto ptr_kernel_contour_tmp = ptr_kernel_contour + i * width;
 
-    for (int j = 0, k = 0; j < width && k < width * embedding_dim; j++, k += embedding_dim) {
-      int32_t label = ptr_kernel_label_tmp[j];
-      if (label > 0) {
-        for (int d = 0; d < embedding_dim; d++) kernel_vector[label][d] += ptr_embedding_tmp[k + d];
-        kernel_vector[label][embedding_dim] += 1;
-        // kernel pixel number
-        if (ptr_kernel_contour_tmp[j]) {
-          contour_pixels.push(std::make_tuple(i, j, label));
+            for (int j = 0, k = 0; j < width && k < width * embedding_dim; j++, k += embedding_dim)
+            {
+                int32_t label = ptr_kernel_label_tmp[j];
+                if (label > 0)
+                {
+                    for (int d = 0; d < embedding_dim; d++) kernel_vector[label][d] += ptr_embedding_tmp[k + d];
+                    kernel_vector[label][embedding_dim] += 1;
+                    // kernel pixel number
+                    if (ptr_kernel_contour_tmp[j])
+                    {
+                        contour_pixels.push(std::make_tuple(i, j, label));
+                    }
+                }
+            }
         }
-      }
-    }
-  }
-  for (int i = 0; i < kernel_region_num; i++) {
-    for (int j = 0; j < embedding_dim; j++) {
-      kernel_vector[i][j] /= kernel_vector[i][embedding_dim];
-    }
-  }
-  int dx[4] = {-1, 1, 0, 0};
-  int dy[4] = {0, 0, -1, 1};
-  while (!contour_pixels.empty()) {
-    auto query_pixel = contour_pixels.front();
-    contour_pixels.pop();
-    int y = std::get<0>(query_pixel);
-    int x = std::get<1>(query_pixel);
-    int32_t l = std::get<2>(query_pixel);
-    auto kernel_cv = kernel_vector[l];
-    for (int idx = 0; idx < 4; idx++) {
-      int tmpy = y + dy[idx];
-      int tmpx = x + dx[idx];
-      auto ptr_text_label_tmp = ptr_text_label + tmpy * width;
-      if (tmpy < 0 || tmpy >= height || tmpx < 0 || tmpx >= width) continue;
-      if (!ptr_mask[tmpy * width + tmpx] || ptr_text_label_tmp[tmpx] > 0) continue;
+        for (int i = 0; i < kernel_region_num; i++)
+        {
+            for (int j = 0; j < embedding_dim; j++)
+            {
+                kernel_vector[i][j] /= kernel_vector[i][embedding_dim];
+            }
+        }
+        int dx[4] = {-1, 1, 0, 0};
+        int dy[4] = {0, 0, -1, 1};
+        while (!contour_pixels.empty())
+        {
+            auto query_pixel = contour_pixels.front();
+            contour_pixels.pop();
+            int     y         = std::get<0>(query_pixel);
+            int     x         = std::get<1>(query_pixel);
+            int32_t l         = std::get<2>(query_pixel);
+            auto    kernel_cv = kernel_vector[l];
+            for (int idx = 0; idx < 4; idx++)
+            {
+                int  tmpy               = y + dy[idx];
+                int  tmpx               = x + dx[idx];
+                auto ptr_text_label_tmp = ptr_text_label + tmpy * width;
+                if (tmpy < 0 || tmpy >= height || tmpx < 0 || tmpx >= width) continue;
+                if (!ptr_mask[tmpy * width + tmpx] || ptr_text_label_tmp[tmpx] > 0) continue;
 
-      float dis = 0;
-      auto ptr_embedding_tmp = ptr_embedding + tmpy * width * embedding_dim;
-      for (size_t i = 0; i < embedding_dim; i++) {
-        dis += std::pow(kernel_cv[i] - ptr_embedding_tmp[tmpx * embedding_dim + i], 2);
-        // ignore further computing if dis is big enough
-        if (dis >= threshold_square) break;
-      }
-      if (dis >= threshold_square) continue;
-      contour_pixels.push(std::make_tuple(tmpy, tmpx, l));
-      ptr_text_label_tmp[tmpx] = l;
-    }
-  }
+                float dis               = 0;
+                auto  ptr_embedding_tmp = ptr_embedding + tmpy * width * embedding_dim;
+                for (size_t i = 0; i < embedding_dim; i++)
+                {
+                    dis += std::pow(kernel_cv[i] - ptr_embedding_tmp[tmpx * embedding_dim + i], 2);
+                    // ignore further computing if dis is big enough
+                    if (dis >= threshold_square) break;
+                }
+                if (dis >= threshold_square) continue;
+                contour_pixels.push(std::make_tuple(tmpy, tmpx, l));
+                ptr_text_label_tmp[tmpx] = l;
+            }
+        }
 
-  return estimate_confidence(ptr_text_label, ptr_score, kernel_region_num, height, width);
-}
+        return estimate_confidence(ptr_text_label, ptr_score, kernel_region_num, height, width);
+    }
 
 }  // namespace mmdeploy::mmocr
diff --git a/csrc/mmdeploy/codebase/mmocr/psenet.cpp b/csrc/mmdeploy/codebase/mmocr/psenet.cpp
index ad66c675fe..afe3fef3f0 100644
--- a/csrc/mmdeploy/codebase/mmocr/psenet.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/psenet.cpp
@@ -11,114 +11,123 @@
 #include "mmdeploy/core/utils/device_utils.h"
 #include "opencv2/imgproc/imgproc.hpp"
 
-namespace mmdeploy::mmocr {
-
-void contour_expand(const cv::Mat_<uint8_t>& kernel_masks, const cv::Mat_<int32_t>& kernel_label,
-                    const cv::Mat_<float>& score, int min_kernel_area, int kernel_num,
-                    std::vector<int>& text_areas, std::vector<float>& text_scores,
-                    std::vector<std::vector<int>>& text_points);
-
-class PSEHead : public MMOCR {
- public:
-  explicit PSEHead(const Value& config) : MMOCR(config) {
-    if (config.contains("params")) {
-      auto& params = config["params"];
-      min_kernel_confidence_ = params.value("min_kernel_confidence", min_kernel_confidence_);
-      min_text_avg_confidence_ = params.value("min_text_avg_confidence", min_text_avg_confidence_);
-      min_kernel_area_ = params.value("min_kernel_area", min_kernel_area_);
-      min_text_area_ = params.value("min_text_area", min_text_area_);
-      rescale_ = params.value("rescale", rescale_);
-      downsample_ratio_ = params.value("downsample_ratio", downsample_ratio_);
-    }
-    auto platform = Platform(device_.platform_id()).GetPlatformName();
-    auto creator = gRegistry<PseHeadImpl>().Get(platform);
-    if (!creator) {
-      MMDEPLOY_ERROR(
-          "PSEHead: implementation for platform \"{}\" not found. Available platforms: {}",
-          platform, gRegistry<PseHeadImpl>().List());
-      throw_exception(eEntryNotFound);
-    }
-    impl_ = creator->Create();
-    impl_->Init(stream_);
-  }
-
-  Result<Value> operator()(const Value& _data, const Value& _pred) noexcept {
-    auto _preds = _pred["output"].get<Tensor>();
-    if (_preds.shape().size() != 4 || _preds.shape(0) != 1 ||
-        _preds.data_type() != DataType::kFLOAT) {
-      MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}, dtype: {}", _preds.shape(),
-                     (int)_preds.data_type());
-      return Status(eNotSupported);
-    }
-
-    // drop batch dimension
-    _preds.Squeeze(0);
-
-    cv::Mat_<uint8_t> masks;
-    cv::Mat_<int> kernel_labels;
-    cv::Mat_<float> score;
-    int region_num = 0;
-
-    OUTCOME_TRY(
-        impl_->Process(_preds, min_kernel_confidence_, score, masks, kernel_labels, region_num));
-
-    std::vector<int> text_areas;
-    std::vector<float> text_scores;
-    std::vector<std::vector<int>> text_points;
-    contour_expand(masks.rowRange(1, masks.rows), kernel_labels, score, min_kernel_area_,
-                   region_num, text_areas, text_scores, text_points);
-
-    auto scale_w = _data["img_metas"]["scale_factor"][0].get<float>();
-    auto scale_h = _data["img_metas"]["scale_factor"][1].get<float>();
-
-    TextDetections output;
-    for (int text_index = 1; text_index < region_num; ++text_index) {
-      auto& text_point = text_points[text_index];
-      auto text_confidence = text_scores[text_index];
-      auto area = text_areas[text_index];
-
-      if (filter_instance(static_cast<float>(area), text_confidence, min_text_area_,
-                          min_text_avg_confidence_)) {
-        continue;
-      }
-
-      cv::Mat_<int> points(text_point.size() / 2, 2, text_point.data());
-      cv::RotatedRect rect = cv::minAreaRect(points);
-      std::vector<cv::Point2f> vertices(4);
-      rect.points(vertices.data());
-
-      if (rescale_) {
-        for (auto& p : vertices) {
-          p.x /= scale_w * downsample_ratio_;
-          p.y /= scale_h * downsample_ratio_;
+namespace mmdeploy::mmocr
+{
+
+    void contour_expand(const cv::Mat_<uint8_t>& kernel_masks, const cv::Mat_<int32_t>& kernel_label, const cv::Mat_<float>& score, int min_kernel_area, int kernel_num, std::vector<int>& text_areas, std::vector<float>& text_scores, std::vector<std::vector<int>>& text_points);
+
+    class PSEHead : public MMOCR
+    {
+      public:
+        explicit PSEHead(const Value& config)
+            : MMOCR(config)
+        {
+            if (config.contains("params"))
+            {
+                auto& params             = config["params"];
+                min_kernel_confidence_   = params.value("min_kernel_confidence", min_kernel_confidence_);
+                min_text_avg_confidence_ = params.value("min_text_avg_confidence", min_text_avg_confidence_);
+                min_kernel_area_         = params.value("min_kernel_area", min_kernel_area_);
+                min_text_area_           = params.value("min_text_area", min_text_area_);
+                rescale_                 = params.value("rescale", rescale_);
+                downsample_ratio_        = params.value("downsample_ratio", downsample_ratio_);
+            }
+            auto platform = Platform(device_.platform_id()).GetPlatformName();
+            auto creator  = gRegistry<PseHeadImpl>().Get(platform);
+            if (!creator)
+            {
+                MMDEPLOY_ERROR(
+                    "PSEHead: implementation for platform \"{}\" not found. Available platforms: {}",
+                    platform,
+                    gRegistry<PseHeadImpl>().List());
+                throw_exception(eEntryNotFound);
+            }
+            impl_ = creator->Create();
+            impl_->Init(stream_);
         }
-      }
-      auto& det = output.emplace_back();
-      for (int i = 0; i < 4; ++i) {
-        det.bbox[i * 2] = vertices[i].x;
-        det.bbox[i * 2 + 1] = vertices[i].y;
-      }
-      det.score = text_confidence;
-    }
-    return to_value(output);
-  }
-
-  static bool filter_instance(float area, float confidence, float min_area, float min_confidence) {
-    return area < min_area || confidence < min_confidence;
-  }
-
-  float min_kernel_confidence_{.5f};
-  float min_text_avg_confidence_{0.85};
-  int min_kernel_area_{0};
-  float min_text_area_{16};
-  bool rescale_{true};
-  float downsample_ratio_{.25f};
-
-  std::unique_ptr<PseHeadImpl> impl_;
-};
-
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMOCR, PSEHead);
-
-MMDEPLOY_DEFINE_REGISTRY(PseHeadImpl);
+
+        Result<Value> operator()(const Value& _data, const Value& _pred) noexcept
+        {
+            auto _preds = _pred["output"].get<Tensor>();
+            if (_preds.shape().size() != 4 || _preds.shape(0) != 1 ||
+                _preds.data_type() != DataType::kFLOAT)
+            {
+                MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}, dtype: {}", _preds.shape(), (int)_preds.data_type());
+                return Status(eNotSupported);
+            }
+
+            // drop batch dimension
+            _preds.Squeeze(0);
+
+            cv::Mat_<uint8_t> masks;
+            cv::Mat_<int>     kernel_labels;
+            cv::Mat_<float>   score;
+            int               region_num = 0;
+
+            OUTCOME_TRY(
+                impl_->Process(_preds, min_kernel_confidence_, score, masks, kernel_labels, region_num));
+
+            std::vector<int>              text_areas;
+            std::vector<float>            text_scores;
+            std::vector<std::vector<int>> text_points;
+            contour_expand(masks.rowRange(1, masks.rows), kernel_labels, score, min_kernel_area_, region_num, text_areas, text_scores, text_points);
+
+            auto           scale_w = _data["img_metas"]["scale_factor"][0].get<float>();
+            auto           scale_h = _data["img_metas"]["scale_factor"][1].get<float>();
+
+            TextDetections output;
+            for (int text_index = 1; text_index < region_num; ++text_index)
+            {
+                auto& text_point      = text_points[text_index];
+                auto  text_confidence = text_scores[text_index];
+                auto  area            = text_areas[text_index];
+
+                if (filter_instance(static_cast<float>(area), text_confidence, min_text_area_, min_text_avg_confidence_))
+                {
+                    continue;
+                }
+
+                cv::Mat_<int>            points(text_point.size() / 2, 2, text_point.data());
+                cv::RotatedRect          rect = cv::minAreaRect(points);
+                std::vector<cv::Point2f> vertices(4);
+                rect.points(vertices.data());
+
+                if (rescale_)
+                {
+                    for (auto& p : vertices)
+                    {
+                        p.x /= scale_w * downsample_ratio_;
+                        p.y /= scale_h * downsample_ratio_;
+                    }
+                }
+                auto& det = output.emplace_back();
+                for (int i = 0; i < 4; ++i)
+                {
+                    det.bbox[i * 2]     = vertices[i].x;
+                    det.bbox[i * 2 + 1] = vertices[i].y;
+                }
+                det.score = text_confidence;
+            }
+            return to_value(output);
+        }
+
+        static bool filter_instance(float area, float confidence, float min_area, float min_confidence)
+        {
+            return area < min_area || confidence < min_confidence;
+        }
+
+        float                        min_kernel_confidence_{.5f};
+        float                        min_text_avg_confidence_{0.85};
+        int                          min_kernel_area_{0};
+        float                        min_text_area_{16};
+        bool                         rescale_{true};
+        float                        downsample_ratio_{.25f};
+
+        std::unique_ptr<PseHeadImpl> impl_;
+    };
+
+    MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMOCR, PSEHead);
+
+    MMDEPLOY_DEFINE_REGISTRY(PseHeadImpl);
 
 }  // namespace mmdeploy::mmocr
diff --git a/csrc/mmdeploy/codebase/mmocr/psenet.h b/csrc/mmdeploy/codebase/mmocr/psenet.h
index cec376fdc2..069eecf49f 100644
--- a/csrc/mmdeploy/codebase/mmocr/psenet.h
+++ b/csrc/mmdeploy/codebase/mmocr/psenet.h
@@ -9,26 +9,31 @@
 #include "mmdeploy/core/tensor.h"
 #include "opencv2/core.hpp"
 
-namespace mmdeploy::mmocr {
-
-class PseHeadImpl {
- public:
-  virtual ~PseHeadImpl() = default;
-
-  virtual void Init(const Stream& stream) { stream_ = stream; }
-
-  virtual Result<void> Process(Tensor preds,                 //
-                               float min_kernel_confidence,  //
-                               cv::Mat_<float>& score,       //
-                               cv::Mat_<uint8_t>& masks,     //
-                               cv::Mat_<int>& label,         //
-                               int& region_num) = 0;
-
- protected:
-  Stream stream_;
-};
-
-MMDEPLOY_DECLARE_REGISTRY(PseHeadImpl, std::unique_ptr<PseHeadImpl>());
+namespace mmdeploy::mmocr
+{
+
+    class PseHeadImpl
+    {
+      public:
+        virtual ~PseHeadImpl() = default;
+
+        virtual void Init(const Stream& stream)
+        {
+            stream_ = stream;
+        }
+
+        virtual Result<void> Process(Tensor             preds,                  //
+                                     float              min_kernel_confidence,  //
+                                     cv::Mat_<float>&   score,                  //
+                                     cv::Mat_<uint8_t>& masks,                  //
+                                     cv::Mat_<int>&     label,                  //
+                                     int&               region_num) = 0;
+
+      protected:
+        Stream stream_;
+    };
+
+    MMDEPLOY_DECLARE_REGISTRY(PseHeadImpl, std::unique_ptr<PseHeadImpl>());
 
 }  // namespace mmdeploy::mmocr
 
diff --git a/csrc/mmdeploy/codebase/mmocr/rescale_to_height.cpp b/csrc/mmdeploy/codebase/mmocr/rescale_to_height.cpp
index 434e6574b4..70f54c016f 100644
--- a/csrc/mmdeploy/codebase/mmocr/rescale_to_height.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/rescale_to_height.cpp
@@ -12,74 +12,75 @@
 
 using namespace std;
 
-namespace mmdeploy::mmocr {
+namespace mmdeploy::mmocr
+{
 
-class RescaleToHeight : public transform::Transform {
- public:
-  explicit RescaleToHeight(const Value& args) noexcept {
-    height_ = args.value("height", height_);
-    min_width_ = args.contains("min_width") && args["min_width"].is_number_integer()
-                     ? args["min_width"].get<int>()
-                     : min_width_;
-    max_width_ = args.contains("max_width") && args["max_width"].is_number_integer()
-                     ? args["max_width"].get<int>()
-                     : max_width_;
-    width_divisor_ = args.contains("width_divisor") && args["width_divisor"].is_number_integer()
-                         ? args["width_divisor"].get<int>()
-                         : width_divisor_;
-    resize_ = operation::Managed<operation::Resize>::Create("bilinear");
-  }
+    class RescaleToHeight : public transform::Transform
+    {
+      public:
+        explicit RescaleToHeight(const Value& args) noexcept
+        {
+            height_        = args.value("height", height_);
+            min_width_     = args.contains("min_width") && args["min_width"].is_number_integer() ? args["min_width"].get<int>() : min_width_;
+            max_width_     = args.contains("max_width") && args["max_width"].is_number_integer() ? args["max_width"].get<int>() : max_width_;
+            width_divisor_ = args.contains("width_divisor") && args["width_divisor"].is_number_integer() ? args["width_divisor"].get<int>() : width_divisor_;
+            resize_        = operation::Managed<operation::Resize>::Create("bilinear");
+        }
 
-  ~RescaleToHeight() override = default;
+        ~RescaleToHeight() override = default;
 
-  Result<void> Apply(Value& data) override {
-    MMDEPLOY_DEBUG("input: {}", data);
-    auto dst_height = height_;
-    auto dst_min_width = min_width_;
-    auto dst_max_width = max_width_;
+        Result<void> Apply(Value& data) override
+        {
+            MMDEPLOY_DEBUG("input: {}", data);
+            auto             dst_height    = height_;
+            auto             dst_min_width = min_width_;
+            auto             dst_max_width = max_width_;
 
-    std::vector<int> img_shape;  // NHWC
-    from_value(data["img_shape"], img_shape);
+            std::vector<int> img_shape;  // NHWC
+            from_value(data["img_shape"], img_shape);
 
-    std::vector<int> ori_shape;  // NHWC
-    from_value(data["ori_shape"], ori_shape);
+            std::vector<int> ori_shape;  // NHWC
+            from_value(data["ori_shape"], ori_shape);
 
-    auto ori_height = ori_shape[1];
-    auto ori_width = ori_shape[2];
-    auto valid_ratio = 1.f;
+            auto   ori_height  = ori_shape[1];
+            auto   ori_width   = ori_shape[2];
+            auto   valid_ratio = 1.f;
 
-    auto img = data["img"].get<Tensor>();
-    Tensor img_resize;
-    auto new_width = static_cast<int>(std::ceil(1.f * dst_height / ori_height * ori_width));
-    auto width_divisor = width_divisor_;
-    if (dst_min_width > 0) {
-      new_width = std::max(dst_min_width, new_width);
-    }
-    if (dst_max_width > 0) {
-      new_width = std::min(dst_max_width, new_width);
-    }
-    if (new_width % width_divisor != 0) {
-      new_width = std::round(1.f * new_width / width_divisor) * width_divisor;
-    }
-    OUTCOME_TRY(resize_.Apply(img, img_resize, dst_height, new_width));
-    data["img"] = img_resize;
-    data["resize_shape"] = to_value(img_resize.desc().shape);
-    data["pad_shape"] = data["resize_shape"];
-    data["ori_shape"] = data["ori_shape"];
-    data["scale"] = to_value(std::vector<int>({new_width, dst_height}));
-    data["valid_ratio"] = valid_ratio;
-    MMDEPLOY_DEBUG("output: {}", data);
-    return success();
-  }
+            auto   img = data["img"].get<Tensor>();
+            Tensor img_resize;
+            auto   new_width     = static_cast<int>(std::ceil(1.f * dst_height / ori_height * ori_width));
+            auto   width_divisor = width_divisor_;
+            if (dst_min_width > 0)
+            {
+                new_width = std::max(dst_min_width, new_width);
+            }
+            if (dst_max_width > 0)
+            {
+                new_width = std::min(dst_max_width, new_width);
+            }
+            if (new_width % width_divisor != 0)
+            {
+                new_width = std::round(1.f * new_width / width_divisor) * width_divisor;
+            }
+            OUTCOME_TRY(resize_.Apply(img, img_resize, dst_height, new_width));
+            data["img"]          = img_resize;
+            data["resize_shape"] = to_value(img_resize.desc().shape);
+            data["pad_shape"]    = data["resize_shape"];
+            data["ori_shape"]    = data["ori_shape"];
+            data["scale"]        = to_value(std::vector<int>({new_width, dst_height}));
+            data["valid_ratio"]  = valid_ratio;
+            MMDEPLOY_DEBUG("output: {}", data);
+            return success();
+        }
 
- protected:
-  operation::Managed<operation::Resize> resize_;
-  int height_{-1};
-  int min_width_{-1};
-  int max_width_{-1};
-  bool keep_aspect_ratio_{true};
-  int width_divisor_{1};
-};
+      protected:
+        operation::Managed<operation::Resize> resize_;
+        int                                   height_{-1};
+        int                                   min_width_{-1};
+        int                                   max_width_{-1};
+        bool                                  keep_aspect_ratio_{true};
+        int                                   width_divisor_{1};
+    };
 
-MMDEPLOY_REGISTER_TRANSFORM(RescaleToHeight);
+    MMDEPLOY_REGISTER_TRANSFORM(RescaleToHeight);
 }  // namespace mmdeploy::mmocr
diff --git a/csrc/mmdeploy/codebase/mmocr/resize_ocr.cpp b/csrc/mmdeploy/codebase/mmocr/resize_ocr.cpp
index 292ad9a4d7..89ddbe68b9 100644
--- a/csrc/mmdeploy/codebase/mmocr/resize_ocr.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/resize_ocr.cpp
@@ -10,93 +10,100 @@
 
 using namespace std;
 
-namespace mmdeploy::mmocr {
+namespace mmdeploy::mmocr
+{
 
-class ResizeOCR : public transform::Transform {
- public:
-  explicit ResizeOCR(const Value& args) noexcept {
-    height_ = args.value("height", height_);
-    min_width_ = args.contains("min_width") && args["min_width"].is_number_integer()
-                     ? args["min_width"].get<int>()
-                     : min_width_;
-    max_width_ = args.contains("max_width") && args["max_width"].is_number_integer()
-                     ? args["max_width"].get<int>()
-                     : max_width_;
-    keep_aspect_ratio_ = args.value("keep_aspect_ratio", keep_aspect_ratio_);
-    backend_ = args.contains("backend") && args["backend"].is_string()
-                   ? args["backend"].get<string>()
-                   : backend_;
-    img_pad_value_ = args.value("img_pad_value", img_pad_value_);
-    width_downsample_ratio_ = args.value("width_downsample_ratio", width_downsample_ratio_);
+    class ResizeOCR : public transform::Transform
+    {
+      public:
+        explicit ResizeOCR(const Value& args) noexcept
+        {
+            height_                 = args.value("height", height_);
+            min_width_              = args.contains("min_width") && args["min_width"].is_number_integer() ? args["min_width"].get<int>() : min_width_;
+            max_width_              = args.contains("max_width") && args["max_width"].is_number_integer() ? args["max_width"].get<int>() : max_width_;
+            keep_aspect_ratio_      = args.value("keep_aspect_ratio", keep_aspect_ratio_);
+            backend_                = args.contains("backend") && args["backend"].is_string() ? args["backend"].get<string>() : backend_;
+            img_pad_value_          = args.value("img_pad_value", img_pad_value_);
+            width_downsample_ratio_ = args.value("width_downsample_ratio", width_downsample_ratio_);
 
-    resize_ = operation::Managed<operation::Resize>::Create("bilinear");
-    pad_ = operation::Managed<operation::Pad>::Create("constant", img_pad_value_);
-  }
+            resize_ = operation::Managed<operation::Resize>::Create("bilinear");
+            pad_    = operation::Managed<operation::Pad>::Create("constant", img_pad_value_);
+        }
 
-  ~ResizeOCR() override = default;
+        ~ResizeOCR() override = default;
 
-  Result<void> Apply(Value& data) override {
-    MMDEPLOY_DEBUG("input: {}", data);
-    auto dst_height = height_;
-    auto dst_min_width = min_width_;
-    auto dst_max_width = max_width_;
+        Result<void> Apply(Value& data) override
+        {
+            MMDEPLOY_DEBUG("input: {}", data);
+            auto             dst_height    = height_;
+            auto             dst_min_width = min_width_;
+            auto             dst_max_width = max_width_;
 
-    std::vector<int> img_shape;  // NHWC
-    from_value(data["img_shape"], img_shape);
+            std::vector<int> img_shape;  // NHWC
+            from_value(data["img_shape"], img_shape);
 
-    std::vector<int> ori_shape;  // NHWC
-    from_value(data["ori_shape"], ori_shape);
+            std::vector<int> ori_shape;  // NHWC
+            from_value(data["ori_shape"], ori_shape);
 
-    auto ori_height = ori_shape[1];
-    auto ori_width = ori_shape[2];
-    auto valid_ratio = 1.f;
+            auto   ori_height  = ori_shape[1];
+            auto   ori_width   = ori_shape[2];
+            auto   valid_ratio = 1.f;
 
-    auto img = data["img"].get<Tensor>();
-    Tensor img_resize;
-    if (keep_aspect_ratio_) {
-      auto new_width = static_cast<int>(std::ceil(1.f * dst_height / ori_height * ori_width));
-      auto width_divisor = static_cast<int>(1 / width_downsample_ratio_);
-      if (new_width % width_divisor != 0) {
-        new_width = std::round(1.f * new_width / width_divisor) * width_divisor;
-      }
-      if (dst_min_width > 0) {
-        new_width = std::max(dst_min_width, new_width);
-      }
-      if (dst_max_width > 0) {
-        valid_ratio = std::min(1., 1. * new_width / dst_max_width);
-        auto resize_width = std::min(dst_max_width, new_width);
-        OUTCOME_TRY(resize_.Apply(img, img_resize, dst_height, resize_width));
-        if (new_width < dst_max_width) {
-          auto pad_w = std::max(0, dst_max_width - resize_width);
-          OUTCOME_TRY(pad_.Apply(img_resize, img_resize, 0, 0, 0, pad_w));
-        }
-      } else {
-        OUTCOME_TRY(resize_.Apply(img, img_resize, dst_height, new_width));
-      }
-    } else {
-      OUTCOME_TRY(resize_.Apply(img, img_resize, dst_height, dst_max_width));
-    }
+            auto   img = data["img"].get<Tensor>();
+            Tensor img_resize;
+            if (keep_aspect_ratio_)
+            {
+                auto new_width     = static_cast<int>(std::ceil(1.f * dst_height / ori_height * ori_width));
+                auto width_divisor = static_cast<int>(1 / width_downsample_ratio_);
+                if (new_width % width_divisor != 0)
+                {
+                    new_width = std::round(1.f * new_width / width_divisor) * width_divisor;
+                }
+                if (dst_min_width > 0)
+                {
+                    new_width = std::max(dst_min_width, new_width);
+                }
+                if (dst_max_width > 0)
+                {
+                    valid_ratio       = std::min(1., 1. * new_width / dst_max_width);
+                    auto resize_width = std::min(dst_max_width, new_width);
+                    OUTCOME_TRY(resize_.Apply(img, img_resize, dst_height, resize_width));
+                    if (new_width < dst_max_width)
+                    {
+                        auto pad_w = std::max(0, dst_max_width - resize_width);
+                        OUTCOME_TRY(pad_.Apply(img_resize, img_resize, 0, 0, 0, pad_w));
+                    }
+                }
+                else
+                {
+                    OUTCOME_TRY(resize_.Apply(img, img_resize, dst_height, new_width));
+                }
+            }
+            else
+            {
+                OUTCOME_TRY(resize_.Apply(img, img_resize, dst_height, dst_max_width));
+            }
 
-    data["img"] = img_resize;
-    data["resize_shape"] = to_value(img_resize.desc().shape);
-    data["pad_shape"] = data["resize_shape"];
-    data["valid_ratio"] = valid_ratio;
-    MMDEPLOY_DEBUG("output: {}", data);
-    return success();
-  }
+            data["img"]          = img_resize;
+            data["resize_shape"] = to_value(img_resize.desc().shape);
+            data["pad_shape"]    = data["resize_shape"];
+            data["valid_ratio"]  = valid_ratio;
+            MMDEPLOY_DEBUG("output: {}", data);
+            return success();
+        }
 
- protected:
-  operation::Managed<operation::Resize> resize_;
-  operation::Managed<operation::Pad> pad_;
-  int height_{-1};
-  int min_width_{-1};
-  int max_width_{-1};
-  bool keep_aspect_ratio_{true};
-  float img_pad_value_{0};
-  float width_downsample_ratio_{1. / 16};
-  std::string backend_;
-};
+      protected:
+        operation::Managed<operation::Resize> resize_;
+        operation::Managed<operation::Pad>    pad_;
+        int                                   height_{-1};
+        int                                   min_width_{-1};
+        int                                   max_width_{-1};
+        bool                                  keep_aspect_ratio_{true};
+        float                                 img_pad_value_{0};
+        float                                 width_downsample_ratio_{1. / 16};
+        std::string                           backend_;
+    };
 
-MMDEPLOY_REGISTER_TRANSFORM(ResizeOCR);
+    MMDEPLOY_REGISTER_TRANSFORM(ResizeOCR);
 
 }  // namespace mmdeploy::mmocr
diff --git a/csrc/mmdeploy/codebase/mmocr/short_scale_aspect_jitter.cpp b/csrc/mmdeploy/codebase/mmocr/short_scale_aspect_jitter.cpp
index 0390985c3e..a2d7585da4 100644
--- a/csrc/mmdeploy/codebase/mmocr/short_scale_aspect_jitter.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/short_scale_aspect_jitter.cpp
@@ -12,85 +12,92 @@
 
 using namespace std;
 
-namespace mmdeploy::mmocr {
-
-class ShortScaleAspectJitter : public transform::Transform {
- public:
-  explicit ShortScaleAspectJitter(const Value& args) noexcept {
-    short_size_ = args.contains("short_size") && args["short_size"].is_number_integer()
-                      ? args["short_size"].get<int>()
-                      : short_size_;
-    if (args["ratio_range"].is_array() && args["ratio_range"].size() == 2) {
-      ratio_range_[0] = args["ratio_range"][0].get<float>();
-      ratio_range_[1] = args["ratio_range"][1].get<float>();
-    } else {
-      MMDEPLOY_ERROR("'ratio_range' should be a float array of size 2");
-      throw_exception(eInvalidArgument);
-    }
-
-    if (args["aspect_ratio_range"].is_array() && args["aspect_ratio_range"].size() == 2) {
-      aspect_ratio_range_[0] = args["aspect_ratio_range"][0].get<float>();
-      aspect_ratio_range_[1] = args["aspect_ratio_range"][1].get<float>();
-    } else {
-      MMDEPLOY_ERROR("'aspect_ratio_range' should be a float array of size 2");
-      throw_exception(eInvalidArgument);
-    }
-    scale_divisor_ = args.contains("scale_divisor") && args["scale_divisor"].is_number_integer()
-                         ? args["scale_divisor"].get<int>()
-                         : scale_divisor_;
-    resize_ = operation::Managed<operation::Resize>::Create("bilinear");
-  }
-
-  ~ShortScaleAspectJitter() override = default;
-
-  Result<void> Apply(Value& data) override {
-    MMDEPLOY_DEBUG("input: {}", data);
-    auto short_size = short_size_;
-    auto ratio_range = ratio_range_;
-    auto aspect_ratio_range = aspect_ratio_range_;
-    auto scale_divisor = scale_divisor_;
-
-    if (ratio_range[0] != 1.0 || ratio_range[1] != 1.0 || aspect_ratio_range[0] != 1.0 ||
-        aspect_ratio_range[1] != 1.0) {
-      MMDEPLOY_ERROR("unsupported `ratio_range` and `aspect_ratio_range`");
-      return Status(eNotSupported);
-    }
-    std::vector<int> img_shape;  // NHWC
-    from_value(data["img_shape"], img_shape);
-
-    std::vector<int> ori_shape;  // NHWC
-    from_value(data["ori_shape"], ori_shape);
-
-    auto ori_height = ori_shape[1];
-    auto ori_width = ori_shape[2];
-
-    auto img = data["img"].get<Tensor>();
-    Tensor img_resize;
-    auto scale = static_cast<float>(1.0 * short_size / std::min(img_shape[1], img_shape[2]));
-    auto dst_height = static_cast<int>(std::round(scale * img_shape[1]));
-    auto dst_width = static_cast<int>(std::round(scale * img_shape[2]));
-    dst_height = static_cast<int>(std::ceil(1.0 * dst_height / scale_divisor) * scale_divisor);
-    dst_width = static_cast<int>(std::ceil(1.0 * dst_width / scale_divisor) * scale_divisor);
-    std::vector<float> scale_factor = {(float)1.0 * dst_width / img_shape[2],
-                                       (float)1.0 * dst_height / img_shape[1]};
-
-    OUTCOME_TRY(resize_.Apply(img, img_resize, dst_height, dst_width));
-    data["img"] = img_resize;
-    data["resize_shape"] = to_value(img_resize.desc().shape);
-    data["scale"] = to_value(std::vector<int>({dst_width, dst_height}));
-    data["scale_factor"] = to_value(scale_factor);
-    MMDEPLOY_DEBUG("output: {}", data);
-    return success();
-  }
-
- protected:
-  operation::Managed<operation::Resize> resize_;
-  int short_size_{736};
-  std::vector<float> ratio_range_{0.7, 1.3};
-  std::vector<float> aspect_ratio_range_{0.9, 1.1};
-  int scale_divisor_{1};
-};
-
-MMDEPLOY_REGISTER_TRANSFORM(ShortScaleAspectJitter);
+namespace mmdeploy::mmocr
+{
+
+    class ShortScaleAspectJitter : public transform::Transform
+    {
+      public:
+        explicit ShortScaleAspectJitter(const Value& args) noexcept
+        {
+            short_size_ = args.contains("short_size") && args["short_size"].is_number_integer() ? args["short_size"].get<int>() : short_size_;
+            if (args["ratio_range"].is_array() && args["ratio_range"].size() == 2)
+            {
+                ratio_range_[0] = args["ratio_range"][0].get<float>();
+                ratio_range_[1] = args["ratio_range"][1].get<float>();
+            }
+            else
+            {
+                MMDEPLOY_ERROR("'ratio_range' should be a float array of size 2");
+                throw_exception(eInvalidArgument);
+            }
+
+            if (args["aspect_ratio_range"].is_array() && args["aspect_ratio_range"].size() == 2)
+            {
+                aspect_ratio_range_[0] = args["aspect_ratio_range"][0].get<float>();
+                aspect_ratio_range_[1] = args["aspect_ratio_range"][1].get<float>();
+            }
+            else
+            {
+                MMDEPLOY_ERROR("'aspect_ratio_range' should be a float array of size 2");
+                throw_exception(eInvalidArgument);
+            }
+            scale_divisor_ = args.contains("scale_divisor") && args["scale_divisor"].is_number_integer() ? args["scale_divisor"].get<int>() : scale_divisor_;
+            resize_        = operation::Managed<operation::Resize>::Create("bilinear");
+        }
+
+        ~ShortScaleAspectJitter() override = default;
+
+        Result<void> Apply(Value& data) override
+        {
+            MMDEPLOY_DEBUG("input: {}", data);
+            auto short_size         = short_size_;
+            auto ratio_range        = ratio_range_;
+            auto aspect_ratio_range = aspect_ratio_range_;
+            auto scale_divisor      = scale_divisor_;
+
+            if (ratio_range[0] != 1.0 || ratio_range[1] != 1.0 || aspect_ratio_range[0] != 1.0 ||
+                aspect_ratio_range[1] != 1.0)
+            {
+                MMDEPLOY_ERROR("unsupported `ratio_range` and `aspect_ratio_range`");
+                return Status(eNotSupported);
+            }
+            std::vector<int> img_shape;  // NHWC
+            from_value(data["img_shape"], img_shape);
+
+            std::vector<int> ori_shape;  // NHWC
+            from_value(data["ori_shape"], ori_shape);
+
+            auto   ori_height = ori_shape[1];
+            auto   ori_width  = ori_shape[2];
+
+            auto   img = data["img"].get<Tensor>();
+            Tensor img_resize;
+            auto   scale                    = static_cast<float>(1.0 * short_size / std::min(img_shape[1], img_shape[2]));
+            auto   dst_height               = static_cast<int>(std::round(scale * img_shape[1]));
+            auto   dst_width                = static_cast<int>(std::round(scale * img_shape[2]));
+            dst_height                      = static_cast<int>(std::ceil(1.0 * dst_height / scale_divisor) * scale_divisor);
+            dst_width                       = static_cast<int>(std::ceil(1.0 * dst_width / scale_divisor) * scale_divisor);
+            std::vector<float> scale_factor = {(float)1.0 * dst_width / img_shape[2],
+                                               (float)1.0 * dst_height / img_shape[1]};
+
+            OUTCOME_TRY(resize_.Apply(img, img_resize, dst_height, dst_width));
+            data["img"]          = img_resize;
+            data["resize_shape"] = to_value(img_resize.desc().shape);
+            data["scale"]        = to_value(std::vector<int>({dst_width, dst_height}));
+            data["scale_factor"] = to_value(scale_factor);
+            MMDEPLOY_DEBUG("output: {}", data);
+            return success();
+        }
+
+      protected:
+        operation::Managed<operation::Resize> resize_;
+        int                                   short_size_{736};
+        std::vector<float>                    ratio_range_{0.7, 1.3};
+        std::vector<float>                    aspect_ratio_range_{0.9, 1.1};
+        int                                   scale_divisor_{1};
+    };
+
+    MMDEPLOY_REGISTER_TRANSFORM(ShortScaleAspectJitter);
 
 }  // namespace mmdeploy::mmocr
diff --git a/csrc/mmdeploy/codebase/mmocr/warp.cpp b/csrc/mmdeploy/codebase/mmocr/warp.cpp
index cf009f096f..12b3ba5d46 100644
--- a/csrc/mmdeploy/codebase/mmocr/warp.cpp
+++ b/csrc/mmdeploy/codebase/mmocr/warp.cpp
@@ -11,60 +11,67 @@
 #include "opencv2/imgproc/imgproc.hpp"
 #include "opencv_utils.h"
 
-namespace mmdeploy::mmocr {
+namespace mmdeploy::mmocr
+{
 
-// Warp rotated rect
-class WarpBbox {
- public:
-  Result<Value> operator()(const Value& img, const Value& det) {
-    auto ori_img = img["ori_img"].get<framework::Mat>();
-    if (det.is_object() && det.contains("bbox")) {
-      auto bbox = from_value<std::vector<cv::Point>>(det["bbox"]);
-      auto patch = warp(mmdeploy::cpu::Mat2CVMat(ori_img), bbox);
-      return Value{{"ori_img", cpu::CVMat2Mat(patch, ori_img.pixel_format())}};
-    } else {  // whole image as a bbox
-      return Value{{"ori_img", ori_img}};
-    }
-  }
+    // Warp rotated rect
+    class WarpBbox
+    {
+      public:
+        Result<Value> operator()(const Value& img, const Value& det)
+        {
+            auto ori_img = img["ori_img"].get<framework::Mat>();
+            if (det.is_object() && det.contains("bbox"))
+            {
+                auto bbox  = from_value<std::vector<cv::Point>>(det["bbox"]);
+                auto patch = warp(mmdeploy::cpu::Mat2CVMat(ori_img), bbox);
+                return Value{{"ori_img", cpu::CVMat2Mat(patch, ori_img.pixel_format())}};
+            }
+            else
+            {  // whole image as a bbox
+                return Value{{"ori_img", ori_img}};
+            }
+        }
 
-  // assuming rect
-  static cv::Mat warp(const cv::Mat& img, const std::vector<cv::Point>& _pts) {
-    auto pts = sort_vertex(_pts);
-    std::vector<cv::Point2f> src(begin(pts), end(pts));
-    auto e0 = norm(pts[0] - pts[1]);
-    auto e1 = norm(pts[1] - pts[2]);
-    auto w = static_cast<float>(std::max(e0, e1));
-    auto h = static_cast<float>(std::min(e0, e1));
-    std::vector<cv::Point2f> dst{{0, 0}, {w, 0}, {w, h}, {0, h}};
-    auto m = cv::getAffineTransform(src.data(), dst.data());
-    cv::Mat warped;
-    cv::warpAffine(img, warped, m, {static_cast<int>(w), static_cast<int>(h)});
-    return warped;
-  }
+        // assuming rect
+        static cv::Mat warp(const cv::Mat& img, const std::vector<cv::Point>& _pts)
+        {
+            auto                     pts = sort_vertex(_pts);
+            std::vector<cv::Point2f> src(begin(pts), end(pts));
+            auto                     e0 = norm(pts[0] - pts[1]);
+            auto                     e1 = norm(pts[1] - pts[2]);
+            auto                     w  = static_cast<float>(std::max(e0, e1));
+            auto                     h  = static_cast<float>(std::min(e0, e1));
+            std::vector<cv::Point2f> dst{{0, 0}, {w, 0}, {w, h}, {0, h}};
+            auto                     m = cv::getAffineTransform(src.data(), dst.data());
+            cv::Mat                  warped;
+            cv::warpAffine(img, warped, m, {static_cast<int>(w), static_cast<int>(h)});
+            return warped;
+        }
 
-  static std::vector<cv::Point> sort_vertex(std::vector<cv::Point> ps) {
-    auto pivot = *min_element(begin(ps), end(ps), [](auto r, auto p) {
-      return (r.y != p.y) ? (r.y < p.y) : (r.x < p.x);
-    });
-    // TODO: resolve tie with distance
-    sort(begin(ps), end(ps), [&](auto a, auto b) {
+        static std::vector<cv::Point> sort_vertex(std::vector<cv::Point> ps)
+        {
+            auto pivot = *min_element(begin(ps), end(ps), [](auto r, auto p)
+                                      { return (r.y != p.y) ? (r.y < p.y) : (r.x < p.x); });
+            // TODO: resolve tie with distance
+            sort(begin(ps), end(ps), [&](auto a, auto b)
+                 {
       if (a == pivot) return b != pivot;
-      return (a - pivot).cross(b - pivot) > 0;
-    });
-    auto tl = accumulate(begin(ps) + 1, end(ps), ps[0], [](auto r, auto p) {
-      return cv::Point{std::min(r.x, p.x), std::min(r.y, p.y)};
-    });
-    auto cmp = [&](auto r, auto p) {
-      cv::Point2f u{r - tl}, v{p - tl};
-      return u.dot(u) < v.dot(v);
+      return (a - pivot).cross(b - pivot) > 0; });
+            auto tl  = accumulate(begin(ps) + 1, end(ps), ps[0], [](auto r, auto p)
+                                 { return cv::Point{std::min(r.x, p.x), std::min(r.y, p.y)}; });
+            auto cmp = [&](auto r, auto p)
+            {
+                cv::Point2f u{r - tl}, v{p - tl};
+                return u.dot(u) < v.dot(v);
+            };
+            auto tl_idx = min_element(begin(ps), end(ps), cmp) - begin(ps);
+            rotate(begin(ps), begin(ps) + tl_idx, end(ps));
+            return ps;
+        }
     };
-    auto tl_idx = min_element(begin(ps), end(ps), cmp) - begin(ps);
-    rotate(begin(ps), begin(ps) + tl_idx, end(ps));
-    return ps;
-  }
-};
 
-MMDEPLOY_REGISTER_FACTORY_FUNC(Module, (WarpBbox, 0),
-                               [](const Value&) { return CreateTask(WarpBbox{}); });
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Module, (WarpBbox, 0), [](const Value&)
+                                   { return CreateTask(WarpBbox{}); });
 
 }  // namespace mmdeploy::mmocr
diff --git a/csrc/mmdeploy/codebase/mmpose/CMakeLists.txt b/csrc/mmdeploy/codebase/mmpose/CMakeLists.txt
index 9fbafd4aa3..db86432e4b 100644
--- a/csrc/mmdeploy/codebase/mmpose/CMakeLists.txt
+++ b/csrc/mmdeploy/codebase/mmpose/CMakeLists.txt
@@ -6,13 +6,14 @@ aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR} MMPOSE_SRCS)
 aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/pose_tracker POSE_TRACKER_SRCS)
 
 mmdeploy_add_module(${PROJECT_NAME} ${MMPOSE_SRCS} ${POSE_TRACKER_SRCS})
-target_link_libraries(${PROJECT_NAME} PRIVATE
-        mmdeploy::transform
-        mmdeploy_operation
-        mmdeploy_opencv_utils)
-target_include_directories(${PROJECT_NAME} PRIVATE
-        ${CMAKE_CURRENT_SOURCE_DIR}
-        ${CMAKE_CURRENT_SOURCE_DIR}/../../apis/c)
+target_link_libraries(
+  ${PROJECT_NAME} PRIVATE mmdeploy::transform mmdeploy_operation
+                          mmdeploy_opencv_utils)
+target_include_directories(
+  ${PROJECT_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+                          ${CMAKE_CURRENT_SOURCE_DIR}/../../apis/c)
 add_library(mmdeploy::mmpose ALIAS ${PROJECT_NAME})
 
-set(MMDEPLOY_TASKS ${MMDEPLOY_TASKS} pose_detector pose_tracker CACHE INTERNAL "")
+set(MMDEPLOY_TASKS
+    ${MMDEPLOY_TASKS} pose_detector pose_tracker
+    CACHE INTERNAL "")
diff --git a/csrc/mmdeploy/codebase/mmpose/keypoints_from_heatmap.cpp b/csrc/mmdeploy/codebase/mmpose/keypoints_from_heatmap.cpp
index 4d7c0e7a92..ffdbbbc619 100644
--- a/csrc/mmdeploy/codebase/mmpose/keypoints_from_heatmap.cpp
+++ b/csrc/mmdeploy/codebase/mmpose/keypoints_from_heatmap.cpp
@@ -13,359 +13,401 @@
 #include "mmdeploy/experimental/module_adapter.h"
 #include "mmpose.h"
 
-namespace mmdeploy::mmpose {
-
-using std::string;
-using std::vector;
-
-std::string to_lower(const std::string& s) {
-  std::string t = s;
-  std::transform(t.begin(), t.end(), t.begin(), [](unsigned char c) { return std::tolower(c); });
-  return t;
-}
-
-class TopdownHeatmapBaseHeadDecode : public MMPose {
- public:
-  explicit TopdownHeatmapBaseHeadDecode(const Value& config) : MMPose(config) {
-    if (config.contains("params")) {
-      auto& params = config["params"];
-      flip_test_ = params.value("flip_test", flip_test_);
-      use_udp_ = params.value("use_udp", use_udp_);
-      target_type_ = params.value("target_type", target_type_);
-      valid_radius_factor_ = params.value("valid_radius_factor", valid_radius_factor_);
-      unbiased_decoding_ = params.value("unbiased_decoding", unbiased_decoding_);
-      post_process_ = params.value("post_process", post_process_);
-      shift_heatmap_ = params.value("shift_heatmap", shift_heatmap_);
-      modulate_kernel_ = params.value("modulate_kernel", modulate_kernel_);
-    }
-  }
-
-  Result<Value> operator()(const Value& _data, const Value& _prob) {
-    MMDEPLOY_DEBUG("preprocess_result: {}", _data);
-    MMDEPLOY_DEBUG("inference_result: {}", _prob);
-
-    Device cpu_device{"cpu"};
-    OUTCOME_TRY(auto heatmap,
-                MakeAvailableOnDevice(_prob["output"].get<Tensor>(), cpu_device, stream()));
-    OUTCOME_TRY(stream().Wait());
-    if (!(heatmap.shape().size() == 4 && heatmap.data_type() == DataType::kFLOAT)) {
-      MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}, dtype: {}", heatmap.shape(),
-                     (int)heatmap.data_type());
-      return Status(eNotSupported);
-    }
-
-    auto& img_metas = _data["img_metas"];
-
-    vector<float> center;
-    vector<float> scale;
-    from_value(img_metas["center"], center);
-    from_value(img_metas["scale"], scale);
-    Tensor pred =
-        keypoints_from_heatmap(heatmap, center, scale, unbiased_decoding_, post_process_,
-                               modulate_kernel_, valid_radius_factor_, use_udp_, target_type_);
-
-    return GetOutput(pred);
-  }
-
-  Value GetOutput(Tensor& pred) {
-    PoseDetectorOutput output;
-    int K = pred.shape(1);
-    float* data = pred.data<float>();
-    for (int i = 0; i < K; i++) {
-      float x = *(data + 0);
-      float y = *(data + 1);
-      float s = *(data + 2);
-      output.key_points.push_back({{x, y}, s});
-      data += 3;
-    }
-    return to_value(std::move(output));
-  }
-
-  Tensor keypoints_from_heatmap(Tensor& heatmap, const vector<float>& center,
-                                const vector<float>& scale, bool unbiased_decoding,
-                                const string& post_process, int modulate_kernel,
-                                float valid_radius_factor, bool use_udp,
-                                const string& target_type) {
-    int K = heatmap.shape(1);
-    int H = heatmap.shape(2);
-    int W = heatmap.shape(3);
-
-    if (post_process == "megvii") {
-      heatmap = gaussian_blur(heatmap, modulate_kernel);
+namespace mmdeploy::mmpose
+{
+
+    using std::string;
+    using std::vector;
+
+    std::string to_lower(const std::string& s)
+    {
+        std::string t = s;
+        std::transform(t.begin(), t.end(), t.begin(), [](unsigned char c)
+                       { return std::tolower(c); });
+        return t;
     }
 
-    Tensor pred;
-
-    if (use_udp) {
-      if (to_lower(target_type) == to_lower(string("GaussianHeatMap"))) {
-        pred = get_max_pred(heatmap);
-        post_dark_udp(pred, heatmap, modulate_kernel);
-      } else if (to_lower(target_type) == to_lower(string("CombinedTarget"))) {
-        // output channel = 3 * channel_cfg['num_output_channels']
-        assert(K % 3 == 0);
-        for (int i = 0; i < K; i++) {
-          int kt = (i % 3 == 0) ? 2 * modulate_kernel + 1 : modulate_kernel;
-          float* data = heatmap.data<float>() + i * H * W;
-          cv::Mat work = cv::Mat(H, W, CV_32FC(1), data);
-          cv::GaussianBlur(work, work, {kt, kt}, 0);  // inplace
+    class TopdownHeatmapBaseHeadDecode : public MMPose
+    {
+      public:
+        explicit TopdownHeatmapBaseHeadDecode(const Value& config)
+            : MMPose(config)
+        {
+            if (config.contains("params"))
+            {
+                auto& params         = config["params"];
+                flip_test_           = params.value("flip_test", flip_test_);
+                use_udp_             = params.value("use_udp", use_udp_);
+                target_type_         = params.value("target_type", target_type_);
+                valid_radius_factor_ = params.value("valid_radius_factor", valid_radius_factor_);
+                unbiased_decoding_   = params.value("unbiased_decoding", unbiased_decoding_);
+                post_process_        = params.value("post_process", post_process_);
+                shift_heatmap_       = params.value("shift_heatmap", shift_heatmap_);
+                modulate_kernel_     = params.value("modulate_kernel", modulate_kernel_);
+            }
         }
-        float valid_radius = valid_radius_factor_ * H;
-        TensorDesc desc = {Device{"cpu"}, DataType::kFLOAT, {1, K / 3, H, W}};
-        Tensor offset_x(desc);
-        Tensor offset_y(desc);
-        Tensor heatmap_(desc);
+
+        Result<Value> operator()(const Value& _data, const Value& _prob)
         {
-          // split heatmap
-          float* src = heatmap.data<float>();
-          float* dst0 = heatmap_.data<float>();
-          float* dst1 = offset_x.data<float>();
-          float* dst2 = offset_y.data<float>();
-          for (int i = 0; i < K / 3; i++) {
-            std::copy_n(src, H * W, dst0);
-            std::transform(src + H * W, src + 2 * H * W, dst1,
-                           [=](float& x) { return x * valid_radius; });
-            std::transform(src + 2 * H * W, src + 3 * H * W, dst2,
-                           [=](float& x) { return x * valid_radius; });
-            src += 3 * H * W;
-            dst0 += H * W;
-            dst1 += H * W;
-            dst2 += H * W;
-          }
+            MMDEPLOY_DEBUG("preprocess_result: {}", _data);
+            MMDEPLOY_DEBUG("inference_result: {}", _prob);
+
+            Device cpu_device{"cpu"};
+            OUTCOME_TRY(auto heatmap,
+                        MakeAvailableOnDevice(_prob["output"].get<Tensor>(), cpu_device, stream()));
+            OUTCOME_TRY(stream().Wait());
+            if (!(heatmap.shape().size() == 4 && heatmap.data_type() == DataType::kFLOAT))
+            {
+                MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}, dtype: {}", heatmap.shape(), (int)heatmap.data_type());
+                return Status(eNotSupported);
+            }
+
+            auto&         img_metas = _data["img_metas"];
+
+            vector<float> center;
+            vector<float> scale;
+            from_value(img_metas["center"], center);
+            from_value(img_metas["scale"], scale);
+            Tensor pred =
+                keypoints_from_heatmap(heatmap, center, scale, unbiased_decoding_, post_process_, modulate_kernel_, valid_radius_factor_, use_udp_, target_type_);
+
+            return GetOutput(pred);
         }
-        pred = get_max_pred(heatmap_);
-        for (int i = 0; i < K / 3; i++) {
-          float* data = pred.data<float>() + i * 3;
-          int index = *(data + 0) + *(data + 1) * W + H * W * i;
-          float* offx = offset_x.data<float>() + index;
-          float* offy = offset_y.data<float>() + index;
-          *(data + 0) += *offx;
-          *(data + 1) += *offy;
+
+        Value GetOutput(Tensor& pred)
+        {
+            PoseDetectorOutput output;
+            int                K    = pred.shape(1);
+            float*             data = pred.data<float>();
+            for (int i = 0; i < K; i++)
+            {
+                float x = *(data + 0);
+                float y = *(data + 1);
+                float s = *(data + 2);
+                output.key_points.push_back({{x, y}, s});
+                data += 3;
+            }
+            return to_value(std::move(output));
         }
-      }
-    } else {
-      pred = get_max_pred(heatmap);
-      if (post_process == "unbiased") {
-        heatmap = gaussian_blur(heatmap, modulate_kernel);
-        float* data = heatmap.data<float>();
-        std::for_each(data, data + K * H * W, [](float& v) {
+
+        Tensor keypoints_from_heatmap(Tensor& heatmap, const vector<float>& center, const vector<float>& scale, bool unbiased_decoding, const string& post_process, int modulate_kernel, float valid_radius_factor, bool use_udp, const string& target_type)
+        {
+            int K = heatmap.shape(1);
+            int H = heatmap.shape(2);
+            int W = heatmap.shape(3);
+
+            if (post_process == "megvii")
+            {
+                heatmap = gaussian_blur(heatmap, modulate_kernel);
+            }
+
+            Tensor pred;
+
+            if (use_udp)
+            {
+                if (to_lower(target_type) == to_lower(string("GaussianHeatMap")))
+                {
+                    pred = get_max_pred(heatmap);
+                    post_dark_udp(pred, heatmap, modulate_kernel);
+                }
+                else if (to_lower(target_type) == to_lower(string("CombinedTarget")))
+                {
+                    // output channel = 3 * channel_cfg['num_output_channels']
+                    assert(K % 3 == 0);
+                    for (int i = 0; i < K; i++)
+                    {
+                        int     kt   = (i % 3 == 0) ? 2 * modulate_kernel + 1 : modulate_kernel;
+                        float*  data = heatmap.data<float>() + i * H * W;
+                        cv::Mat work = cv::Mat(H, W, CV_32FC(1), data);
+                        cv::GaussianBlur(work, work, {kt, kt}, 0);  // inplace
+                    }
+                    float      valid_radius = valid_radius_factor_ * H;
+                    TensorDesc desc         = {Device{"cpu"}, DataType::kFLOAT, {1, K / 3, H, W}};
+                    Tensor     offset_x(desc);
+                    Tensor     offset_y(desc);
+                    Tensor     heatmap_(desc);
+                    {
+                        // split heatmap
+                        float* src  = heatmap.data<float>();
+                        float* dst0 = heatmap_.data<float>();
+                        float* dst1 = offset_x.data<float>();
+                        float* dst2 = offset_y.data<float>();
+                        for (int i = 0; i < K / 3; i++)
+                        {
+                            std::copy_n(src, H * W, dst0);
+                            std::transform(src + H * W, src + 2 * H * W, dst1, [=](float& x)
+                                           { return x * valid_radius; });
+                            std::transform(src + 2 * H * W, src + 3 * H * W, dst2, [=](float& x)
+                                           { return x * valid_radius; });
+                            src += 3 * H * W;
+                            dst0 += H * W;
+                            dst1 += H * W;
+                            dst2 += H * W;
+                        }
+                    }
+                    pred = get_max_pred(heatmap_);
+                    for (int i = 0; i < K / 3; i++)
+                    {
+                        float* data  = pred.data<float>() + i * 3;
+                        int    index = *(data + 0) + *(data + 1) * W + H * W * i;
+                        float* offx  = offset_x.data<float>() + index;
+                        float* offy  = offset_y.data<float>() + index;
+                        *(data + 0) += *offx;
+                        *(data + 1) += *offy;
+                    }
+                }
+            }
+            else
+            {
+                pred = get_max_pred(heatmap);
+                if (post_process == "unbiased")
+                {
+                    heatmap     = gaussian_blur(heatmap, modulate_kernel);
+                    float* data = heatmap.data<float>();
+                    std::for_each(data, data + K * H * W, [](float& v)
+                                  {
           double _v = std::max((double)v, 1e-10);
-          v = std::log(_v);
-        });
-        for (int i = 0; i < K; i++) {
-          taylor(heatmap, pred, i);
+          v = std::log(_v); });
+                    for (int i = 0; i < K; i++)
+                    {
+                        taylor(heatmap, pred, i);
+                    }
+                }
+                else if (post_process != "null")
+                {
+                    for (int i = 0; i < K; i++)
+                    {
+                        float* data  = heatmap.data<float>() + i * W * H;
+                        auto   _data = [&](int y, int x)
+                        { return *(data + y * W + x); };
+                        int px = *(pred.data<float>() + i * 3 + 0);
+                        int py = *(pred.data<float>() + i * 3 + 1);
+                        if (1 < px && px < W - 1 && 1 < py && py < H - 1)
+                        {
+                            float v1 = _data(py, px + 1) - _data(py, px - 1);
+                            float v2 = _data(py + 1, px) - _data(py - 1, px);
+                            *(pred.data<float>() + i * 3 + 0) += (v1 > 0) ? 0.25 : ((v1 < 0) ? -0.25 : 0);
+                            *(pred.data<float>() + i * 3 + 1) += (v2 > 0) ? 0.25 : ((v2 < 0) ? -0.25 : 0);
+                            if (post_process_ == "megvii")
+                            {
+                                *(pred.data<float>() + i * 3 + 0) += 0.5;
+                                *(pred.data<float>() + i * 3 + 1) += 0.5;
+                            }
+                        }
+                    }
+                }
+            }
+
+            K = pred.shape(1);  // changed if target_type is CombinedTarget
+
+            // Transform back to the image
+            for (int i = 0; i < K; i++)
+            {
+                transform_pred(pred, i, center, scale, {W, H}, use_udp);
+            }
+
+            if (post_process_ == "megvii")
+            {
+                for (int i = 0; i < K; i++)
+                {
+                    float* data = pred.data<float>() + i * 3 + 2;
+                    *data       = *data / 255.0 + 0.5;
+                }
+            }
+
+            return pred;
         }
 
-      } else if (post_process != "null") {
-        for (int i = 0; i < K; i++) {
-          float* data = heatmap.data<float>() + i * W * H;
-          auto _data = [&](int y, int x) { return *(data + y * W + x); };
-          int px = *(pred.data<float>() + i * 3 + 0);
-          int py = *(pred.data<float>() + i * 3 + 1);
-          if (1 < px && px < W - 1 && 1 < py && py < H - 1) {
-            float v1 = _data(py, px + 1) - _data(py, px - 1);
-            float v2 = _data(py + 1, px) - _data(py - 1, px);
-            *(pred.data<float>() + i * 3 + 0) += (v1 > 0) ? 0.25 : ((v1 < 0) ? -0.25 : 0);
-            *(pred.data<float>() + i * 3 + 1) += (v2 > 0) ? 0.25 : ((v2 < 0) ? -0.25 : 0);
-            if (post_process_ == "megvii") {
-              *(pred.data<float>() + i * 3 + 0) += 0.5;
-              *(pred.data<float>() + i * 3 + 1) += 0.5;
+        void post_dark_udp(Tensor& pred, Tensor& heatmap, int kernel)
+        {
+            int K = heatmap.shape(1);
+            int H = heatmap.shape(2);
+            int W = heatmap.shape(3);
+            for (int i = 0; i < K; i++)
+            {
+                float*  data = heatmap.data<float>() + i * H * W;
+                cv::Mat work = cv::Mat(H, W, CV_32FC(1), data);
+                cv::GaussianBlur(work, work, {kernel, kernel}, 0);  // inplace
+            }
+            std::for_each(heatmap.data<float>(), heatmap.data<float>() + K * H * W, [](float& x)
+                          {
+      x = std::max(0.001f, std::min(50.f, x));
+      x = std::log(x); });
+            auto _heatmap_data = [&](int index, int c) -> float
+            {
+                int y = index / (W + 2);
+                int x = index % (W + 2);
+                y     = std::max(0, y - 1);
+                x     = std::max(0, x - 1);
+                return *(heatmap.data<float>() + c * H * W + y * W + x);
+            };
+            for (int i = 0; i < K; i++)
+            {
+                float*        data       = pred.data<float>() + i * 3;
+                int           index      = *(data + 0) + 1 + (*(data + 1) + 1) * (W + 2);
+                float         i_         = _heatmap_data(index, i);
+                float         ix1        = _heatmap_data(index + 1, i);
+                float         iy1        = _heatmap_data(index + W + 2, i);
+                float         ix1y1      = _heatmap_data(index + W + 3, i);
+                float         ix1_y1_    = _heatmap_data(index - W - 3, i);
+                float         ix1_       = _heatmap_data(index - 1, i);
+                float         iy1_       = _heatmap_data(index - 2 - W, i);
+                float         dx         = 0.5 * (ix1 - ix1_);
+                float         dy         = 0.5 * (iy1 - iy1_);
+                float         dxx        = ix1 - 2 * i_ + ix1_;
+                float         dyy        = iy1 - 2 * i_ + iy1_;
+                float         dxy        = 0.5 * (ix1y1 - ix1 - iy1 + i_ + i_ - ix1_ - iy1_ + ix1_y1_);
+                vector<float> _data0     = {dx, dy};
+                vector<float> _data1     = {dxx, dxy, dxy, dyy};
+                cv::Mat       derivative = cv::Mat(2, 1, CV_32FC1, _data0.data());
+                cv::Mat       hessian    = cv::Mat(2, 2, CV_32FC1, _data1.data());
+                cv::Mat       hessianinv = hessian.inv();
+                cv::Mat       offset     = -hessianinv * derivative;
+                *(data + 0) += offset.at<float>(0, 0);
+                *(data + 1) += offset.at<float>(1, 0);
             }
-          }
         }
-      }
-    }
 
-    K = pred.shape(1);  // changed if target_type is CombinedTarget
+        void transform_pred(Tensor& pred, int k, const vector<float>& center, const vector<float>& _scale, const vector<int>& output_size, bool use_udp = false)
+        {
+            auto scale = _scale;
+            scale[0] *= 200;
+            scale[1] *= 200;
+
+            float scale_x, scale_y;
+            if (use_udp)
+            {
+                scale_x = scale[0] / (output_size[0] - 1.0);
+                scale_y = scale[1] / (output_size[1] - 1.0);
+            }
+            else
+            {
+                scale_x = scale[0] / output_size[0];
+                scale_y = scale[1] / output_size[1];
+            }
 
-    // Transform back to the image
-    for (int i = 0; i < K; i++) {
-      transform_pred(pred, i, center, scale, {W, H}, use_udp);
-    }
+            float* data = pred.data<float>() + k * 3;
+            *(data + 0) = *(data + 0) * scale_x + center[0] - scale[0] * 0.5;
+            *(data + 1) = *(data + 1) * scale_y + center[1] - scale[1] * 0.5;
+        }
 
-    if (post_process_ == "megvii") {
-      for (int i = 0; i < K; i++) {
-        float* data = pred.data<float>() + i * 3 + 2;
-        *data = *data / 255.0 + 0.5;
-      }
-    }
+        void taylor(const Tensor& heatmap, Tensor& pred, int k)
+        {
+            int K  = heatmap.shape(1);
+            int H  = heatmap.shape(2);
+            int W  = heatmap.shape(3);
+            int px = *(pred.data<float>() + k * 3 + 0);
+            int py = *(pred.data<float>() + k * 3 + 1);
+            if (1 < px && px < W - 2 && 1 < py && py < H - 2)
+            {
+                float* data     = const_cast<float*>(heatmap.data<float>() + k * H * W);
+                auto   get_data = [&](int r, int c)
+                { return *(data + r * W + c); };
+                float         dx  = 0.5 * (get_data(py, px + 1) - get_data(py, px - 1));
+                float         dy  = 0.5 * (get_data(py + 1, px) - get_data(py - 1, px));
+                float         dxx = 0.25 * (get_data(py, px + 2) - 2 * get_data(py, px) + get_data(py, px - 2));
+                float         dxy = 0.25 * (get_data(py + 1, px + 1) - get_data(py - 1, px + 1) -
+                                    get_data(py + 1, px - 1) + get_data(py - 1, px - 1));
+                float         dyy = 0.25 * (get_data(py + 2, px) - 2 * get_data(py, px) + get_data(py - 2, px));
+
+                vector<float> _data0     = {dx, dy};
+                vector<float> _data1     = {dxx, dxy, dxy, dyy};
+                cv::Mat       derivative = cv::Mat(2, 1, CV_32FC1, _data0.data());
+                cv::Mat       hessian    = cv::Mat(2, 2, CV_32FC1, _data1.data());
+                if (std::fabs(dxx * dyy - dxy * dxy) > 1e-6)
+                {
+                    cv::Mat hessianinv = hessian.inv();
+                    cv::Mat offset     = -hessianinv * derivative;
+                    *(pred.data<float>() + k * 3 + 0) += offset.at<float>(0, 0);
+                    *(pred.data<float>() + k * 3 + 1) += offset.at<float>(1, 0);
+                }
+            }
+        }
 
-    return pred;
-  }
-
-  void post_dark_udp(Tensor& pred, Tensor& heatmap, int kernel) {
-    int K = heatmap.shape(1);
-    int H = heatmap.shape(2);
-    int W = heatmap.shape(3);
-    for (int i = 0; i < K; i++) {
-      float* data = heatmap.data<float>() + i * H * W;
-      cv::Mat work = cv::Mat(H, W, CV_32FC(1), data);
-      cv::GaussianBlur(work, work, {kernel, kernel}, 0);  // inplace
-    }
-    std::for_each(heatmap.data<float>(), heatmap.data<float>() + K * H * W, [](float& x) {
-      x = std::max(0.001f, std::min(50.f, x));
-      x = std::log(x);
-    });
-    auto _heatmap_data = [&](int index, int c) -> float {
-      int y = index / (W + 2);
-      int x = index % (W + 2);
-      y = std::max(0, y - 1);
-      x = std::max(0, x - 1);
-      return *(heatmap.data<float>() + c * H * W + y * W + x);
-    };
-    for (int i = 0; i < K; i++) {
-      float* data = pred.data<float>() + i * 3;
-      int index = *(data + 0) + 1 + (*(data + 1) + 1) * (W + 2);
-      float i_ = _heatmap_data(index, i);
-      float ix1 = _heatmap_data(index + 1, i);
-      float iy1 = _heatmap_data(index + W + 2, i);
-      float ix1y1 = _heatmap_data(index + W + 3, i);
-      float ix1_y1_ = _heatmap_data(index - W - 3, i);
-      float ix1_ = _heatmap_data(index - 1, i);
-      float iy1_ = _heatmap_data(index - 2 - W, i);
-      float dx = 0.5 * (ix1 - ix1_);
-      float dy = 0.5 * (iy1 - iy1_);
-      float dxx = ix1 - 2 * i_ + ix1_;
-      float dyy = iy1 - 2 * i_ + iy1_;
-      float dxy = 0.5 * (ix1y1 - ix1 - iy1 + i_ + i_ - ix1_ - iy1_ + ix1_y1_);
-      vector<float> _data0 = {dx, dy};
-      vector<float> _data1 = {dxx, dxy, dxy, dyy};
-      cv::Mat derivative = cv::Mat(2, 1, CV_32FC1, _data0.data());
-      cv::Mat hessian = cv::Mat(2, 2, CV_32FC1, _data1.data());
-      cv::Mat hessianinv = hessian.inv();
-      cv::Mat offset = -hessianinv * derivative;
-      *(data + 0) += offset.at<float>(0, 0);
-      *(data + 1) += offset.at<float>(1, 0);
-    }
-  }
-
-  void transform_pred(Tensor& pred, int k, const vector<float>& center, const vector<float>& _scale,
-                      const vector<int>& output_size, bool use_udp = false) {
-    auto scale = _scale;
-    scale[0] *= 200;
-    scale[1] *= 200;
-
-    float scale_x, scale_y;
-    if (use_udp) {
-      scale_x = scale[0] / (output_size[0] - 1.0);
-      scale_y = scale[1] / (output_size[1] - 1.0);
-    } else {
-      scale_x = scale[0] / output_size[0];
-      scale_y = scale[1] / output_size[1];
-    }
+        Tensor gaussian_blur(const Tensor& _heatmap, int kernel)
+        {
+            assert(kernel % 2 == 1);
+
+            auto   desc = _heatmap.desc();
+            Tensor heatmap(desc);
+
+            int    K          = _heatmap.shape(1);
+            int    H          = _heatmap.shape(2);
+            int    W          = _heatmap.shape(3);
+            int    num_points = H * W;
+
+            int    border = (kernel - 1) / 2;
+
+            for (int i = 0; i < K; i++)
+            {
+                int      offset     = i * H * W;
+                float*   data       = const_cast<float*>(_heatmap.data<float>()) + offset;
+                float    origin_max = *std::max_element(data, data + num_points);
+                cv::Mat  work       = cv::Mat(H + 2 * border, W + 2 * border, CV_32FC1, cv::Scalar{});
+                cv::Mat  curr       = cv::Mat(H, W, CV_32FC1, data);
+                cv::Rect roi        = {border, border, W, H};
+                curr.copyTo(work(roi));
+                cv::GaussianBlur(work, work, {kernel, kernel}, 0);
+                cv::Mat valid   = work(roi).clone();
+                float   cur_max = *std::max_element((float*)valid.data, (float*)valid.data + num_points);
+                float*  dst     = heatmap.data<float>() + offset;
+                std::transform((float*)valid.data, (float*)valid.data + num_points, dst, [&](float v)
+                               { return v * origin_max / cur_max; });
+            }
+            return heatmap;
+        }
 
-    float* data = pred.data<float>() + k * 3;
-    *(data + 0) = *(data + 0) * scale_x + center[0] - scale[0] * 0.5;
-    *(data + 1) = *(data + 1) * scale_y + center[1] - scale[1] * 0.5;
-  }
-
-  void taylor(const Tensor& heatmap, Tensor& pred, int k) {
-    int K = heatmap.shape(1);
-    int H = heatmap.shape(2);
-    int W = heatmap.shape(3);
-    int px = *(pred.data<float>() + k * 3 + 0);
-    int py = *(pred.data<float>() + k * 3 + 1);
-    if (1 < px && px < W - 2 && 1 < py && py < H - 2) {
-      float* data = const_cast<float*>(heatmap.data<float>() + k * H * W);
-      auto get_data = [&](int r, int c) { return *(data + r * W + c); };
-      float dx = 0.5 * (get_data(py, px + 1) - get_data(py, px - 1));
-      float dy = 0.5 * (get_data(py + 1, px) - get_data(py - 1, px));
-      float dxx = 0.25 * (get_data(py, px + 2) - 2 * get_data(py, px) + get_data(py, px - 2));
-      float dxy = 0.25 * (get_data(py + 1, px + 1) - get_data(py - 1, px + 1) -
-                          get_data(py + 1, px - 1) + get_data(py - 1, px - 1));
-      float dyy = 0.25 * (get_data(py + 2, px) - 2 * get_data(py, px) + get_data(py - 2, px));
-
-      vector<float> _data0 = {dx, dy};
-      vector<float> _data1 = {dxx, dxy, dxy, dyy};
-      cv::Mat derivative = cv::Mat(2, 1, CV_32FC1, _data0.data());
-      cv::Mat hessian = cv::Mat(2, 2, CV_32FC1, _data1.data());
-      if (std::fabs(dxx * dyy - dxy * dxy) > 1e-6) {
-        cv::Mat hessianinv = hessian.inv();
-        cv::Mat offset = -hessianinv * derivative;
-        *(pred.data<float>() + k * 3 + 0) += offset.at<float>(0, 0);
-        *(pred.data<float>() + k * 3 + 1) += offset.at<float>(1, 0);
-      }
-    }
-  }
-
-  Tensor gaussian_blur(const Tensor& _heatmap, int kernel) {
-    assert(kernel % 2 == 1);
-
-    auto desc = _heatmap.desc();
-    Tensor heatmap(desc);
-
-    int K = _heatmap.shape(1);
-    int H = _heatmap.shape(2);
-    int W = _heatmap.shape(3);
-    int num_points = H * W;
-
-    int border = (kernel - 1) / 2;
-
-    for (int i = 0; i < K; i++) {
-      int offset = i * H * W;
-      float* data = const_cast<float*>(_heatmap.data<float>()) + offset;
-      float origin_max = *std::max_element(data, data + num_points);
-      cv::Mat work = cv::Mat(H + 2 * border, W + 2 * border, CV_32FC1, cv::Scalar{});
-      cv::Mat curr = cv::Mat(H, W, CV_32FC1, data);
-      cv::Rect roi = {border, border, W, H};
-      curr.copyTo(work(roi));
-      cv::GaussianBlur(work, work, {kernel, kernel}, 0);
-      cv::Mat valid = work(roi).clone();
-      float cur_max = *std::max_element((float*)valid.data, (float*)valid.data + num_points);
-      float* dst = heatmap.data<float>() + offset;
-      std::transform((float*)valid.data, (float*)valid.data + num_points, dst,
-                     [&](float v) { return v * origin_max / cur_max; });
-    }
-    return heatmap;
-  }
-
-  Tensor get_max_pred(const Tensor& heatmap) {
-    int K = heatmap.shape(1);
-    int H = heatmap.shape(2);
-    int W = heatmap.shape(3);
-    int num_points = H * W;
-    TensorDesc pred_desc = {Device{"cpu"}, DataType::kFLOAT, {1, K, 3}};
-    Tensor pred(pred_desc);
-
-    for (int i = 0; i < K; i++) {
-      float* src_data = const_cast<float*>(heatmap.data<float>()) + i * H * W;
-      cv::Mat mat = cv::Mat(H, W, CV_32FC1, src_data);
-      double min_val, max_val;
-      cv::Point min_loc, max_loc;
-      cv::minMaxLoc(mat, &min_val, &max_val, &min_loc, &max_loc);
-      float* dst_data = pred.data<float>() + i * 3;
-      *(dst_data + 0) = -1;
-      *(dst_data + 1) = -1;
-      *(dst_data + 2) = max_val;
-      if (max_val > 0.0) {
-        *(dst_data + 0) = max_loc.x;
-        *(dst_data + 1) = max_loc.y;
-      }
-    }
+        Tensor get_max_pred(const Tensor& heatmap)
+        {
+            int        K          = heatmap.shape(1);
+            int        H          = heatmap.shape(2);
+            int        W          = heatmap.shape(3);
+            int        num_points = H * W;
+            TensorDesc pred_desc  = {Device{"cpu"}, DataType::kFLOAT, {1, K, 3}};
+            Tensor     pred(pred_desc);
+
+            for (int i = 0; i < K; i++)
+            {
+                float*    src_data = const_cast<float*>(heatmap.data<float>()) + i * H * W;
+                cv::Mat   mat      = cv::Mat(H, W, CV_32FC1, src_data);
+                double    min_val, max_val;
+                cv::Point min_loc, max_loc;
+                cv::minMaxLoc(mat, &min_val, &max_val, &min_loc, &max_loc);
+                float* dst_data = pred.data<float>() + i * 3;
+                *(dst_data + 0) = -1;
+                *(dst_data + 1) = -1;
+                *(dst_data + 2) = max_val;
+                if (max_val > 0.0)
+                {
+                    *(dst_data + 0) = max_loc.x;
+                    *(dst_data + 1) = max_loc.y;
+                }
+            }
+
+            return pred;
+        }
+
+      private:
+        bool   flip_test_{true};
+        bool   shift_heatmap_{true};
+        string post_process_ = {"default"};
+        int    modulate_kernel_{11};
+        bool   unbiased_decoding_{false};
+        float  valid_radius_factor_{0.0546875f};
+        bool   use_udp_{false};
+        string target_type_{"GaussianHeatmap"};
+    };
 
-    return pred;
-  }
-
- private:
-  bool flip_test_{true};
-  bool shift_heatmap_{true};
-  string post_process_ = {"default"};
-  int modulate_kernel_{11};
-  bool unbiased_decoding_{false};
-  float valid_radius_factor_{0.0546875f};
-  bool use_udp_{false};
-  string target_type_{"GaussianHeatmap"};
-};
-
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMPose, TopdownHeatmapBaseHeadDecode);
-
-// decode process is same
-using TopdownHeatmapSimpleHeadDecode = TopdownHeatmapBaseHeadDecode;
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMPose, TopdownHeatmapSimpleHeadDecode);
-using TopdownHeatmapMultiStageHeadDecode = TopdownHeatmapBaseHeadDecode;
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMPose, TopdownHeatmapMultiStageHeadDecode);
-using ViPNASHeatmapSimpleHeadDecode = TopdownHeatmapBaseHeadDecode;
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMPose, ViPNASHeatmapSimpleHeadDecode);
-using TopdownHeatmapMSMUHeadDecode = TopdownHeatmapBaseHeadDecode;
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMPose, TopdownHeatmapMSMUHeadDecode);
+    MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMPose, TopdownHeatmapBaseHeadDecode);
+
+    // decode process is same
+    using TopdownHeatmapSimpleHeadDecode = TopdownHeatmapBaseHeadDecode;
+    MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMPose, TopdownHeatmapSimpleHeadDecode);
+    using TopdownHeatmapMultiStageHeadDecode = TopdownHeatmapBaseHeadDecode;
+    MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMPose, TopdownHeatmapMultiStageHeadDecode);
+    using ViPNASHeatmapSimpleHeadDecode = TopdownHeatmapBaseHeadDecode;
+    MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMPose, ViPNASHeatmapSimpleHeadDecode);
+    using TopdownHeatmapMSMUHeadDecode = TopdownHeatmapBaseHeadDecode;
+    MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMPose, TopdownHeatmapMSMUHeadDecode);
 
 }  // namespace mmdeploy::mmpose
diff --git a/csrc/mmdeploy/codebase/mmpose/keypoints_from_regression.cpp b/csrc/mmdeploy/codebase/mmpose/keypoints_from_regression.cpp
index 06f372c788..e31376ce88 100644
--- a/csrc/mmdeploy/codebase/mmpose/keypoints_from_regression.cpp
+++ b/csrc/mmdeploy/codebase/mmpose/keypoints_from_regression.cpp
@@ -12,102 +12,115 @@
 #include "mmdeploy/experimental/module_adapter.h"
 #include "mmpose.h"
 
-namespace mmdeploy::mmpose {
-
-using std::string;
-using std::vector;
-
-class DeepposeRegressionHeadDecode : public MMPose {
- public:
-  explicit DeepposeRegressionHeadDecode(const Value& config) : MMPose(config) {}
-
-  Result<Value> operator()(const Value& _data, const Value& _prob) {
-    MMDEPLOY_DEBUG("preprocess_result: {}", _data);
-    MMDEPLOY_DEBUG("inference_result: {}", _prob);
-
-    Device cpu_device{"cpu"};
-    OUTCOME_TRY(auto output,
-                MakeAvailableOnDevice(_prob["output"].get<Tensor>(), cpu_device, stream()));
-    OUTCOME_TRY(stream().Wait());
-    if (!(output.shape().size() == 3 && output.data_type() == DataType::kFLOAT)) {
-      MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}, dtype: {}", output.shape(),
-                     (int)output.data_type());
-      return Status(eNotSupported);
-    }
-
-    auto& img_metas = _data["img_metas"];
-
-    vector<float> center;
-    vector<float> scale;
-    from_value(img_metas["center"], center);
-    from_value(img_metas["scale"], scale);
-    vector<int> img_size = {img_metas["img_shape"][2].get<int>(),
-                            img_metas["img_shape"][1].get<int>()};
-
-    Tensor pred = keypoints_from_regression(output, center, scale, img_size);
-
-    return GetOutput(pred);
-  }
-
-  Value GetOutput(Tensor& pred) {
-    PoseDetectorOutput output;
-    int K = pred.shape(1);
-    float* data = pred.data<float>();
-    for (int i = 0; i < K; i++) {
-      float x = *(data + 0);
-      float y = *(data + 1);
-      float s = *(data + 2);
-      output.key_points.push_back({{x, y}, s});
-      data += 3;
-    }
-    return to_value(std::move(output));
-  }
-
-  Tensor keypoints_from_regression(const Tensor& output, const vector<float>& center,
-                                   const vector<float>& scale, const vector<int>& img_size) {
-    int K = output.shape(1);
-    TensorDesc pred_desc = {Device{"cpu"}, DataType::kFLOAT, {1, K, 3}};
-    Tensor pred(pred_desc);
-
-    float* src = const_cast<float*>(output.data<float>());
-    float* dst = pred.data<float>();
-    for (int i = 0; i < K; i++) {
-      *(dst + 0) = *(src + 0) * img_size[0];
-      *(dst + 1) = *(src + 1) * img_size[1];
-      *(dst + 2) = 1.f;
-      src += 2;
-      dst += 3;
-    }
-
-    // Transform back to the image
-    for (int i = 0; i < K; i++) {
-      transform_pred(pred, i, center, scale, img_size, false);
-    }
-
-    return pred;
-  }
-
-  void transform_pred(Tensor& pred, int k, const vector<float>& center, const vector<float>& _scale,
-                      const vector<int>& output_size, bool use_udp = false) {
-    auto scale = _scale;
-    scale[0] *= 200;
-    scale[1] *= 200;
-
-    float scale_x, scale_y;
-    if (use_udp) {
-      scale_x = scale[0] / (output_size[0] - 1.0);
-      scale_y = scale[1] / (output_size[1] - 1.0);
-    } else {
-      scale_x = scale[0] / output_size[0];
-      scale_y = scale[1] / output_size[1];
-    }
-
-    float* data = pred.data<float>() + k * 3;
-    *(data + 0) = *(data + 0) * scale_x + center[0] - scale[0] * 0.5;
-    *(data + 1) = *(data + 1) * scale_y + center[1] - scale[1] * 0.5;
-  }
-};
-
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMPose, DeepposeRegressionHeadDecode);
+namespace mmdeploy::mmpose
+{
+
+    using std::string;
+    using std::vector;
+
+    class DeepposeRegressionHeadDecode : public MMPose
+    {
+      public:
+        explicit DeepposeRegressionHeadDecode(const Value& config)
+            : MMPose(config)
+        {
+        }
+
+        Result<Value> operator()(const Value& _data, const Value& _prob)
+        {
+            MMDEPLOY_DEBUG("preprocess_result: {}", _data);
+            MMDEPLOY_DEBUG("inference_result: {}", _prob);
+
+            Device cpu_device{"cpu"};
+            OUTCOME_TRY(auto output,
+                        MakeAvailableOnDevice(_prob["output"].get<Tensor>(), cpu_device, stream()));
+            OUTCOME_TRY(stream().Wait());
+            if (!(output.shape().size() == 3 && output.data_type() == DataType::kFLOAT))
+            {
+                MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}, dtype: {}", output.shape(), (int)output.data_type());
+                return Status(eNotSupported);
+            }
+
+            auto&         img_metas = _data["img_metas"];
+
+            vector<float> center;
+            vector<float> scale;
+            from_value(img_metas["center"], center);
+            from_value(img_metas["scale"], scale);
+            vector<int> img_size = {img_metas["img_shape"][2].get<int>(),
+                                    img_metas["img_shape"][1].get<int>()};
+
+            Tensor      pred = keypoints_from_regression(output, center, scale, img_size);
+
+            return GetOutput(pred);
+        }
+
+        Value GetOutput(Tensor& pred)
+        {
+            PoseDetectorOutput output;
+            int                K    = pred.shape(1);
+            float*             data = pred.data<float>();
+            for (int i = 0; i < K; i++)
+            {
+                float x = *(data + 0);
+                float y = *(data + 1);
+                float s = *(data + 2);
+                output.key_points.push_back({{x, y}, s});
+                data += 3;
+            }
+            return to_value(std::move(output));
+        }
+
+        Tensor keypoints_from_regression(const Tensor& output, const vector<float>& center, const vector<float>& scale, const vector<int>& img_size)
+        {
+            int        K         = output.shape(1);
+            TensorDesc pred_desc = {Device{"cpu"}, DataType::kFLOAT, {1, K, 3}};
+            Tensor     pred(pred_desc);
+
+            float*     src = const_cast<float*>(output.data<float>());
+            float*     dst = pred.data<float>();
+            for (int i = 0; i < K; i++)
+            {
+                *(dst + 0) = *(src + 0) * img_size[0];
+                *(dst + 1) = *(src + 1) * img_size[1];
+                *(dst + 2) = 1.f;
+                src += 2;
+                dst += 3;
+            }
+
+            // Transform back to the image
+            for (int i = 0; i < K; i++)
+            {
+                transform_pred(pred, i, center, scale, img_size, false);
+            }
+
+            return pred;
+        }
+
+        void transform_pred(Tensor& pred, int k, const vector<float>& center, const vector<float>& _scale, const vector<int>& output_size, bool use_udp = false)
+        {
+            auto scale = _scale;
+            scale[0] *= 200;
+            scale[1] *= 200;
+
+            float scale_x, scale_y;
+            if (use_udp)
+            {
+                scale_x = scale[0] / (output_size[0] - 1.0);
+                scale_y = scale[1] / (output_size[1] - 1.0);
+            }
+            else
+            {
+                scale_x = scale[0] / output_size[0];
+                scale_y = scale[1] / output_size[1];
+            }
+
+            float* data = pred.data<float>() + k * 3;
+            *(data + 0) = *(data + 0) * scale_x + center[0] - scale[0] * 0.5;
+            *(data + 1) = *(data + 1) * scale_y + center[1] - scale[1] * 0.5;
+        }
+    };
+
+    MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMPose, DeepposeRegressionHeadDecode);
 
 }  // namespace mmdeploy::mmpose
diff --git a/csrc/mmdeploy/codebase/mmpose/mmpose.cpp b/csrc/mmdeploy/codebase/mmpose/mmpose.cpp
index 6b2940b9f9..c4e2583eae 100644
--- a/csrc/mmdeploy/codebase/mmpose/mmpose.cpp
+++ b/csrc/mmdeploy/codebase/mmpose/mmpose.cpp
@@ -2,8 +2,9 @@
 
 #include "mmdeploy/codebase/mmpose/mmpose.h"
 
-namespace mmdeploy::mmpose {
+namespace mmdeploy::mmpose
+{
 
-MMDEPLOY_REGISTER_CODEBASE(MMPose);
+    MMDEPLOY_REGISTER_CODEBASE(MMPose);
 
 }  // namespace mmdeploy::mmpose
diff --git a/csrc/mmdeploy/codebase/mmpose/mmpose.h b/csrc/mmdeploy/codebase/mmpose/mmpose.h
index 3da9a9332a..f219cd1892 100644
--- a/csrc/mmdeploy/codebase/mmpose/mmpose.h
+++ b/csrc/mmdeploy/codebase/mmpose/mmpose.h
@@ -9,19 +9,22 @@
 #include "mmdeploy/core/device.h"
 #include "mmdeploy/core/module.h"
 
-namespace mmdeploy::mmpose {
-
-struct PoseDetectorOutput {
-  struct KeyPoint {
-    std::array<float, 2> bbox;  // x, y
-    float score;
-    MMDEPLOY_ARCHIVE_MEMBERS(bbox, score);
-  };
-  std::vector<KeyPoint> key_points;
-  MMDEPLOY_ARCHIVE_MEMBERS(key_points);
-};
-
-MMDEPLOY_DECLARE_CODEBASE(MMPose, mmpose);
+namespace mmdeploy::mmpose
+{
+
+    struct PoseDetectorOutput
+    {
+        struct KeyPoint
+        {
+            std::array<float, 2> bbox;  // x, y
+            float                score;
+            MMDEPLOY_ARCHIVE_MEMBERS(bbox, score);
+        };
+        std::vector<KeyPoint> key_points;
+        MMDEPLOY_ARCHIVE_MEMBERS(key_points);
+    };
+
+    MMDEPLOY_DECLARE_CODEBASE(MMPose, mmpose);
 
 }  // namespace mmdeploy::mmpose
 
diff --git a/csrc/mmdeploy/codebase/mmpose/pose_tracker/common.h b/csrc/mmdeploy/codebase/mmpose/pose_tracker/common.h
index ec24648148..b7cd819e13 100644
--- a/csrc/mmdeploy/codebase/mmpose/pose_tracker/common.h
+++ b/csrc/mmdeploy/codebase/mmpose/pose_tracker/common.h
@@ -9,48 +9,52 @@
 #include "mmdeploy/core/mpl/type_traits.h"
 #include "mmdeploy/pose_tracker.h"
 
-namespace mmdeploy::mmpose::_pose_tracker {
-
-struct TrackerResult {
-  std::vector<std::vector<mmdeploy_point_t>> keypoints;
-  std::vector<std::vector<float>> scores;
-  std::vector<mmdeploy_rect_t> bboxes;
-  std::vector<uint32_t> track_ids;
-  // debug info
-  std::vector<std::array<float, 4>> pose_input_bboxes;
-  std::vector<std::array<float, 4>> pose_output_bboxes;
-};
-
-inline void SetDefaultParams(mmdeploy_pose_tracker_param_t& p) {
-  p.det_interval = 1;
-  p.det_label = 0;
-  p.det_min_bbox_size = -1;
-  p.det_thr = .5f;
-  p.det_nms_thr = .7f;
-  p.pose_max_num_bboxes = -1;
-  p.pose_min_keypoints = -1;
-  p.pose_min_bbox_size = 0;
-  p.pose_kpt_thr = .5f;
-  p.pose_nms_thr = .5f;
-  p.keypoint_sigmas = nullptr;
-  p.keypoint_sigmas_size = 0;
-  p.track_iou_thr = .4f;
-  p.pose_bbox_scale = 1.25f;
-  p.track_max_missing = 10;
-  p.track_history_size = 1;
-
-  p.std_weight_position = 1 / 20.f;
-  p.std_weight_velocity = 1 / 160.f;
-
-  (std::array<float, 3>&)p.smooth_params = {0.007, 1., 1.};
-}
+namespace mmdeploy::mmpose::_pose_tracker
+{
+
+    struct TrackerResult
+    {
+        std::vector<std::vector<mmdeploy_point_t>> keypoints;
+        std::vector<std::vector<float>>            scores;
+        std::vector<mmdeploy_rect_t>               bboxes;
+        std::vector<uint32_t>                      track_ids;
+        // debug info
+        std::vector<std::array<float, 4>>          pose_input_bboxes;
+        std::vector<std::array<float, 4>>          pose_output_bboxes;
+    };
+
+    inline void SetDefaultParams(mmdeploy_pose_tracker_param_t& p)
+    {
+        p.det_interval         = 1;
+        p.det_label            = 0;
+        p.det_min_bbox_size    = -1;
+        p.det_thr              = .5f;
+        p.det_nms_thr          = .7f;
+        p.pose_max_num_bboxes  = -1;
+        p.pose_min_keypoints   = -1;
+        p.pose_min_bbox_size   = 0;
+        p.pose_kpt_thr         = .5f;
+        p.pose_nms_thr         = .5f;
+        p.keypoint_sigmas      = nullptr;
+        p.keypoint_sigmas_size = 0;
+        p.track_iou_thr        = .4f;
+        p.pose_bbox_scale      = 1.25f;
+        p.track_max_missing    = 10;
+        p.track_history_size   = 1;
+
+        p.std_weight_position = 1 / 20.f;
+        p.std_weight_velocity = 1 / 160.f;
+
+        (std::array<float, 3>&)p.smooth_params = {0.007, 1., 1.};
+    }
 
 }  // namespace mmdeploy::mmpose::_pose_tracker
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-MMDEPLOY_REGISTER_TYPE_ID(mmdeploy_pose_tracker_param_t*, 0x32bc6919d5287035);
-MMDEPLOY_REGISTER_TYPE_ID(mmpose::_pose_tracker::TrackerResult, 0xacb6ddb7dc1ffbca);
+    MMDEPLOY_REGISTER_TYPE_ID(mmdeploy_pose_tracker_param_t*, 0x32bc6919d5287035);
+    MMDEPLOY_REGISTER_TYPE_ID(mmpose::_pose_tracker::TrackerResult, 0xacb6ddb7dc1ffbca);
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/codebase/mmpose/pose_tracker/pipeline.cpp b/csrc/mmdeploy/codebase/mmpose/pose_tracker/pipeline.cpp
index 6ae6012b2f..7ac22e2049 100644
--- a/csrc/mmdeploy/codebase/mmpose/pose_tracker/pipeline.cpp
+++ b/csrc/mmdeploy/codebase/mmpose/pose_tracker/pipeline.cpp
@@ -7,111 +7,126 @@
 #include "pose_tracker/common.h"
 #include "pose_tracker/pose_tracker.h"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-MMDEPLOY_REGISTER_TYPE_ID(mmpose::_pose_tracker::Tracker, 0xcfe87980aa895d3a);
+    MMDEPLOY_REGISTER_TYPE_ID(mmpose::_pose_tracker::Tracker, 0xcfe87980aa895d3a);
 
 }
 
-namespace mmdeploy::mmpose::_pose_tracker {
+namespace mmdeploy::mmpose::_pose_tracker
+{
 
 #define REGISTER_SIMPLE_MODULE(name, fn) \
-  MMDEPLOY_REGISTER_FACTORY_FUNC(Module, (name, 0), [](const Value&) { return CreateTask(fn); });
-
-Value Prepare(const Value& data, const Value& use_det, Value state) {
-  auto& tracker = state.get_ref<Tracker&>();
-  // set frame size for the first frame
-  if (tracker.frame_id() == 0) {
-    auto& frame = data["ori_img"].get_ref<const framework::Mat&>();
-    tracker.SetFrameSize(frame.height(), frame.width());
-  }
-  // use_det is set to non-auto mode
-  if (use_det.get<int>() != -1) {
-    return use_det;
-  }
-  auto interval = tracker.params().det_interval;
-  return interval > 0 && tracker.frame_id() % interval == 0;
-}
-
-REGISTER_SIMPLE_MODULE(pose_tracker::Prepare, Prepare);
-
-std::tuple<Value, Value> ProcessBboxes(const Value& det_val, const Value& data,
-                                       Value state) noexcept {
-  auto& tracker = state.get_ref<Tracker&>();
-
-  std::optional<Tracker::Detections> dets;
-
-  if (det_val.is_array()) {  // has detections
-    auto& [bboxes, scores, labels] = dets.emplace();
-    for (const auto& det : det_val.array()) {
-      bboxes.push_back(from_value<Bbox>(det["bbox"]));
-      scores.push_back(det["score"].get<float>());
-      labels.push_back(det["label_id"].get<int>());
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Module, (name, 0), [](const Value&) { return CreateTask(fn); });
+
+    Value Prepare(const Value& data, const Value& use_det, Value state)
+    {
+        auto& tracker = state.get_ref<Tracker&>();
+        // set frame size for the first frame
+        if (tracker.frame_id() == 0)
+        {
+            auto& frame = data["ori_img"].get_ref<const framework::Mat&>();
+            tracker.SetFrameSize(frame.height(), frame.width());
+        }
+        // use_det is set to non-auto mode
+        if (use_det.get<int>() != -1)
+        {
+            return use_det;
+        }
+        auto interval = tracker.params().det_interval;
+        return interval > 0 && tracker.frame_id() % interval == 0;
     }
-  }
-
-  auto [bboxes, ids] = tracker.ProcessBboxes(dets);
-
-  Value::Array bbox_array;
-  Value track_ids_array;
-  // attach bboxes to image data
-  for (auto& bbox : bboxes) {
-    cv::Rect rect(cv::Rect2f(cv::Point2f(bbox[0], bbox[1]), cv::Point2f(bbox[2], bbox[3])));
-    bbox_array.push_back({
-        {"img", data["img"]},                                 // img
-        {"bbox", {rect.x, rect.y, rect.width, rect.height}},  // bbox
-    });
-  }
-
-  track_ids_array = to_value(ids);
-  return {std::move(bbox_array), std::move(track_ids_array)};
-}
-REGISTER_SIMPLE_MODULE(pose_tracker::ProcessBboxes, ProcessBboxes);
-
-Value TrackStep(const Value& poses, const Value& track_indices, Value state) noexcept {
-  assert(poses.is_array());
-  vector<Points> keypoints;
-  vector<Scores> scores;
-  for (auto& output : poses.array()) {
-    auto& k = keypoints.emplace_back();
-    auto& s = scores.emplace_back();
-    float avg = 0.f;
-    for (auto& kpt : output["key_points"].array()) {
-      k.emplace_back(kpt["bbox"][0].get<float>(), kpt["bbox"][1].get<float>());
-      s.push_back(kpt["score"].get<float>());
-      avg += s.back();
-    }
-  }
-  vector<int64_t> track_ids;
-  from_value(track_indices, track_ids);
-  auto& tracker = state.get_ref<Tracker&>();
-  tracker.TrackStep(keypoints, scores, track_ids);
-  TrackerResult result;
-  for (const auto& track : tracker.tracks()) {
-    if (track->missing()) {
-      continue;
+
+    REGISTER_SIMPLE_MODULE(pose_tracker::Prepare, Prepare);
+
+    std::tuple<Value, Value> ProcessBboxes(const Value& det_val, const Value& data, Value state) noexcept
+    {
+        auto&                              tracker = state.get_ref<Tracker&>();
+
+        std::optional<Tracker::Detections> dets;
+
+        if (det_val.is_array())
+        {  // has detections
+            auto& [bboxes, scores, labels] = dets.emplace();
+            for (const auto& det : det_val.array())
+            {
+                bboxes.push_back(from_value<Bbox>(det["bbox"]));
+                scores.push_back(det["score"].get<float>());
+                labels.push_back(det["label_id"].get<int>());
+            }
+        }
+
+        auto [bboxes, ids] = tracker.ProcessBboxes(dets);
+
+        Value::Array bbox_array;
+        Value        track_ids_array;
+        // attach bboxes to image data
+        for (auto& bbox : bboxes)
+        {
+            cv::Rect rect(cv::Rect2f(cv::Point2f(bbox[0], bbox[1]), cv::Point2f(bbox[2], bbox[3])));
+            bbox_array.push_back({
+                {"img", data["img"]},                                 // img
+                {"bbox", {rect.x, rect.y, rect.width, rect.height}},  // bbox
+            });
+        }
+
+        track_ids_array = to_value(ids);
+        return {std::move(bbox_array), std::move(track_ids_array)};
     }
-    vector<mmdeploy_point_t> kpts;
-    kpts.reserve(track->smoothed_kpts().size());
-    for (const auto& kpt : track->smoothed_kpts()) {
-      kpts.push_back({kpt.x, kpt.y});
+    REGISTER_SIMPLE_MODULE(pose_tracker::ProcessBboxes, ProcessBboxes);
+
+    Value TrackStep(const Value& poses, const Value& track_indices, Value state) noexcept
+    {
+        assert(poses.is_array());
+        vector<Points> keypoints;
+        vector<Scores> scores;
+        for (auto& output : poses.array())
+        {
+            auto& k   = keypoints.emplace_back();
+            auto& s   = scores.emplace_back();
+            float avg = 0.f;
+            for (auto& kpt : output["key_points"].array())
+            {
+                k.emplace_back(kpt["bbox"][0].get<float>(), kpt["bbox"][1].get<float>());
+                s.push_back(kpt["score"].get<float>());
+                avg += s.back();
+            }
+        }
+        vector<int64_t> track_ids;
+        from_value(track_indices, track_ids);
+        auto& tracker = state.get_ref<Tracker&>();
+        tracker.TrackStep(keypoints, scores, track_ids);
+        TrackerResult result;
+        for (const auto& track : tracker.tracks())
+        {
+            if (track->missing())
+            {
+                continue;
+            }
+            vector<mmdeploy_point_t> kpts;
+            kpts.reserve(track->smoothed_kpts().size());
+            for (const auto& kpt : track->smoothed_kpts())
+            {
+                kpts.push_back({kpt.x, kpt.y});
+            }
+            result.keypoints.push_back(std::move(kpts));
+            result.scores.push_back(track->scores());
+            auto& bbox = track->smoothed_bbox();
+            result.bboxes.push_back({bbox[0], bbox[1], bbox[2], bbox[3]});
+            result.track_ids.push_back(track->track_id());
+        }
+        result.pose_input_bboxes  = tracker.pose_input_bboxes();
+        result.pose_output_bboxes = tracker.pose_output_bboxes();
+        return result;
     }
-    result.keypoints.push_back(std::move(kpts));
-    result.scores.push_back(track->scores());
-    auto& bbox = track->smoothed_bbox();
-    result.bboxes.push_back({bbox[0], bbox[1], bbox[2], bbox[3]});
-    result.track_ids.push_back(track->track_id());
-  }
-  result.pose_input_bboxes = tracker.pose_input_bboxes();
-  result.pose_output_bboxes = tracker.pose_output_bboxes();
-  return result;
-}
-REGISTER_SIMPLE_MODULE(pose_tracker::TrackStep, TrackStep);
+    REGISTER_SIMPLE_MODULE(pose_tracker::TrackStep, TrackStep);
 
-// MSVC toolset v143 keeps ICEing when using a lambda here
-static Value CreateTracker(mmdeploy_pose_tracker_param_t* param) {
-  return make_pointer(Tracker{*param});
-}
-REGISTER_SIMPLE_MODULE(pose_tracker::Create, CreateTracker);
+    // MSVC toolset v143 keeps ICEing when using a lambda here
+    static Value CreateTracker(mmdeploy_pose_tracker_param_t* param)
+    {
+        return make_pointer(Tracker{*param});
+    }
+    REGISTER_SIMPLE_MODULE(pose_tracker::Create, CreateTracker);
 
 }  // namespace mmdeploy::mmpose::_pose_tracker
diff --git a/csrc/mmdeploy/codebase/mmpose/pose_tracker/pose_tracker.cpp b/csrc/mmdeploy/codebase/mmpose/pose_tracker/pose_tracker.cpp
index e1671dd7ac..712de43174 100644
--- a/csrc/mmdeploy/codebase/mmpose/pose_tracker/pose_tracker.cpp
+++ b/csrc/mmdeploy/codebase/mmpose/pose_tracker/pose_tracker.cpp
@@ -9,391 +9,461 @@
 #include "mmdeploy/core/utils/formatter.h"
 #include "pose_tracker/utils.h"
 
-namespace mmdeploy::mmpose::_pose_tracker {
-
-Tracker::Tracker(const mmdeploy_pose_tracker_param_t& _params) : params_(_params) {
-  if (params_.keypoint_sigmas && params_.keypoint_sigmas_size) {
-    std::copy_n(params_.keypoint_sigmas, params_.keypoint_sigmas_size,
-                std::back_inserter(key_point_sigmas_));
-    params_.keypoint_sigmas = key_point_sigmas_.data();
-  }
-}
-
-void Tracker::SuppressOverlappingBoxes(const vector<Bbox>& bboxes,
-                                       vector<std::pair<int, float>>& ranks,
-                                       vector<int>& is_valid_bbox) const {
-  vector<float> iou(ranks.size() * ranks.size());
-  for (int i = 0; i < bboxes.size(); ++i) {
-    for (int j = 0; j < i; ++j) {
-      iou[i * bboxes.size() + j] = iou[j * bboxes.size() + i] =
-          intersection_over_union(bboxes[i], bboxes[j]);
-    }
-  }
-  suppress_non_maximum(ranks, iou, is_valid_bbox, params_.det_nms_thr);
-}
-
-void Tracker::SuppressOverlappingPoses(const vector<Points>& keypoints,
-                                       const vector<Scores>& scores, const vector<Bbox>& bboxes,
-                                       const vector<int64_t>& track_ids, vector<int>& is_valid,
-                                       float oks_thr) {
-  assert(keypoints.size() == is_valid.size());
-  assert(scores.size() == is_valid.size());
-  assert(bboxes.size() == is_valid.size());
-  const auto size = is_valid.size();
-  vector<float> similarity(size * size);
-  for (int i = 0; i < size; ++i) {
-    if (is_valid[i]) {
-      for (int j = 0; j < i; ++j) {
-        if (is_valid[j]) {
-          similarity[i * size + j] = similarity[j * size + i] =
-              GetPosePoseSimilarity(bboxes[i], keypoints[i], bboxes[j], keypoints[j]);
+namespace mmdeploy::mmpose::_pose_tracker
+{
+
+    Tracker::Tracker(const mmdeploy_pose_tracker_param_t& _params)
+        : params_(_params)
+    {
+        if (params_.keypoint_sigmas && params_.keypoint_sigmas_size)
+        {
+            std::copy_n(params_.keypoint_sigmas, params_.keypoint_sigmas_size, std::back_inserter(key_point_sigmas_));
+            params_.keypoint_sigmas = key_point_sigmas_.data();
         }
-      }
-    }
-  }
-  vector<std::pair<bool, float>> ranks;
-  ranks.reserve(size);
-  for (int i = 0; i < size; ++i) {
-    bool is_visible = false;
-    for (const auto& track : tracks_) {
-      if (track->track_id() == track_ids[i]) {
-        is_visible = track->missing() == 0;
-        break;
-      }
-    }
-    auto score = std::accumulate(scores[i].begin(), scores[i].end(), 0.f);
-    // prevents bboxes from missing tracks to suppress visible tracks
-    ranks.emplace_back(is_visible, score);
-  }
-  suppress_non_maximum(ranks, similarity, is_valid, oks_thr);
-}
-
-std::tuple<vector<Bbox>, vector<int64_t>> Tracker::ProcessBboxes(
-    const std::optional<Detections>& dets) {
-  vector<Bbox> bboxes;
-  vector<int64_t> prev_ids;
-
-  // 2 - visible tracks
-  // 1 - detection
-  // 0 - missing tracks
-  vector<int> types;
-
-  GetDetectedObjects(dets, bboxes, prev_ids, types);
-
-  GetTrackedObjects(bboxes, prev_ids, types);
-
-  vector<int> is_valid_bboxes(bboxes.size(), 1);
-
-  auto count = [&] {
-    std::array<int, 3> acc{};
-    for (size_t i = 0; i < is_valid_bboxes.size(); ++i) {
-      if (is_valid_bboxes[i]) {
-        ++acc[types[i]];
-      }
-    }
-    return acc;
-  };
-  POSE_TRACKER_DEBUG("frame {}, bboxes {}", frame_id_, count());
-
-  vector<std::pair<int, float>> ranks;
-  ranks.reserve(bboxes.size());
-  for (int i = 0; i < bboxes.size(); ++i) {
-    ranks.emplace_back(types[i], get_area(bboxes[i]));
-  }
-  SuppressOverlappingBoxes(bboxes, ranks, is_valid_bboxes);
-  POSE_TRACKER_DEBUG("frame {}, bboxes after nms: {}", frame_id_, count());
-
-  vector<int> idxs;
-  idxs.reserve(bboxes.size());
-  for (int i = 0; i < bboxes.size(); ++i) {
-    if (is_valid_bboxes[i]) {
-      idxs.push_back(i);
     }
-  }
-
-  std::stable_sort(idxs.begin(), idxs.end(), [&](int i, int j) { return ranks[i] > ranks[j]; });
-  std::fill(is_valid_bboxes.begin(), is_valid_bboxes.end(), 0);
-  {
-    vector<Bbox> tmp_bboxes;
-    vector<int64_t> tmp_track_ids;
-    for (const auto& i : idxs) {
-      if (tmp_bboxes.size() >= params_.pose_max_num_bboxes) {
-        break;
-      }
-      tmp_bboxes.push_back(bboxes[i]);
-      tmp_track_ids.push_back(prev_ids[i]);
-      is_valid_bboxes[i] = 1;
+
+    void Tracker::SuppressOverlappingBoxes(const vector<Bbox>&            bboxes,
+                                           vector<std::pair<int, float>>& ranks,
+                                           vector<int>&                   is_valid_bbox) const
+    {
+        vector<float> iou(ranks.size() * ranks.size());
+        for (int i = 0; i < bboxes.size(); ++i)
+        {
+            for (int j = 0; j < i; ++j)
+            {
+                iou[i * bboxes.size() + j] = iou[j * bboxes.size() + i] =
+                    intersection_over_union(bboxes[i], bboxes[j]);
+            }
+        }
+        suppress_non_maximum(ranks, iou, is_valid_bbox, params_.det_nms_thr);
     }
-    bboxes = std::move(tmp_bboxes);
-    prev_ids = std::move(tmp_track_ids);
-  }
-
-  pose_input_bboxes_ = bboxes;
-
-  POSE_TRACKER_DEBUG("frame {}, bboxes after sort: {}", frame_id_, count());
-  return {bboxes, prev_ids};
-}
-
-void Tracker::TrackStep(vector<Points>& keypoints, vector<Scores>& scores,
-                        const vector<int64_t>& prev_ids) noexcept {
-  vector<Bbox> bboxes(keypoints.size());
-  vector<int> is_unused_bbox(keypoints.size(), 1);
-
-  // key-points to bboxes
-  for (size_t i = 0; i < keypoints.size(); ++i) {
-    if (auto bbox = KeypointsToBbox(keypoints[i], scores[i])) {
-      bboxes[i] = *bbox;
-    } else {
-      is_unused_bbox[i] = false;
+
+    void Tracker::SuppressOverlappingPoses(const vector<Points>&  keypoints,
+                                           const vector<Scores>&  scores,
+                                           const vector<Bbox>&    bboxes,
+                                           const vector<int64_t>& track_ids,
+                                           vector<int>&           is_valid,
+                                           float                  oks_thr)
+    {
+        assert(keypoints.size() == is_valid.size());
+        assert(scores.size() == is_valid.size());
+        assert(bboxes.size() == is_valid.size());
+        const auto    size = is_valid.size();
+        vector<float> similarity(size * size);
+        for (int i = 0; i < size; ++i)
+        {
+            if (is_valid[i])
+            {
+                for (int j = 0; j < i; ++j)
+                {
+                    if (is_valid[j])
+                    {
+                        similarity[i * size + j] = similarity[j * size + i] =
+                            GetPosePoseSimilarity(bboxes[i], keypoints[i], bboxes[j], keypoints[j]);
+                    }
+                }
+            }
+        }
+        vector<std::pair<bool, float>> ranks;
+        ranks.reserve(size);
+        for (int i = 0; i < size; ++i)
+        {
+            bool is_visible = false;
+            for (const auto& track : tracks_)
+            {
+                if (track->track_id() == track_ids[i])
+                {
+                    is_visible = track->missing() == 0;
+                    break;
+                }
+            }
+            auto score = std::accumulate(scores[i].begin(), scores[i].end(), 0.f);
+            // prevents bboxes from missing tracks to suppress visible tracks
+            ranks.emplace_back(is_visible, score);
+        }
+        suppress_non_maximum(ranks, similarity, is_valid, oks_thr);
     }
-  }
 
-  pose_output_bboxes_ = bboxes;
+    std::tuple<vector<Bbox>, vector<int64_t>> Tracker::ProcessBboxes(
+        const std::optional<Detections>& dets)
+    {
+        vector<Bbox>    bboxes;
+        vector<int64_t> prev_ids;
+
+        // 2 - visible tracks
+        // 1 - detection
+        // 0 - missing tracks
+        vector<int>     types;
+
+        GetDetectedObjects(dets, bboxes, prev_ids, types);
+
+        GetTrackedObjects(bboxes, prev_ids, types);
+
+        vector<int> is_valid_bboxes(bboxes.size(), 1);
+
+        auto        count = [&]
+        {
+            std::array<int, 3> acc{};
+            for (size_t i = 0; i < is_valid_bboxes.size(); ++i)
+            {
+                if (is_valid_bboxes[i])
+                {
+                    ++acc[types[i]];
+                }
+            }
+            return acc;
+        };
+        POSE_TRACKER_DEBUG("frame {}, bboxes {}", frame_id_, count());
+
+        vector<std::pair<int, float>> ranks;
+        ranks.reserve(bboxes.size());
+        for (int i = 0; i < bboxes.size(); ++i)
+        {
+            ranks.emplace_back(types[i], get_area(bboxes[i]));
+        }
+        SuppressOverlappingBoxes(bboxes, ranks, is_valid_bboxes);
+        POSE_TRACKER_DEBUG("frame {}, bboxes after nms: {}", frame_id_, count());
+
+        vector<int> idxs;
+        idxs.reserve(bboxes.size());
+        for (int i = 0; i < bboxes.size(); ++i)
+        {
+            if (is_valid_bboxes[i])
+            {
+                idxs.push_back(i);
+            }
+        }
 
-  SuppressOverlappingPoses(keypoints, scores, bboxes, prev_ids, is_unused_bbox,
-                           params_.pose_nms_thr);
-  assert(is_unused_bbox.size() == bboxes.size());
+        std::stable_sort(idxs.begin(), idxs.end(), [&](int i, int j)
+                         { return ranks[i] > ranks[j]; });
+        std::fill(is_valid_bboxes.begin(), is_valid_bboxes.end(), 0);
+        {
+            vector<Bbox>    tmp_bboxes;
+            vector<int64_t> tmp_track_ids;
+            for (const auto& i : idxs)
+            {
+                if (tmp_bboxes.size() >= params_.pose_max_num_bboxes)
+                {
+                    break;
+                }
+                tmp_bboxes.push_back(bboxes[i]);
+                tmp_track_ids.push_back(prev_ids[i]);
+                is_valid_bboxes[i] = 1;
+            }
+            bboxes   = std::move(tmp_bboxes);
+            prev_ids = std::move(tmp_track_ids);
+        }
 
-  vector<float> similarity0;    // average mahalanobis dist - proportion of tracked key-points
-  vector<float> similarity1;    // iou
-  vector<vector<bool>> gating;  // key-point gating result
-  GetSimilarityMatrices(bboxes, keypoints, prev_ids, similarity0, similarity1, gating);
+        pose_input_bboxes_ = bboxes;
 
-  vector<int> is_unused_track(tracks_.size(), 1);
-  // disable missing tracks in the #1 assignment
-  for (int i = 0; i < tracks_.size(); ++i) {
-    if (tracks_[i]->missing()) {
-      is_unused_track[i] = 0;
-    }
-  }
-  const auto assignment_visible =
-      greedy_assignment(similarity0, is_unused_bbox, is_unused_track, -kInf / 10);
-  POSE_TRACKER_DEBUG("frame {}, assignment for visible {}", frame_id_, assignment_visible);
-
-  // enable missing tracks in the #2 assignment
-  for (int i = 0; i < tracks_.size(); ++i) {
-    if (tracks_[i]->missing()) {
-      is_unused_track[i] = 1;
-    }
-  }
-  const auto assignment_missing =
-      greedy_assignment(similarity1, is_unused_bbox, is_unused_track, params_.track_iou_thr);
-  POSE_TRACKER_DEBUG("frame {}, assignment for missing {}", frame_id_, assignment_missing);
-
-  // update assigned tracks
-  for (auto [i, j, _] : assignment_visible) {
-    tracks_[j]->UpdateVisible(bboxes[i], keypoints[i], scores[i], gating[i * tracks_.size() + j]);
-  }
-
-  // update recovered tracks
-  for (auto [i, j, _] : assignment_missing) {
-    tracks_[j]->UpdateRecovered(bboxes[i], keypoints[i], scores[i]);
-  }
-
-  // generating new tracks from detected bboxes
-  for (size_t i = 0; i < is_unused_bbox.size(); ++i) {
-    if (is_unused_bbox[i] && prev_ids[i] == -1) {
-      CreateTrack(bboxes[i], keypoints[i], scores[i]);
+        POSE_TRACKER_DEBUG("frame {}, bboxes after sort: {}", frame_id_, count());
+        return {bboxes, prev_ids};
     }
-  }
 
-  // update missing tracks
-  for (size_t i = 0; i < is_unused_track.size(); ++i) {
-    if (is_unused_track[i]) {
-      tracks_[i]->UpdateMissing();
-    }
-  }
+    void Tracker::TrackStep(vector<Points>& keypoints, vector<Scores>& scores, const vector<int64_t>& prev_ids) noexcept
+    {
+        vector<Bbox> bboxes(keypoints.size());
+        vector<int>  is_unused_bbox(keypoints.size(), 1);
+
+        // key-points to bboxes
+        for (size_t i = 0; i < keypoints.size(); ++i)
+        {
+            if (auto bbox = KeypointsToBbox(keypoints[i], scores[i]))
+            {
+                bboxes[i] = *bbox;
+            }
+            else
+            {
+                is_unused_bbox[i] = false;
+            }
+        }
+
+        pose_output_bboxes_ = bboxes;
+
+        SuppressOverlappingPoses(keypoints, scores, bboxes, prev_ids, is_unused_bbox, params_.pose_nms_thr);
+        assert(is_unused_bbox.size() == bboxes.size());
+
+        vector<float>        similarity0;  // average mahalanobis dist - proportion of tracked key-points
+        vector<float>        similarity1;  // iou
+        vector<vector<bool>> gating;       // key-point gating result
+        GetSimilarityMatrices(bboxes, keypoints, prev_ids, similarity0, similarity1, gating);
+
+        vector<int> is_unused_track(tracks_.size(), 1);
+        // disable missing tracks in the #1 assignment
+        for (int i = 0; i < tracks_.size(); ++i)
+        {
+            if (tracks_[i]->missing())
+            {
+                is_unused_track[i] = 0;
+            }
+        }
+        const auto assignment_visible =
+            greedy_assignment(similarity0, is_unused_bbox, is_unused_track, -kInf / 10);
+        POSE_TRACKER_DEBUG("frame {}, assignment for visible {}", frame_id_, assignment_visible);
+
+        // enable missing tracks in the #2 assignment
+        for (int i = 0; i < tracks_.size(); ++i)
+        {
+            if (tracks_[i]->missing())
+            {
+                is_unused_track[i] = 1;
+            }
+        }
+        const auto assignment_missing =
+            greedy_assignment(similarity1, is_unused_bbox, is_unused_track, params_.track_iou_thr);
+        POSE_TRACKER_DEBUG("frame {}, assignment for missing {}", frame_id_, assignment_missing);
+
+        // update assigned tracks
+        for (auto [i, j, _] : assignment_visible)
+        {
+            tracks_[j]->UpdateVisible(bboxes[i], keypoints[i], scores[i], gating[i * tracks_.size() + j]);
+        }
+
+        // update recovered tracks
+        for (auto [i, j, _] : assignment_missing)
+        {
+            tracks_[j]->UpdateRecovered(bboxes[i], keypoints[i], scores[i]);
+        }
+
+        // generating new tracks from detected bboxes
+        for (size_t i = 0; i < is_unused_bbox.size(); ++i)
+        {
+            if (is_unused_bbox[i] && prev_ids[i] == -1)
+            {
+                CreateTrack(bboxes[i], keypoints[i], scores[i]);
+            }
+        }
+
+        // update missing tracks
+        for (size_t i = 0; i < is_unused_track.size(); ++i)
+        {
+            if (is_unused_track[i])
+            {
+                tracks_[i]->UpdateMissing();
+            }
+        }
+
+        // diagnostic for missing tracks
+        DiagnosticMissingTracks(is_unused_track, is_unused_bbox, similarity0, similarity1);
 
-  // diagnostic for missing tracks
-  DiagnosticMissingTracks(is_unused_track, is_unused_bbox, similarity0, similarity1);
+        RemoveMissingTracks();
+
+        for (auto& track : tracks_)
+        {
+            track->Predict();
+        }
+
+        ++frame_id_;
+
+        // print track summary
+        //  SummaryTracks();
+    }
 
-  RemoveMissingTracks();
+    void Tracker::GetTrackedObjects(vector<Bbox>& bboxes, vector<int64_t>& track_ids, vector<int>& types) const
+    {
+        for (auto& track : tracks_)
+        {
+            std::optional<Bbox> bbox;
+            if (track->missing())
+            {
+                bbox = track->predicted_bbox();
+            }
+            else
+            {
+                bbox = keypoints_to_bbox(track->predicted_kpts(), track->scores(), frame_h_, frame_w_, params_.pose_bbox_scale, params_.pose_kpt_thr, params_.pose_min_keypoints);
+            }
+            if (bbox)
+            {
+                auto&            b = *bbox;
+                cv::Rect_<float> img_rect(0, 0, frame_w_, frame_h_);
+                cv::Rect_<float> box_rect(b[0], b[1], b[2] - b[0], b[3] - b[1]);
+                auto             roi = img_rect & box_rect;
+                if (roi.area() > 0 && get_area(b) > params_.pose_min_bbox_size * params_.pose_min_bbox_size)
+                {
+                    bboxes.push_back(*bbox);
+                    track_ids.push_back(track->track_id());
+                    types.push_back(track->missing() ? 0 : 2);
+                }
+            }
+        }
+    }
 
-  for (auto& track : tracks_) {
-    track->Predict();
-  }
+    void Tracker::GetDetectedObjects(const std::optional<Detections>& dets, vector<Bbox>& _bboxes, vector<int64_t>& track_ids, vector<int>& types) const
+    {
+        if (dets)
+        {
+            auto& [bboxes, scores, labels] = *dets;
+            for (size_t i = 0; i < bboxes.size(); ++i)
+            {
+                if (labels[i] == params_.det_label && scores[i] > params_.det_thr &&
+                    get_area(bboxes[i]) >= params_.det_min_bbox_size * params_.det_min_bbox_size)
+                {
+                    _bboxes.push_back(bboxes[i]);
+                    track_ids.push_back(-1);
+                    types.push_back(1);
+                }
+            }
+        }
+    }
 
-  ++frame_id_;
+    std::tuple<float, float, vector<bool>> Tracker::GetTrackPoseSimilarity(Track&        track,
+                                                                           const Bbox&   bbox,
+                                                                           const Points& kpts) const
+    {
+        static constexpr const std::array chi2inv95{0.f, 3.8415f, 5.9915f, 7.8147f, 9.4877f, 11.070f, 12.592f, 14.067f, 15.507f, 16.919f};
+        auto                              dists = track.KeyPointDistance(kpts);
+        vector<bool>                      gating;
+        gating.reserve(dists.size());
+        float dist  = 0.f;
+        int   count = 0;
+        for (const auto& d : dists)
+        {
+            if (d < chi2inv95[2])
+            {
+                dist += d;
+                ++count;
+                gating.push_back(true);
+            }
+            else
+            {
+                gating.push_back(false);
+            }
+        }
+        auto count_thr =
+            params_.pose_min_keypoints >= 0 ? params_.pose_min_keypoints : (dists.size() + 1) / 2;
+        if (count >= count_thr)
+        {
+            auto fcount = static_cast<float>(count);
+            dist        = dist / fcount - fcount / static_cast<float>(dists.size());
+        }
+        else
+        {
+            dist = kInf;
+        }
 
-  // print track summary
-  //  SummaryTracks();
-}
+        auto iou = intersection_over_union(track.predicted_bbox(), bbox);
+        if (key_point_sigmas_.empty())
+        {
+            return {dist, iou, gating};
+        }
 
-void Tracker::GetTrackedObjects(vector<Bbox>& bboxes, vector<int64_t>& track_ids,
-                                vector<int>& types) const {
-  for (auto& track : tracks_) {
-    std::optional<Bbox> bbox;
-    if (track->missing()) {
-      bbox = track->predicted_bbox();
-    } else {
-      bbox = keypoints_to_bbox(track->predicted_kpts(), track->scores(), frame_h_, frame_w_,
-                               params_.pose_bbox_scale, params_.pose_kpt_thr,
-                               params_.pose_min_keypoints);
+        return {dist, iou, gating};
     }
-    if (bbox) {
-      auto& b = *bbox;
-      cv::Rect_<float> img_rect(0, 0, frame_w_, frame_h_);
-      cv::Rect_<float> box_rect(b[0], b[1], b[2] - b[0], b[3] - b[1]);
-      auto roi = img_rect & box_rect;
-      if (roi.area() > 0 && get_area(b) > params_.pose_min_bbox_size * params_.pose_min_bbox_size) {
-        bboxes.push_back(*bbox);
-        track_ids.push_back(track->track_id());
-        types.push_back(track->missing() ? 0 : 2);
-      }
+
+    void Tracker::GetSimilarityMatrices(const vector<Bbox>& bboxes, const vector<Points>& keypoints, const vector<int64_t>& prev_ids, vector<float>& similarity0, vector<float>& similarity1, vector<vector<bool>>& gating)
+    {
+        const auto n_rows = static_cast<int>(bboxes.size());
+        const auto n_cols = static_cast<int>(tracks_.size());
+
+        // generate similarity matrix
+        similarity0 = vector<float>(n_rows * n_cols, -kInf);
+        similarity1 = vector<float>(n_rows * n_cols, -kInf);
+        gating      = vector<vector<bool>>(n_rows * n_cols);
+        for (size_t i = 0; i < n_rows; ++i)
+        {
+            const auto& bbox = bboxes[i];
+            const auto& kpts = keypoints[i];
+            for (size_t j = 0; j < n_cols; ++j)
+            {
+                auto& track = *tracks_[j];
+                if (prev_ids[i] != -1 && prev_ids[i] != track.track_id())
+                {
+                    continue;
+                }
+                const auto index         = i * n_cols + j;
+                auto&& [dist, iou, gate] = GetTrackPoseSimilarity(track, bbox, kpts);
+                similarity0[index]       = -dist;
+                similarity1[index]       = iou;
+                gating.push_back(std::move(gate));
+            }
+        }
     }
-  }
-}
-
-void Tracker::GetDetectedObjects(const std::optional<Detections>& dets, vector<Bbox>& _bboxes,
-                                 vector<int64_t>& track_ids, vector<int>& types) const {
-  if (dets) {
-    auto& [bboxes, scores, labels] = *dets;
-    for (size_t i = 0; i < bboxes.size(); ++i) {
-      if (labels[i] == params_.det_label && scores[i] > params_.det_thr &&
-          get_area(bboxes[i]) >= params_.det_min_bbox_size * params_.det_min_bbox_size) {
-        _bboxes.push_back(bboxes[i]);
-        track_ids.push_back(-1);
-        types.push_back(1);
-      }
+
+    float Tracker::GetPosePoseSimilarity(const Bbox& bbox0, const Points& kpts0, const Bbox& bbox1, const Points& kpts1)
+    {
+        if (key_point_sigmas_.empty())
+        {
+            return intersection_over_union(bbox0, bbox1);
+        }
+        // symmetric
+        return object_keypoint_similarity(kpts0, bbox0, kpts1, bbox1, key_point_sigmas_);
     }
-  }
-}
-
-std::tuple<float, float, vector<bool>> Tracker::GetTrackPoseSimilarity(Track& track,
-                                                                       const Bbox& bbox,
-                                                                       const Points& kpts) const {
-  static constexpr const std::array chi2inv95{0.f,     3.8415f, 5.9915f, 7.8147f, 9.4877f,
-                                              11.070f, 12.592f, 14.067f, 15.507f, 16.919f};
-  auto dists = track.KeyPointDistance(kpts);
-  vector<bool> gating;
-  gating.reserve(dists.size());
-  float dist = 0.f;
-  int count = 0;
-  for (const auto& d : dists) {
-    if (d < chi2inv95[2]) {
-      dist += d;
-      ++count;
-      gating.push_back(true);
-    } else {
-      gating.push_back(false);
+
+    void Tracker::CreateTrack(const Bbox& bbox, const Points& kpts, const Scores& scores)
+    {
+        *tracks_.emplace_back(std::make_unique<Track>(&params_, bbox, kpts, scores, next_id_++));
     }
-  }
-  auto count_thr =
-      params_.pose_min_keypoints >= 0 ? params_.pose_min_keypoints : (dists.size() + 1) / 2;
-  if (count >= count_thr) {
-    auto fcount = static_cast<float>(count);
-    dist = dist / fcount - fcount / static_cast<float>(dists.size());
-  } else {
-    dist = kInf;
-  }
-
-  auto iou = intersection_over_union(track.predicted_bbox(), bbox);
-  if (key_point_sigmas_.empty()) {
-    return {dist, iou, gating};
-  }
-
-  return {dist, iou, gating};
-}
-
-void Tracker::GetSimilarityMatrices(const vector<Bbox>& bboxes, const vector<Points>& keypoints,
-                                    const vector<int64_t>& prev_ids, vector<float>& similarity0,
-                                    vector<float>& similarity1, vector<vector<bool>>& gating) {
-  const auto n_rows = static_cast<int>(bboxes.size());
-  const auto n_cols = static_cast<int>(tracks_.size());
-
-  // generate similarity matrix
-  similarity0 = vector<float>(n_rows * n_cols, -kInf);
-  similarity1 = vector<float>(n_rows * n_cols, -kInf);
-  gating = vector<vector<bool>>(n_rows * n_cols);
-  for (size_t i = 0; i < n_rows; ++i) {
-    const auto& bbox = bboxes[i];
-    const auto& kpts = keypoints[i];
-    for (size_t j = 0; j < n_cols; ++j) {
-      auto& track = *tracks_[j];
-      if (prev_ids[i] != -1 && prev_ids[i] != track.track_id()) {
-        continue;
-      }
-      const auto index = i * n_cols + j;
-      auto&& [dist, iou, gate] = GetTrackPoseSimilarity(track, bbox, kpts);
-      similarity0[index] = -dist;
-      similarity1[index] = iou;
-      gating.push_back(std::move(gate));
+
+    std::optional<Bbox> Tracker::KeypointsToBbox(const Points& kpts, const Scores& scores) const
+    {
+        return keypoints_to_bbox(kpts, scores, frame_h_, frame_w_, params_.pose_bbox_scale, params_.pose_kpt_thr, params_.pose_min_keypoints);
     }
-  }
-}
-
-float Tracker::GetPosePoseSimilarity(const Bbox& bbox0, const Points& kpts0, const Bbox& bbox1,
-                                     const Points& kpts1) {
-  if (key_point_sigmas_.empty()) {
-    return intersection_over_union(bbox0, bbox1);
-  }
-  // symmetric
-  return object_keypoint_similarity(kpts0, bbox0, kpts1, bbox1, key_point_sigmas_);
-}
-
-void Tracker::CreateTrack(const Bbox& bbox, const Points& kpts, const Scores& scores) {
-  *tracks_.emplace_back(std::make_unique<Track>(&params_, bbox, kpts, scores, next_id_++));
-}
-
-std::optional<Bbox> Tracker::KeypointsToBbox(const Points& kpts, const Scores& scores) const {
-  return keypoints_to_bbox(kpts, scores, frame_h_, frame_w_, params_.pose_bbox_scale,
-                           params_.pose_kpt_thr, params_.pose_min_keypoints);
-}
-
-void Tracker::RemoveMissingTracks() {
-  size_t count{};
-  for (auto& track : tracks_) {
-    if (track->missing() <= params_.track_max_missing) {
-      tracks_[count++] = std::move(track);
+
+    void Tracker::RemoveMissingTracks()
+    {
+        size_t count{};
+        for (auto& track : tracks_)
+        {
+            if (track->missing() <= params_.track_max_missing)
+            {
+                tracks_[count++] = std::move(track);
+            }
+        }
+        tracks_.resize(count);
     }
-  }
-  tracks_.resize(count);
-}
-
-void Tracker::DiagnosticMissingTracks(const vector<int>& is_unused_track,
-                                      const vector<int>& is_unused_bbox,
-                                      const vector<float>& similarity0,
-                                      const vector<float>& similarity1) {
-  int missing = 0;
-  const auto n_cols = static_cast<int>(is_unused_track.size());
-  const auto n_rows = static_cast<int>(is_unused_bbox.size());
-  for (int i = 0; i < is_unused_track.size(); ++i) {
-    if (is_unused_track[i]) {
-      float best_s0 = 0.f;
-      float best_s1 = 0.f;
-      for (int j = 0; j < is_unused_bbox.size(); ++j) {
-        if (is_unused_bbox[j]) {
-          best_s0 = std::max(similarity0[j * n_cols + i], best_s0);
-          best_s1 = std::max(similarity1[j * n_cols + i], best_s1);
+
+    void Tracker::DiagnosticMissingTracks(const vector<int>&   is_unused_track,
+                                          const vector<int>&   is_unused_bbox,
+                                          const vector<float>& similarity0,
+                                          const vector<float>& similarity1)
+    {
+        int        missing = 0;
+        const auto n_cols  = static_cast<int>(is_unused_track.size());
+        const auto n_rows  = static_cast<int>(is_unused_bbox.size());
+        for (int i = 0; i < is_unused_track.size(); ++i)
+        {
+            if (is_unused_track[i])
+            {
+                float best_s0 = 0.f;
+                float best_s1 = 0.f;
+                for (int j = 0; j < is_unused_bbox.size(); ++j)
+                {
+                    if (is_unused_bbox[j])
+                    {
+                        best_s0 = std::max(similarity0[j * n_cols + i], best_s0);
+                        best_s1 = std::max(similarity1[j * n_cols + i], best_s1);
+                    }
+                }
+                POSE_TRACKER_DEBUG("frame {}: track missing {}, best_s0={}, best_s1={}", frame_id_, tracks_[i]->track_id(), best_s0, best_s1);
+                ++missing;
+            }
+        }
+        if (missing)
+        {
+            std::stringstream ss;
+            ss << cv::Mat_<float>(n_rows, n_cols, const_cast<float*>(similarity0.data()));
+            POSE_TRACKER_DEBUG("frame {}, similarity#0: \n{}", frame_id_, ss.str());
+            ss = std::stringstream{};
+            ss << cv::Mat_<float>(n_rows, n_cols, const_cast<float*>(similarity1.data()));
+            POSE_TRACKER_DEBUG("frame {}, similarity#1: \n{}", frame_id_, ss.str());
         }
-      }
-      POSE_TRACKER_DEBUG("frame {}: track missing {}, best_s0={}, best_s1={}", frame_id_,
-                         tracks_[i]->track_id(), best_s0, best_s1);
-      ++missing;
     }
-  }
-  if (missing) {
-    std::stringstream ss;
-    ss << cv::Mat_<float>(n_rows, n_cols, const_cast<float*>(similarity0.data()));
-    POSE_TRACKER_DEBUG("frame {}, similarity#0: \n{}", frame_id_, ss.str());
-    ss = std::stringstream{};
-    ss << cv::Mat_<float>(n_rows, n_cols, const_cast<float*>(similarity1.data()));
-    POSE_TRACKER_DEBUG("frame {}, similarity#1: \n{}", frame_id_, ss.str());
-  }
-}
-
-void Tracker::SummaryTracks() {
-  vector<std::tuple<int64_t, int>> summary;
-  for (const auto& track : tracks_) {
-    summary.emplace_back(track->track_id(), track->missing());
-  }
-  POSE_TRACKER_DEBUG("frame {}, track summary {}", frame_id_, summary);
-  for (const auto& track : tracks_) {
-    if (!track->missing()) {
-      POSE_TRACKER_DEBUG("frame {}, track {}, scores {}", frame_id_, track->track_id(),
-                         track->scores());
+
+    void Tracker::SummaryTracks()
+    {
+        vector<std::tuple<int64_t, int>> summary;
+        for (const auto& track : tracks_)
+        {
+            summary.emplace_back(track->track_id(), track->missing());
+        }
+        POSE_TRACKER_DEBUG("frame {}, track summary {}", frame_id_, summary);
+        for (const auto& track : tracks_)
+        {
+            if (!track->missing())
+            {
+                POSE_TRACKER_DEBUG("frame {}, track {}, scores {}", frame_id_, track->track_id(), track->scores());
+            }
+        }
     }
-  }
-}
 
 }  // namespace mmdeploy::mmpose::_pose_tracker
diff --git a/csrc/mmdeploy/codebase/mmpose/pose_tracker/pose_tracker.h b/csrc/mmdeploy/codebase/mmpose/pose_tracker/pose_tracker.h
index c27ae0b85b..697832b2f2 100644
--- a/csrc/mmdeploy/codebase/mmpose/pose_tracker/pose_tracker.h
+++ b/csrc/mmdeploy/codebase/mmpose/pose_tracker/pose_tracker.h
@@ -7,94 +7,107 @@
 #include "pose_tracker/common.h"
 #include "pose_tracker/track.h"
 
-namespace mmdeploy::mmpose::_pose_tracker {
+namespace mmdeploy::mmpose::_pose_tracker
+{
 
-class Tracker {
- public:
-  explicit Tracker(const mmdeploy_pose_tracker_param_t& _params);
+    class Tracker
+    {
+      public:
+        explicit Tracker(const mmdeploy_pose_tracker_param_t& _params);
 
-  Tracker(const Tracker&) { assert(0); }
-  Tracker(Tracker&& o) noexcept = default;
+        Tracker(const Tracker&)
+        {
+            assert(0);
+        }
+        Tracker(Tracker&& o) noexcept = default;
 
-  struct Detections {
-    Bboxes bboxes;
-    Scores scores;
-    vector<int> labels;
-  };
+        struct Detections
+        {
+            Bboxes      bboxes;
+            Scores      scores;
+            vector<int> labels;
+        };
 
-  void SetFrameSize(int height, int width) {
-    frame_h_ = static_cast<float>(height);
-    frame_w_ = static_cast<float>(width);
-  }
+        void SetFrameSize(int height, int width)
+        {
+            frame_h_ = static_cast<float>(height);
+            frame_w_ = static_cast<float>(width);
+        }
 
-  const mmdeploy_pose_tracker_param_t& params() const noexcept { return params_; }
+        const mmdeploy_pose_tracker_param_t& params() const noexcept
+        {
+            return params_;
+        }
 
-  int64_t frame_id() const noexcept { return frame_id_; }
+        int64_t frame_id() const noexcept
+        {
+            return frame_id_;
+        }
 
-  const vector<std::unique_ptr<Track>>& tracks() const noexcept { return tracks_; }
+        const vector<std::unique_ptr<Track>>& tracks() const noexcept
+        {
+            return tracks_;
+        }
 
-  std::tuple<vector<Bbox>, vector<int64_t>> ProcessBboxes(const std::optional<Detections>& dets);
+        std::tuple<vector<Bbox>, vector<int64_t>> ProcessBboxes(const std::optional<Detections>& dets);
 
-  void TrackStep(vector<Points>& keypoints, vector<Scores>& scores,
-                 const vector<int64_t>& prev_ids) noexcept;
+        void                                      TrackStep(vector<Points>& keypoints, vector<Scores>& scores, const vector<int64_t>& prev_ids) noexcept;
 
- private:
-  void GetDetectedObjects(const std::optional<Detections>& dets, vector<Bbox>& _bboxes,
-                          vector<int64_t>& track_ids, vector<int>& types) const;
+      private:
+        void                                   GetDetectedObjects(const std::optional<Detections>& dets, vector<Bbox>& _bboxes, vector<int64_t>& track_ids, vector<int>& types) const;
 
-  void GetTrackedObjects(vector<Bbox>& bboxes, vector<int64_t>& track_ids,
-                         vector<int>& types) const;
+        void                                   GetTrackedObjects(vector<Bbox>& bboxes, vector<int64_t>& track_ids, vector<int>& types) const;
 
-  void SuppressOverlappingBoxes(const vector<Bbox>& bboxes, vector<std::pair<int, float>>& ranks,
-                                vector<int>& is_valid_bbox) const;
+        void                                   SuppressOverlappingBoxes(const vector<Bbox>& bboxes, vector<std::pair<int, float>>& ranks, vector<int>& is_valid_bbox) const;
 
-  void SuppressOverlappingPoses(const vector<Points>& keypoints, const vector<Scores>& scores,
-                                const vector<Bbox>& bboxes, const vector<int64_t>& track_ids,
-                                vector<int>& is_valid, float oks_thr);
+        void                                   SuppressOverlappingPoses(const vector<Points>& keypoints, const vector<Scores>& scores, const vector<Bbox>& bboxes, const vector<int64_t>& track_ids, vector<int>& is_valid, float oks_thr);
 
-  std::optional<Bbox> KeypointsToBbox(const Points& kpts, const Scores& scores) const;
+        std::optional<Bbox>                    KeypointsToBbox(const Points& kpts, const Scores& scores) const;
 
-  float GetPosePoseSimilarity(const Bbox& bbox0, const Points& kpts0, const Bbox& bbox1,
-                              const Points& kpts1);
+        float                                  GetPosePoseSimilarity(const Bbox& bbox0, const Points& kpts0, const Bbox& bbox1, const Points& kpts1);
 
-  void GetSimilarityMatrices(const vector<Bbox>& bboxes, const vector<Points>& keypoints,
-                             const vector<int64_t>& prev_ids, vector<float>& similarity0,
-                             vector<float>& similarity1, vector<vector<bool>>& gating);
+        void                                   GetSimilarityMatrices(const vector<Bbox>& bboxes, const vector<Points>& keypoints, const vector<int64_t>& prev_ids, vector<float>& similarity0, vector<float>& similarity1, vector<vector<bool>>& gating);
 
-  std::tuple<float, float, vector<bool>> GetTrackPoseSimilarity(Track& track, const Bbox& bbox,
-                                                                const Points& kpts) const;
+        std::tuple<float, float, vector<bool>> GetTrackPoseSimilarity(Track& track, const Bbox& bbox, const Points& kpts) const;
 
-  void CreateTrack(const Bbox& bbox, const Points& kpts, const Scores& scores);
+        void                                   CreateTrack(const Bbox& bbox, const Points& kpts, const Scores& scores);
 
-  void RemoveMissingTracks();
+        void                                   RemoveMissingTracks();
 
-  void DiagnosticMissingTracks(const vector<int>& is_unused_track,
-                               const vector<int>& is_unused_bbox, const vector<float>& similarity0,
-                               const vector<float>& similarity1);
+        void                                   DiagnosticMissingTracks(const vector<int>&   is_unused_track,
+                                                                       const vector<int>&   is_unused_bbox,
+                                                                       const vector<float>& similarity0,
+                                                                       const vector<float>& similarity1);
 
-  void SummaryTracks();
+        void                                   SummaryTracks();
 
- private:
-  static constexpr const auto kInf = 1000.f;
+      private:
+        static constexpr const auto    kInf = 1000.f;
 
-  float frame_h_ = 0;
-  float frame_w_ = 0;
+        float                          frame_h_ = 0;
+        float                          frame_w_ = 0;
 
-  vector<std::unique_ptr<Track>> tracks_;
-  int64_t next_id_{0};
+        vector<std::unique_ptr<Track>> tracks_;
+        int64_t                        next_id_{0};
 
-  std::vector<float> key_point_sigmas_;
-  mmdeploy_pose_tracker_param_t params_;
+        std::vector<float>             key_point_sigmas_;
+        mmdeploy_pose_tracker_param_t  params_;
 
-  vector<Bbox> pose_input_bboxes_;
-  vector<Bbox> pose_output_bboxes_;
+        vector<Bbox>                   pose_input_bboxes_;
+        vector<Bbox>                   pose_output_bboxes_;
 
-  int64_t frame_id_ = 0;
+        int64_t                        frame_id_ = 0;
 
- public:
-  const vector<Bbox>& pose_input_bboxes() const noexcept { return pose_input_bboxes_; }
-  const vector<Bbox>& pose_output_bboxes() const noexcept { return pose_output_bboxes_; }
-};
+      public:
+        const vector<Bbox>& pose_input_bboxes() const noexcept
+        {
+            return pose_input_bboxes_;
+        }
+        const vector<Bbox>& pose_output_bboxes() const noexcept
+        {
+            return pose_output_bboxes_;
+        }
+    };
 
 }  // namespace mmdeploy::mmpose::_pose_tracker
 
diff --git a/csrc/mmdeploy/codebase/mmpose/pose_tracker/smoothing_filter.cpp b/csrc/mmdeploy/codebase/mmpose/pose_tracker/smoothing_filter.cpp
index 9c2711a4f5..6e39be12cb 100644
--- a/csrc/mmdeploy/codebase/mmpose/pose_tracker/smoothing_filter.cpp
+++ b/csrc/mmdeploy/codebase/mmpose/pose_tracker/smoothing_filter.cpp
@@ -2,48 +2,55 @@
 
 #include "smoothing_filter.h"
 
-namespace mmdeploy::mmpose::_pose_tracker {
-
-SmoothingFilter::SmoothingFilter(const Bbox& bbox, const Points& pts,
-                                 const SmoothingFilter::Params& params)
-    : params_(params),
-      pts_v_(pts.size()),
-      pts_x_(pts),
-      center_v_{},
-      center_x_{get_center(bbox)},
-      scale_v_{},
-      scale_x_{get_scale(bbox)} {}
-
-std::pair<Bbox, Points> SmoothingFilter::Step(const Bbox& bbox, const Points& kpts) {
-  constexpr auto abs = [](const Point& p) { return std::sqrt(p.dot(p)); };
-
-  // filter key-points
-  step<Point>(pts_x_, pts_v_, kpts, params_, abs);
-
-  // filter bbox center
-  std::array c{get_center(bbox)};
-  step<Point>(center_x_, center_v_, c, params_, abs);
-
-  // filter bbox scales
-  auto s = get_scale(bbox);
-  step<float>(scale_x_, scale_v_, s, params_, [](auto x) { return x; });
-
-  return {get_bbox(center_x_[0], scale_x_), pts_x_};
-}
-
-void SmoothingFilter::Reset(const Bbox& bbox, const Points& pts) {
-  pts_v_ = Points(pts_v_.size());
-  center_v_ = {};
-  scale_v_ = {};
-  pts_x_ = pts;
-  center_v_ = {get_center(bbox)};
-  scale_v_ = get_scale(bbox);
-}
-
-float SmoothingFilter::smoothing_factor(float cutoff) {
-  static constexpr float kPi = 3.1415926;
-  auto r = 2.f * kPi * cutoff;
-  return r / (r + 1.f);
-}
+namespace mmdeploy::mmpose::_pose_tracker
+{
+
+    SmoothingFilter::SmoothingFilter(const Bbox& bbox, const Points& pts, const SmoothingFilter::Params& params)
+        : params_(params)
+        , pts_v_(pts.size())
+        , pts_x_(pts)
+        , center_v_{}
+        , center_x_{get_center(bbox)}
+        , scale_v_{}
+        , scale_x_{get_scale(bbox)}
+    {
+    }
+
+    std::pair<Bbox, Points> SmoothingFilter::Step(const Bbox& bbox, const Points& kpts)
+    {
+        constexpr auto abs = [](const Point& p)
+        { return std::sqrt(p.dot(p)); };
+
+        // filter key-points
+        step<Point>(pts_x_, pts_v_, kpts, params_, abs);
+
+        // filter bbox center
+        std::array c{get_center(bbox)};
+        step<Point>(center_x_, center_v_, c, params_, abs);
+
+        // filter bbox scales
+        auto s = get_scale(bbox);
+        step<float>(scale_x_, scale_v_, s, params_, [](auto x)
+                    { return x; });
+
+        return {get_bbox(center_x_[0], scale_x_), pts_x_};
+    }
+
+    void SmoothingFilter::Reset(const Bbox& bbox, const Points& pts)
+    {
+        pts_v_    = Points(pts_v_.size());
+        center_v_ = {};
+        scale_v_  = {};
+        pts_x_    = pts;
+        center_v_ = {get_center(bbox)};
+        scale_v_  = get_scale(bbox);
+    }
+
+    float SmoothingFilter::smoothing_factor(float cutoff)
+    {
+        static constexpr float kPi = 3.1415926;
+        auto                   r   = 2.f * kPi * cutoff;
+        return r / (r + 1.f);
+    }
 
 }  // namespace mmdeploy::mmpose::_pose_tracker
diff --git a/csrc/mmdeploy/codebase/mmpose/pose_tracker/smoothing_filter.h b/csrc/mmdeploy/codebase/mmpose/pose_tracker/smoothing_filter.h
index c136beb2d6..77358e727e 100644
--- a/csrc/mmdeploy/codebase/mmpose/pose_tracker/smoothing_filter.h
+++ b/csrc/mmdeploy/codebase/mmpose/pose_tracker/smoothing_filter.h
@@ -6,52 +6,58 @@
 #include "mmdeploy/core/mpl/span.h"
 #include "pose_tracker/utils.h"
 
-namespace mmdeploy::mmpose::_pose_tracker {
-
-template <typename T>
-using span = mmdeploy::Span<T>;
-
-class SmoothingFilter {
- public:
-  struct Params {
-    float beta;
-    float fc_min;
-    float fc_v;
-  };
-  explicit SmoothingFilter(const Bbox& bbox, const Points& pts, const Params& params);
-
-  std::pair<Bbox, Points> Step(const Bbox& bbox, const Points& kpts);
-
-  void Reset(const Bbox& bbox, const Points& pts);
-
- private:
-  static float smoothing_factor(float cutoff);
-
-  template <typename T, typename Norm>
-  static void step(span<T> x, span<T> v, span<const T> x1, const Params& params, Norm norm) {
-    auto a_v = smoothing_factor(params.fc_v);
-    for (int i = 0; i < v.size(); ++i) {
-      v[i] = smooth(a_v, v[i], x1[i] - x[i]);
-      auto fc = params.fc_min + params.beta * norm(v[i]);
-      auto a_x = smoothing_factor(fc);
-      x[i] = smooth(a_x, x[i], x1[i]);
-    }
-  }
-
-  template <typename T>
-  static T smooth(float a, const T& x0, const T& x1) {
-    return (1.f - a) * x0 + a * x1;
-  }
-
- private:
-  Params params_;
-  std::vector<Point> pts_v_;
-  std::vector<Point> pts_x_;
-  std::array<Point, 1> center_v_;
-  std::array<Point, 1> center_x_;
-  std::array<float, 2> scale_v_;
-  std::array<float, 2> scale_x_;
-};
+namespace mmdeploy::mmpose::_pose_tracker
+{
+
+    template<typename T>
+    using span = mmdeploy::Span<T>;
+
+    class SmoothingFilter
+    {
+      public:
+        struct Params
+        {
+            float beta;
+            float fc_min;
+            float fc_v;
+        };
+        explicit SmoothingFilter(const Bbox& bbox, const Points& pts, const Params& params);
+
+        std::pair<Bbox, Points> Step(const Bbox& bbox, const Points& kpts);
+
+        void                    Reset(const Bbox& bbox, const Points& pts);
+
+      private:
+        static float smoothing_factor(float cutoff);
+
+        template<typename T, typename Norm>
+        static void step(span<T> x, span<T> v, span<const T> x1, const Params& params, Norm norm)
+        {
+            auto a_v = smoothing_factor(params.fc_v);
+            for (int i = 0; i < v.size(); ++i)
+            {
+                v[i]     = smooth(a_v, v[i], x1[i] - x[i]);
+                auto fc  = params.fc_min + params.beta * norm(v[i]);
+                auto a_x = smoothing_factor(fc);
+                x[i]     = smooth(a_x, x[i], x1[i]);
+            }
+        }
+
+        template<typename T>
+        static T smooth(float a, const T& x0, const T& x1)
+        {
+            return (1.f - a) * x0 + a * x1;
+        }
+
+      private:
+        Params               params_;
+        std::vector<Point>   pts_v_;
+        std::vector<Point>   pts_x_;
+        std::array<Point, 1> center_v_;
+        std::array<Point, 1> center_x_;
+        std::array<float, 2> scale_v_;
+        std::array<float, 2> scale_x_;
+    };
 
 }  // namespace mmdeploy::mmpose::_pose_tracker
 
diff --git a/csrc/mmdeploy/codebase/mmpose/pose_tracker/track.cpp b/csrc/mmdeploy/codebase/mmpose/pose_tracker/track.cpp
index 8566433ba0..6fad771c3b 100644
--- a/csrc/mmdeploy/codebase/mmpose/pose_tracker/track.cpp
+++ b/csrc/mmdeploy/codebase/mmpose/pose_tracker/track.cpp
@@ -2,69 +2,83 @@
 
 #include "pose_tracker/track.h"
 
-namespace mmdeploy::mmpose::_pose_tracker {
+namespace mmdeploy::mmpose::_pose_tracker
+{
 
-Track::Track(const mmdeploy_pose_tracker_param_t* params, const Bbox& bbox, const Points& kpts,
-             const Scores& ss, int64_t id)
-    : params_(params),
-      filter_(CreateFilter(bbox, kpts)),
-      smoother_(CreateSmoother(bbox, kpts)),
-      track_id_(id) {
-  POSE_TRACKER_DEBUG("new track {}", track_id_);
-  Add(bbox, kpts, ss);
-}
+    Track::Track(const mmdeploy_pose_tracker_param_t* params, const Bbox& bbox, const Points& kpts, const Scores& ss, int64_t id)
+        : params_(params)
+        , filter_(CreateFilter(bbox, kpts))
+        , smoother_(CreateSmoother(bbox, kpts))
+        , track_id_(id)
+    {
+        POSE_TRACKER_DEBUG("new track {}", track_id_);
+        Add(bbox, kpts, ss);
+    }
 
-Track::~Track() { POSE_TRACKER_DEBUG("track lost {}", track_id_); }
+    Track::~Track()
+    {
+        POSE_TRACKER_DEBUG("track lost {}", track_id_);
+    }
 
-void Track::UpdateVisible(const Bbox& bbox, const Points& kpts, const Scores& scores,
-                          const vector<bool>& tracked) {
-  auto [bbox_corr, kpts_corr] = filter_.Correct(bbox, kpts, tracked);
-  Add(bbox_corr, kpts_corr, scores);
-}
+    void Track::UpdateVisible(const Bbox& bbox, const Points& kpts, const Scores& scores, const vector<bool>& tracked)
+    {
+        auto [bbox_corr, kpts_corr] = filter_.Correct(bbox, kpts, tracked);
+        Add(bbox_corr, kpts_corr, scores);
+    }
 
-void Track::UpdateRecovered(const Bbox& bbox, const Points& kpts, const Scores& scores) {
-  POSE_TRACKER_DEBUG("track recovered {}", track_id_);
-  filter_ = CreateFilter(bbox, kpts);
-  smoother_.Reset(bbox, kpts);
-  Add(bbox, kpts, scores);
-  missing_ = 0;
-}
+    void Track::UpdateRecovered(const Bbox& bbox, const Points& kpts, const Scores& scores)
+    {
+        POSE_TRACKER_DEBUG("track recovered {}", track_id_);
+        filter_ = CreateFilter(bbox, kpts);
+        smoother_.Reset(bbox, kpts);
+        Add(bbox, kpts, scores);
+        missing_ = 0;
+    }
 
-void Track::UpdateMissing() {
-  missing_++;
-  if (missing_ <= params_->track_max_missing) {
-    // use predicted state to update the missing tracks
-    Add(bbox_predict_, kpts_predict_, vector<float>(kpts_predict_.size()));
-  }
-}
+    void Track::UpdateMissing()
+    {
+        missing_++;
+        if (missing_ <= params_->track_max_missing)
+        {
+            // use predicted state to update the missing tracks
+            Add(bbox_predict_, kpts_predict_, vector<float>(kpts_predict_.size()));
+        }
+    }
 
-void Track::Predict() {
-  // TODO: velocity decay for missing tracks
-  std::tie(bbox_predict_, kpts_predict_) = filter_.Predict();
-}
+    void Track::Predict()
+    {
+        // TODO: velocity decay for missing tracks
+        std::tie(bbox_predict_, kpts_predict_) = filter_.Predict();
+    }
 
-void Track::Add(const Bbox& bbox, const Points& kpts, const Scores& ss) {
-  bboxes_.push_back(bbox);
-  keypoints_.push_back(kpts);
-  scores_.push_back(ss);
-  if (bboxes_.size() > params_->track_history_size) {
-    std::rotate(bboxes_.begin(), bboxes_.begin() + 1, bboxes_.end());
-    std::rotate(keypoints_.begin(), keypoints_.begin() + 1, keypoints_.end());
-    std::rotate(scores_.begin(), scores_.begin() + 1, scores_.end());
-    bboxes_.pop_back();
-    keypoints_.pop_back();
-    scores_.pop_back();
-  }
-  std::tie(bbox_smooth_, kpts_smooth_) = smoother_.Step(bbox, kpts);
-}
+    void Track::Add(const Bbox& bbox, const Points& kpts, const Scores& ss)
+    {
+        bboxes_.push_back(bbox);
+        keypoints_.push_back(kpts);
+        scores_.push_back(ss);
+        if (bboxes_.size() > params_->track_history_size)
+        {
+            std::rotate(bboxes_.begin(), bboxes_.begin() + 1, bboxes_.end());
+            std::rotate(keypoints_.begin(), keypoints_.begin() + 1, keypoints_.end());
+            std::rotate(scores_.begin(), scores_.begin() + 1, scores_.end());
+            bboxes_.pop_back();
+            keypoints_.pop_back();
+            scores_.pop_back();
+        }
+        std::tie(bbox_smooth_, kpts_smooth_) = smoother_.Step(bbox, kpts);
+    }
 
-TrackingFilter Track::CreateFilter(const Bbox& bbox, const Points& pts) {
-  return {bbox, pts, params_->std_weight_position, params_->std_weight_velocity};
-}
+    TrackingFilter Track::CreateFilter(const Bbox& bbox, const Points& pts)
+    {
+        return {bbox, pts, params_->std_weight_position, params_->std_weight_velocity};
+    }
 
-SmoothingFilter Track::CreateSmoother(const Bbox& bbox, const Points& pts) {
-  return SmoothingFilter(
-      bbox, pts, {params_->smooth_params[0], params_->smooth_params[1], params_->smooth_params[2]});
-}
+    SmoothingFilter Track::CreateSmoother(const Bbox& bbox, const Points& pts)
+    {
+        return SmoothingFilter(
+            bbox,
+            pts,
+            {params_->smooth_params[0], params_->smooth_params[1], params_->smooth_params[2]});
+    }
 
 }  // namespace mmdeploy::mmpose::_pose_tracker
diff --git a/csrc/mmdeploy/codebase/mmpose/pose_tracker/track.h b/csrc/mmdeploy/codebase/mmpose/pose_tracker/track.h
index 5be73168ac..799039da4f 100644
--- a/csrc/mmdeploy/codebase/mmpose/pose_tracker/track.h
+++ b/csrc/mmdeploy/codebase/mmpose/pose_tracker/track.h
@@ -10,55 +10,82 @@
 #include "pose_tracker/tracking_filter.h"
 #include "pose_tracker/utils.h"
 
-namespace mmdeploy::mmpose::_pose_tracker {
-
-class Track {
- public:
-  Track(const mmdeploy_pose_tracker_param_t* params, const Bbox& bbox, const Points& kpts,
-        const Scores& ss, int64_t id);
-  ~Track();
-
-  void UpdateVisible(const Bbox& bbox, const Points& kpts, const Scores& scores,
-                     const vector<bool>& tracked);
-  void UpdateRecovered(const Bbox& bbox, const Points& kpts, const Scores& scores);
-  void UpdateMissing();
-  void Predict();
-
-  float BboxDistance(const Bbox& bbox) { return filter_.BboxDistance(bbox); }
-
-  vector<float> KeyPointDistance(const Points& kpts) { return filter_.KeyPointDistance(kpts); }
-
-  uint32_t track_id() const noexcept { return track_id_; }
-  uint32_t missing() const noexcept { return missing_; }
-
-  const Bbox& predicted_bbox() const noexcept { return bbox_predict_; }
-  const Bbox& smoothed_bbox() const noexcept { return bbox_smooth_; }
-
-  const Points& predicted_kpts() const noexcept { return kpts_predict_; }
-  const Points& smoothed_kpts() const noexcept { return kpts_smooth_; }
-
-  const Scores& scores() const noexcept { return scores_.back(); }
-
- private:
-  void Add(const Bbox& bbox, const Points& kpts, const Scores& ss);
-
-  TrackingFilter CreateFilter(const Bbox& bbox, const Points& pts);
-  SmoothingFilter CreateSmoother(const Bbox& bbox, const Points& pts);
-
- private:
-  const mmdeploy_pose_tracker_param_t* params_;
-  vector<Bbox> bboxes_;
-  vector<Points> keypoints_;
-  vector<Scores> scores_;
-  uint32_t track_id_{};
-  Bbox bbox_predict_{};
-  Bbox bbox_smooth_{};
-  Points kpts_predict_;
-  Points kpts_smooth_;
-  uint32_t missing_{0};
-  TrackingFilter filter_;
-  SmoothingFilter smoother_;
-};
+namespace mmdeploy::mmpose::_pose_tracker
+{
+
+    class Track
+    {
+      public:
+        Track(const mmdeploy_pose_tracker_param_t* params, const Bbox& bbox, const Points& kpts, const Scores& ss, int64_t id);
+        ~Track();
+
+        void  UpdateVisible(const Bbox& bbox, const Points& kpts, const Scores& scores, const vector<bool>& tracked);
+        void  UpdateRecovered(const Bbox& bbox, const Points& kpts, const Scores& scores);
+        void  UpdateMissing();
+        void  Predict();
+
+        float BboxDistance(const Bbox& bbox)
+        {
+            return filter_.BboxDistance(bbox);
+        }
+
+        vector<float> KeyPointDistance(const Points& kpts)
+        {
+            return filter_.KeyPointDistance(kpts);
+        }
+
+        uint32_t track_id() const noexcept
+        {
+            return track_id_;
+        }
+        uint32_t missing() const noexcept
+        {
+            return missing_;
+        }
+
+        const Bbox& predicted_bbox() const noexcept
+        {
+            return bbox_predict_;
+        }
+        const Bbox& smoothed_bbox() const noexcept
+        {
+            return bbox_smooth_;
+        }
+
+        const Points& predicted_kpts() const noexcept
+        {
+            return kpts_predict_;
+        }
+        const Points& smoothed_kpts() const noexcept
+        {
+            return kpts_smooth_;
+        }
+
+        const Scores& scores() const noexcept
+        {
+            return scores_.back();
+        }
+
+      private:
+        void            Add(const Bbox& bbox, const Points& kpts, const Scores& ss);
+
+        TrackingFilter  CreateFilter(const Bbox& bbox, const Points& pts);
+        SmoothingFilter CreateSmoother(const Bbox& bbox, const Points& pts);
+
+      private:
+        const mmdeploy_pose_tracker_param_t* params_;
+        vector<Bbox>                         bboxes_;
+        vector<Points>                       keypoints_;
+        vector<Scores>                       scores_;
+        uint32_t                             track_id_{};
+        Bbox                                 bbox_predict_{};
+        Bbox                                 bbox_smooth_{};
+        Points                               kpts_predict_;
+        Points                               kpts_smooth_;
+        uint32_t                             missing_{0};
+        TrackingFilter                       filter_;
+        SmoothingFilter                      smoother_;
+    };
 
 }  // namespace mmdeploy::mmpose::_pose_tracker
 
diff --git a/csrc/mmdeploy/codebase/mmpose/pose_tracker/tracking_filter.cpp b/csrc/mmdeploy/codebase/mmpose/pose_tracker/tracking_filter.cpp
index 03745a3c8f..e5de3c0a81 100644
--- a/csrc/mmdeploy/codebase/mmpose/pose_tracker/tracking_filter.cpp
+++ b/csrc/mmdeploy/codebase/mmpose/pose_tracker/tracking_filter.cpp
@@ -2,198 +2,222 @@
 
 #include "pose_tracker/tracking_filter.h"
 
-namespace mmdeploy::mmpose::_pose_tracker {
-
-float get_mean_scale(float scale_w, float scale_h) { return std::sqrt(scale_w * scale_h); }
-
-TrackingFilter::TrackingFilter(const Bbox& bbox, const vector<Point>& kpts,
-                               float std_weight_position, float std_weight_velocity)
-    : std_weight_position_(std_weight_position), std_weight_velocity_(std_weight_velocity) {
-  auto center = get_center(bbox);
-  auto scale = get_scale(bbox);
-
-  auto mean_scale = get_mean_scale(scale[0], scale[1]);
-
-  const auto n = kpts.size();
-  pt_filters_.resize(n);
-  for (int i = 0; i < n; ++i) {
-    auto& f = pt_filters_[i];
-    f.init(4, 2);
-    SetKeyPointTransitionMat(i);
-    SetKeyPointMeasurementMat(i);
-
-    ResetKeyPoint(i, kpts[i], mean_scale);
-  }
-
-  {
-    // [x, y, w, h, dx, dy, dw, dh]
-    auto& f = bbox_filter_;
-
-    f.init(8, 4);
-
-    SetBboxTransitionMat();
-    SetBboxMeasurementMat();
-
-    SetBboxErrorCov(2 * std_weight_position * mean_scale,  //
-                    10 * std_weight_velocity * mean_scale);
-
-    f.statePost.at<float>(0) = center.x;
-    f.statePost.at<float>(1) = center.y;
-    f.statePost.at<float>(2) = scale[0];
-    f.statePost.at<float>(3) = scale[1];
-  }
-}
-
-std::pair<Bbox, Points> TrackingFilter::Predict() {
-  auto mean_scale = get_mean_scale(bbox_filter_.statePost.at<float>(2),  //
-                                   bbox_filter_.statePost.at<float>(3));
-  const auto n = pt_filters_.size();
-  Points pts(n);
-  for (int i = 0; i < n; ++i) {
-    SetKeyPointProcessCov(i, std_weight_position_ * mean_scale, std_weight_velocity_ * mean_scale);
-    auto mat = pt_filters_[i].predict();
-    pts[i].x = mat.at<float>(0);
-    pts[i].y = mat.at<float>(1);
-  }
-  Bbox bbox;
-  {
-    SetBboxProcessCov(std_weight_position_ * mean_scale, std_weight_velocity_ * mean_scale);
-    auto mat = bbox_filter_.predict();
-    auto x = mat.ptr<float>();
-    bbox = get_bbox({x[0], x[1]}, {x[2], x[3]});
-  }
-  return {bbox, pts};
-}
-
-std::pair<Bbox, Points> TrackingFilter::Correct(const Bbox& bbox, const Points& kpts,
-                                                const vector<bool>& tracked) {
-  auto mean_scale = get_mean_scale(bbox_filter_.statePre.at<float>(2),  //
-                                   bbox_filter_.statePre.at<float>(3));
-  const auto n = pt_filters_.size();
-  Points corr_kpts(n);
-  for (int i = 0; i < n; ++i) {
-    if (!tracked.empty() && tracked[i]) {
-      SetKeyPointMeasurementCov(i, std_weight_position_ * mean_scale);
-      std::array<float, 2> m{kpts[i].x, kpts[i].y};
-      auto mat = pt_filters_[i].correct(as_mat(m));
-      corr_kpts[i].x = mat.at<float>(0);
-      corr_kpts[i].y = mat.at<float>(1);
-    } else {
-      ResetKeyPoint(i, kpts[i], mean_scale);
-      corr_kpts[i] = kpts[i];
-    }
-  }
-  Bbox corr_bbox;
-  {
-    SetBboxMeasurementCov(std_weight_position_ * mean_scale);
-    auto c = get_center(bbox);
-    auto s = get_scale(bbox);
-    std::array<float, 4> m{c.x, c.y, s[0], s[1]};
-    auto mat = bbox_filter_.correct(as_mat(m));
-    auto x = mat.ptr<float>();
-    corr_bbox = get_bbox({x[0], x[1]}, {x[2], x[3]});
-  }
-  return {corr_bbox, corr_kpts};
-}
-
-float TrackingFilter::BboxDistance(const Bbox& bbox) {
-  auto mean_scale = get_mean_scale(bbox_filter_.statePre.at<float>(2),  //
-                                   bbox_filter_.statePre.at<float>(3));
-  SetBboxMeasurementCov(std_weight_position_ * mean_scale);
-  auto c = get_center(bbox);
-  auto s = get_scale(bbox);
-  std::array<float, 4> m{c.x, c.y, s[0], s[1]};
-  cv::Mat z = as_mat(m);
-  auto& f = bbox_filter_;
-  cv::Mat sigma;
-  cv::gemm(f.measurementMatrix * f.errorCovPre, f.measurementMatrix, 1, f.measurementNoiseCov, 1,
-           sigma, cv::GEMM_2_T);
-  cv::Mat r = z - f.measurementMatrix * f.statePre;
-  // ignore contribution of scales as it is unstable when inferred from key-points
-  r.at<float>(2) = 0;
-  r.at<float>(3) = 0;
-  cv::Mat d = r.t() * sigma.inv() * r;
-  return d.at<float>();
-}
-
-vector<float> TrackingFilter::KeyPointDistance(const Points& kpts) {
-  auto mean_scale = get_mean_scale(bbox_filter_.statePre.at<float>(2),  //
-                                   bbox_filter_.statePre.at<float>(3));
-
-  const auto n = pt_filters_.size();
-  vector<float> dists(n);
-  for (int i = 0; i < n; ++i) {
-    SetKeyPointMeasurementCov(i, std_weight_position_ * mean_scale);
-    std::array<float, 2> m{kpts[i].x, kpts[i].y};
-    cv::Mat z = as_mat(m);
-    auto& f = pt_filters_[i];
-    cv::Mat sigma;
-    cv::gemm(f.measurementMatrix * f.errorCovPre, f.measurementMatrix, 1, f.measurementNoiseCov, 1,
-             sigma, cv::GEMM_2_T);
-    cv::Mat r = z - f.measurementMatrix * f.statePre;
-    cv::Mat d = r.t() * sigma.inv() * r;
-    dists[i] = d.at<float>();
-  }
-  return dists;
-}
-
-void TrackingFilter::SetBboxProcessCov(float sigma_p, float sigma_v) {
-  auto& m = bbox_filter_.processNoiseCov;
-  cv::setIdentity(m(cv::Rect(0, 0, 4, 4)), sigma_p * sigma_p);
-  cv::setIdentity(m(cv::Rect(4, 4, 4, 4)), sigma_v * sigma_v);
-}
-void TrackingFilter::SetBboxMeasurementCov(float sigma_p) {
-  auto& m = bbox_filter_.measurementNoiseCov;
-  cv::setIdentity(m, sigma_p * sigma_p);
-}
-void TrackingFilter::SetBboxErrorCov(float sigma_p, float sigma_v) {
-  auto& m = bbox_filter_.errorCovPost;
-  cv::setIdentity(m(cv::Rect(0, 0, 4, 4)), sigma_p * sigma_p);
-  cv::setIdentity(m(cv::Rect(4, 4, 4, 4)), sigma_v * sigma_v);
-}
-void TrackingFilter::SetBboxTransitionMat() {
-  auto& m = bbox_filter_.transitionMatrix;
-  cv::setIdentity(m(cv::Rect(4, 0, 4, 4)));  // with scale velocity
-  //  cv::setIdentity(m(cv::Rect(4, 0, 2, 2)));  // w/o scale velocity
-}
-void TrackingFilter::SetBboxMeasurementMat() {
-  auto& m = bbox_filter_.measurementMatrix;
-  cv::setIdentity(m(cv::Rect(0, 0, 4, 4)));
-}
-
-void TrackingFilter::SetKeyPointProcessCov(int index, float sigma_p, float sigma_v) {
-  auto& m = pt_filters_[index].processNoiseCov;
-  m.at<float>(0, 0) = sigma_p * sigma_p;
-  m.at<float>(1, 1) = sigma_p * sigma_p;
-  m.at<float>(2, 2) = sigma_v * sigma_v;
-  m.at<float>(3, 3) = sigma_v * sigma_v;
-}
-void TrackingFilter::SetKeyPointMeasurementCov(int index, float sigma_p) {
-  auto& m = pt_filters_[index].measurementNoiseCov;
-  m.at<float>(0, 0) = sigma_p * sigma_p;
-  m.at<float>(1, 1) = sigma_p * sigma_p;
-}
-void TrackingFilter::SetKeyPointErrorCov(int index, float sigma_p, float sigma_v) {
-  auto& m = pt_filters_[index].errorCovPost;
-  m.at<float>(0, 0) = sigma_p * sigma_p;
-  m.at<float>(1, 1) = sigma_p * sigma_p;
-  m.at<float>(2, 2) = sigma_v * sigma_v;
-  m.at<float>(3, 3) = sigma_v * sigma_v;
-}
-void TrackingFilter::SetKeyPointTransitionMat(int index) {
-  auto& m = pt_filters_[index].transitionMatrix;
-  cv::setIdentity(m(cv::Rect(2, 0, 2, 2)));
-}
-void TrackingFilter::SetKeyPointMeasurementMat(int index) {
-  auto& m = pt_filters_[index].measurementMatrix;
-  cv::setIdentity(m(cv::Rect(0, 0, 2, 2)));
-}
-
-void TrackingFilter::ResetKeyPoint(int index, const Point& kpt, float scale) {
-  auto& f = pt_filters_[index];
-  SetKeyPointErrorCov(index, 2 * std_weight_position_ * scale, 10 * std_weight_velocity_ * scale);
-  f.statePost.at<float>(0) = kpt.x;
-  f.statePost.at<float>(1) = kpt.y;
-}
+namespace mmdeploy::mmpose::_pose_tracker
+{
+
+    float get_mean_scale(float scale_w, float scale_h)
+    {
+        return std::sqrt(scale_w * scale_h);
+    }
+
+    TrackingFilter::TrackingFilter(const Bbox& bbox, const vector<Point>& kpts, float std_weight_position, float std_weight_velocity)
+        : std_weight_position_(std_weight_position)
+        , std_weight_velocity_(std_weight_velocity)
+    {
+        auto       center = get_center(bbox);
+        auto       scale  = get_scale(bbox);
+
+        auto       mean_scale = get_mean_scale(scale[0], scale[1]);
+
+        const auto n = kpts.size();
+        pt_filters_.resize(n);
+        for (int i = 0; i < n; ++i)
+        {
+            auto& f = pt_filters_[i];
+            f.init(4, 2);
+            SetKeyPointTransitionMat(i);
+            SetKeyPointMeasurementMat(i);
+
+            ResetKeyPoint(i, kpts[i], mean_scale);
+        }
+
+        {
+            // [x, y, w, h, dx, dy, dw, dh]
+            auto& f = bbox_filter_;
+
+            f.init(8, 4);
+
+            SetBboxTransitionMat();
+            SetBboxMeasurementMat();
+
+            SetBboxErrorCov(2 * std_weight_position * mean_scale,  //
+                            10 * std_weight_velocity * mean_scale);
+
+            f.statePost.at<float>(0) = center.x;
+            f.statePost.at<float>(1) = center.y;
+            f.statePost.at<float>(2) = scale[0];
+            f.statePost.at<float>(3) = scale[1];
+        }
+    }
+
+    std::pair<Bbox, Points> TrackingFilter::Predict()
+    {
+        auto       mean_scale = get_mean_scale(bbox_filter_.statePost.at<float>(2),  //
+                                         bbox_filter_.statePost.at<float>(3));
+        const auto n          = pt_filters_.size();
+        Points     pts(n);
+        for (int i = 0; i < n; ++i)
+        {
+            SetKeyPointProcessCov(i, std_weight_position_ * mean_scale, std_weight_velocity_ * mean_scale);
+            auto mat = pt_filters_[i].predict();
+            pts[i].x = mat.at<float>(0);
+            pts[i].y = mat.at<float>(1);
+        }
+        Bbox bbox;
+        {
+            SetBboxProcessCov(std_weight_position_ * mean_scale, std_weight_velocity_ * mean_scale);
+            auto mat = bbox_filter_.predict();
+            auto x   = mat.ptr<float>();
+            bbox     = get_bbox({x[0], x[1]}, {x[2], x[3]});
+        }
+        return {bbox, pts};
+    }
+
+    std::pair<Bbox, Points> TrackingFilter::Correct(const Bbox& bbox, const Points& kpts, const vector<bool>& tracked)
+    {
+        auto       mean_scale = get_mean_scale(bbox_filter_.statePre.at<float>(2),  //
+                                         bbox_filter_.statePre.at<float>(3));
+        const auto n          = pt_filters_.size();
+        Points     corr_kpts(n);
+        for (int i = 0; i < n; ++i)
+        {
+            if (!tracked.empty() && tracked[i])
+            {
+                SetKeyPointMeasurementCov(i, std_weight_position_ * mean_scale);
+                std::array<float, 2> m{kpts[i].x, kpts[i].y};
+                auto                 mat = pt_filters_[i].correct(as_mat(m));
+                corr_kpts[i].x           = mat.at<float>(0);
+                corr_kpts[i].y           = mat.at<float>(1);
+            }
+            else
+            {
+                ResetKeyPoint(i, kpts[i], mean_scale);
+                corr_kpts[i] = kpts[i];
+            }
+        }
+        Bbox corr_bbox;
+        {
+            SetBboxMeasurementCov(std_weight_position_ * mean_scale);
+            auto                 c = get_center(bbox);
+            auto                 s = get_scale(bbox);
+            std::array<float, 4> m{c.x, c.y, s[0], s[1]};
+            auto                 mat = bbox_filter_.correct(as_mat(m));
+            auto                 x   = mat.ptr<float>();
+            corr_bbox                = get_bbox({x[0], x[1]}, {x[2], x[3]});
+        }
+        return {corr_bbox, corr_kpts};
+    }
+
+    float TrackingFilter::BboxDistance(const Bbox& bbox)
+    {
+        auto mean_scale = get_mean_scale(bbox_filter_.statePre.at<float>(2),  //
+                                         bbox_filter_.statePre.at<float>(3));
+        SetBboxMeasurementCov(std_weight_position_ * mean_scale);
+        auto                 c = get_center(bbox);
+        auto                 s = get_scale(bbox);
+        std::array<float, 4> m{c.x, c.y, s[0], s[1]};
+        cv::Mat              z = as_mat(m);
+        auto&                f = bbox_filter_;
+        cv::Mat              sigma;
+        cv::gemm(f.measurementMatrix * f.errorCovPre, f.measurementMatrix, 1, f.measurementNoiseCov, 1, sigma, cv::GEMM_2_T);
+        cv::Mat r      = z - f.measurementMatrix * f.statePre;
+        // ignore contribution of scales as it is unstable when inferred from key-points
+        r.at<float>(2) = 0;
+        r.at<float>(3) = 0;
+        cv::Mat d      = r.t() * sigma.inv() * r;
+        return d.at<float>();
+    }
+
+    vector<float> TrackingFilter::KeyPointDistance(const Points& kpts)
+    {
+        auto          mean_scale = get_mean_scale(bbox_filter_.statePre.at<float>(2),  //
+                                         bbox_filter_.statePre.at<float>(3));
+
+        const auto    n = pt_filters_.size();
+        vector<float> dists(n);
+        for (int i = 0; i < n; ++i)
+        {
+            SetKeyPointMeasurementCov(i, std_weight_position_ * mean_scale);
+            std::array<float, 2> m{kpts[i].x, kpts[i].y};
+            cv::Mat              z = as_mat(m);
+            auto&                f = pt_filters_[i];
+            cv::Mat              sigma;
+            cv::gemm(f.measurementMatrix * f.errorCovPre, f.measurementMatrix, 1, f.measurementNoiseCov, 1, sigma, cv::GEMM_2_T);
+            cv::Mat r = z - f.measurementMatrix * f.statePre;
+            cv::Mat d = r.t() * sigma.inv() * r;
+            dists[i]  = d.at<float>();
+        }
+        return dists;
+    }
+
+    void TrackingFilter::SetBboxProcessCov(float sigma_p, float sigma_v)
+    {
+        auto& m = bbox_filter_.processNoiseCov;
+        cv::setIdentity(m(cv::Rect(0, 0, 4, 4)), sigma_p * sigma_p);
+        cv::setIdentity(m(cv::Rect(4, 4, 4, 4)), sigma_v * sigma_v);
+    }
+    void TrackingFilter::SetBboxMeasurementCov(float sigma_p)
+    {
+        auto& m = bbox_filter_.measurementNoiseCov;
+        cv::setIdentity(m, sigma_p * sigma_p);
+    }
+    void TrackingFilter::SetBboxErrorCov(float sigma_p, float sigma_v)
+    {
+        auto& m = bbox_filter_.errorCovPost;
+        cv::setIdentity(m(cv::Rect(0, 0, 4, 4)), sigma_p * sigma_p);
+        cv::setIdentity(m(cv::Rect(4, 4, 4, 4)), sigma_v * sigma_v);
+    }
+    void TrackingFilter::SetBboxTransitionMat()
+    {
+        auto& m = bbox_filter_.transitionMatrix;
+        cv::setIdentity(m(cv::Rect(4, 0, 4, 4)));  // with scale velocity
+                                                   //  cv::setIdentity(m(cv::Rect(4, 0, 2, 2)));  // w/o scale velocity
+    }
+    void TrackingFilter::SetBboxMeasurementMat()
+    {
+        auto& m = bbox_filter_.measurementMatrix;
+        cv::setIdentity(m(cv::Rect(0, 0, 4, 4)));
+    }
+
+    void TrackingFilter::SetKeyPointProcessCov(int index, float sigma_p, float sigma_v)
+    {
+        auto& m           = pt_filters_[index].processNoiseCov;
+        m.at<float>(0, 0) = sigma_p * sigma_p;
+        m.at<float>(1, 1) = sigma_p * sigma_p;
+        m.at<float>(2, 2) = sigma_v * sigma_v;
+        m.at<float>(3, 3) = sigma_v * sigma_v;
+    }
+    void TrackingFilter::SetKeyPointMeasurementCov(int index, float sigma_p)
+    {
+        auto& m           = pt_filters_[index].measurementNoiseCov;
+        m.at<float>(0, 0) = sigma_p * sigma_p;
+        m.at<float>(1, 1) = sigma_p * sigma_p;
+    }
+    void TrackingFilter::SetKeyPointErrorCov(int index, float sigma_p, float sigma_v)
+    {
+        auto& m           = pt_filters_[index].errorCovPost;
+        m.at<float>(0, 0) = sigma_p * sigma_p;
+        m.at<float>(1, 1) = sigma_p * sigma_p;
+        m.at<float>(2, 2) = sigma_v * sigma_v;
+        m.at<float>(3, 3) = sigma_v * sigma_v;
+    }
+    void TrackingFilter::SetKeyPointTransitionMat(int index)
+    {
+        auto& m = pt_filters_[index].transitionMatrix;
+        cv::setIdentity(m(cv::Rect(2, 0, 2, 2)));
+    }
+    void TrackingFilter::SetKeyPointMeasurementMat(int index)
+    {
+        auto& m = pt_filters_[index].measurementMatrix;
+        cv::setIdentity(m(cv::Rect(0, 0, 2, 2)));
+    }
+
+    void TrackingFilter::ResetKeyPoint(int index, const Point& kpt, float scale)
+    {
+        auto& f = pt_filters_[index];
+        SetKeyPointErrorCov(index, 2 * std_weight_position_ * scale, 10 * std_weight_velocity_ * scale);
+        f.statePost.at<float>(0) = kpt.x;
+        f.statePost.at<float>(1) = kpt.y;
+    }
 
 }  // namespace mmdeploy::mmpose::_pose_tracker
diff --git a/csrc/mmdeploy/codebase/mmpose/pose_tracker/tracking_filter.h b/csrc/mmdeploy/codebase/mmpose/pose_tracker/tracking_filter.h
index 593fddc369..fba6091c94 100644
--- a/csrc/mmdeploy/codebase/mmpose/pose_tracker/tracking_filter.h
+++ b/csrc/mmdeploy/codebase/mmpose/pose_tracker/tracking_filter.h
@@ -6,44 +6,44 @@
 #include "opencv2/video/video.hpp"
 #include "pose_tracker/utils.h"
 
-namespace mmdeploy::mmpose::_pose_tracker {
+namespace mmdeploy::mmpose::_pose_tracker
+{
 
-// use Kalman filter to estimate and predict true states
-class TrackingFilter {
- public:
-  TrackingFilter(const Bbox& bbox, const vector<Point>& kpts, float std_weight_position,
-                 float std_weight_velocity);
+    // use Kalman filter to estimate and predict true states
+    class TrackingFilter
+    {
+      public:
+        TrackingFilter(const Bbox& bbox, const vector<Point>& kpts, float std_weight_position, float std_weight_velocity);
 
-  std::pair<Bbox, Points> Predict();
+        std::pair<Bbox, Points> Predict();
 
-  vector<float> KeyPointDistance(const Points& kpts);
+        vector<float>           KeyPointDistance(const Points& kpts);
 
-  float BboxDistance(const Bbox& bbox);
+        float                   BboxDistance(const Bbox& bbox);
 
-  std::pair<Bbox, Points> Correct(const Bbox& bbox, const Points& kpts,
-                                  const vector<bool>& tracked);
+        std::pair<Bbox, Points> Correct(const Bbox& bbox, const Points& kpts, const vector<bool>& tracked);
 
- private:
-  void SetBboxProcessCov(float sigma_p, float sigma_v);
-  void SetBboxMeasurementCov(float sigma_p);
-  void SetBboxErrorCov(float sigma_p, float sigma_v);
-  void SetBboxTransitionMat();
-  void SetBboxMeasurementMat();
+      private:
+        void SetBboxProcessCov(float sigma_p, float sigma_v);
+        void SetBboxMeasurementCov(float sigma_p);
+        void SetBboxErrorCov(float sigma_p, float sigma_v);
+        void SetBboxTransitionMat();
+        void SetBboxMeasurementMat();
 
-  void SetKeyPointProcessCov(int index, float sigma_p, float sigma_v);
-  void SetKeyPointMeasurementCov(int index, float sigma_p);
-  void SetKeyPointErrorCov(int index, float sigma_p, float sigma_v);
-  void SetKeyPointTransitionMat(int index);
-  void SetKeyPointMeasurementMat(int index);
+        void SetKeyPointProcessCov(int index, float sigma_p, float sigma_v);
+        void SetKeyPointMeasurementCov(int index, float sigma_p);
+        void SetKeyPointErrorCov(int index, float sigma_p, float sigma_v);
+        void SetKeyPointTransitionMat(int index);
+        void SetKeyPointMeasurementMat(int index);
 
-  void ResetKeyPoint(int index, const Point& kpt, float scale);
+        void ResetKeyPoint(int index, const Point& kpt, float scale);
 
- private:
-  std::vector<cv::KalmanFilter> pt_filters_;
-  cv::KalmanFilter bbox_filter_;
-  float std_weight_position_;
-  float std_weight_velocity_;
-};
+      private:
+        std::vector<cv::KalmanFilter> pt_filters_;
+        cv::KalmanFilter              bbox_filter_;
+        float                         std_weight_position_;
+        float                         std_weight_velocity_;
+    };
 
 }  // namespace mmdeploy::mmpose::_pose_tracker
 
diff --git a/csrc/mmdeploy/codebase/mmpose/pose_tracker/utils.cpp b/csrc/mmdeploy/codebase/mmpose/pose_tracker/utils.cpp
index e264a7b4f1..97c4540f42 100644
--- a/csrc/mmdeploy/codebase/mmpose/pose_tracker/utils.cpp
+++ b/csrc/mmdeploy/codebase/mmpose/pose_tracker/utils.cpp
@@ -2,135 +2,158 @@
 
 #include "pose_tracker/utils.h"
 
-namespace mmdeploy::mmpose::_pose_tracker {
+namespace mmdeploy::mmpose::_pose_tracker
+{
 
-vector<std::tuple<int, int, float>> greedy_assignment(const vector<float>& scores,
-                                                      vector<int>& is_valid_row,
-                                                      vector<int>& is_valid_col, float thr) {
-  const auto n_rows = is_valid_row.size();
-  const auto n_cols = is_valid_col.size();
-  vector<std::tuple<int, int, float>> assignment;
-  assignment.reserve(std::max(n_rows, n_cols));
-  while (true) {
-    auto max_score = std::numeric_limits<float>::lowest();
-    int max_row = -1;
-    int max_col = -1;
-    for (int i = 0; i < n_rows; ++i) {
-      if (is_valid_row[i]) {
-        for (int j = 0; j < n_cols; ++j) {
-          if (is_valid_col[j]) {
-            if (scores[i * n_cols + j] > max_score) {
-              max_score = scores[i * n_cols + j];
-              max_row = i;
-              max_col = j;
+    vector<std::tuple<int, int, float>> greedy_assignment(const vector<float>& scores,
+                                                          vector<int>&         is_valid_row,
+                                                          vector<int>&         is_valid_col,
+                                                          float                thr)
+    {
+        const auto                          n_rows = is_valid_row.size();
+        const auto                          n_cols = is_valid_col.size();
+        vector<std::tuple<int, int, float>> assignment;
+        assignment.reserve(std::max(n_rows, n_cols));
+        while (true)
+        {
+            auto max_score = std::numeric_limits<float>::lowest();
+            int  max_row   = -1;
+            int  max_col   = -1;
+            for (int i = 0; i < n_rows; ++i)
+            {
+                if (is_valid_row[i])
+                {
+                    for (int j = 0; j < n_cols; ++j)
+                    {
+                        if (is_valid_col[j])
+                        {
+                            if (scores[i * n_cols + j] > max_score)
+                            {
+                                max_score = scores[i * n_cols + j];
+                                max_row   = i;
+                                max_col   = j;
+                            }
+                        }
+                    }
+                }
             }
-          }
+            if (max_score < thr)
+            {
+                break;
+            }
+            is_valid_row[max_row] = 0;
+            is_valid_col[max_col] = 0;
+            assignment.emplace_back(max_row, max_col, max_score);
         }
-      }
-    }
-    if (max_score < thr) {
-      break;
+        return assignment;
     }
-    is_valid_row[max_row] = 0;
-    is_valid_col[max_col] = 0;
-    assignment.emplace_back(max_row, max_col, max_score);
-  }
-  return assignment;
-}
 
-float intersection_over_union(const Bbox& a, const Bbox& b) {
-  auto x1 = std::max(a[0], b[0]);
-  auto y1 = std::max(a[1], b[1]);
-  auto x2 = std::min(a[2], b[2]);
-  auto y2 = std::min(a[3], b[3]);
+    float intersection_over_union(const Bbox& a, const Bbox& b)
+    {
+        auto x1 = std::max(a[0], b[0]);
+        auto y1 = std::max(a[1], b[1]);
+        auto x2 = std::min(a[2], b[2]);
+        auto y2 = std::min(a[3], b[3]);
 
-  auto inter_area = std::max(0.f, x2 - x1) * std::max(0.f, y2 - y1);
+        auto inter_area = std::max(0.f, x2 - x1) * std::max(0.f, y2 - y1);
 
-  auto a_area = get_area(a);
-  auto b_area = get_area(b);
-  auto union_area = a_area + b_area - inter_area;
+        auto a_area     = get_area(a);
+        auto b_area     = get_area(b);
+        auto union_area = a_area + b_area - inter_area;
 
-  if (union_area == 0.f) {
-    return 0;
-  }
-
-  return inter_area / union_area;
-}
+        if (union_area == 0.f)
+        {
+            return 0;
+        }
 
-float object_keypoint_similarity(const Points& pts_a, const Bbox& box_a, const Points& pts_b,
-                                 const Bbox& box_b, const vector<float>& sigmas) {
-  assert(pts_a.size() == sigmas.size());
-  assert(pts_b.size() == sigmas.size());
-  auto scale = [](const Bbox& bbox) -> float {
-    auto a = bbox[2] - bbox[0];
-    auto b = bbox[3] - bbox[1];
-    return std::sqrt(a * a + b * b);
-  };
-  auto oks = [](const Point& pa, const Point& pb, float s, float k) {
-    return std::exp(-(pa - pb).dot(pa - pb) / (2.f * s * s * k * k));
-  };
-  auto sum = 0.f;
-  const auto s = .5f * (scale(box_a) + scale(box_b));
-  for (int i = 0; i < sigmas.size(); ++i) {
-    sum += oks(pts_a[i], pts_b[i], s, sigmas[i]);
-  }
-  sum /= static_cast<float>(sigmas.size());
-  return sum;
-}
+        return inter_area / union_area;
+    }
 
-std::optional<Bbox> keypoints_to_bbox(const Points& keypoints, const Scores& scores, float img_h,
-                                      float img_w, float scale, float kpt_thr, int min_keypoints) {
-  int valid = 0;
-  auto x1 = static_cast<float>(img_w);
-  auto y1 = static_cast<float>(img_h);
-  auto x2 = 0.f;
-  auto y2 = 0.f;
-  for (size_t i = 0; i < keypoints.size(); ++i) {
-    auto& kpt = keypoints[i];
-    if (scores[i] >= kpt_thr) {
-      x1 = std::min(x1, kpt.x);
-      y1 = std::min(y1, kpt.y);
-      x2 = std::max(x2, kpt.x);
-      y2 = std::max(y2, kpt.y);
-      ++valid;
+    float object_keypoint_similarity(const Points& pts_a, const Bbox& box_a, const Points& pts_b, const Bbox& box_b, const vector<float>& sigmas)
+    {
+        assert(pts_a.size() == sigmas.size());
+        assert(pts_b.size() == sigmas.size());
+        auto scale = [](const Bbox& bbox) -> float
+        {
+            auto a = bbox[2] - bbox[0];
+            auto b = bbox[3] - bbox[1];
+            return std::sqrt(a * a + b * b);
+        };
+        auto oks = [](const Point& pa, const Point& pb, float s, float k)
+        {
+            return std::exp(-(pa - pb).dot(pa - pb) / (2.f * s * s * k * k));
+        };
+        auto       sum = 0.f;
+        const auto s   = .5f * (scale(box_a) + scale(box_b));
+        for (int i = 0; i < sigmas.size(); ++i)
+        {
+            sum += oks(pts_a[i], pts_b[i], s, sigmas[i]);
+        }
+        sum /= static_cast<float>(sigmas.size());
+        return sum;
     }
-  }
-  if (min_keypoints < 0) {
-    min_keypoints = (static_cast<int>(scores.size()) + 1) / 2;
-  }
-  if (valid < min_keypoints) {
-    return std::nullopt;
-  }
-  auto xc = .5f * (x1 + x2);
-  auto yc = .5f * (y1 + y2);
-  auto w = (x2 - x1) * scale;
-  auto h = (y2 - y1) * scale;
 
-  return std::array<float, 4>{
-      std::max(0.f, std::min(img_w, xc - .5f * w)),
-      std::max(0.f, std::min(img_h, yc - .5f * h)),
-      std::max(0.f, std::min(img_w, xc + .5f * w)),
-      std::max(0.f, std::min(img_h, yc + .5f * h)),
-  };
-}
+    std::optional<Bbox> keypoints_to_bbox(const Points& keypoints, const Scores& scores, float img_h, float img_w, float scale, float kpt_thr, int min_keypoints)
+    {
+        int  valid = 0;
+        auto x1    = static_cast<float>(img_w);
+        auto y1    = static_cast<float>(img_h);
+        auto x2    = 0.f;
+        auto y2    = 0.f;
+        for (size_t i = 0; i < keypoints.size(); ++i)
+        {
+            auto& kpt = keypoints[i];
+            if (scores[i] >= kpt_thr)
+            {
+                x1 = std::min(x1, kpt.x);
+                y1 = std::min(y1, kpt.y);
+                x2 = std::max(x2, kpt.x);
+                y2 = std::max(y2, kpt.y);
+                ++valid;
+            }
+        }
+        if (min_keypoints < 0)
+        {
+            min_keypoints = (static_cast<int>(scores.size()) + 1) / 2;
+        }
+        if (valid < min_keypoints)
+        {
+            return std::nullopt;
+        }
+        auto xc = .5f * (x1 + x2);
+        auto yc = .5f * (y1 + y2);
+        auto w  = (x2 - x1) * scale;
+        auto h  = (y2 - y1) * scale;
+
+        return std::array<float, 4>{
+            std::max(0.f, std::min(img_w, xc - .5f * w)),
+            std::max(0.f, std::min(img_h, yc - .5f * h)),
+            std::max(0.f, std::min(img_w, xc + .5f * w)),
+            std::max(0.f, std::min(img_h, yc + .5f * h)),
+        };
+    }
 
-Bbox map_bbox(const Bbox& box) {
-  Point p0(box[0], box[1]);
-  Point p1(box[2], box[3]);
-  auto c = .5f * (p0 + p1);
-  auto s = p1 - p0;
-  static constexpr std::array image_size{192.f, 256.f};
-  float aspect_ratio = image_size[0] * 1.0 / image_size[1];
-  if (s.x > aspect_ratio * s.y) {
-    s.y = s.x / aspect_ratio;
-  } else if (s.x < aspect_ratio * s.y) {
-    s.x = s.y * aspect_ratio;
-  }
-  s.x *= 1.25f;
-  s.y *= 1.25f;
-  p0 = c - .5f * s;
-  p1 = c + .5f * s;
-  return {p0.x, p0.y, p1.x, p1.y};
-}
+    Bbox map_bbox(const Bbox& box)
+    {
+        Point                       p0(box[0], box[1]);
+        Point                       p1(box[2], box[3]);
+        auto                        c = .5f * (p0 + p1);
+        auto                        s = p1 - p0;
+        static constexpr std::array image_size{192.f, 256.f};
+        float                       aspect_ratio = image_size[0] * 1.0 / image_size[1];
+        if (s.x > aspect_ratio * s.y)
+        {
+            s.y = s.x / aspect_ratio;
+        }
+        else if (s.x < aspect_ratio * s.y)
+        {
+            s.x = s.y * aspect_ratio;
+        }
+        s.x *= 1.25f;
+        s.y *= 1.25f;
+        p0 = c - .5f * s;
+        p1 = c + .5f * s;
+        return {p0.x, p0.y, p1.x, p1.y};
+    }
 
 }  // namespace mmdeploy::mmpose::_pose_tracker
diff --git a/csrc/mmdeploy/codebase/mmpose/pose_tracker/utils.h b/csrc/mmdeploy/codebase/mmpose/pose_tracker/utils.h
index 676e87157d..b57882ed97 100644
--- a/csrc/mmdeploy/codebase/mmpose/pose_tracker/utils.h
+++ b/csrc/mmdeploy/codebase/mmpose/pose_tracker/utils.h
@@ -12,84 +12,96 @@
 #include "opencv2/core/core.hpp"
 #include "pose_tracker/common.h"
 
-namespace mmdeploy::mmpose::_pose_tracker {
+namespace mmdeploy::mmpose::_pose_tracker
+{
 
-using std::vector;
-using Bbox = std::array<float, 4>;
-using Bboxes = vector<Bbox>;
-using Point = cv::Point2f;
-using Points = vector<cv::Point2f>;
-using Score = float;
-using Scores = vector<float>;
+    using std::vector;
+    using Bbox   = std::array<float, 4>;
+    using Bboxes = vector<Bbox>;
+    using Point  = cv::Point2f;
+    using Points = vector<cv::Point2f>;
+    using Score  = float;
+    using Scores = vector<float>;
 
 #define POSE_TRACKER_DEBUG(...) MMDEPLOY_DEBUG(__VA_ARGS__)
 
-// opencv3 can't construct cv::Mat from std::array
-template <size_t N>
-cv::Mat as_mat(const std::array<float, N>& a) {
-  return cv::Mat_<float>(a.size(), 1, const_cast<float*>(a.data()));
-}
-
-// scale = 1.5, kpt_thr = 0.3
-std::optional<Bbox> keypoints_to_bbox(const Points& keypoints, const Scores& scores, float img_h,
-                                      float img_w, float scale, float kpt_thr, int min_keypoints);
-
-// xyxy format
-float intersection_over_union(const Bbox& a, const Bbox& b);
-
-float object_keypoint_similarity(const Points& pts_a, const Bbox& box_a, const Points& pts_b,
-                                 const Bbox& box_b, const vector<float>& sigmas);
-
-template <typename T>
-void suppress_non_maximum(const vector<T>& scores, const vector<float>& similarities,
-                          vector<int>& is_valid, float thresh);
-
-inline float get_area(const Bbox& bbox) { return (bbox[2] - bbox[0]) * (bbox[3] - bbox[1]); }
-
-inline Point get_center(const Bbox& bbox) {
-  return {.5f * (bbox[0] + bbox[2]), .5f * (bbox[1] + bbox[3])};
-}
-
-inline std::array<float, 2> get_scale(const Bbox& bbox) {
-  return {bbox[2] - bbox[0], bbox[3] - bbox[1]};
-}
-
-inline Bbox get_bbox(const Point& center, const std::array<float, 2>& scale) {
-  return {
-      center.x - .5f * scale[0],
-      center.y - .5f * scale[1],
-      center.x + .5f * scale[0],
-      center.y + .5f * scale[1],
-  };
-}
-
-vector<std::tuple<int, int, float>> greedy_assignment(const vector<float>& scores,
-                                                      vector<int>& is_valid_row,
-                                                      vector<int>& is_valid_col, float thr);
-
-template <typename T>
-inline void suppress_non_maximum(const vector<T>& scores, const vector<float>& similarities,
-                                 vector<int>& is_valid, float thresh) {
-  assert(is_valid.size() == scores.size());
-  vector<int> indices(scores.size());
-  std::iota(indices.begin(), indices.end(), 0);
-  std::sort(indices.begin(), indices.end(), [&](int i, int j) { return scores[i] > scores[j]; });
-  // suppress similar samples
-  for (int i = 0; i < indices.size(); ++i) {
-    if (auto u = indices[i]; is_valid[u]) {
-      for (int j = i + 1; j < indices.size(); ++j) {
-        if (auto v = indices[j]; is_valid[v]) {
-          if (similarities[u * scores.size() + v] >= thresh) {
-            is_valid[v] = false;
-          }
+    // opencv3 can't construct cv::Mat from std::array
+    template<size_t N>
+    cv::Mat as_mat(const std::array<float, N>& a)
+    {
+        return cv::Mat_<float>(a.size(), 1, const_cast<float*>(a.data()));
+    }
+
+    // scale = 1.5, kpt_thr = 0.3
+    std::optional<Bbox> keypoints_to_bbox(const Points& keypoints, const Scores& scores, float img_h, float img_w, float scale, float kpt_thr, int min_keypoints);
+
+    // xyxy format
+    float               intersection_over_union(const Bbox& a, const Bbox& b);
+
+    float               object_keypoint_similarity(const Points& pts_a, const Bbox& box_a, const Points& pts_b, const Bbox& box_b, const vector<float>& sigmas);
+
+    template<typename T>
+    void         suppress_non_maximum(const vector<T>& scores, const vector<float>& similarities, vector<int>& is_valid, float thresh);
+
+    inline float get_area(const Bbox& bbox)
+    {
+        return (bbox[2] - bbox[0]) * (bbox[3] - bbox[1]);
+    }
+
+    inline Point get_center(const Bbox& bbox)
+    {
+        return {.5f * (bbox[0] + bbox[2]), .5f * (bbox[1] + bbox[3])};
+    }
+
+    inline std::array<float, 2> get_scale(const Bbox& bbox)
+    {
+        return {bbox[2] - bbox[0], bbox[3] - bbox[1]};
+    }
+
+    inline Bbox get_bbox(const Point& center, const std::array<float, 2>& scale)
+    {
+        return {
+            center.x - .5f * scale[0],
+            center.y - .5f * scale[1],
+            center.x + .5f * scale[0],
+            center.y + .5f * scale[1],
+        };
+    }
+
+    vector<std::tuple<int, int, float>> greedy_assignment(const vector<float>& scores,
+                                                          vector<int>&         is_valid_row,
+                                                          vector<int>&         is_valid_col,
+                                                          float                thr);
+
+    template<typename T>
+    inline void suppress_non_maximum(const vector<T>& scores, const vector<float>& similarities, vector<int>& is_valid, float thresh)
+    {
+        assert(is_valid.size() == scores.size());
+        vector<int> indices(scores.size());
+        std::iota(indices.begin(), indices.end(), 0);
+        std::sort(indices.begin(), indices.end(), [&](int i, int j)
+                  { return scores[i] > scores[j]; });
+        // suppress similar samples
+        for (int i = 0; i < indices.size(); ++i)
+        {
+            if (auto u = indices[i]; is_valid[u])
+            {
+                for (int j = i + 1; j < indices.size(); ++j)
+                {
+                    if (auto v = indices[j]; is_valid[v])
+                    {
+                        if (similarities[u * scores.size() + v] >= thresh)
+                        {
+                            is_valid[v] = false;
+                        }
+                    }
+                }
+            }
         }
-      }
     }
-  }
-}
 
-// TopDownAffine's internal logic for mapping pose model inputs
-Bbox map_bbox(const Bbox& box);
+    // TopDownAffine's internal logic for mapping pose model inputs
+    Bbox map_bbox(const Bbox& box);
 
 }  // namespace mmdeploy::mmpose::_pose_tracker
 
diff --git a/csrc/mmdeploy/codebase/mmpose/simcc_label.cpp b/csrc/mmdeploy/codebase/mmpose/simcc_label.cpp
index 6ad142f6fa..12bb2a9b9f 100644
--- a/csrc/mmdeploy/codebase/mmpose/simcc_label.cpp
+++ b/csrc/mmdeploy/codebase/mmpose/simcc_label.cpp
@@ -14,111 +14,124 @@
 #include "mmpose.h"
 #include "opencv_utils.h"
 
-namespace mmdeploy::mmpose {
-
-using std::string;
-using std::vector;
-
-class SimCCLabelDecode : public MMPose {
- public:
-  explicit SimCCLabelDecode(const Value& config) : MMPose(config) {
-    if (config.contains("params")) {
-      auto& params = config["params"];
-      flip_test_ = params.value("flip_test", flip_test_);
-      simcc_split_ratio_ = params.value("simcc_split_ratio", simcc_split_ratio_);
-      export_postprocess_ = params.value("export_postprocess", export_postprocess_);
-      if (export_postprocess_) {
-        simcc_split_ratio_ = 1.0;
-      }
-      if (params.contains("input_size")) {
-        from_value(params["input_size"], input_size_);
-      }
-    }
-  }
-
-  Result<Value> operator()(const Value& _data, const Value& _prob) {
-    MMDEPLOY_DEBUG("preprocess_result: {}", _data);
-    MMDEPLOY_DEBUG("inference_result: {}", _prob);
-
-    Device cpu_device{"cpu"};
-    OUTCOME_TRY(auto simcc_x,
-                MakeAvailableOnDevice(_prob["simcc_x"].get<Tensor>(), cpu_device, stream()));
-    OUTCOME_TRY(auto simcc_y,
-                MakeAvailableOnDevice(_prob["simcc_y"].get<Tensor>(), cpu_device, stream()));
-    OUTCOME_TRY(stream().Wait());
-    if (!(simcc_x.shape().size() == 3 && simcc_x.data_type() == DataType::kFLOAT)) {
-      MMDEPLOY_ERROR("unsupported `simcc_x` tensor, shape: {}, dtype: {}", simcc_x.shape(),
-                     (int)simcc_x.data_type());
-      return Status(eNotSupported);
-    }
-
-    auto& img_metas = _data["img_metas"];
-
-    Tensor keypoints({Device{"cpu"}, DataType::kFLOAT, {simcc_x.shape(0), simcc_x.shape(1), 2}});
-    Tensor scores({Device{"cpu"}, DataType::kFLOAT, {simcc_x.shape(0), simcc_x.shape(1), 1}});
-    float *keypoints_data = nullptr, *scores_data = nullptr;
-    if (!export_postprocess_) {
-      get_simcc_maximum(simcc_x, simcc_y, keypoints, scores);
-      keypoints_data = keypoints.data<float>();
-      scores_data = scores.data<float>();
-    } else {
-      keypoints_data = simcc_x.data<float>();
-      scores_data = simcc_y.data<float>();
-    }
-
-    std::vector<float> center;
-    std::vector<float> scale;
-    from_value(img_metas["center"], center);
-    from_value(img_metas["scale"], scale);
-    PoseDetectorOutput output;
-
-    float scale_value = 200, x = -1, y = -1, s = 0;
-    for (int i = 0; i < simcc_x.shape(1); i++) {
-      x = *(keypoints_data++) / simcc_split_ratio_;
-      y = *(keypoints_data++) / simcc_split_ratio_;
-      s = *(scores_data++);
-
-      x = x * scale[0] * scale_value / input_size_[0] + center[0] - scale[0] * scale_value * 0.5;
-      y = y * scale[1] * scale_value / input_size_[1] + center[1] - scale[1] * scale_value * 0.5;
-      output.key_points.push_back({{x, y}, s});
-    }
-    return to_value(output);
-  }
-
-  void get_simcc_maximum(const Tensor& simcc_x, const Tensor& simcc_y, Tensor& keypoints,
-                         Tensor& scores) {
-    int K = simcc_x.shape(1);
-    int N_x = simcc_x.shape(2);
-    int N_y = simcc_y.shape(2);
-
-    for (int i = 0; i < K; i++) {
-      float* data_x = const_cast<float*>(simcc_x.data<float>()) + i * N_x;
-      float* data_y = const_cast<float*>(simcc_y.data<float>()) + i * N_y;
-      cv::Mat mat_x = cv::Mat(N_x, 1, CV_32FC1, data_x);
-      cv::Mat mat_y = cv::Mat(N_y, 1, CV_32FC1, data_y);
-      double min_val_x, max_val_x, min_val_y, max_val_y;
-      cv::Point min_loc_x, max_loc_x, min_loc_y, max_loc_y;
-      cv::minMaxLoc(mat_x, &min_val_x, &max_val_x, &min_loc_x, &max_loc_x);
-      cv::minMaxLoc(mat_y, &min_val_y, &max_val_y, &min_loc_y, &max_loc_y);
-      float s = max_val_x > max_val_y ? max_val_y : max_val_x;
-      float x = s > 0 ? max_loc_x.y : -1.0;
-      float y = s > 0 ? max_loc_y.y : -1.0;
-      float* keypoints_data = keypoints.data<float>() + i * 2;
-      float* scores_data = scores.data<float>() + i;
-      *(scores_data) = s;
-      *(keypoints_data + 0) = x;
-      *(keypoints_data + 1) = y;
-    }
-  }
-
- private:
-  bool flip_test_{false};
-  bool export_postprocess_{false};
-  bool shift_heatmap_{false};
-  float simcc_split_ratio_{2.0};
-  std::vector<int> input_size_{192, 256};
-};
-
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMPose, SimCCLabelDecode);
+namespace mmdeploy::mmpose
+{
+
+    using std::string;
+    using std::vector;
+
+    class SimCCLabelDecode : public MMPose
+    {
+      public:
+        explicit SimCCLabelDecode(const Value& config)
+            : MMPose(config)
+        {
+            if (config.contains("params"))
+            {
+                auto& params        = config["params"];
+                flip_test_          = params.value("flip_test", flip_test_);
+                simcc_split_ratio_  = params.value("simcc_split_ratio", simcc_split_ratio_);
+                export_postprocess_ = params.value("export_postprocess", export_postprocess_);
+                if (export_postprocess_)
+                {
+                    simcc_split_ratio_ = 1.0;
+                }
+                if (params.contains("input_size"))
+                {
+                    from_value(params["input_size"], input_size_);
+                }
+            }
+        }
+
+        Result<Value> operator()(const Value& _data, const Value& _prob)
+        {
+            MMDEPLOY_DEBUG("preprocess_result: {}", _data);
+            MMDEPLOY_DEBUG("inference_result: {}", _prob);
+
+            Device cpu_device{"cpu"};
+            OUTCOME_TRY(auto simcc_x,
+                        MakeAvailableOnDevice(_prob["simcc_x"].get<Tensor>(), cpu_device, stream()));
+            OUTCOME_TRY(auto simcc_y,
+                        MakeAvailableOnDevice(_prob["simcc_y"].get<Tensor>(), cpu_device, stream()));
+            OUTCOME_TRY(stream().Wait());
+            if (!(simcc_x.shape().size() == 3 && simcc_x.data_type() == DataType::kFLOAT))
+            {
+                MMDEPLOY_ERROR("unsupported `simcc_x` tensor, shape: {}, dtype: {}", simcc_x.shape(), (int)simcc_x.data_type());
+                return Status(eNotSupported);
+            }
+
+            auto&  img_metas = _data["img_metas"];
+
+            Tensor keypoints({Device{"cpu"}, DataType::kFLOAT, {simcc_x.shape(0), simcc_x.shape(1), 2}});
+            Tensor scores({Device{"cpu"}, DataType::kFLOAT, {simcc_x.shape(0), simcc_x.shape(1), 1}});
+            float *keypoints_data = nullptr, *scores_data = nullptr;
+            if (!export_postprocess_)
+            {
+                get_simcc_maximum(simcc_x, simcc_y, keypoints, scores);
+                keypoints_data = keypoints.data<float>();
+                scores_data    = scores.data<float>();
+            }
+            else
+            {
+                keypoints_data = simcc_x.data<float>();
+                scores_data    = simcc_y.data<float>();
+            }
+
+            std::vector<float> center;
+            std::vector<float> scale;
+            from_value(img_metas["center"], center);
+            from_value(img_metas["scale"], scale);
+            PoseDetectorOutput output;
+
+            float              scale_value = 200, x = -1, y = -1, s = 0;
+            for (int i = 0; i < simcc_x.shape(1); i++)
+            {
+                x = *(keypoints_data++) / simcc_split_ratio_;
+                y = *(keypoints_data++) / simcc_split_ratio_;
+                s = *(scores_data++);
+
+                x = x * scale[0] * scale_value / input_size_[0] + center[0] - scale[0] * scale_value * 0.5;
+                y = y * scale[1] * scale_value / input_size_[1] + center[1] - scale[1] * scale_value * 0.5;
+                output.key_points.push_back({{x, y}, s});
+            }
+            return to_value(output);
+        }
+
+        void get_simcc_maximum(const Tensor& simcc_x, const Tensor& simcc_y, Tensor& keypoints, Tensor& scores)
+        {
+            int K   = simcc_x.shape(1);
+            int N_x = simcc_x.shape(2);
+            int N_y = simcc_y.shape(2);
+
+            for (int i = 0; i < K; i++)
+            {
+                float*    data_x = const_cast<float*>(simcc_x.data<float>()) + i * N_x;
+                float*    data_y = const_cast<float*>(simcc_y.data<float>()) + i * N_y;
+                cv::Mat   mat_x  = cv::Mat(N_x, 1, CV_32FC1, data_x);
+                cv::Mat   mat_y  = cv::Mat(N_y, 1, CV_32FC1, data_y);
+                double    min_val_x, max_val_x, min_val_y, max_val_y;
+                cv::Point min_loc_x, max_loc_x, min_loc_y, max_loc_y;
+                cv::minMaxLoc(mat_x, &min_val_x, &max_val_x, &min_loc_x, &max_loc_x);
+                cv::minMaxLoc(mat_y, &min_val_y, &max_val_y, &min_loc_y, &max_loc_y);
+                float  s              = max_val_x > max_val_y ? max_val_y : max_val_x;
+                float  x              = s > 0 ? max_loc_x.y : -1.0;
+                float  y              = s > 0 ? max_loc_y.y : -1.0;
+                float* keypoints_data = keypoints.data<float>() + i * 2;
+                float* scores_data    = scores.data<float>() + i;
+                *(scores_data)        = s;
+                *(keypoints_data + 0) = x;
+                *(keypoints_data + 1) = y;
+            }
+        }
+
+      private:
+        bool             flip_test_{false};
+        bool             export_postprocess_{false};
+        bool             shift_heatmap_{false};
+        float            simcc_split_ratio_{2.0};
+        std::vector<int> input_size_{192, 256};
+    };
+
+    MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMPose, SimCCLabelDecode);
 
 }  // namespace mmdeploy::mmpose
diff --git a/csrc/mmdeploy/codebase/mmpose/topdown_affine.cpp b/csrc/mmdeploy/codebase/mmpose/topdown_affine.cpp
index f04ab0f38e..cf3632bcb0 100644
--- a/csrc/mmdeploy/codebase/mmpose/topdown_affine.cpp
+++ b/csrc/mmdeploy/codebase/mmpose/topdown_affine.cpp
@@ -14,141 +14,153 @@
 
 using namespace std;
 
-namespace mmdeploy::mmpose {
-
-class TopDownAffine : public transform::Transform {
- public:
-  explicit TopDownAffine(const Value& args) noexcept {
-    assert(args.contains("image_size"));
-    from_value(args["image_size"], image_size_);
-    crop_resize_pad_ =
-        ::mmdeploy::operation::Managed<::mmdeploy::operation::CropResizePad>::Create();
-  }
-
-  ~TopDownAffine() override = default;
-
-  Result<void> Apply(Value& data) override {
-    MMDEPLOY_DEBUG("top_down_affine input: {}", data);
-
-    auto img = data["img"].get<Tensor>();
-
-    // prepare data
-    vector<float> bbox;
-    vector<float> c;  // center
-    vector<float> s;  // scale
-    if (data.contains("center") && data.contains("scale")) {
-      // after mmpose v0.26.0
-      from_value(data["center"], c);
-      from_value(data["scale"], s);
-      from_value(data["bbox"], bbox);
-    } else {
-      // before mmpose v0.26.0
-      from_value(data["bbox"], bbox);
-      Box2cs(bbox, c, s);
-    }
-    // end prepare data
-
-    Tensor dst;
+namespace mmdeploy::mmpose
+{
+
+    class TopDownAffine : public transform::Transform
     {
-      s[0] *= 200;
-      s[1] *= 200;
-      const std::array img_roi{0, 0, (int)img.shape(2), (int)img.shape(1)};
-      const std::array tmp_roi{0, 0, (int)image_size_[0], (int)image_size_[1]};
-      auto roi = round({c[0] - s[0] / 2.f, c[1] - s[1] / 2.f, s[0], s[1]});
-      auto src_roi = intersect(roi, img_roi);
-      // prior scale factor
-      auto factor = (float)image_size_[0] / s[0];
-      // rounded dst roi
-      auto dst_roi = round({(src_roi[0] - roi[0]) * factor,  //
-                            (src_roi[1] - roi[1]) * factor,  //
-                            src_roi[2] * factor,             //
-                            src_roi[3] * factor});
-      dst_roi = intersect(dst_roi, tmp_roi);
-      // exact scale factors
-      auto factor_x = (float)dst_roi[2] / src_roi[2];
-      auto factor_y = (float)dst_roi[3] / src_roi[3];
-      // center of src roi
-      auto c_src_x = src_roi[0] + (src_roi[2] - 1) / 2.f;
-      auto c_src_y = src_roi[1] + (src_roi[3] - 1) / 2.f;
-      // center of dst roi
-      auto c_dst_x = dst_roi[0] + (dst_roi[2] - 1) / 2.f;
-      auto c_dst_y = dst_roi[1] + (dst_roi[3] - 1) / 2.f;
-      // vector from c_dst to (w/2, h/2)
-      auto v_dst_x = image_size_[0] / 2.f - c_dst_x;
-      auto v_dst_y = image_size_[1] / 2.f - c_dst_y;
-      // vector from c_src to corrected center
-      auto v_src_x = v_dst_x / factor_x;
-      auto v_src_y = v_dst_y / factor_y;
-      // corrected center
-      c[0] = c_src_x + v_src_x;
-      c[1] = c_src_y + v_src_y;
-      // corrected scale
-      s[0] = image_size_[0] / factor_x / 200.f;
-      s[1] = image_size_[1] / factor_y / 200.f;
-
-      vector<int> crop_rect = {src_roi[1], src_roi[0], src_roi[1] + src_roi[3] - 1,
-                               src_roi[0] + src_roi[2] - 1};
-      vector<int> target_size = {dst_roi[2], dst_roi[3]};
-      vector<int> pad_rect = {dst_roi[1], dst_roi[0], image_size_[1] - dst_roi[3] - dst_roi[1],
-                              image_size_[0] - dst_roi[2] - dst_roi[0]};
-      crop_resize_pad_.Apply(img, crop_rect, target_size, pad_rect, dst);
-    }
-
-    data["img"] = std::move(dst);
-    data["img_shape"] = {1, image_size_[1], image_size_[0], img.shape(3)};
-    data["center"] = to_value(c);
-    data["scale"] = to_value(s);
-    MMDEPLOY_DEBUG("output: {}", data);
-    return success();
-  }
-
-  static std::array<int, 4> round(const std::array<float, 4>& a) {
-    return {
-        static_cast<int>(std::round(a[0])),
-        static_cast<int>(std::round(a[1])),
-        static_cast<int>(std::round(a[2])),
-        static_cast<int>(std::round(a[3])),
+      public:
+        explicit TopDownAffine(const Value& args) noexcept
+        {
+            assert(args.contains("image_size"));
+            from_value(args["image_size"], image_size_);
+            crop_resize_pad_ =
+                ::mmdeploy::operation::Managed<::mmdeploy::operation::CropResizePad>::Create();
+        }
+
+        ~TopDownAffine() override = default;
+
+        Result<void> Apply(Value& data) override
+        {
+            MMDEPLOY_DEBUG("top_down_affine input: {}", data);
+
+            auto          img = data["img"].get<Tensor>();
+
+            // prepare data
+            vector<float> bbox;
+            vector<float> c;  // center
+            vector<float> s;  // scale
+            if (data.contains("center") && data.contains("scale"))
+            {
+                // after mmpose v0.26.0
+                from_value(data["center"], c);
+                from_value(data["scale"], s);
+                from_value(data["bbox"], bbox);
+            }
+            else
+            {
+                // before mmpose v0.26.0
+                from_value(data["bbox"], bbox);
+                Box2cs(bbox, c, s);
+            }
+            // end prepare data
+
+            Tensor dst;
+            {
+                s[0] *= 200;
+                s[1] *= 200;
+                const std::array img_roi{0, 0, (int)img.shape(2), (int)img.shape(1)};
+                const std::array tmp_roi{0, 0, (int)image_size_[0], (int)image_size_[1]};
+                auto             roi     = round({c[0] - s[0] / 2.f, c[1] - s[1] / 2.f, s[0], s[1]});
+                auto             src_roi = intersect(roi, img_roi);
+                // prior scale factor
+                auto             factor  = (float)image_size_[0] / s[0];
+                // rounded dst roi
+                auto             dst_roi = round({(src_roi[0] - roi[0]) * factor,  //
+                                                  (src_roi[1] - roi[1]) * factor,  //
+                                                  src_roi[2] * factor,             //
+                                                  src_roi[3] * factor});
+                dst_roi                  = intersect(dst_roi, tmp_roi);
+                // exact scale factors
+                auto factor_x            = (float)dst_roi[2] / src_roi[2];
+                auto factor_y            = (float)dst_roi[3] / src_roi[3];
+                // center of src roi
+                auto c_src_x             = src_roi[0] + (src_roi[2] - 1) / 2.f;
+                auto c_src_y             = src_roi[1] + (src_roi[3] - 1) / 2.f;
+                // center of dst roi
+                auto c_dst_x             = dst_roi[0] + (dst_roi[2] - 1) / 2.f;
+                auto c_dst_y             = dst_roi[1] + (dst_roi[3] - 1) / 2.f;
+                // vector from c_dst to (w/2, h/2)
+                auto v_dst_x             = image_size_[0] / 2.f - c_dst_x;
+                auto v_dst_y             = image_size_[1] / 2.f - c_dst_y;
+                // vector from c_src to corrected center
+                auto v_src_x             = v_dst_x / factor_x;
+                auto v_src_y             = v_dst_y / factor_y;
+                // corrected center
+                c[0]                     = c_src_x + v_src_x;
+                c[1]                     = c_src_y + v_src_y;
+                // corrected scale
+                s[0]                     = image_size_[0] / factor_x / 200.f;
+                s[1]                     = image_size_[1] / factor_y / 200.f;
+
+                vector<int> crop_rect   = {src_roi[1], src_roi[0], src_roi[1] + src_roi[3] - 1, src_roi[0] + src_roi[2] - 1};
+                vector<int> target_size = {dst_roi[2], dst_roi[3]};
+                vector<int> pad_rect    = {dst_roi[1], dst_roi[0], image_size_[1] - dst_roi[3] - dst_roi[1], image_size_[0] - dst_roi[2] - dst_roi[0]};
+                crop_resize_pad_.Apply(img, crop_rect, target_size, pad_rect, dst);
+            }
+
+            data["img"]       = std::move(dst);
+            data["img_shape"] = {1, image_size_[1], image_size_[0], img.shape(3)};
+            data["center"]    = to_value(c);
+            data["scale"]     = to_value(s);
+            MMDEPLOY_DEBUG("output: {}", data);
+            return success();
+        }
+
+        static std::array<int, 4> round(const std::array<float, 4>& a)
+        {
+            return {
+                static_cast<int>(std::round(a[0])),
+                static_cast<int>(std::round(a[1])),
+                static_cast<int>(std::round(a[2])),
+                static_cast<int>(std::round(a[3])),
+            };
+        }
+
+        // xywh
+        template<typename T>
+        static std::array<T, 4> intersect(std::array<T, 4> a, std::array<T, 4> b)
+        {
+            auto x1 = std::max(a[0], b[0]);
+            auto y1 = std::max(a[1], b[1]);
+            a[2]    = std::min(a[0] + a[2], b[0] + b[2]) - x1;
+            a[3]    = std::min(a[1] + a[3], b[1] + b[3]) - y1;
+            a[0]    = x1;
+            a[1]    = y1;
+            if (a[2] <= 0 || a[3] <= 0)
+            {
+                a = {};
+            }
+            return a;
+        }
+
+        void Box2cs(vector<float>& box, vector<float>& center, vector<float>& scale)
+        {
+            // bbox_xywh2cs
+            float x            = box[0];
+            float y            = box[1];
+            float w            = box[2];
+            float h            = box[3];
+            float aspect_ratio = image_size_[0] * 1.0 / image_size_[1];
+            center.push_back(x + w * 0.5);
+            center.push_back(y + h * 0.5);
+            if (w > aspect_ratio * h)
+            {
+                h = w * 1.0 / aspect_ratio;
+            }
+            else if (w < aspect_ratio * h)
+            {
+                w = h * aspect_ratio;
+            }
+            scale.push_back(w / 200 * 1.25);
+            scale.push_back(h / 200 * 1.25);
+        }
+
+      protected:
+        vector<int>                                                          image_size_;
+        ::mmdeploy::operation::Managed<::mmdeploy::operation::CropResizePad> crop_resize_pad_;
     };
-  }
-
-  // xywh
-  template <typename T>
-  static std::array<T, 4> intersect(std::array<T, 4> a, std::array<T, 4> b) {
-    auto x1 = std::max(a[0], b[0]);
-    auto y1 = std::max(a[1], b[1]);
-    a[2] = std::min(a[0] + a[2], b[0] + b[2]) - x1;
-    a[3] = std::min(a[1] + a[3], b[1] + b[3]) - y1;
-    a[0] = x1;
-    a[1] = y1;
-    if (a[2] <= 0 || a[3] <= 0) {
-      a = {};
-    }
-    return a;
-  }
-
-  void Box2cs(vector<float>& box, vector<float>& center, vector<float>& scale) {
-    // bbox_xywh2cs
-    float x = box[0];
-    float y = box[1];
-    float w = box[2];
-    float h = box[3];
-    float aspect_ratio = image_size_[0] * 1.0 / image_size_[1];
-    center.push_back(x + w * 0.5);
-    center.push_back(y + h * 0.5);
-    if (w > aspect_ratio * h) {
-      h = w * 1.0 / aspect_ratio;
-    } else if (w < aspect_ratio * h) {
-      w = h * aspect_ratio;
-    }
-    scale.push_back(w / 200 * 1.25);
-    scale.push_back(h / 200 * 1.25);
-  }
-
- protected:
-  vector<int> image_size_;
-  ::mmdeploy::operation::Managed<::mmdeploy::operation::CropResizePad> crop_resize_pad_;
-};
-
-MMDEPLOY_REGISTER_TRANSFORM(TopDownAffine);
+
+    MMDEPLOY_REGISTER_TRANSFORM(TopDownAffine);
 
 }  // namespace mmdeploy::mmpose
diff --git a/csrc/mmdeploy/codebase/mmpose/topdown_get_bbox_center_scale.cpp b/csrc/mmdeploy/codebase/mmpose/topdown_get_bbox_center_scale.cpp
index 1890803849..53f4d44764 100644
--- a/csrc/mmdeploy/codebase/mmpose/topdown_get_bbox_center_scale.cpp
+++ b/csrc/mmdeploy/codebase/mmpose/topdown_get_bbox_center_scale.cpp
@@ -10,57 +10,64 @@
 
 using namespace std;
 
-namespace mmdeploy::mmpose {
+namespace mmdeploy::mmpose
+{
 
-class TopDownGetBboxCenterScale : public transform::Transform {
- public:
-  explicit TopDownGetBboxCenterScale(const Value& args) {
-    padding_ = args.value("padding", 1.25);
-    assert(args.contains("image_size"));
-    from_value(args["image_size"], image_size_);
-  }
+    class TopDownGetBboxCenterScale : public transform::Transform
+    {
+      public:
+        explicit TopDownGetBboxCenterScale(const Value& args)
+        {
+            padding_ = args.value("padding", 1.25);
+            assert(args.contains("image_size"));
+            from_value(args["image_size"], image_size_);
+        }
 
-  ~TopDownGetBboxCenterScale() override = default;
+        ~TopDownGetBboxCenterScale() override = default;
 
-  Result<void> Apply(Value& data) override {
-    vector<float> bbox;
-    from_value(data["bbox"], bbox);
+        Result<void> Apply(Value& data) override
+        {
+            vector<float> bbox;
+            from_value(data["bbox"], bbox);
 
-    vector<float> c;  // center
-    vector<float> s;  // scale
+            vector<float> c;  // center
+            vector<float> s;  // scale
 
-    Box2cs(bbox, c, s, padding_, pixel_std_);
-    data["center"] = to_value(c);
-    data["scale"] = to_value(s);
+            Box2cs(bbox, c, s, padding_, pixel_std_);
+            data["center"] = to_value(c);
+            data["scale"]  = to_value(s);
 
-    return success();
-  }
+            return success();
+        }
 
-  void Box2cs(vector<float>& box, vector<float>& center, vector<float>& scale, float padding,
-              float pixel_std) {
-    // bbox_xywh2cs
-    float x = box[0];
-    float y = box[1];
-    float w = box[2];
-    float h = box[3];
-    float aspect_ratio = image_size_[0] * 1.0 / image_size_[1];
-    center.push_back(x + w * 0.5);
-    center.push_back(y + h * 0.5);
-    if (w > aspect_ratio * h) {
-      h = w * 1.0 / aspect_ratio;
-    } else if (w < aspect_ratio * h) {
-      w = h * aspect_ratio;
-    }
-    scale.push_back(w / pixel_std * padding);
-    scale.push_back(h / pixel_std * padding);
-  }
+        void Box2cs(vector<float>& box, vector<float>& center, vector<float>& scale, float padding, float pixel_std)
+        {
+            // bbox_xywh2cs
+            float x            = box[0];
+            float y            = box[1];
+            float w            = box[2];
+            float h            = box[3];
+            float aspect_ratio = image_size_[0] * 1.0 / image_size_[1];
+            center.push_back(x + w * 0.5);
+            center.push_back(y + h * 0.5);
+            if (w > aspect_ratio * h)
+            {
+                h = w * 1.0 / aspect_ratio;
+            }
+            else if (w < aspect_ratio * h)
+            {
+                w = h * aspect_ratio;
+            }
+            scale.push_back(w / pixel_std * padding);
+            scale.push_back(h / pixel_std * padding);
+        }
 
- protected:
-  float padding_{1.25f};
-  float pixel_std_{200.f};
-  vector<int> image_size_;
-};
+      protected:
+        float       padding_{1.25f};
+        float       pixel_std_{200.f};
+        vector<int> image_size_;
+    };
 
-MMDEPLOY_REGISTER_TRANSFORM(TopDownGetBboxCenterScale);
+    MMDEPLOY_REGISTER_TRANSFORM(TopDownGetBboxCenterScale);
 
 }  // namespace mmdeploy::mmpose
diff --git a/csrc/mmdeploy/codebase/mmrotate/CMakeLists.txt b/csrc/mmdeploy/codebase/mmrotate/CMakeLists.txt
index b2ee16ad7c..f2510a7f32 100644
--- a/csrc/mmdeploy/codebase/mmrotate/CMakeLists.txt
+++ b/csrc/mmdeploy/codebase/mmrotate/CMakeLists.txt
@@ -7,4 +7,6 @@ mmdeploy_add_module(${PROJECT_NAME} "${SRCS}")
 target_link_libraries(${PROJECT_NAME} PRIVATE mmdeploy_opencv_utils)
 add_library(mmdeploy::mmrotate ALIAS ${PROJECT_NAME})
 
-set(MMDEPLOY_TASKS ${MMDEPLOY_TASKS} rotated_detector CACHE INTERNAL "")
+set(MMDEPLOY_TASKS
+    ${MMDEPLOY_TASKS} rotated_detector
+    CACHE INTERNAL "")
diff --git a/csrc/mmdeploy/codebase/mmrotate/mmrotate.cpp b/csrc/mmdeploy/codebase/mmrotate/mmrotate.cpp
index ee22f35f99..38f70bf215 100644
--- a/csrc/mmdeploy/codebase/mmrotate/mmrotate.cpp
+++ b/csrc/mmdeploy/codebase/mmrotate/mmrotate.cpp
@@ -2,8 +2,9 @@
 
 #include "mmdeploy/codebase/mmrotate/mmrotate.h"
 
-namespace mmdeploy::mmrotate {
+namespace mmdeploy::mmrotate
+{
 
-MMDEPLOY_REGISTER_CODEBASE(MMRotate);
+    MMDEPLOY_REGISTER_CODEBASE(MMRotate);
 
 }  // namespace mmdeploy::mmrotate
diff --git a/csrc/mmdeploy/codebase/mmrotate/mmrotate.h b/csrc/mmdeploy/codebase/mmrotate/mmrotate.h
index 83fc5bdc0f..378a4797ca 100644
--- a/csrc/mmdeploy/codebase/mmrotate/mmrotate.h
+++ b/csrc/mmdeploy/codebase/mmrotate/mmrotate.h
@@ -9,20 +9,23 @@
 #include "mmdeploy/core/device.h"
 #include "mmdeploy/core/module.h"
 
-namespace mmdeploy::mmrotate {
-
-struct RotatedDetectorOutput {
-  struct Detection {
-    int label_id;
-    float score;
-    std::array<float, 5> rbbox;  // cx,cy,w,h,ag
-    MMDEPLOY_ARCHIVE_MEMBERS(label_id, score, rbbox);
-  };
-  std::vector<Detection> detections;
-  MMDEPLOY_ARCHIVE_MEMBERS(detections);
-};
-
-MMDEPLOY_DECLARE_CODEBASE(MMRotate, mmrotate);
+namespace mmdeploy::mmrotate
+{
+
+    struct RotatedDetectorOutput
+    {
+        struct Detection
+        {
+            int                  label_id;
+            float                score;
+            std::array<float, 5> rbbox;  // cx,cy,w,h,ag
+            MMDEPLOY_ARCHIVE_MEMBERS(label_id, score, rbbox);
+        };
+        std::vector<Detection> detections;
+        MMDEPLOY_ARCHIVE_MEMBERS(detections);
+    };
+
+    MMDEPLOY_DECLARE_CODEBASE(MMRotate, mmrotate);
 
 }  // namespace mmdeploy::mmrotate
 
diff --git a/csrc/mmdeploy/codebase/mmrotate/oriented_object_detection.cpp b/csrc/mmdeploy/codebase/mmrotate/oriented_object_detection.cpp
index c5d8f8a38f..6ca9116671 100644
--- a/csrc/mmdeploy/codebase/mmrotate/oriented_object_detection.cpp
+++ b/csrc/mmdeploy/codebase/mmrotate/oriented_object_detection.cpp
@@ -12,102 +12,114 @@
 #include "mmrotate.h"
 #include "opencv_utils.h"
 
-namespace mmdeploy::mmrotate {
-
-using std::vector;
-
-class ResizeRBBox : public MMRotate {
- public:
-  explicit ResizeRBBox(const Value& cfg) : MMRotate(cfg) {
-    if (cfg.contains("params")) {
-      score_thr_ = cfg["params"].value("score_thr", 0.05f);
-    }
-  }
-
-  Result<Value> operator()(const Value& prep_res, const Value& infer_res) {
-    MMDEPLOY_DEBUG("prep_res: {}", prep_res);
-    MMDEPLOY_DEBUG("infer_res: {}", infer_res);
-
-    Device cpu_device{"cpu"};
-    OUTCOME_TRY(auto dets,
-                MakeAvailableOnDevice(infer_res["dets"].get<Tensor>(), cpu_device, stream_));
-    OUTCOME_TRY(auto labels,
-                MakeAvailableOnDevice(infer_res["labels"].get<Tensor>(), cpu_device, stream_));
-    OUTCOME_TRY(stream_.Wait());
-
-    if (!(dets.shape().size() == 3 && dets.shape(2) == 6 && dets.data_type() == DataType::kFLOAT)) {
-      MMDEPLOY_ERROR("unsupported `dets` tensor, shape: {}, dtype: {}", dets.shape(),
-                     (int)dets.data_type());
-      return Status(eNotSupported);
-    }
-
-    if (labels.shape().size() != 2) {
-      MMDEPLOY_ERROR("unsupported `labels`, tensor, shape: {}, dtype: {}", labels.shape(),
-                     (int)labels.data_type());
-      return Status(eNotSupported);
-    }
-
-    OUTCOME_TRY(auto result, DispatchGetBBoxes(prep_res["img_metas"], dets, labels));
-    return to_value(result);
-  }
-
-  Result<RotatedDetectorOutput> DispatchGetBBoxes(const Value& prep_res, const Tensor& dets,
-                                                  const Tensor& labels) {
-    auto data_type = labels.data_type();
-    switch (data_type) {
-      case DataType::kFLOAT:
-        return GetRBBoxes<float>(prep_res, dets, labels);
-      case DataType::kINT32:
-        return GetRBBoxes<int32_t>(prep_res, dets, labels);
-      case DataType::kINT64:
-        return GetRBBoxes<int64_t>(prep_res, dets, labels);
-      default:
-        return Status(eNotSupported);
-    }
-  }
-
-  template <typename T>
-  Result<RotatedDetectorOutput> GetRBBoxes(const Value& prep_res, const Tensor& dets,
-                                           const Tensor& labels) {
-    RotatedDetectorOutput objs;
-    auto* dets_ptr = dets.data<float>();
-    auto* labels_ptr = labels.data<T>();
-    vector<float> scale_factor;
-    if (prep_res.contains("scale_factor")) {
-      from_value(prep_res["scale_factor"], scale_factor);
-    } else {
-      scale_factor = {1.f, 1.f, 1.f, 1.f};
-    }
-
-    int ori_width = prep_res["ori_shape"][2].get<int>();
-    int ori_height = prep_res["ori_shape"][1].get<int>();
-
-    auto bboxes_number = dets.shape()[1];
-    auto channels = dets.shape()[2];
-    for (auto i = 0; i < bboxes_number; ++i, dets_ptr += channels, ++labels_ptr) {
-      float score = dets_ptr[channels - 1];
-      if (score <= score_thr_) {
-        continue;
-      }
-      auto cx = dets_ptr[0] / scale_factor[0];
-      auto cy = dets_ptr[1] / scale_factor[1];
-      auto width = dets_ptr[2] / scale_factor[0];
-      auto height = dets_ptr[3] / scale_factor[1];
-      auto angle = dets_ptr[4];
-      RotatedDetectorOutput::Detection det{};
-      det.label_id = static_cast<int>(*labels_ptr);
-      det.score = score;
-      det.rbbox = {cx, cy, width, height, angle};
-      objs.detections.push_back(std::move(det));
-    }
-
-    return objs;
-  }
-
- private:
-  float score_thr_;
-};
-
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMRotate, ResizeRBBox);
+namespace mmdeploy::mmrotate
+{
+
+    using std::vector;
+
+    class ResizeRBBox : public MMRotate
+    {
+      public:
+        explicit ResizeRBBox(const Value& cfg)
+            : MMRotate(cfg)
+        {
+            if (cfg.contains("params"))
+            {
+                score_thr_ = cfg["params"].value("score_thr", 0.05f);
+            }
+        }
+
+        Result<Value> operator()(const Value& prep_res, const Value& infer_res)
+        {
+            MMDEPLOY_DEBUG("prep_res: {}", prep_res);
+            MMDEPLOY_DEBUG("infer_res: {}", infer_res);
+
+            Device cpu_device{"cpu"};
+            OUTCOME_TRY(auto dets,
+                        MakeAvailableOnDevice(infer_res["dets"].get<Tensor>(), cpu_device, stream_));
+            OUTCOME_TRY(auto labels,
+                        MakeAvailableOnDevice(infer_res["labels"].get<Tensor>(), cpu_device, stream_));
+            OUTCOME_TRY(stream_.Wait());
+
+            if (!(dets.shape().size() == 3 && dets.shape(2) == 6 && dets.data_type() == DataType::kFLOAT))
+            {
+                MMDEPLOY_ERROR("unsupported `dets` tensor, shape: {}, dtype: {}", dets.shape(), (int)dets.data_type());
+                return Status(eNotSupported);
+            }
+
+            if (labels.shape().size() != 2)
+            {
+                MMDEPLOY_ERROR("unsupported `labels`, tensor, shape: {}, dtype: {}", labels.shape(), (int)labels.data_type());
+                return Status(eNotSupported);
+            }
+
+            OUTCOME_TRY(auto result, DispatchGetBBoxes(prep_res["img_metas"], dets, labels));
+            return to_value(result);
+        }
+
+        Result<RotatedDetectorOutput> DispatchGetBBoxes(const Value& prep_res, const Tensor& dets, const Tensor& labels)
+        {
+            auto data_type = labels.data_type();
+            switch (data_type)
+            {
+                case DataType::kFLOAT:
+                    return GetRBBoxes<float>(prep_res, dets, labels);
+                case DataType::kINT32:
+                    return GetRBBoxes<int32_t>(prep_res, dets, labels);
+                case DataType::kINT64:
+                    return GetRBBoxes<int64_t>(prep_res, dets, labels);
+                default:
+                    return Status(eNotSupported);
+            }
+        }
+
+        template<typename T>
+        Result<RotatedDetectorOutput> GetRBBoxes(const Value& prep_res, const Tensor& dets, const Tensor& labels)
+        {
+            RotatedDetectorOutput objs;
+            auto*                 dets_ptr   = dets.data<float>();
+            auto*                 labels_ptr = labels.data<T>();
+            vector<float>         scale_factor;
+            if (prep_res.contains("scale_factor"))
+            {
+                from_value(prep_res["scale_factor"], scale_factor);
+            }
+            else
+            {
+                scale_factor = {1.f, 1.f, 1.f, 1.f};
+            }
+
+            int  ori_width  = prep_res["ori_shape"][2].get<int>();
+            int  ori_height = prep_res["ori_shape"][1].get<int>();
+
+            auto bboxes_number = dets.shape()[1];
+            auto channels      = dets.shape()[2];
+            for (auto i = 0; i < bboxes_number; ++i, dets_ptr += channels, ++labels_ptr)
+            {
+                float score = dets_ptr[channels - 1];
+                if (score <= score_thr_)
+                {
+                    continue;
+                }
+                auto                             cx     = dets_ptr[0] / scale_factor[0];
+                auto                             cy     = dets_ptr[1] / scale_factor[1];
+                auto                             width  = dets_ptr[2] / scale_factor[0];
+                auto                             height = dets_ptr[3] / scale_factor[1];
+                auto                             angle  = dets_ptr[4];
+                RotatedDetectorOutput::Detection det{};
+                det.label_id = static_cast<int>(*labels_ptr);
+                det.score    = score;
+                det.rbbox    = {cx, cy, width, height, angle};
+                objs.detections.push_back(std::move(det));
+            }
+
+            return objs;
+        }
+
+      private:
+        float score_thr_;
+    };
+
+    MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMRotate, ResizeRBBox);
 
 }  // namespace mmdeploy::mmrotate
diff --git a/csrc/mmdeploy/codebase/mmseg/CMakeLists.txt b/csrc/mmdeploy/codebase/mmseg/CMakeLists.txt
index aac2376346..526654fe15 100644
--- a/csrc/mmdeploy/codebase/mmseg/CMakeLists.txt
+++ b/csrc/mmdeploy/codebase/mmseg/CMakeLists.txt
@@ -4,9 +4,10 @@ project(mmdeploy_mmseg)
 
 file(GLOB_RECURSE SRCS ${CMAKE_CURRENT_SOURCE_DIR} "*.cpp")
 mmdeploy_add_module(${PROJECT_NAME} "${SRCS}")
-target_link_libraries(${PROJECT_NAME} PRIVATE
-    mmdeploy_opencv_utils
-    mmdeploy_operation)
+target_link_libraries(${PROJECT_NAME} PRIVATE mmdeploy_opencv_utils
+                                              mmdeploy_operation)
 add_library(mmdeploy::mmseg ALIAS ${PROJECT_NAME})
 
-set(MMDEPLOY_TASKS ${MMDEPLOY_TASKS} segmentor CACHE INTERNAL "")
+set(MMDEPLOY_TASKS
+    ${MMDEPLOY_TASKS} segmentor
+    CACHE INTERNAL "")
diff --git a/csrc/mmdeploy/codebase/mmseg/mmseg.cpp b/csrc/mmdeploy/codebase/mmseg/mmseg.cpp
index 36d0add8eb..d49f42734f 100644
--- a/csrc/mmdeploy/codebase/mmseg/mmseg.cpp
+++ b/csrc/mmdeploy/codebase/mmseg/mmseg.cpp
@@ -2,8 +2,9 @@
 
 #include "mmdeploy/codebase/mmseg/mmseg.h"
 
-namespace mmdeploy::mmseg {
+namespace mmdeploy::mmseg
+{
 
-MMDEPLOY_REGISTER_CODEBASE(MMSegmentation);
+    MMDEPLOY_REGISTER_CODEBASE(MMSegmentation);
 
 }  // namespace mmdeploy::mmseg
diff --git a/csrc/mmdeploy/codebase/mmseg/mmseg.h b/csrc/mmdeploy/codebase/mmseg/mmseg.h
index 8f55fadce1..8e798ede75 100644
--- a/csrc/mmdeploy/codebase/mmseg/mmseg.h
+++ b/csrc/mmdeploy/codebase/mmseg/mmseg.h
@@ -8,18 +8,20 @@
 #include "mmdeploy/core/module.h"
 #include "mmdeploy/core/tensor.h"
 
-namespace mmdeploy::mmseg {
+namespace mmdeploy::mmseg
+{
 
-struct SegmentorOutput {
-  Tensor mask;
-  Tensor score;
-  int height;
-  int width;
-  int classes;
-  MMDEPLOY_ARCHIVE_MEMBERS(mask, score, height, width, classes);
-};
+    struct SegmentorOutput
+    {
+        Tensor mask;
+        Tensor score;
+        int    height;
+        int    width;
+        int    classes;
+        MMDEPLOY_ARCHIVE_MEMBERS(mask, score, height, width, classes);
+    };
 
-MMDEPLOY_DECLARE_CODEBASE(MMSegmentation, mmseg);
+    MMDEPLOY_DECLARE_CODEBASE(MMSegmentation, mmseg);
 
 }  // namespace mmdeploy::mmseg
 
diff --git a/csrc/mmdeploy/codebase/mmseg/segment.cpp b/csrc/mmdeploy/codebase/mmseg/segment.cpp
index eb0e971390..b251f48669 100644
--- a/csrc/mmdeploy/codebase/mmseg/segment.cpp
+++ b/csrc/mmdeploy/codebase/mmseg/segment.cpp
@@ -10,123 +10,143 @@
 #include "mmdeploy/preprocess/transform/transform.h"
 #include "opencv_utils.h"
 
-namespace mmdeploy::mmseg {
-
-// TODO: resize masks on device
-// TODO: when network output is on device, cast it to a smaller type (e.g. int16_t or int8_t
-//  according to num classes) to reduce DtoH footprint
-class ResizeMask : public MMSegmentation {
- public:
-  explicit ResizeMask(const Value &cfg) : MMSegmentation(cfg) {
-    try {
-      classes_ = cfg["params"]["num_classes"].get<int>();
-      with_argmax_ = cfg["params"].value("with_argmax", true);
-      little_endian_ = IsLittleEndian();
-      ::mmdeploy::operation::Context ctx(Device("cpu"), stream_);
-      permute_ = ::mmdeploy::operation::Managed<::mmdeploy::operation::Permute>::Create();
-    } catch (const std::exception &e) {
-      MMDEPLOY_ERROR("no ['params']['num_classes'] is specified in cfg: {}", cfg);
-      throw_exception(eInvalidArgument);
-    }
-  }
-
-  Result<Value> operator()(const Value &preprocess_result, const Value &inference_result) {
-    MMDEPLOY_DEBUG("preprocess: {}\ninference: {}", preprocess_result, inference_result);
-
-    auto mask = inference_result["output"].get<Tensor>();
-    MMDEPLOY_DEBUG("tensor.name: {}, tensor.shape: {}, tensor.data_type: {}", mask.name(),
-                   mask.shape(), mask.data_type());
-    if (!(mask.shape().size() == 4 && mask.shape(0) == 1)) {
-      MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}", mask.shape());
-      return Status(eNotSupported);
-    }
-    if ((mask.shape(1) != 1) && with_argmax_) {
-      MMDEPLOY_ERROR("probability feat map with shape: {} requires `with_argmax_=false`",
-                     mask.shape());
-      return Status(eNotSupported);
-    }
-    if ((mask.data_type() != DataType::kFLOAT) && !with_argmax_) {
-      MMDEPLOY_ERROR("probability feat map only support float32 output");
-      return Status(eNotSupported);
-    }
-
-    auto channel = (int)mask.shape(1);
-    auto height = (int)mask.shape(2);
-    auto width = (int)mask.shape(3);
-    auto input_height = preprocess_result["img_metas"]["ori_shape"][1].get<int>();
-    auto input_width = preprocess_result["img_metas"]["ori_shape"][2].get<int>();
-    Device host{"cpu"};
-    OUTCOME_TRY(auto host_tensor, MakeAvailableOnDevice(mask, host, stream_));
-    OUTCOME_TRY(stream().Wait());  // should sync even mask is on cpu
-    if (!with_argmax_) {
-      // (C, H, W) -> (H, W, C)
-      ::mmdeploy::operation::Context ctx(host, stream_);
-      std::vector<int> axes = {0, 2, 3, 1};
-      OUTCOME_TRY(permute_.Apply(host_tensor, host_tensor, axes));
-    }
-
-    OUTCOME_TRY(auto cv_type, GetCvType(mask.data_type(), channel));
-    cv::Mat mask_mat(height, width, cv_type, host_tensor.data());
-
-    cv::Mat resized_mask;
-    cv::Mat resized_score;
-
-    Tensor tensor_mask{};
-    Tensor tensor_score{};
-
-    if (with_argmax_) {
-      // mask
-      if (mask_mat.channels() > 1) {
-        cv::extractChannel(mask_mat, mask_mat, little_endian_ ? 0 : mask_mat.channels() - 1);
-      }
-      if (mask_mat.type() != CV_32S) {
-        mask_mat.convertTo(mask_mat, CV_32S);
-      }
-      resized_mask = cpu::Resize(mask_mat, input_height, input_width, "nearest");
-      tensor_mask = cpu::CVMat2Tensor(resized_mask);
-    } else {
-      // score
-      resized_score = cpu::Resize(mask_mat, input_height, input_width, "bilinear");
-      tensor_score = cpu::CVMat2Tensor(resized_score);
-      std::vector<int> axes = {0, 3, 1, 2};
-      ::mmdeploy::operation::Context ctx(host, stream_);
-      OUTCOME_TRY(permute_.Apply(tensor_score, tensor_score, axes));
-    }
-
-    SegmentorOutput output{tensor_mask, tensor_score, input_height, input_width, classes_};
-    return to_value(output);
-  }
-
- private:
-  static Result<int> GetCvType(DataType type, int channel) {
-    switch (type) {
-      case DataType::kFLOAT:
-        return CV_32FC(channel);
-      case DataType::kINT64:
-        return CV_32SC2;
-      case DataType::kINT32:
-        return CV_32S;
-      default:
-        return Status(eNotSupported);
-    }
-  }
-
-  static bool IsLittleEndian() {
-    union Un {
-      char a;
-      int b;
-    } un;
-    un.b = 1;
-    return (int)un.a == 1;
-  }
-
- protected:
-  ::mmdeploy::operation::Managed<::mmdeploy::operation::Permute> permute_;
-  int classes_{};
-  bool with_argmax_{true};
-  bool little_endian_;
-};
-
-MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMSegmentation, ResizeMask);
+namespace mmdeploy::mmseg
+{
+
+    // TODO: resize masks on device
+    // TODO: when network output is on device, cast it to a smaller type (e.g. int16_t or int8_t
+    //  according to num classes) to reduce DtoH footprint
+    class ResizeMask : public MMSegmentation
+    {
+      public:
+        explicit ResizeMask(const Value& cfg)
+            : MMSegmentation(cfg)
+        {
+            try
+            {
+                classes_       = cfg["params"]["num_classes"].get<int>();
+                with_argmax_   = cfg["params"].value("with_argmax", true);
+                little_endian_ = IsLittleEndian();
+                ::mmdeploy::operation::Context ctx(Device("cpu"), stream_);
+                permute_ = ::mmdeploy::operation::Managed<::mmdeploy::operation::Permute>::Create();
+            }
+            catch (const std::exception& e)
+            {
+                MMDEPLOY_ERROR("no ['params']['num_classes'] is specified in cfg: {}", cfg);
+                throw_exception(eInvalidArgument);
+            }
+        }
+
+        Result<Value> operator()(const Value& preprocess_result, const Value& inference_result)
+        {
+            MMDEPLOY_DEBUG("preprocess: {}\ninference: {}", preprocess_result, inference_result);
+
+            auto mask = inference_result["output"].get<Tensor>();
+            MMDEPLOY_DEBUG("tensor.name: {}, tensor.shape: {}, tensor.data_type: {}", mask.name(), mask.shape(), mask.data_type());
+            if (!(mask.shape().size() == 4 && mask.shape(0) == 1))
+            {
+                MMDEPLOY_ERROR("unsupported `output` tensor, shape: {}", mask.shape());
+                return Status(eNotSupported);
+            }
+            if ((mask.shape(1) != 1) && with_argmax_)
+            {
+                MMDEPLOY_ERROR("probability feat map with shape: {} requires `with_argmax_=false`",
+                               mask.shape());
+                return Status(eNotSupported);
+            }
+            if ((mask.data_type() != DataType::kFLOAT) && !with_argmax_)
+            {
+                MMDEPLOY_ERROR("probability feat map only support float32 output");
+                return Status(eNotSupported);
+            }
+
+            auto   channel      = (int)mask.shape(1);
+            auto   height       = (int)mask.shape(2);
+            auto   width        = (int)mask.shape(3);
+            auto   input_height = preprocess_result["img_metas"]["ori_shape"][1].get<int>();
+            auto   input_width  = preprocess_result["img_metas"]["ori_shape"][2].get<int>();
+            Device host{"cpu"};
+            OUTCOME_TRY(auto host_tensor, MakeAvailableOnDevice(mask, host, stream_));
+            OUTCOME_TRY(stream().Wait());  // should sync even mask is on cpu
+            if (!with_argmax_)
+            {
+                // (C, H, W) -> (H, W, C)
+                ::mmdeploy::operation::Context ctx(host, stream_);
+                std::vector<int>               axes = {0, 2, 3, 1};
+                OUTCOME_TRY(permute_.Apply(host_tensor, host_tensor, axes));
+            }
+
+            OUTCOME_TRY(auto cv_type, GetCvType(mask.data_type(), channel));
+            cv::Mat mask_mat(height, width, cv_type, host_tensor.data());
+
+            cv::Mat resized_mask;
+            cv::Mat resized_score;
+
+            Tensor  tensor_mask{};
+            Tensor  tensor_score{};
+
+            if (with_argmax_)
+            {
+                // mask
+                if (mask_mat.channels() > 1)
+                {
+                    cv::extractChannel(mask_mat, mask_mat, little_endian_ ? 0 : mask_mat.channels() - 1);
+                }
+                if (mask_mat.type() != CV_32S)
+                {
+                    mask_mat.convertTo(mask_mat, CV_32S);
+                }
+                resized_mask = cpu::Resize(mask_mat, input_height, input_width, "nearest");
+                tensor_mask  = cpu::CVMat2Tensor(resized_mask);
+            }
+            else
+            {
+                // score
+                resized_score                       = cpu::Resize(mask_mat, input_height, input_width, "bilinear");
+                tensor_score                        = cpu::CVMat2Tensor(resized_score);
+                std::vector<int>               axes = {0, 3, 1, 2};
+                ::mmdeploy::operation::Context ctx(host, stream_);
+                OUTCOME_TRY(permute_.Apply(tensor_score, tensor_score, axes));
+            }
+
+            SegmentorOutput output{tensor_mask, tensor_score, input_height, input_width, classes_};
+            return to_value(output);
+        }
+
+      private:
+        static Result<int> GetCvType(DataType type, int channel)
+        {
+            switch (type)
+            {
+                case DataType::kFLOAT:
+                    return CV_32FC(channel);
+                case DataType::kINT64:
+                    return CV_32SC2;
+                case DataType::kINT32:
+                    return CV_32S;
+                default:
+                    return Status(eNotSupported);
+            }
+        }
+
+        static bool IsLittleEndian()
+        {
+            union Un
+            {
+                char a;
+                int  b;
+            } un;
+            un.b = 1;
+            return (int)un.a == 1;
+        }
+
+      protected:
+        ::mmdeploy::operation::Managed<::mmdeploy::operation::Permute> permute_;
+        int                                                            classes_{};
+        bool                                                           with_argmax_{true};
+        bool                                                           little_endian_;
+    };
+
+    MMDEPLOY_REGISTER_CODEBASE_COMPONENT(MMSegmentation, ResizeMask);
 
 }  // namespace mmdeploy::mmseg
diff --git a/csrc/mmdeploy/core/CMakeLists.txt b/csrc/mmdeploy/core/CMakeLists.txt
index 25a12849f9..90891d191a 100644
--- a/csrc/mmdeploy/core/CMakeLists.txt
+++ b/csrc/mmdeploy/core/CMakeLists.txt
@@ -2,92 +2,100 @@
 
 project(mmdeploy_core)
 
-# this is used to keep compatibility with legacy spdlog where CMake package is not available
+# this is used to keep compatibility with legacy spdlog where CMake package is
+# not available
 set(SPDLOG_LIB)
 
-if (MMDEPLOY_SPDLOG_EXTERNAL)
-    find_package(spdlog QUIET)
-    if (spdlog_FOUND)
-        set(SPDLOG_LIB spdlog::spdlog)
-    endif ()
-else ()
-    set(MMDEPLOY_SPDLOG_DIR ${CMAKE_SOURCE_DIR}/third_party/spdlog)
-    add_subdirectory(${MMDEPLOY_SPDLOG_DIR} ${CMAKE_CURRENT_BINARY_DIR}/spdlog EXCLUDE_FROM_ALL)
-    set_target_properties(spdlog PROPERTIES POSITION_INDEPENDENT_CODE ON)
-    if (NOT (MMDEPLOY_SHARED_LIBS OR MSVC))
-        target_compile_options(spdlog PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-fvisibility=hidden>)
-    endif ()
+if(MMDEPLOY_SPDLOG_EXTERNAL)
+  find_package(spdlog QUIET)
+  if(spdlog_FOUND)
     set(SPDLOG_LIB spdlog::spdlog)
-    mmdeploy_export(spdlog)
+  endif()
+else()
+  set(MMDEPLOY_SPDLOG_DIR ${CMAKE_SOURCE_DIR}/third_party/spdlog)
+  add_subdirectory(${MMDEPLOY_SPDLOG_DIR} ${CMAKE_CURRENT_BINARY_DIR}/spdlog
+                   EXCLUDE_FROM_ALL)
+  set_target_properties(spdlog PROPERTIES POSITION_INDEPENDENT_CODE ON)
+  if(NOT (MMDEPLOY_SHARED_LIBS OR MSVC))
+    target_compile_options(
+      spdlog PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-fvisibility=hidden>)
+  endif()
+  set(SPDLOG_LIB spdlog::spdlog)
+  mmdeploy_export(spdlog)
 
-    install(DIRECTORY ${MMDEPLOY_SPDLOG_DIR}/include/spdlog
-            DESTINATION include/mmdeploy/third_party)
-endif ()
+  install(DIRECTORY ${MMDEPLOY_SPDLOG_DIR}/include/spdlog
+          DESTINATION include/mmdeploy/third_party)
+endif()
 
 set(SRCS
-        device_impl.cpp
-        logger.cpp
-        mat.cpp
-        model.cpp
-        module.cpp
-        net.cpp
-        operator.cpp
-        status_code.cpp
-        tensor.cpp
-        registry.cpp
-        graph.cpp
-        utils/device_utils.cpp
-        utils/formatter.cpp
-        utils/stacktrace.cpp
-        profiler.cpp
-        )
+    device_impl.cpp
+    logger.cpp
+    mat.cpp
+    model.cpp
+    module.cpp
+    net.cpp
+    operator.cpp
+    status_code.cpp
+    tensor.cpp
+    registry.cpp
+    graph.cpp
+    utils/device_utils.cpp
+    utils/formatter.cpp
+    utils/stacktrace.cpp
+    profiler.cpp)
 
 mmdeploy_add_library(${PROJECT_NAME} ${SRCS})
 
-target_include_directories(${PROJECT_NAME}
-        PUBLIC
-        $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/csrc>
-        $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/third_party/outcome>
-        $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/third_party/concurrentqueue>
-        # TODO: remove dependency of `json`
-        $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/third_party/json>
-        )
+target_include_directories(
+  ${PROJECT_NAME}
+  PUBLIC $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/csrc>
+         $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/third_party/outcome>
+         $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/third_party/concurrentqueue>
+         # TODO: remove dependency of `json`
+         $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/third_party/json>)
 
-if (MSVC)
-    target_compile_options(${PROJECT_NAME} PUBLIC
-            $<$<COMPILE_LANGUAGE:CXX>:/Zc:preprocessor;/Zc:__cplusplus>)
-endif ()
+if(MSVC)
+  target_compile_options(
+    ${PROJECT_NAME}
+    PUBLIC $<$<COMPILE_LANGUAGE:CXX>:/Zc:preprocessor;/Zc:__cplusplus>)
+endif()
 
-if (MMDEPLOY_STATUS_USE_STACKTRACE)
-    include(${CMAKE_SOURCE_DIR}/cmake/stacktrace.cmake)
-else ()
-    target_compile_definitions(${PROJECT_NAME} PUBLIC -DMMDEPLOY_STATUS_USE_SOURCE_LOCATION=1)
-endif ()
+if(MMDEPLOY_STATUS_USE_STACKTRACE)
+  include(${CMAKE_SOURCE_DIR}/cmake/stacktrace.cmake)
+else()
+  target_compile_definitions(${PROJECT_NAME}
+                             PUBLIC -DMMDEPLOY_STATUS_USE_SOURCE_LOCATION=1)
+endif()
 
-target_include_directories(${PROJECT_NAME} PUBLIC
-        $<INSTALL_INTERFACE:include>
-        $<INSTALL_INTERFACE:include/mmdeploy/third_party/outcome>
-        $<INSTALL_INTERFACE:include/mmdeploy/third_party/json>)
-if (NOT MMDEPLOY_SPDLOG_EXTERNAL)
-    target_include_directories(spdlog INTERFACE
-            $<INSTALL_INTERFACE:include/mmdeploy/third_party>)
-endif ()
+target_include_directories(
+  ${PROJECT_NAME}
+  PUBLIC $<INSTALL_INTERFACE:include>
+         $<INSTALL_INTERFACE:include/mmdeploy/third_party/outcome>
+         $<INSTALL_INTERFACE:include/mmdeploy/third_party/json>)
+if(NOT MMDEPLOY_SPDLOG_EXTERNAL)
+  target_include_directories(
+    spdlog INTERFACE $<INSTALL_INTERFACE:include/mmdeploy/third_party>)
+endif()
 
 target_link_libraries(${PROJECT_NAME} PUBLIC ${SPDLOG_LIB})
 
 include(${CMAKE_SOURCE_DIR}/cmake/filesystem.cmake)
-if (STD_FS_LIB)
-    target_link_libraries(${PROJECT_NAME} PUBLIC ${STD_FS_LIB})
-endif ()
+if(STD_FS_LIB)
+  target_link_libraries(${PROJECT_NAME} PUBLIC ${STD_FS_LIB})
+endif()
 
 add_library(mmdeploy::core ALIAS ${PROJECT_NAME})
 
-install(DIRECTORY ${CMAKE_SOURCE_DIR}/csrc/mmdeploy/core
-        DESTINATION include/mmdeploy
-        FILES_MATCHING PATTERN "*.h")
+install(
+  DIRECTORY ${CMAKE_SOURCE_DIR}/csrc/mmdeploy/core
+  DESTINATION include/mmdeploy
+  FILES_MATCHING
+  PATTERN "*.h")
 install(FILES ${CMAKE_SOURCE_DIR}/third_party/outcome/outcome-experimental.hpp
         DESTINATION include/mmdeploy/third_party/outcome)
 
-install(DIRECTORY ${CMAKE_SOURCE_DIR}/csrc/mmdeploy/experimental
-        DESTINATION include/mmdeploy
-        FILES_MATCHING PATTERN "*.h")
+install(
+  DIRECTORY ${CMAKE_SOURCE_DIR}/csrc/mmdeploy/experimental
+  DESTINATION include/mmdeploy
+  FILES_MATCHING
+  PATTERN "*.h")
diff --git a/csrc/mmdeploy/core/archive.h b/csrc/mmdeploy/core/archive.h
index a0dde248ba..b9bfe9957f 100644
--- a/csrc/mmdeploy/core/archive.h
+++ b/csrc/mmdeploy/core/archive.h
@@ -6,102 +6,130 @@
 #include "mmdeploy/core/logger.h"
 #include "mmdeploy/core/serialization.h"
 
-namespace mmdeploy {
-
-template <typename T, typename A>
-using member_load_t = decltype(std::declval<T&>().load(std::declval<A&>()));
-
-template <typename T, typename A>
-using member_save_t = decltype(std::declval<T&>().save(std::declval<A&>()));
-
-template <typename T, typename A>
-using member_serialize_t = decltype(std::declval<T&>().serialize(std::declval<A&>()));
-
-template <typename T, typename A>
-using has_member_load = detail::is_detected<member_load_t, T, A>;
-
-template <typename T, typename A>
-using has_member_save = detail::is_detected<member_save_t, T, A>;
-
-template <typename T, typename A>
-using has_member_serialize = detail::is_detected<member_serialize_t, T, A>;
-
-template <typename T, typename A>
-using adl_load_t = decltype(adl_serializer<T>::load(std::declval<A&>(), std::declval<T&>()));
-
-template <typename T, typename A>
-using has_adl_load = detail::is_detected<adl_load_t, T, A>;
-
-template <typename T, typename A>
-using adl_save_t = decltype(adl_serializer<T>::save(std::declval<A&>(), std::declval<T&>()));
-
-template <typename T, typename A>
-using has_adl_save = detail::is_detected<adl_save_t, T, A>;
-
-template <typename T, typename A>
-using adl_serialize_t =
-    decltype(adl_serializer<T>::serialize(std::declval<A&>(), std::declval<T&>()));
-
-template <typename T, typename A>
-using has_adl_serialize = detail::is_detected<adl_serialize_t, T, A>;
-
-namespace detail {
-// ADL bridge for archives
-class ArchiveBase {};
-
-}  // namespace detail
-
-template <typename Archive>
-class OutputArchive : public detail::ArchiveBase {
- public:
-  template <typename... Args>
-  void operator()(Args&&... args) {
-    (dispatch(std::forward<Args>(args)), ...);
-  }
-
- private:
-  template <typename T>
-  void dispatch(T&& v) {
-    auto& archive = static_cast<Archive&>(*this);
-    if constexpr (has_member_save<T, Archive>::value) {
-      std::forward<T>(v).save(archive);
-    } else if constexpr (has_member_serialize<T, Archive>::value) {
-      std::forward<T>(v).serialize(archive);
-    } else if constexpr (has_adl_save<T, Archive>::value) {
-      adl_serializer<T>::save(archive, std::forward<T>(v));
-    } else if constexpr (has_adl_serialize<T, Archive>::value) {
-      adl_serializer<T>::serialize(archive, std::forward<T>(v));
-    } else {
-      archive.native(std::forward<T>(v));
-    }
-  }
-};
-
-template <typename Archive>
-class InputArchive : public detail::ArchiveBase {
- public:
-  template <typename... Args>
-  void operator()(Args&&... args) {
-    (dispatch(std::forward<Args>(args)), ...);
-  }
-
- private:
-  template <typename T>
-  void dispatch(T&& v) {
-    auto& archive = static_cast<Archive&>(*this);
-    if constexpr (has_member_load<T, Archive>::value) {
-      std::forward<T>(v).load(archive);
-    } else if constexpr (has_member_serialize<T, Archive>::value) {
-      std::forward<T>(v).serialize(archive);
-    } else if constexpr (has_adl_load<T, Archive>::value) {
-      adl_serializer<T>::load(archive, std::forward<T>(v));
-    } else if constexpr (has_adl_serialize<T, Archive>::value) {
-      adl_serializer<T>::serialize(archive, std::forward<T>(v));
-    } else {
-      archive.native(std::forward<T>(v));
-    }
-  }
-};
+namespace mmdeploy
+{
+
+    template<typename T, typename A>
+    using member_load_t = decltype(std::declval<T&>().load(std::declval<A&>()));
+
+    template<typename T, typename A>
+    using member_save_t = decltype(std::declval<T&>().save(std::declval<A&>()));
+
+    template<typename T, typename A>
+    using member_serialize_t = decltype(std::declval<T&>().serialize(std::declval<A&>()));
+
+    template<typename T, typename A>
+    using has_member_load = detail::is_detected<member_load_t, T, A>;
+
+    template<typename T, typename A>
+    using has_member_save = detail::is_detected<member_save_t, T, A>;
+
+    template<typename T, typename A>
+    using has_member_serialize = detail::is_detected<member_serialize_t, T, A>;
+
+    template<typename T, typename A>
+    using adl_load_t = decltype(adl_serializer<T>::load(std::declval<A&>(), std::declval<T&>()));
+
+    template<typename T, typename A>
+    using has_adl_load = detail::is_detected<adl_load_t, T, A>;
+
+    template<typename T, typename A>
+    using adl_save_t = decltype(adl_serializer<T>::save(std::declval<A&>(), std::declval<T&>()));
+
+    template<typename T, typename A>
+    using has_adl_save = detail::is_detected<adl_save_t, T, A>;
+
+    template<typename T, typename A>
+    using adl_serialize_t =
+        decltype(adl_serializer<T>::serialize(std::declval<A&>(), std::declval<T&>()));
+
+    template<typename T, typename A>
+    using has_adl_serialize = detail::is_detected<adl_serialize_t, T, A>;
+
+    namespace detail
+    {
+        // ADL bridge for archives
+        class ArchiveBase
+        {
+        };
+
+    }  // namespace detail
+
+    template<typename Archive>
+    class OutputArchive : public detail::ArchiveBase
+    {
+      public:
+        template<typename... Args>
+        void operator()(Args&&... args)
+        {
+            (dispatch(std::forward<Args>(args)), ...);
+        }
+
+      private:
+        template<typename T>
+        void dispatch(T&& v)
+        {
+            auto& archive = static_cast<Archive&>(*this);
+            if constexpr (has_member_save<T, Archive>::value)
+            {
+                std::forward<T>(v).save(archive);
+            }
+            else if constexpr (has_member_serialize<T, Archive>::value)
+            {
+                std::forward<T>(v).serialize(archive);
+            }
+            else if constexpr (has_adl_save<T, Archive>::value)
+            {
+                adl_serializer<T>::save(archive, std::forward<T>(v));
+            }
+            else if constexpr (has_adl_serialize<T, Archive>::value)
+            {
+                adl_serializer<T>::serialize(archive, std::forward<T>(v));
+            }
+            else
+            {
+                archive.native(std::forward<T>(v));
+            }
+        }
+    };
+
+    template<typename Archive>
+    class InputArchive : public detail::ArchiveBase
+    {
+      public:
+        template<typename... Args>
+        void operator()(Args&&... args)
+        {
+            (dispatch(std::forward<Args>(args)), ...);
+        }
+
+      private:
+        template<typename T>
+        void dispatch(T&& v)
+        {
+            auto& archive = static_cast<Archive&>(*this);
+            if constexpr (has_member_load<T, Archive>::value)
+            {
+                std::forward<T>(v).load(archive);
+            }
+            else if constexpr (has_member_serialize<T, Archive>::value)
+            {
+                std::forward<T>(v).serialize(archive);
+            }
+            else if constexpr (has_adl_load<T, Archive>::value)
+            {
+                adl_serializer<T>::load(archive, std::forward<T>(v));
+            }
+            else if constexpr (has_adl_serialize<T, Archive>::value)
+            {
+                adl_serializer<T>::serialize(archive, std::forward<T>(v));
+            }
+            else
+            {
+                archive.native(std::forward<T>(v));
+            }
+        }
+    };
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/core/device.h b/csrc/mmdeploy/core/device.h
index 5efc808137..170aa50f83 100644
--- a/csrc/mmdeploy/core/device.h
+++ b/csrc/mmdeploy/core/device.h
@@ -16,369 +16,509 @@
 #include "mmdeploy/core/status_code.h"
 #include "mmdeploy/core/utils/formatter.h"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
+
+    namespace framework
+    {
+
+        class Platform;
+        class Device;
+        class Stream;
+        class Event;
+        class Allocator;
+        class Buffer;
+        class Kernel;
+
+        class PlatformImpl;
+        class StreamImpl;
+        class EventImpl;
+        class AllocatorImpl;
+        class BufferImpl;
+        class KernelImpl;
+
+        template<typename T>
+        using optional = std::optional<T>;
+
+        class DeviceId
+        {
+          public:
+            using ValueType = int32_t;
+            constexpr explicit DeviceId(ValueType value)
+                : value_(value)
+            {
+            }
+            constexpr operator ValueType() const
+            {
+                return value_;
+            }  // NOLINT
+            constexpr ValueType get() const
+            {
+                return value_;
+            }
+
+          private:
+            ValueType value_;
+        };
+
+        class PlatformId
+        {
+          public:
+            using ValueType = int32_t;
+            constexpr explicit PlatformId(ValueType value)
+                : value_(value)
+            {
+            }
+            constexpr operator ValueType() const
+            {
+                return value_;
+            }  // NOLINT
+            constexpr ValueType get() const
+            {
+                return value_;
+            }
+
+          private:
+            ValueType value_;
+        };
+
+        class Device
+        {
+          public:
+            constexpr Device()
+                : platform_id_(-1)
+                , device_id_(-1)
+            {
+            }
+
+            constexpr explicit Device(DeviceId device_id, PlatformId platform_id = PlatformId(-1))
+                : Device(platform_id.get(), device_id.get())
+            {
+            }
+
+            constexpr explicit Device(PlatformId platform_id, DeviceId device_id = DeviceId(-1))
+                : Device(platform_id.get(), device_id.get())
+            {
+            }
+
+            constexpr explicit Device(int platform_id, int device_id = 0)
+                : platform_id_(platform_id)
+                , device_id_(device_id)
+            {
+            }
+
+            MMDEPLOY_API explicit Device(const char* platform_name, int device_id = 0);
+
+            constexpr int device_id() const noexcept
+            {
+                return device_id_;
+            }
+
+            constexpr int platform_id() const noexcept
+            {
+                return platform_id_;
+            }
+
+            constexpr bool is_host() const noexcept
+            {
+                return platform_id() == 0;
+            }
+
+            constexpr bool is_device() const noexcept
+            {
+                return platform_id() > 0;
+            }
+
+            constexpr bool operator==(const Device& other) const noexcept
+            {
+                return platform_id_ == other.platform_id_ && device_id_ == other.device_id_;
+            }
+
+            constexpr bool operator!=(const Device& other) const noexcept
+            {
+                return !(*this == other);
+            }
+
+            constexpr explicit operator bool() const noexcept
+            {
+                return platform_id_ >= 0 && device_id_ >= 0;
+            }
+
+            constexpr operator DeviceId() const noexcept
+            {  // NOLINT
+                return DeviceId(device_id_);
+            }
+
+            constexpr operator PlatformId() const noexcept
+            {  // NOLINT
+                return PlatformId(platform_id_);
+            }
+
+            friend std::ostream& operator<<(std::ostream& os, const Device& device)
+            {
+                os << "(" << device.platform_id_ << ", " << device.device_id_ << ")";
+                return os;
+            }
+
+          private:
+            int platform_id_{0};
+            int device_id_{0};
+        };
+
+        enum class MemcpyKind : int
+        {
+            HtoD,
+            DtoH,
+            DtoD
+        };
+
+        class MMDEPLOY_API Platform
+        {
+          public:
+            // throws if not found
+            explicit Platform(const char* platform_name);
+
+            // throws if not found
+            explicit Platform(int platform_id);
+
+            // bind device with the current thread
+            Result<void> Bind(Device device, Device* prev);
+
+            // -1 if invalid
+            int          GetPlatformId() const;
+
+            // "" if invalid
+            const char*  GetPlatformName() const;
+
+            bool         operator==(const Platform& other)
+            {
+                return impl_ == other.impl_;
+            }
+
+            bool operator!=(const Platform& other)
+            {
+                return !(*this == other);
+            }
+
+            explicit operator bool() const noexcept
+            {
+                return static_cast<bool>(impl_);
+            }
+
+          private:
+            explicit Platform(std::shared_ptr<PlatformImpl> impl)
+                : impl_(std::move(impl))
+            {
+            }
+
+          private:
+            friend class PlatformRegistry;
+            friend class Access;
+            std::shared_ptr<PlatformImpl> impl_;
+        };
+
+        MMDEPLOY_API const char* GetPlatformName(PlatformId id);
+
+        class DeviceGuard
+        {
+          public:
+            explicit DeviceGuard(Device device)
+                : platform_(device.platform_id())
+            {
+                auto r = platform_.Bind(device, &prev_);
+                if (!r)
+                {
+                    MMDEPLOY_ERROR("failed to bind device {}: {}", device, r.error().message().c_str());
+                }
+            }
 
-namespace framework {
+            ~DeviceGuard()
+            {
+                auto r = platform_.Bind(prev_, nullptr);
+                if (!r)
+                {
+                    MMDEPLOY_ERROR("failed to unbind device {}: {}", prev_, r.error().message().c_str());
+                }
+            }
 
-class Platform;
-class Device;
-class Stream;
-class Event;
-class Allocator;
-class Buffer;
-class Kernel;
+          private:
+            Platform platform_;
+            Device   prev_;
+        };
 
-class PlatformImpl;
-class StreamImpl;
-class EventImpl;
-class AllocatorImpl;
-class BufferImpl;
-class KernelImpl;
+        class MMDEPLOY_API Stream
+        {
+          public:
+            Stream() = default;
 
-template <typename T>
-using optional = std::optional<T>;
+            explicit Stream(Device device, uint64_t flags = 0);
 
-class DeviceId {
- public:
-  using ValueType = int32_t;
-  constexpr explicit DeviceId(ValueType value) : value_(value) {}
-  constexpr operator ValueType() const { return value_; }  // NOLINT
-  constexpr ValueType get() const { return value_; }
+            explicit Stream(Device device, void* native, uint64_t flags = 0);
 
- private:
-  ValueType value_;
-};
+            explicit Stream(Device device, std::shared_ptr<void> native, uint64_t flags = 0);
 
-class PlatformId {
- public:
-  using ValueType = int32_t;
-  constexpr explicit PlatformId(ValueType value) : value_(value) {}
-  constexpr operator ValueType() const { return value_; }  // NOLINT
-  constexpr ValueType get() const { return value_; }
+            Device       GetDevice() const;
 
- private:
-  ValueType value_;
-};
+            Result<void> Query();
 
-class Device {
- public:
-  constexpr Device() : platform_id_(-1), device_id_(-1) {}
+            Result<void> Wait();
 
-  constexpr explicit Device(DeviceId device_id, PlatformId platform_id = PlatformId(-1))
-      : Device(platform_id.get(), device_id.get()) {}
+            Result<void> DependsOn(Event& event);
 
-  constexpr explicit Device(PlatformId platform_id, DeviceId device_id = DeviceId(-1))
-      : Device(platform_id.get(), device_id.get()) {}
+            Result<void> Submit(Kernel& kernel);
 
-  constexpr explicit Device(int platform_id, int device_id = 0)
-      : platform_id_(platform_id), device_id_(device_id) {}
+            void*        GetNative(ErrorCode* ec = nullptr);
 
-  MMDEPLOY_API explicit Device(const char* platform_name, int device_id = 0);
+            Result<void> Copy(const Buffer& src, Buffer& dst, size_t size = -1, size_t src_offset = 0, size_t dst_offset = 0);
 
-  constexpr int device_id() const noexcept { return device_id_; }
+            Result<void> Copy(const void* host_ptr, Buffer& dst, size_t size = -1, size_t dst_offset = 0);
 
-  constexpr int platform_id() const noexcept { return platform_id_; }
+            Result<void> Copy(const Buffer& src, void* host_ptr, size_t size = -1, size_t src_offset = 0);
 
-  constexpr bool is_host() const noexcept { return platform_id() == 0; }
+            Result<void> Fill(const Buffer& dst, void* pattern, size_t pattern_size, size_t size = -1, size_t offset = 0);
 
-  constexpr bool is_device() const noexcept { return platform_id() > 0; }
+            bool         operator==(const Stream& other) const
+            {
+                return impl_ == other.impl_;
+            }
 
-  constexpr bool operator==(const Device& other) const noexcept {
-    return platform_id_ == other.platform_id_ && device_id_ == other.device_id_;
-  }
+            bool operator!=(const Stream& other) const
+            {
+                return !(*this == other);
+            }
 
-  constexpr bool operator!=(const Device& other) const noexcept { return !(*this == other); }
+            explicit operator bool() const noexcept
+            {
+                return static_cast<bool>(impl_);
+            }
 
-  constexpr explicit operator bool() const noexcept { return platform_id_ >= 0 && device_id_ >= 0; }
+            static Stream GetDefault(Device device);
 
-  constexpr operator DeviceId() const noexcept {  // NOLINT
-    return DeviceId(device_id_);
-  }
+          private:
+            explicit Stream(std::shared_ptr<StreamImpl> impl)
+                : impl_(std::move(impl))
+            {
+            }
 
-  constexpr operator PlatformId() const noexcept {  // NOLINT
-    return PlatformId(platform_id_);
-  }
+          private:
+            friend class Access;
 
-  friend std::ostream& operator<<(std::ostream& os, const Device& device) {
-    os << "(" << device.platform_id_ << ", " << device.device_id_ << ")";
-    return os;
-  }
+            std::shared_ptr<StreamImpl> impl_;
+        };
 
- private:
-  int platform_id_{0};
-  int device_id_{0};
-};
+        template<typename T>
+        T GetNative(Stream& stream, ErrorCode* ec = nullptr)
+        {
+            return reinterpret_cast<T>(stream.GetNative(ec));
+        }
 
-enum class MemcpyKind : int { HtoD, DtoH, DtoD };
+        class MMDEPLOY_API Event
+        {
+          public:
+            Event() = default;
 
-class MMDEPLOY_API Platform {
- public:
-  // throws if not found
-  explicit Platform(const char* platform_name);
+            explicit Event(Device device, uint64_t flags = 0);
 
-  // throws if not found
-  explicit Platform(int platform_id);
+            explicit Event(Device device, void* native, uint64_t flags = 0);
 
-  // bind device with the current thread
-  Result<void> Bind(Device device, Device* prev);
+            explicit Event(Device device, std::shared_ptr<void> native, uint64_t flags = 0);
 
-  // -1 if invalid
-  int GetPlatformId() const;
+            Device       GetDevice();
 
-  // "" if invalid
-  const char* GetPlatformName() const;
+            Result<void> Query();
 
-  bool operator==(const Platform& other) { return impl_ == other.impl_; }
+            Result<void> Wait();
 
-  bool operator!=(const Platform& other) { return !(*this == other); }
+            Result<void> Record(Stream& stream);
 
-  explicit operator bool() const noexcept { return static_cast<bool>(impl_); }
+            void*        GetNative(ErrorCode* ec = nullptr);
 
- private:
-  explicit Platform(std::shared_ptr<PlatformImpl> impl) : impl_(std::move(impl)) {}
+            bool         operator==(const Event& other) const
+            {
+                return impl_ == other.impl_;
+            }
 
- private:
-  friend class PlatformRegistry;
-  friend class Access;
-  std::shared_ptr<PlatformImpl> impl_;
-};
+            bool operator!=(const Event& other) const
+            {
+                return !(*this == other);
+            }
 
-MMDEPLOY_API const char* GetPlatformName(PlatformId id);
+            explicit operator bool() const noexcept
+            {
+                return static_cast<bool>(impl_);
+            }
 
-class DeviceGuard {
- public:
-  explicit DeviceGuard(Device device) : platform_(device.platform_id()) {
-    auto r = platform_.Bind(device, &prev_);
-    if (!r) {
-      MMDEPLOY_ERROR("failed to bind device {}: {}", device, r.error().message().c_str());
-    }
-  }
+          private:
+            explicit Event(std::shared_ptr<EventImpl> impl)
+                : impl_(std::move(impl))
+            {
+            }
 
-  ~DeviceGuard() {
-    auto r = platform_.Bind(prev_, nullptr);
-    if (!r) {
-      MMDEPLOY_ERROR("failed to unbind device {}: {}", prev_, r.error().message().c_str());
-    }
-  }
+          private:
+            friend class Access;
+            std::shared_ptr<EventImpl> impl_;
+        };
 
- private:
-  Platform platform_;
-  Device prev_;
-};
+        template<typename T>
+        T GetNative(Event& event, ErrorCode* ec = nullptr)
+        {
+            return reinterpret_cast<T>(event.GetNative(ec));
+        }
 
-class MMDEPLOY_API Stream {
- public:
-  Stream() = default;
+        class MMDEPLOY_API Kernel
+        {
+          public:
+            Kernel() = default;
+            explicit Kernel(std::shared_ptr<KernelImpl> impl)
+                : impl_(std::move(impl))
+            {
+            }
 
-  explicit Stream(Device device, uint64_t flags = 0);
+            Device   GetDevice() const;
 
-  explicit Stream(Device device, void* native, uint64_t flags = 0);
+            void*    GetNative(ErrorCode* ec = nullptr);
+
+            explicit operator bool() const noexcept
+            {
+                return static_cast<bool>(impl_);
+            }
+
+          private:
+            std::shared_ptr<KernelImpl> impl_;
+        };
 
-  explicit Stream(Device device, std::shared_ptr<void> native, uint64_t flags = 0);
+        template<typename T>
+        T GetNative(Kernel& kernel, ErrorCode* ec = nullptr)
+        {
+            return reinterpret_cast<T>(kernel.GetNative(ec));
+        }
 
-  Device GetDevice() const;
+        class MMDEPLOY_API Allocator
+        {
+            friend class Access;
 
-  Result<void> Query();
+          public:
+            Allocator() = default;
 
-  Result<void> Wait();
+            explicit operator bool() const noexcept
+            {
+                return static_cast<bool>(impl_);
+            }
 
-  Result<void> DependsOn(Event& event);
+          private:
+            explicit Allocator(std::shared_ptr<AllocatorImpl> impl)
+                : impl_(std::move(impl))
+            {
+            }
+            std::shared_ptr<AllocatorImpl> impl_;
+        };
 
-  Result<void> Submit(Kernel& kernel);
+        class MMDEPLOY_API Buffer
+        {
+          public:
+            Buffer() = default;
 
-  void* GetNative(ErrorCode* ec = nullptr);
+            Buffer(Device device, size_t size, size_t alignment = 1, uint64_t flags = 0)
+                : Buffer(device, size, Allocator{}, alignment, flags)
+            {
+            }
 
-  Result<void> Copy(const Buffer& src, Buffer& dst, size_t size = -1, size_t src_offset = 0,
-                    size_t dst_offset = 0);
+            Buffer(Device device, size_t size, Allocator allocator, size_t alignment = 1, uint64_t flags = 0);
 
-  Result<void> Copy(const void* host_ptr, Buffer& dst, size_t size = -1, size_t dst_offset = 0);
+            Buffer(Device device, size_t size, void* native, uint64_t flags = 0);
 
-  Result<void> Copy(const Buffer& src, void* host_ptr, size_t size = -1, size_t src_offset = 0);
+            Buffer(Device device, size_t size, std::shared_ptr<void> native, uint64_t flags = 0);
+            // create sub-buffer
+            Buffer(Buffer& buffer, size_t offset, size_t size, uint64_t flags = 0);
 
-  Result<void> Fill(const Buffer& dst, void* pattern, size_t pattern_size, size_t size = -1,
-                    size_t offset = 0);
+            size_t    GetSize(ErrorCode* ec = nullptr) const;
+
+            //  bool IsSubBuffer(ErrorCode* ec = nullptr);
+
+            void*     GetNative(ErrorCode* ec = nullptr) const;
+
+            Device    GetDevice() const;
+
+            Allocator GetAllocator() const;
+
+            bool      operator==(const Buffer& other) const
+            {
+                return impl_ == other.impl_;
+            }
 
-  bool operator==(const Stream& other) const { return impl_ == other.impl_; }
+            bool operator!=(const Buffer& other) const
+            {
+                return !(*this == other);
+            }
 
-  bool operator!=(const Stream& other) const { return !(*this == other); }
+            explicit operator bool() const noexcept
+            {
+                return static_cast<bool>(impl_);
+            }
 
-  explicit operator bool() const noexcept { return static_cast<bool>(impl_); }
+          private:
+            explicit Buffer(std::shared_ptr<BufferImpl> impl)
+                : impl_(std::move(impl))
+            {
+            }
 
-  static Stream GetDefault(Device device);
+          private:
+            friend class Access;
+            std::shared_ptr<BufferImpl> impl_;
+        };
 
- private:
-  explicit Stream(std::shared_ptr<StreamImpl> impl) : impl_(std::move(impl)) {}
+        template<typename T>
+        T GetNative(Buffer& buffer, ErrorCode* ec = nullptr)
+        {
+            return reinterpret_cast<T>(buffer.GetNative(ec));
+        }
 
- private:
-  friend class Access;
+        template<typename T>
+        T GetNative(const Buffer& buffer, ErrorCode* ec = nullptr)
+        {
+            return reinterpret_cast<T>(buffer.GetNative(ec));
+        }
 
-  std::shared_ptr<StreamImpl> impl_;
-};
+        class MMDEPLOY_API PlatformRegistry
+        {
+          public:
+            using Creator = std::function<std::shared_ptr<PlatformImpl>()>;
 
-template <typename T>
-T GetNative(Stream& stream, ErrorCode* ec = nullptr) {
-  return reinterpret_cast<T>(stream.GetNative(ec));
-}
+            int           Register(Creator creator);
 
-class MMDEPLOY_API Event {
- public:
-  Event() = default;
+            int           AddAlias(const char* name, const char* target);
 
-  explicit Event(Device device, uint64_t flags = 0);
+            int           GetPlatform(const char* name, Platform* platform);
 
-  explicit Event(Device device, void* native, uint64_t flags = 0);
+            int           GetPlatform(int id, Platform* platform);
 
-  explicit Event(Device device, std::shared_ptr<void> native, uint64_t flags = 0);
+            int           GetPlatformId(const char* name);
 
-  Device GetDevice();
+            PlatformImpl* GetPlatformImpl(PlatformId id);
 
-  Result<void> Query();
+          private:
+            int  GetNextId();
 
-  Result<void> Wait();
+            bool IsAvailable(int id);
 
-  Result<void> Record(Stream& stream);
+          private:
+            struct Entry
+            {
+                std::string name;
+                int         id;
+                Platform    platform;
+            };
+            std::vector<Entry>                               entries_;
+            std::vector<std::pair<std::string, std::string>> aliases_;
+        };
 
-  void* GetNative(ErrorCode* ec = nullptr);
+        MMDEPLOY_API PlatformRegistry& gPlatformRegistry();
 
-  bool operator==(const Event& other) const { return impl_ == other.impl_; }
+    }  // namespace framework
 
-  bool operator!=(const Event& other) const { return !(*this == other); }
-
-  explicit operator bool() const noexcept { return static_cast<bool>(impl_); }
-
- private:
-  explicit Event(std::shared_ptr<EventImpl> impl) : impl_(std::move(impl)) {}
-
- private:
-  friend class Access;
-  std::shared_ptr<EventImpl> impl_;
-};
-
-template <typename T>
-T GetNative(Event& event, ErrorCode* ec = nullptr) {
-  return reinterpret_cast<T>(event.GetNative(ec));
-}
-
-class MMDEPLOY_API Kernel {
- public:
-  Kernel() = default;
-  explicit Kernel(std::shared_ptr<KernelImpl> impl) : impl_(std::move(impl)) {}
-
-  Device GetDevice() const;
-
-  void* GetNative(ErrorCode* ec = nullptr);
-
-  explicit operator bool() const noexcept { return static_cast<bool>(impl_); }
-
- private:
-  std::shared_ptr<KernelImpl> impl_;
-};
-
-template <typename T>
-T GetNative(Kernel& kernel, ErrorCode* ec = nullptr) {
-  return reinterpret_cast<T>(kernel.GetNative(ec));
-}
-
-class MMDEPLOY_API Allocator {
-  friend class Access;
-
- public:
-  Allocator() = default;
-
-  explicit operator bool() const noexcept { return static_cast<bool>(impl_); }
-
- private:
-  explicit Allocator(std::shared_ptr<AllocatorImpl> impl) : impl_(std::move(impl)) {}
-  std::shared_ptr<AllocatorImpl> impl_;
-};
-
-class MMDEPLOY_API Buffer {
- public:
-  Buffer() = default;
-
-  Buffer(Device device, size_t size, size_t alignment = 1, uint64_t flags = 0)
-      : Buffer(device, size, Allocator{}, alignment, flags) {}
-
-  Buffer(Device device, size_t size, Allocator allocator, size_t alignment = 1, uint64_t flags = 0);
-
-  Buffer(Device device, size_t size, void* native, uint64_t flags = 0);
-
-  Buffer(Device device, size_t size, std::shared_ptr<void> native, uint64_t flags = 0);
-  // create sub-buffer
-  Buffer(Buffer& buffer, size_t offset, size_t size, uint64_t flags = 0);
-
-  size_t GetSize(ErrorCode* ec = nullptr) const;
-
-  //  bool IsSubBuffer(ErrorCode* ec = nullptr);
-
-  void* GetNative(ErrorCode* ec = nullptr) const;
-
-  Device GetDevice() const;
-
-  Allocator GetAllocator() const;
-
-  bool operator==(const Buffer& other) const { return impl_ == other.impl_; }
-
-  bool operator!=(const Buffer& other) const { return !(*this == other); }
-
-  explicit operator bool() const noexcept { return static_cast<bool>(impl_); }
-
- private:
-  explicit Buffer(std::shared_ptr<BufferImpl> impl) : impl_(std::move(impl)) {}
-
- private:
-  friend class Access;
-  std::shared_ptr<BufferImpl> impl_;
-};
-
-template <typename T>
-T GetNative(Buffer& buffer, ErrorCode* ec = nullptr) {
-  return reinterpret_cast<T>(buffer.GetNative(ec));
-}
-
-template <typename T>
-T GetNative(const Buffer& buffer, ErrorCode* ec = nullptr) {
-  return reinterpret_cast<T>(buffer.GetNative(ec));
-}
-
-class MMDEPLOY_API PlatformRegistry {
- public:
-  using Creator = std::function<std::shared_ptr<PlatformImpl>()>;
-
-  int Register(Creator creator);
-
-  int AddAlias(const char* name, const char* target);
-
-  int GetPlatform(const char* name, Platform* platform);
-
-  int GetPlatform(int id, Platform* platform);
-
-  int GetPlatformId(const char* name);
-
-  PlatformImpl* GetPlatformImpl(PlatformId id);
-
- private:
-  int GetNextId();
-
-  bool IsAvailable(int id);
-
- private:
-  struct Entry {
-    std::string name;
-    int id;
-    Platform platform;
-  };
-  std::vector<Entry> entries_;
-  std::vector<std::pair<std::string, std::string>> aliases_;
-};
-
-MMDEPLOY_API PlatformRegistry& gPlatformRegistry();
-
-}  // namespace framework
-
-MMDEPLOY_REGISTER_TYPE_ID(framework::Device, 1);
-MMDEPLOY_REGISTER_TYPE_ID(framework::Buffer, 2);
-MMDEPLOY_REGISTER_TYPE_ID(framework::Stream, 3);
-MMDEPLOY_REGISTER_TYPE_ID(framework::Event, 4);
+    MMDEPLOY_REGISTER_TYPE_ID(framework::Device, 1);
+    MMDEPLOY_REGISTER_TYPE_ID(framework::Buffer, 2);
+    MMDEPLOY_REGISTER_TYPE_ID(framework::Stream, 3);
+    MMDEPLOY_REGISTER_TYPE_ID(framework::Event, 4);
 
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/core/device_impl.cpp b/csrc/mmdeploy/core/device_impl.cpp
index b65b82be07..75d60c2f82 100644
--- a/csrc/mmdeploy/core/device_impl.cpp
+++ b/csrc/mmdeploy/core/device_impl.cpp
@@ -7,111 +7,150 @@
 #include "mmdeploy/core/device.h"
 #include "mmdeploy/core/logger.h"
 
-namespace mmdeploy::framework {
-
-template <typename T>
-T SetError(ErrorCode* ec, ErrorCode code, T ret) {
-  if (ec) {
-    *ec = code;
-  }
-  return ret;
-}
+namespace mmdeploy::framework
+{
+
+    template<typename T>
+    T SetError(ErrorCode* ec, ErrorCode code, T ret)
+    {
+        if (ec)
+        {
+            *ec = code;
+        }
+        return ret;
+    }
 
-////////////////////////////////////////////////////////////////////////////////
-/// Device
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Device
 
-Device::Device(const char* platform_name, int device_id) {
-  platform_id_ = gPlatformRegistry().GetPlatformId(platform_name);
-  device_id_ = device_id;
-}
-
-//////////////////////////////////////////////////
-/// Platform
+    Device::Device(const char* platform_name, int device_id)
+    {
+        platform_id_ = gPlatformRegistry().GetPlatformId(platform_name);
+        device_id_   = device_id;
+    }
 
-int Platform::GetPlatformId() const {
-  if (impl_) {
-    return impl_->GetPlatformId();
-  }
-  return -1;
-}
+    //////////////////////////////////////////////////
+    /// Platform
 
-const char* Platform::GetPlatformName() const {
-  if (impl_) {
-    return impl_->GetPlatformName();
-  }
-  return "";
-}
+    int Platform::GetPlatformId() const
+    {
+        if (impl_)
+        {
+            return impl_->GetPlatformId();
+        }
+        return -1;
+    }
 
-Platform::Platform(const char* platform_name) {
-  if (-1 == gPlatformRegistry().GetPlatform(platform_name, this)) {
-    throw_exception(eInvalidArgument);
-  }
-}
+    const char* Platform::GetPlatformName() const
+    {
+        if (impl_)
+        {
+            return impl_->GetPlatformName();
+        }
+        return "";
+    }
 
-Platform::Platform(int platform_id) {
-  if (-1 == gPlatformRegistry().GetPlatform(platform_id, this)) {
-    throw_exception(eInvalidArgument);
-  }
-}
+    Platform::Platform(const char* platform_name)
+    {
+        if (-1 == gPlatformRegistry().GetPlatform(platform_name, this))
+        {
+            throw_exception(eInvalidArgument);
+        }
+    }
 
-Result<void> Platform::Bind(Device device, Device* prev) { return impl_->BindDevice(device, prev); }
+    Platform::Platform(int platform_id)
+    {
+        if (-1 == gPlatformRegistry().GetPlatform(platform_id, this))
+        {
+            throw_exception(eInvalidArgument);
+        }
+    }
 
-const char* GetPlatformName(PlatformId id) {
-  if (auto impl = gPlatformRegistry().GetPlatformImpl(id); impl) {
-    return impl->GetPlatformName();
-  }
-  return nullptr;
-}
+    Result<void> Platform::Bind(Device device, Device* prev)
+    {
+        return impl_->BindDevice(device, prev);
+    }
 
-////////////////////////////////////////////////////////////////////////////////
-/// Buffer
+    const char* GetPlatformName(PlatformId id)
+    {
+        if (auto impl = gPlatformRegistry().GetPlatformImpl(id); impl)
+        {
+            return impl->GetPlatformName();
+        }
+        return nullptr;
+    }
 
-Buffer::Buffer(Device device, size_t size, Allocator allocator, size_t alignment, uint64_t flags) {
-  if (auto p = GetPlatformImpl(device)) {
-    impl_ = p->CreateBuffer(device);
-    if (auto r = impl_->Init(size, std::move(allocator), alignment, flags); r.has_error()) {
-      impl_.reset();
-      r.error().throw_exception();
+    ////////////////////////////////////////////////////////////////////////////////
+    /// Buffer
+
+    Buffer::Buffer(Device device, size_t size, Allocator allocator, size_t alignment, uint64_t flags)
+    {
+        if (auto p = GetPlatformImpl(device))
+        {
+            impl_ = p->CreateBuffer(device);
+            if (auto r = impl_->Init(size, std::move(allocator), alignment, flags); r.has_error())
+            {
+                impl_.reset();
+                r.error().throw_exception();
+            }
+        }
+        else
+        {
+            throw_exception(eInvalidArgument);
+        }
     }
-  } else {
-    throw_exception(eInvalidArgument);
-  }
-}
 
-Buffer::Buffer(Device device, size_t size, void* native, uint64_t flags)
-    : Buffer(device, size, std::shared_ptr<void>(native, [](void*) {}), flags) {}
+    Buffer::Buffer(Device device, size_t size, void* native, uint64_t flags)
+        : Buffer(device, size, std::shared_ptr<void>(native, [](void*) {}), flags)
+    {
+    }
 
-Buffer::Buffer(Device device, size_t size, std::shared_ptr<void> native, uint64_t flags) {
-  if (auto p = GetPlatformImpl(device)) {
-    impl_ = p->CreateBuffer(device);
-    if (auto r = impl_->Init(size, std::move(native), flags); r.has_error()) {
-      impl_.reset();
-      r.error().throw_exception();
+    Buffer::Buffer(Device device, size_t size, std::shared_ptr<void> native, uint64_t flags)
+    {
+        if (auto p = GetPlatformImpl(device))
+        {
+            impl_ = p->CreateBuffer(device);
+            if (auto r = impl_->Init(size, std::move(native), flags); r.has_error())
+            {
+                impl_.reset();
+                r.error().throw_exception();
+            }
+        }
+        else
+        {
+            throw_exception(eInvalidArgument);
+        }
     }
-  } else {
-    throw_exception(eInvalidArgument);
-  }
-}
 
-Device Buffer::GetDevice() const { return impl_ ? impl_->GetDevice() : Device{}; }
+    Device Buffer::GetDevice() const
+    {
+        return impl_ ? impl_->GetDevice() : Device{};
+    }
 
-Allocator Buffer::GetAllocator() const { return impl_ ? impl_->GetAllocator() : Allocator{}; }
+    Allocator Buffer::GetAllocator() const
+    {
+        return impl_ ? impl_->GetAllocator() : Allocator{};
+    }
 
-void* Buffer::GetNative(ErrorCode* ec) const {
-  return impl_ ? impl_->GetNative(ec) : SetError(ec, eInvalidArgument, nullptr);
-}
+    void* Buffer::GetNative(ErrorCode* ec) const
+    {
+        return impl_ ? impl_->GetNative(ec) : SetError(ec, eInvalidArgument, nullptr);
+    }
 
-size_t Buffer::GetSize(ErrorCode* ec) const {
-  return impl_ ? impl_->GetSize(ec) : SetError(ec, eInvalidArgument, 0);
-}
+    size_t Buffer::GetSize(ErrorCode* ec) const
+    {
+        return impl_ ? impl_->GetSize(ec) : SetError(ec, eInvalidArgument, 0);
+    }
 
-Buffer::Buffer(Buffer& buffer, size_t offset, size_t size, uint64_t flags) {
-  auto impl = buffer.impl_->SubBuffer(offset, size, flags);
-  if (!impl) {
-    impl.error().throw_exception();
-  }
-  impl_ = std::move(impl).value();
-}
+    Buffer::Buffer(Buffer& buffer, size_t offset, size_t size, uint64_t flags)
+    {
+        auto impl = buffer.impl_->SubBuffer(offset, size, flags);
+        if (!impl)
+        {
+            impl.error().throw_exception();
+        }
+        impl_ = std::move(impl).value();
+    }
 
 #if 0
 int Copy(const void* host_ptr, Buffer& dst, size_t size, size_t dst_offset) {
@@ -141,270 +180,368 @@ int Copy(const Buffer& src, Buffer& dst, size_t size, size_t src_offset,
 }
 #endif
 
-//////////////////////////////////////////////////
-/// Stream
-
-Stream::Stream(Device device, uint64_t flags) {
-  if (auto p = GetPlatformImpl(device)) {
-    auto impl = p->CreateStream(device);
-    if (auto r = impl->Init(flags)) {
-      impl_ = std::move(impl);
-    } else {
-      r.error().throw_exception();
-    }
-  } else {
-    MMDEPLOY_ERROR("{}, {}", device.device_id(), device.platform_id());
-    throw_exception(eInvalidArgument);
-  }
-}
-
-Stream::Stream(Device device, void* native, uint64_t flags)
-    : Stream(device, std::shared_ptr<void>(native, [](void*) {}), flags) {}
-
-Stream::Stream(Device device, std::shared_ptr<void> native, uint64_t flags) {
-  if (auto p = GetPlatformImpl(device)) {
-    auto impl = p->CreateStream(device);
-    if (auto r = impl->Init(std::move(native), flags)) {
-      impl_ = std::move(impl);
-    } else {
-      r.error().throw_exception();
+    //////////////////////////////////////////////////
+    /// Stream
+
+    Stream::Stream(Device device, uint64_t flags)
+    {
+        if (auto p = GetPlatformImpl(device))
+        {
+            auto impl = p->CreateStream(device);
+            if (auto r = impl->Init(flags))
+            {
+                impl_ = std::move(impl);
+            }
+            else
+            {
+                r.error().throw_exception();
+            }
+        }
+        else
+        {
+            MMDEPLOY_ERROR("{}, {}", device.device_id(), device.platform_id());
+            throw_exception(eInvalidArgument);
+        }
     }
-  } else {
-    throw_exception(eInvalidArgument);
-  }
-}
-
-Result<void> Stream::Query() {
-  if (impl_) {
-    return impl_->Query();
-  }
-  return Status(eInvalidArgument);
-}
 
-Result<void> Stream::Wait() {
-  if (impl_) {
-    return impl_->Wait();
-  }
-  return Status(eInvalidArgument);
-}
+    Stream::Stream(Device device, void* native, uint64_t flags)
+        : Stream(device, std::shared_ptr<void>(native, [](void*) {}), flags)
+    {
+    }
 
-Result<void> Stream::DependsOn(Event& event) {
-  return impl_ ? impl_->DependsOn(event) : Status(eInvalidArgument);
-}
+    Stream::Stream(Device device, std::shared_ptr<void> native, uint64_t flags)
+    {
+        if (auto p = GetPlatformImpl(device))
+        {
+            auto impl = p->CreateStream(device);
+            if (auto r = impl->Init(std::move(native), flags))
+            {
+                impl_ = std::move(impl);
+            }
+            else
+            {
+                r.error().throw_exception();
+            }
+        }
+        else
+        {
+            throw_exception(eInvalidArgument);
+        }
+    }
 
-void* Stream::GetNative(ErrorCode* ec) {
-  return impl_ ? impl_->GetNative(ec) : SetError(ec, eInvalidArgument, nullptr);
-}
+    Result<void> Stream::Query()
+    {
+        if (impl_)
+        {
+            return impl_->Query();
+        }
+        return Status(eInvalidArgument);
+    }
 
-Result<void> Stream::Submit(Kernel& kernel) {
-  return impl_ ? impl_->Submit(kernel) : Status(eInvalidArgument);
-}
+    Result<void> Stream::Wait()
+    {
+        if (impl_)
+        {
+            return impl_->Wait();
+        }
+        return Status(eInvalidArgument);
+    }
 
-Result<void> Stream::Copy(const Buffer& src, Buffer& dst, size_t size, size_t src_offset,
-                          size_t dst_offset) {
-  if (!impl_) {
-    return Status(eInvalidArgument);
-  }
-  if (size == static_cast<size_t>(-1)) {
-    size = src.GetSize();
-  }
-  if (auto p = GetPlatformImpl(GetDevice())) {
-    return p->Copy(src, dst, size, src_offset, dst_offset, *this);
-  }
-  return Status(eInvalidArgument);
-}
+    Result<void> Stream::DependsOn(Event& event)
+    {
+        return impl_ ? impl_->DependsOn(event) : Status(eInvalidArgument);
+    }
 
-Result<void> Stream::Copy(const void* host_ptr, Buffer& dst, size_t size, size_t dst_offset) {
-  if (!impl_) {
-    return Status(eInvalidArgument);
-  }
-  if (size == static_cast<size_t>(-1)) {
-    size = dst.GetSize();
-  }
-  auto device = GetDevice();
-  if (auto p = GetPlatformImpl(device)) {
-    return p->Copy(host_ptr, dst, size, dst_offset, *this);
-  }
-  return Status(eInvalidArgument);
-}
+    void* Stream::GetNative(ErrorCode* ec)
+    {
+        return impl_ ? impl_->GetNative(ec) : SetError(ec, eInvalidArgument, nullptr);
+    }
 
-Result<void> Stream::Copy(const Buffer& src, void* host_ptr, size_t size, size_t src_offset) {
-  if (!impl_) {
-    return Status(eInvalidArgument);
-  }
-  if (size == static_cast<size_t>(-1)) {
-    size = src.GetSize();
-  }
-  if (auto p = GetPlatformImpl(GetDevice())) {
-    return p->Copy(src, host_ptr, size, src_offset, *this);
-  }
-  return Status(eInvalidArgument);
-}
+    Result<void> Stream::Submit(Kernel& kernel)
+    {
+        return impl_ ? impl_->Submit(kernel) : Status(eInvalidArgument);
+    }
 
-Result<void> Stream::Fill(const Buffer& dst, void* pattern, size_t pattern_size, size_t size,
-                          size_t offset) {
-  if (!impl_) {
-    return Status(eInvalidArgument);
-  }
-  return Status(eNotSupported);
-}
+    Result<void> Stream::Copy(const Buffer& src, Buffer& dst, size_t size, size_t src_offset, size_t dst_offset)
+    {
+        if (!impl_)
+        {
+            return Status(eInvalidArgument);
+        }
+        if (size == static_cast<size_t>(-1))
+        {
+            size = src.GetSize();
+        }
+        if (auto p = GetPlatformImpl(GetDevice()))
+        {
+            return p->Copy(src, dst, size, src_offset, dst_offset, *this);
+        }
+        return Status(eInvalidArgument);
+    }
 
-Device Stream::GetDevice() const { return impl_ ? impl_->GetDevice() : Device{}; }
+    Result<void> Stream::Copy(const void* host_ptr, Buffer& dst, size_t size, size_t dst_offset)
+    {
+        if (!impl_)
+        {
+            return Status(eInvalidArgument);
+        }
+        if (size == static_cast<size_t>(-1))
+        {
+            size = dst.GetSize();
+        }
+        auto device = GetDevice();
+        if (auto p = GetPlatformImpl(device))
+        {
+            return p->Copy(host_ptr, dst, size, dst_offset, *this);
+        }
+        return Status(eInvalidArgument);
+    }
 
-Stream Stream::GetDefault(Device device) {
-  Platform platform(device.platform_id());
-  assert(platform);
-  Stream stream = Access::get<PlatformImpl>(platform).GetDefaultStream(device.device_id()).value();
-  return stream;
-}
+    Result<void> Stream::Copy(const Buffer& src, void* host_ptr, size_t size, size_t src_offset)
+    {
+        if (!impl_)
+        {
+            return Status(eInvalidArgument);
+        }
+        if (size == static_cast<size_t>(-1))
+        {
+            size = src.GetSize();
+        }
+        if (auto p = GetPlatformImpl(GetDevice()))
+        {
+            return p->Copy(src, host_ptr, size, src_offset, *this);
+        }
+        return Status(eInvalidArgument);
+    }
 
-/////////////////////////////////////////////////
-/// Event
+    Result<void> Stream::Fill(const Buffer& dst, void* pattern, size_t pattern_size, size_t size, size_t offset)
+    {
+        if (!impl_)
+        {
+            return Status(eInvalidArgument);
+        }
+        return Status(eNotSupported);
+    }
 
-Event::Event(Device device, uint64_t flags) {
-  if (auto p = GetPlatformImpl(device)) {
-    auto impl = p->CreateEvent(device);
-    if (auto r = impl->Init(flags)) {
-      impl_ = std::move(impl);
-    } else {
-      r.error().throw_exception();
+    Device Stream::GetDevice() const
+    {
+        return impl_ ? impl_->GetDevice() : Device{};
     }
-  } else {
-    throw_exception(eInvalidArgument);
-  }
-}
 
-Event::Event(Device device, void* native, uint64_t flags)
-    : Event(device, std::shared_ptr<void>(native, [](void*) {}), flags) {}
+    Stream Stream::GetDefault(Device device)
+    {
+        Platform platform(device.platform_id());
+        assert(platform);
+        Stream stream = Access::get<PlatformImpl>(platform).GetDefaultStream(device.device_id()).value();
+        return stream;
+    }
 
-Event::Event(Device device, std::shared_ptr<void> native, uint64_t flags) {
-  if (auto p = GetPlatformImpl(device)) {
-    auto impl = p->CreateEvent(device);
-    if (auto r = impl->Init(std::move(native), flags)) {
-      impl_ = std::move(impl);
-    } else {
-      r.error().throw_exception();
+    /////////////////////////////////////////////////
+    /// Event
+
+    Event::Event(Device device, uint64_t flags)
+    {
+        if (auto p = GetPlatformImpl(device))
+        {
+            auto impl = p->CreateEvent(device);
+            if (auto r = impl->Init(flags))
+            {
+                impl_ = std::move(impl);
+            }
+            else
+            {
+                r.error().throw_exception();
+            }
+        }
+        else
+        {
+            throw_exception(eInvalidArgument);
+        }
     }
-  } else {
-    throw_exception(eInvalidArgument);
-  }
-}
 
-Result<void> Event::Query() { return impl_ ? impl_->Query() : Status(eInvalidArgument); }
+    Event::Event(Device device, void* native, uint64_t flags)
+        : Event(device, std::shared_ptr<void>(native, [](void*) {}), flags)
+    {
+    }
 
-Result<void> Event::Wait() { return impl_ ? impl_->Wait() : Status(eInvalidArgument); }
+    Event::Event(Device device, std::shared_ptr<void> native, uint64_t flags)
+    {
+        if (auto p = GetPlatformImpl(device))
+        {
+            auto impl = p->CreateEvent(device);
+            if (auto r = impl->Init(std::move(native), flags))
+            {
+                impl_ = std::move(impl);
+            }
+            else
+            {
+                r.error().throw_exception();
+            }
+        }
+        else
+        {
+            throw_exception(eInvalidArgument);
+        }
+    }
 
-Result<void> Event::Record(Stream& stream) {
-  return impl_ ? impl_->Record(stream) : Status(eInvalidArgument);
-}
+    Result<void> Event::Query()
+    {
+        return impl_ ? impl_->Query() : Status(eInvalidArgument);
+    }
 
-void* Event::GetNative(ErrorCode* ec) {
-  return impl_ ? impl_->GetNative(ec) : SetError(ec, eInvalidArgument, nullptr);
-}
+    Result<void> Event::Wait()
+    {
+        return impl_ ? impl_->Wait() : Status(eInvalidArgument);
+    }
 
-Device Event::GetDevice() { return impl_ ? impl_->GetDevice() : Device{}; }
+    Result<void> Event::Record(Stream& stream)
+    {
+        return impl_ ? impl_->Record(stream) : Status(eInvalidArgument);
+    }
 
-/////////////////////////////////////////////////
-/// Kernel
+    void* Event::GetNative(ErrorCode* ec)
+    {
+        return impl_ ? impl_->GetNative(ec) : SetError(ec, eInvalidArgument, nullptr);
+    }
 
-Device Kernel::GetDevice() const { return impl_ ? impl_->GetDevice() : Device{}; }
+    Device Event::GetDevice()
+    {
+        return impl_ ? impl_->GetDevice() : Device{};
+    }
 
-void* Kernel::GetNative(ErrorCode* ec) {
-  return impl_ ? impl_->GetNative(ec) : SetError(ec, eInvalidArgument, nullptr);
-}
+    /////////////////////////////////////////////////
+    /// Kernel
 
-/////////////////////////////////////////////////
-/// PlatformRegistry
-
-int PlatformRegistry::Register(Creator creator) {
-  Platform platform(creator());
-  auto proposed_id = platform.GetPlatformId();
-  std::string name = platform.GetPlatformName();
-  if (proposed_id == -1) {
-    proposed_id = GetNextId();
-    platform.impl_->SetPlatformId(proposed_id);
-  } else if (!IsAvailable(proposed_id)) {
-    return -1;
-  }
-  entries_.push_back({name, proposed_id, platform});
-  return 0;
-}
+    Device Kernel::GetDevice() const
+    {
+        return impl_ ? impl_->GetDevice() : Device{};
+    }
 
-int PlatformRegistry::AddAlias(const char* name, const char* target) {
-  aliases_.emplace_back(name, target);
-  return 0;
-}
+    void* Kernel::GetNative(ErrorCode* ec)
+    {
+        return impl_ ? impl_->GetNative(ec) : SetError(ec, eInvalidArgument, nullptr);
+    }
 
-int PlatformRegistry::GetNextId() {
-  for (int i = 1;; ++i) {
-    if (IsAvailable(i)) {
-      return i;
+    /////////////////////////////////////////////////
+    /// PlatformRegistry
+
+    int PlatformRegistry::Register(Creator creator)
+    {
+        Platform    platform(creator());
+        auto        proposed_id = platform.GetPlatformId();
+        std::string name        = platform.GetPlatformName();
+        if (proposed_id == -1)
+        {
+            proposed_id = GetNextId();
+            platform.impl_->SetPlatformId(proposed_id);
+        }
+        else if (!IsAvailable(proposed_id))
+        {
+            return -1;
+        }
+        entries_.push_back({name, proposed_id, platform});
+        return 0;
     }
-  }
-}
 
-bool PlatformRegistry::IsAvailable(int id) {
-  for (const auto& entry : entries_) {
-    if (entry.id == id) {
-      return false;
+    int PlatformRegistry::AddAlias(const char* name, const char* target)
+    {
+        aliases_.emplace_back(name, target);
+        return 0;
     }
-  }
-  return true;
-}
 
-int PlatformRegistry::GetPlatform(const char* name, Platform* platform) {
-  for (const auto& alias : aliases_) {
-    if (name == alias.first) {
-      name = alias.second.c_str();
-      break;
+    int PlatformRegistry::GetNextId()
+    {
+        for (int i = 1;; ++i)
+        {
+            if (IsAvailable(i))
+            {
+                return i;
+            }
+        }
     }
-  }
-  for (const auto& entry : entries_) {
-    if (entry.name == name) {
-      *platform = entry.platform;
-      return 0;
+
+    bool PlatformRegistry::IsAvailable(int id)
+    {
+        for (const auto& entry : entries_)
+        {
+            if (entry.id == id)
+            {
+                return false;
+            }
+        }
+        return true;
     }
-  }
-  return -1;
-}
 
-int PlatformRegistry::GetPlatform(int id, Platform* platform) {
-  for (const auto& entry : entries_) {
-    if (entry.id == id) {
-      *platform = entry.platform;
-      return 0;
+    int PlatformRegistry::GetPlatform(const char* name, Platform* platform)
+    {
+        for (const auto& alias : aliases_)
+        {
+            if (name == alias.first)
+            {
+                name = alias.second.c_str();
+                break;
+            }
+        }
+        for (const auto& entry : entries_)
+        {
+            if (entry.name == name)
+            {
+                *platform = entry.platform;
+                return 0;
+            }
+        }
+        return -1;
     }
-  }
-  return -1;
-}
 
-int PlatformRegistry::GetPlatformId(const char* name) {
-  for (const auto& alias : aliases_) {
-    if (name == alias.first) {
-      name = alias.second.c_str();
-      break;
+    int PlatformRegistry::GetPlatform(int id, Platform* platform)
+    {
+        for (const auto& entry : entries_)
+        {
+            if (entry.id == id)
+            {
+                *platform = entry.platform;
+                return 0;
+            }
+        }
+        return -1;
     }
-  }
-  for (const auto& entry : entries_) {
-    if (entry.name == name) {
-      return entry.id;
+
+    int PlatformRegistry::GetPlatformId(const char* name)
+    {
+        for (const auto& alias : aliases_)
+        {
+            if (name == alias.first)
+            {
+                name = alias.second.c_str();
+                break;
+            }
+        }
+        for (const auto& entry : entries_)
+        {
+            if (entry.name == name)
+            {
+                return entry.id;
+            }
+        }
+        return -1;
     }
-  }
-  return -1;
-}
 
-PlatformImpl* PlatformRegistry::GetPlatformImpl(PlatformId id) {
-  for (const auto& entry : entries_) {
-    if (entry.id == id) {
-      return entry.platform.impl_.get();
+    PlatformImpl* PlatformRegistry::GetPlatformImpl(PlatformId id)
+    {
+        for (const auto& entry : entries_)
+        {
+            if (entry.id == id)
+            {
+                return entry.platform.impl_.get();
+            }
+        }
+        return nullptr;
     }
-  }
-  return nullptr;
-}
 
-PlatformRegistry& gPlatformRegistry() {
-  static PlatformRegistry instance;
-  return instance;
-}
+    PlatformRegistry& gPlatformRegistry()
+    {
+        static PlatformRegistry instance;
+        return instance;
+    }
 
 }  // namespace mmdeploy::framework
diff --git a/csrc/mmdeploy/core/device_impl.h b/csrc/mmdeploy/core/device_impl.h
index 8860c96105..1098808000 100644
--- a/csrc/mmdeploy/core/device_impl.h
+++ b/csrc/mmdeploy/core/device_impl.h
@@ -5,181 +5,232 @@
 
 #include "mmdeploy/core/device.h"
 
-namespace mmdeploy::framework {
+namespace mmdeploy::framework
+{
 
-using std::shared_ptr;
+    using std::shared_ptr;
 
-using PlatformImplPtr = shared_ptr<PlatformImpl>;
-using AllocatorImplPtr = shared_ptr<AllocatorImpl>;
-using BufferImplPtr = shared_ptr<BufferImpl>;
-using StreamImplPtr = shared_ptr<StreamImpl>;
-using EventImplPtr = shared_ptr<EventImpl>;
+    using PlatformImplPtr  = shared_ptr<PlatformImpl>;
+    using AllocatorImplPtr = shared_ptr<AllocatorImpl>;
+    using BufferImplPtr    = shared_ptr<BufferImpl>;
+    using StreamImplPtr    = shared_ptr<StreamImpl>;
+    using EventImplPtr     = shared_ptr<EventImpl>;
 
-class PlatformImpl {
- public:
-  PlatformImpl() : platform_id_(-1) {}
+    class PlatformImpl
+    {
+      public:
+        PlatformImpl()
+            : platform_id_(-1)
+        {
+        }
 
-  virtual ~PlatformImpl() = default;
+        virtual ~PlatformImpl() = default;
 
-  virtual const char* GetPlatformName() const noexcept = 0;
+        virtual const char* GetPlatformName() const noexcept = 0;
 
-  virtual int GetPlatformId() const noexcept { return platform_id_; }
+        virtual int         GetPlatformId() const noexcept
+        {
+            return platform_id_;
+        }
 
-  virtual void SetPlatformId(int id) { platform_id_ = id; }
+        virtual void SetPlatformId(int id)
+        {
+            platform_id_ = id;
+        }
 
-  virtual Result<void> BindDevice(Device device, Device* prev) = 0;
+        virtual Result<void>           BindDevice(Device device, Device* prev) = 0;
 
-  virtual shared_ptr<BufferImpl> CreateBuffer(Device device) = 0;
+        virtual shared_ptr<BufferImpl> CreateBuffer(Device device) = 0;
 
-  virtual shared_ptr<StreamImpl> CreateStream(Device device) = 0;
+        virtual shared_ptr<StreamImpl> CreateStream(Device device) = 0;
 
-  virtual shared_ptr<EventImpl> CreateEvent(Device device) = 0;
+        virtual shared_ptr<EventImpl>  CreateEvent(Device device) = 0;
 
-  virtual Result<void> Copy(const void* host_ptr, Buffer dst, size_t size, size_t dst_offset,
-                            Stream stream) = 0;
+        virtual Result<void>           Copy(const void* host_ptr, Buffer dst, size_t size, size_t dst_offset, Stream stream) = 0;
 
-  virtual Result<void> Copy(Buffer src, void* host_ptr, size_t size, size_t src_offset,
-                            Stream stream) = 0;
+        virtual Result<void>           Copy(Buffer src, void* host_ptr, size_t size, size_t src_offset, Stream stream) = 0;
 
-  virtual Result<void> Copy(Buffer src, Buffer dst, size_t size, size_t src_offset,
-                            size_t dst_offset, Stream stream) = 0;
+        virtual Result<void>           Copy(Buffer src, Buffer dst, size_t size, size_t src_offset, size_t dst_offset, Stream stream) = 0;
 
-  virtual Result<Stream> GetDefaultStream(int32_t device_id) = 0;
+        virtual Result<Stream>         GetDefaultStream(int32_t device_id) = 0;
 
- protected:
-  int platform_id_;
-};
+      protected:
+        int platform_id_;
+    };
 
-class AllocatorImpl {
- public:
-  struct Block {
-    explicit Block(void* _handle = nullptr, size_t _size = 0) : handle(_handle), size(_size) {}
-    void* handle;
-    size_t size;
-  };
-  virtual ~AllocatorImpl() = default;
-  virtual Block Allocate(size_t size) noexcept = 0;
-  virtual void Deallocate(Block& block) noexcept = 0;
-  virtual bool Owns(const Block& block) const noexcept = 0;
-  virtual const char* Name() const noexcept { return ""; }
-  //  virtual Device device() const noexcept = 0;
-};
+    class AllocatorImpl
+    {
+      public:
+        struct Block
+        {
+            explicit Block(void* _handle = nullptr, size_t _size = 0)
+                : handle(_handle)
+                , size(_size)
+            {
+            }
+            void*  handle;
+            size_t size;
+        };
+        virtual ~AllocatorImpl()                                    = default;
+        virtual Block       Allocate(size_t size) noexcept          = 0;
+        virtual void        Deallocate(Block& block) noexcept       = 0;
+        virtual bool        Owns(const Block& block) const noexcept = 0;
+        virtual const char* Name() const noexcept
+        {
+            return "";
+        }
+        //  virtual Device device() const noexcept = 0;
+    };
 
-// create, destroy, sub, MakeAvailableOnDevice, FromHost, fill, copy, map, unmap
-class BufferImpl {
- public:
-  explicit BufferImpl(Device device) : device_(device) {}
+    // create, destroy, sub, MakeAvailableOnDevice, FromHost, fill, copy, map, unmap
+    class BufferImpl
+    {
+      public:
+        explicit BufferImpl(Device device)
+            : device_(device)
+        {
+        }
 
-  virtual ~BufferImpl() = default;
+        virtual ~BufferImpl() = default;
 
-  virtual Result<void> Init(size_t size, Allocator allocator, size_t alignment, uint64_t flags) = 0;
+        virtual Result<void>                   Init(size_t size, Allocator allocator, size_t alignment, uint64_t flags) = 0;
 
-  virtual Result<void> Init(size_t size, std::shared_ptr<void> native, uint64_t flags) = 0;
+        virtual Result<void>                   Init(size_t size, std::shared_ptr<void> native, uint64_t flags) = 0;
 
-  virtual Result<shared_ptr<BufferImpl>> SubBuffer(size_t offset, size_t size, uint64_t flags) = 0;
+        virtual Result<shared_ptr<BufferImpl>> SubBuffer(size_t offset, size_t size, uint64_t flags) = 0;
 
-  virtual size_t GetSize(ErrorCode* ec) = 0;
+        virtual size_t                         GetSize(ErrorCode* ec) = 0;
 
-  virtual Allocator GetAllocator() const = 0;
+        virtual Allocator                      GetAllocator() const = 0;
 
-  virtual void* GetNative(ErrorCode* ec) = 0;
+        virtual void*                          GetNative(ErrorCode* ec) = 0;
 
-  Device GetDevice() const noexcept { return device_; }
+        Device                                 GetDevice() const noexcept
+        {
+            return device_;
+        }
 
- protected:
-  Device device_;
-};
+      protected:
+        Device device_;
+    };
 
-class StreamImpl {
- public:
-  explicit StreamImpl(Device device) : device_(device) {}
+    class StreamImpl
+    {
+      public:
+        explicit StreamImpl(Device device)
+            : device_(device)
+        {
+        }
 
-  virtual ~StreamImpl() = default;
+        virtual ~StreamImpl() = default;
 
-  virtual Result<void> Init(uint64_t flags) = 0;
+        virtual Result<void> Init(uint64_t flags) = 0;
 
-  virtual Result<void> Init(std::shared_ptr<void> native, uint64_t flags) = 0;
+        virtual Result<void> Init(std::shared_ptr<void> native, uint64_t flags) = 0;
 
-  virtual Result<void> Query() = 0;
+        virtual Result<void> Query() = 0;
 
-  virtual Result<void> Wait() = 0;
+        virtual Result<void> Wait() = 0;
 
-  virtual Result<void> Submit(Kernel& kernel) = 0;
+        virtual Result<void> Submit(Kernel& kernel) = 0;
 
-  virtual Result<void> DependsOn(Event& event) = 0;
+        virtual Result<void> DependsOn(Event& event) = 0;
 
-  virtual void* GetNative(ErrorCode* ec) = 0;
+        virtual void*        GetNative(ErrorCode* ec) = 0;
 
-  Device GetDevice() const noexcept { return device_; }
+        Device               GetDevice() const noexcept
+        {
+            return device_;
+        }
 
- protected:
-  Device device_;
-};
+      protected:
+        Device device_;
+    };
 
-class EventImpl {
- public:
-  explicit EventImpl(Device device) : device_(device) {}
+    class EventImpl
+    {
+      public:
+        explicit EventImpl(Device device)
+            : device_(device)
+        {
+        }
+
+        virtual ~EventImpl() = default;
+
+        virtual Result<void> Init(uint64_t flags) = 0;
+
+        virtual Result<void> Init(std::shared_ptr<void> native, uint64_t flags) = 0;
+
+        virtual Result<void> Query() = 0;
+
+        virtual Result<void> Record(Stream& st) = 0;
 
-  virtual ~EventImpl() = default;
+        virtual Result<void> Wait() = 0;
 
-  virtual Result<void> Init(uint64_t flags) = 0;
+        virtual void*        GetNative(ErrorCode* ec) = 0;
 
-  virtual Result<void> Init(std::shared_ptr<void> native, uint64_t flags) = 0;
+        Device               GetDevice() const noexcept
+        {
+            return device_;
+        }
 
-  virtual Result<void> Query() = 0;
+      protected:
+        Device device_;
+    };
 
-  virtual Result<void> Record(Stream& st) = 0;
+    class KernelWrapper
+    {
+      public:
+        virtual ~KernelWrapper()                           = default;
+        virtual int Invoke(const std::vector<void*>& args) = 0;
+    };
 
-  virtual Result<void> Wait() = 0;
+    class KernelImpl
+    {
+      public:
+        explicit KernelImpl(Device device)
+            : device_(device)
+        {
+        }
 
-  virtual void* GetNative(ErrorCode* ec) = 0;
+        virtual ~KernelImpl() = default;
 
-  Device GetDevice() const noexcept { return device_; }
+        Device GetDevice() const noexcept
+        {
+            return device_;
+        }
 
- protected:
-  Device device_;
-};
+        virtual void* GetNative(ErrorCode* ec) = 0;
 
-class KernelWrapper {
- public:
-  virtual ~KernelWrapper() = default;
-  virtual int Invoke(const std::vector<void*>& args) = 0;
-};
+      protected:
+        Device device_;
+    };
 
-class KernelImpl {
- public:
-  explicit KernelImpl(Device device) : device_(device) {}
+    struct Access
+    {
+        template<typename T, typename Obj>
+        static T& get(const Obj& obj)
+        {
+            return static_cast<T&>(*obj.impl_);
+        }
 
-  virtual ~KernelImpl() = default;
+        template<typename Obj>
+        static auto& get_impl(const Obj& obj)
+        {
+            return obj.impl_;
+        }
 
-  Device GetDevice() const noexcept { return device_; }
+        template<typename T, typename... Args>
+        static T create(Args&&... args)
+        {
+            return T(std::forward<Args>(args)...);
+        }
+    };
 
-  virtual void* GetNative(ErrorCode* ec) = 0;
-
- protected:
-  Device device_;
-};
-
-struct Access {
-  template <typename T, typename Obj>
-  static T& get(const Obj& obj) {
-    return static_cast<T&>(*obj.impl_);
-  }
-
-  template <typename Obj>
-  static auto& get_impl(const Obj& obj) {
-    return obj.impl_;
-  }
-
-  template <typename T, typename... Args>
-  static T create(Args&&... args) {
-    return T(std::forward<Args>(args)...);
-  }
-};
-
-inline PlatformImpl* GetPlatformImpl(const Device& device) {
-  return gPlatformRegistry().GetPlatformImpl(device);
-}
+    inline PlatformImpl* GetPlatformImpl(const Device& device)
+    {
+        return gPlatformRegistry().GetPlatformImpl(device);
+    }
 
 }  // namespace mmdeploy::framework
 
diff --git a/csrc/mmdeploy/core/graph.cpp b/csrc/mmdeploy/core/graph.cpp
index d83e45b97d..6be60296e3 100644
--- a/csrc/mmdeploy/core/graph.cpp
+++ b/csrc/mmdeploy/core/graph.cpp
@@ -7,126 +7,159 @@
 #include "mmdeploy/graph/common.h"
 #include "mmdeploy/graph/flattened.h"
 
-namespace mmdeploy::graph {
-
-namespace {
-
-struct Expr {
-  string lhs;
-  string rhs;
-  char operation{0};
-};
-
-// parse expressions like "x", "x=y", "x=*y" or "x=+y"
-Expr ParseExpr(const string& str) {
-  Expr expr;
-  bool split{};
-  for (const auto& c : str) {
-    switch (c) {
-      case '=':
-        split = true;
-        break;
-      case '*':
-      case '+':
-        expr.operation = c;
-        break;
-      default:
-        (split ? &expr.rhs : &expr.lhs)->push_back(c);
+namespace mmdeploy::graph
+{
+
+    namespace
+    {
+
+        struct Expr
+        {
+            string lhs;
+            string rhs;
+            char   operation{0};
+        };
+
+        // parse expressions like "x", "x=y", "x=*y" or "x=+y"
+        Expr ParseExpr(const string& str)
+        {
+            Expr expr;
+            bool split{};
+            for (const auto& c : str)
+            {
+                switch (c)
+                {
+                    case '=':
+                        split = true;
+                        break;
+                    case '*':
+                    case '+':
+                        expr.operation = c;
+                        break;
+                    default:
+                        (split ? &expr.rhs : &expr.lhs)->push_back(c);
+                }
+            }
+
+            if (!split)
+            {
+                expr.rhs = expr.lhs;
+            }
+
+            return std::move(expr);
+        }
+
+    }  // namespace
+
+    Result<void> Builder::SetInputs()
+    {
+        OUTCOME_TRY(auto inputs, ParseStringArray(config_["input"]));
+        vector<string> inputs_internal;
+        for (const auto& input : inputs)
+        {
+            auto expr = ParseExpr(input);
+            inputs_.push_back(expr.rhs);
+            inputs_internal.push_back(expr.lhs);
+            flatten_.push_back(expr.operation == '*');
+            broadcast_.push_back(expr.operation == '+');
+        }
+        config_["input"] = to_value(inputs_internal);
+        return success();
     }
-  }
-  if (!split) {
-    expr.rhs = expr.lhs;
-  }
-  return std::move(expr);
-}
-
-}  // namespace
-
-Result<void> Builder::SetInputs() {
-  OUTCOME_TRY(auto inputs, ParseStringArray(config_["input"]));
-  vector<string> inputs_internal;
-  for (const auto& input : inputs) {
-    auto expr = ParseExpr(input);
-    inputs_.push_back(expr.rhs);
-    inputs_internal.push_back(expr.lhs);
-    flatten_.push_back(expr.operation == '*');
-    broadcast_.push_back(expr.operation == '+');
-  }
-  config_["input"] = to_value(inputs_internal);
-  return success();
-}
-
-Result<void> Builder::SetOutputs() {
-  OUTCOME_TRY(auto outputs, ParseStringArray(config_["output"]));
-  vector<string> outputs_internal;
-  for (const auto& output : outputs) {
-    auto expr = ParseExpr(output);
-    outputs_.push_back(expr.lhs);
-    outputs_internal.push_back(expr.rhs);
-    unflatten_.push_back(expr.operation == '*');
-  }
-  config_["output"] = to_value(outputs_internal);
-  return success();
-}
-
-Builder::Builder(Value config) : config_(std::move(config)) {
-  name_ = config_.value<std::string>("name", "");
-}
-
-Result<unique_ptr<Node>> Builder::Build() {
-  OUTCOME_TRY(SetInputs());
-  OUTCOME_TRY(SetOutputs());
-  OUTCOME_TRY(auto node, BuildImpl());
-
-  // use Throttle to constraint resource usage
-  if (auto throttle = config_.value("throttle", 0)) {
-    MMDEPLOY_ERROR("Throttle is not implemented yet");
-    return Status(eNotSupported);
-  }
-
-  // create a FlattenedScope to flatten inputs and unflatten outputs
-  if (std::count(std::begin(flatten_), std::end(flatten_), true)) {
-    node = std::make_unique<Flattened>(std::move(node), flatten_, broadcast_, unflatten_);
-  }
-  return std::move(node);
-}
-
-Result<unique_ptr<Builder>> Builder::CreateFromConfig(const Value& config) {
-  // MMDEPLOY_WARN("config: {}", config);
-  auto type = config.value<string>("type", "");
-  auto cfg = config;
-  // backward compatibility
-  if (type.empty()) {
-    if (config.contains("pipeline")) {
-      type = "Pipeline";
-      cfg = config["pipeline"];
-      if (config.contains("context")) {
-        cfg["context"] = config["context"];
-      }
+
+    Result<void> Builder::SetOutputs()
+    {
+        OUTCOME_TRY(auto outputs, ParseStringArray(config_["output"]));
+        vector<string> outputs_internal;
+        for (const auto& output : outputs)
+        {
+            auto expr = ParseExpr(output);
+            outputs_.push_back(expr.lhs);
+            outputs_internal.push_back(expr.rhs);
+            unflatten_.push_back(expr.operation == '*');
+        }
+        config_["output"] = to_value(outputs_internal);
+        return success();
+    }
+
+    Builder::Builder(Value config)
+        : config_(std::move(config))
+    {
+        name_ = config_.value<std::string>("name", "");
+    }
+
+    Result<unique_ptr<Node>> Builder::Build()
+    {
+        OUTCOME_TRY(SetInputs());
+        OUTCOME_TRY(SetOutputs());
+        OUTCOME_TRY(auto node, BuildImpl());
+
+        // use Throttle to constraint resource usage
+        if (auto throttle = config_.value("throttle", 0))
+        {
+            MMDEPLOY_ERROR("Throttle is not implemented yet");
+            return Status(eNotSupported);
+        }
+
+        // create a FlattenedScope to flatten inputs and unflatten outputs
+        if (std::count(std::begin(flatten_), std::end(flatten_), true))
+        {
+            node = std::make_unique<Flattened>(std::move(node), flatten_, broadcast_, unflatten_);
+        }
+
+        return std::move(node);
     }
-  }
-  auto creator = gRegistry<Builder>().Get(type);
-  if (!creator) {
-    MMDEPLOY_ERROR("failed to find node creator: {}", type);
-    return Status(eEntryNotFound);
-  }
-  auto builder = creator->Create(cfg);
-  if (!builder) {
-    MMDEPLOY_ERROR("failed to create node builder: {}", type);
-    return Status(eFail);
-  }
-  return std::move(builder);
-}
-
-Result<std::vector<std::string>> ParseStringArray(const Value& value) {
-  if (value.is_string()) {
-    return std::vector{value.get<std::string>()};
-  } else if (value.is_array()) {
-    return from_value<std::vector<std::string>>(value);
-  }
-  return Status(eInvalidArgument);
-}
-
-MMDEPLOY_DEFINE_REGISTRY(Builder);
+
+    Result<unique_ptr<Builder>> Builder::CreateFromConfig(const Value& config)
+    {
+        // MMDEPLOY_WARN("config: {}", config);
+        auto type = config.value<string>("type", "");
+        auto cfg  = config;
+        // backward compatibility
+        if (type.empty())
+        {
+            if (config.contains("pipeline"))
+            {
+                type = "Pipeline";
+                cfg  = config["pipeline"];
+                if (config.contains("context"))
+                {
+                    cfg["context"] = config["context"];
+                }
+            }
+        }
+
+        auto creator = gRegistry<Builder>().Get(type);
+        if (!creator)
+        {
+            MMDEPLOY_ERROR("failed to find node creator: {}", type);
+            return Status(eEntryNotFound);
+        }
+
+        auto builder = creator->Create(cfg);
+        if (!builder)
+        {
+            MMDEPLOY_ERROR("failed to create node builder: {}", type);
+            return Status(eFail);
+        }
+
+        return std::move(builder);
+    }
+
+    Result<std::vector<std::string>> ParseStringArray(const Value& value)
+    {
+        if (value.is_string())
+        {
+            return std::vector{value.get<std::string>()};
+        }
+        else if (value.is_array())
+        {
+            return from_value<std::vector<std::string>>(value);
+        }
+
+        return Status(eInvalidArgument);
+    }
+
+    MMDEPLOY_DEFINE_REGISTRY(Builder);
 
 }  // namespace mmdeploy::graph
diff --git a/csrc/mmdeploy/core/graph.h b/csrc/mmdeploy/core/graph.h
index 2cb623529b..3d56aa9ed6 100644
--- a/csrc/mmdeploy/core/graph.h
+++ b/csrc/mmdeploy/core/graph.h
@@ -11,62 +11,82 @@
 #include "mmdeploy/core/utils/formatter.h"
 #include "mmdeploy/execution/schedulers/registry.h"
 
-namespace mmdeploy::graph {
-
-using std::pair;
-using std::string;
-using std::unique_ptr;
-using std::vector;
-
-template <class... Ts>
-using Sender = TypeErasedSender<Ts...>;
-
-class MMDEPLOY_API Node {
- public:
-  virtual ~Node() = default;
-  virtual Sender<Value> Process(Sender<Value> input) = 0;
-
-  struct process_t {
-    Sender<Value> operator()(Sender<Value> sender, Node* node) const {
-      return node->Process(std::move(sender));
-    }
-  };
-  __closure::_BinderBack<process_t, Node*> Process() { return {{}, {}, {this}}; }
-};
-
-class MMDEPLOY_API Builder {
- public:
-  virtual ~Builder() = default;
-
-  const vector<string>& inputs() const noexcept { return inputs_; }
-  const vector<string>& outputs() const noexcept { return outputs_; }
-  const string& name() const noexcept { return name_; }
-
-  Result<unique_ptr<Node>> Build();
-
-  static Result<unique_ptr<Builder>> CreateFromConfig(const Value& config);
-
- protected:
-  explicit Builder(Value config);
-
-  Result<void> SetInputs();
-  Result<void> SetOutputs();
-
-  virtual Result<unique_ptr<Node>> BuildImpl() = 0;
-
- protected:
-  Value config_;
-  string name_;
-  vector<string> inputs_;
-  vector<string> outputs_;
-  vector<bool> flatten_;
-  vector<bool> broadcast_;
-  vector<bool> unflatten_;
-};
-
-MMDEPLOY_API Result<std::vector<std::string>> ParseStringArray(const Value& value);
-
-MMDEPLOY_DECLARE_REGISTRY(Builder, std::unique_ptr<Builder>(const Value& config));
+namespace mmdeploy::graph
+{
+
+    using std::pair;
+    using std::string;
+    using std::unique_ptr;
+    using std::vector;
+
+    template<class... Ts>
+    using Sender = TypeErasedSender<Ts...>;
+
+    class MMDEPLOY_API Node
+    {
+      public:
+        virtual ~Node()                                    = default;
+        virtual Sender<Value> Process(Sender<Value> input) = 0;
+
+        struct process_t
+        {
+            Sender<Value> operator()(Sender<Value> sender, Node* node) const
+            {
+                return node->Process(std::move(sender));
+            }
+        };
+
+        __closure::_BinderBack<process_t, Node*> Process()
+        {
+            return {{}, {}, {this}};
+        }
+    };
+
+    class MMDEPLOY_API Builder
+    {
+      public:
+        virtual ~Builder() = default;
+
+        const vector<string>& inputs() const noexcept
+        {
+            return inputs_;
+        }
+
+        const vector<string>& outputs() const noexcept
+        {
+            return outputs_;
+        }
+
+        const string& name() const noexcept
+        {
+            return name_;
+        }
+
+        Result<unique_ptr<Node>>           Build();
+
+        static Result<unique_ptr<Builder>> CreateFromConfig(const Value& config);
+
+      protected:
+        explicit Builder(Value config);
+
+        Result<void>                     SetInputs();
+        Result<void>                     SetOutputs();
+
+        virtual Result<unique_ptr<Node>> BuildImpl() = 0;
+
+      protected:
+        Value          config_;
+        string         name_;
+        vector<string> inputs_;
+        vector<string> outputs_;
+        vector<bool>   flatten_;
+        vector<bool>   broadcast_;
+        vector<bool>   unflatten_;
+    };
+
+    MMDEPLOY_API Result<std::vector<std::string>> ParseStringArray(const Value& value);
+
+    MMDEPLOY_DECLARE_REGISTRY(Builder, std::unique_ptr<Builder>(const Value& config));
 
 }  // namespace mmdeploy::graph
 
diff --git a/csrc/mmdeploy/core/logger.cpp b/csrc/mmdeploy/core/logger.cpp
index a1499cc072..bbf6b71111 100644
--- a/csrc/mmdeploy/core/logger.cpp
+++ b/csrc/mmdeploy/core/logger.cpp
@@ -5,71 +5,93 @@
 #include <cstdlib>
 
 #if SPDLOG_VER_MAJOR >= 1
-#if defined(__ANDROID__)
-#include <spdlog/sinks/android_sink.h>
-#else
-#include <spdlog/sinks/stdout_color_sinks.h>
-#if defined(_MSC_VER)
-#include <spdlog/sinks/stdout_sinks.h>
-#endif
-#endif
+    #if defined(__ANDROID__)
+        #include <spdlog/sinks/android_sink.h>
+    #else
+        #include <spdlog/sinks/stdout_color_sinks.h>
+        #if defined(_MSC_VER)
+            #include <spdlog/sinks/stdout_sinks.h>
+        #endif
+    #endif
 #endif
 
 #if SPDLOG_VER_MAJOR >= 1 && SPDLOG_VER_MINOR >= 6
-#define MMDEPLOY_SPDLOG_HAS_LOAD_ENV_LEVELS 1
-#include <spdlog/cfg/env.h>
+    #define MMDEPLOY_SPDLOG_HAS_LOAD_ENV_LEVELS 1
+    #include <spdlog/cfg/env.h>
 #endif
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-static void LoadEnvLevels() {
-  auto p = std::getenv("SPDLOG_LEVEL");
-  if (p) {
-    const std::string str(p);
-    if (str == "trace") {
-      spdlog::set_level(spdlog::level::trace);
-    } else if (str == "debug") {
-      spdlog::set_level(spdlog::level::debug);
-    } else if (str == "info") {
-      spdlog::set_level(spdlog::level::info);
-    } else if (str == "warn") {
-      spdlog::set_level(spdlog::level::warn);
-    } else if (str == "err") {
-      spdlog::set_level(spdlog::level::err);
-    } else if (str == "critical") {
-      spdlog::set_level(spdlog::level::critical);
-    } else if (str == "off") {
-      spdlog::set_level(spdlog::level::off);
+    static void LoadEnvLevels()
+    {
+        auto p = std::getenv("SPDLOG_LEVEL");
+        if (p)
+        {
+            const std::string str(p);
+            if (str == "trace")
+            {
+                spdlog::set_level(spdlog::level::trace);
+            }
+            else if (str == "debug")
+            {
+                spdlog::set_level(spdlog::level::debug);
+            }
+            else if (str == "info")
+            {
+                spdlog::set_level(spdlog::level::info);
+            }
+            else if (str == "warn")
+            {
+                spdlog::set_level(spdlog::level::warn);
+            }
+            else if (str == "err")
+            {
+                spdlog::set_level(spdlog::level::err);
+            }
+            else if (str == "critical")
+            {
+                spdlog::set_level(spdlog::level::critical);
+            }
+            else if (str == "off")
+            {
+                spdlog::set_level(spdlog::level::off);
+            }
+        }
     }
-  }
-}
 
-std::shared_ptr<spdlog::logger> CreateDefaultLogger() {
+    std::shared_ptr<spdlog::logger> CreateDefaultLogger()
+    {
 #if MMDEPLOY_SPDLOG_HAS_LOAD_ENV_LEVELS
-  spdlog::cfg::load_env_levels();
+        spdlog::cfg::load_env_levels();
 #else
-  LoadEnvLevels();
+        LoadEnvLevels();
 #endif
-  constexpr const auto logger_name = "mmdeploy";
+        constexpr const auto logger_name = "mmdeploy";
 #if defined(__ANDROID__)
-  return spdlog::android_logger_mt(logger_name);
+        return spdlog::android_logger_mt(logger_name);
 #elif defined(_MSC_VER)
-  return spdlog::stdout_logger_mt(logger_name);
+        return spdlog::stdout_logger_mt(logger_name);
 #else
-  return spdlog::stdout_color_mt(logger_name);
+        return spdlog::stdout_color_mt(logger_name);
 #endif
-}
+    }
 
-std::shared_ptr<spdlog::logger> &gLogger() {
-  // ! leaky singleton
-  static auto ptr = new std::shared_ptr<spdlog::logger>{CreateDefaultLogger()};
-  return *ptr;
-}
+    std::shared_ptr<spdlog::logger>& gLogger()
+    {
+        // ! leaky singleton
+        static auto ptr = new std::shared_ptr<spdlog::logger>{CreateDefaultLogger()};
+        return *ptr;
+    }
 
-spdlog::logger *GetLogger() { return gLogger().get(); }
+    spdlog::logger* GetLogger()
+    {
+        return gLogger().get();
+    }
 
-void SetLogger(spdlog::logger *logger) {
-  gLogger() = std::shared_ptr<spdlog::logger>(logger, [](auto) {});
-}
+    void SetLogger(spdlog::logger* logger)
+    {
+        gLogger() = std::shared_ptr<spdlog::logger>(logger, [](auto) {});
+    }
 
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/core/logger.h b/csrc/mmdeploy/core/logger.h
index 73de4f0ee1..826dedc5ac 100644
--- a/csrc/mmdeploy/core/logger.h
+++ b/csrc/mmdeploy/core/logger.h
@@ -7,85 +7,86 @@
 
 #include "mmdeploy/core/macro.h"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-MMDEPLOY_API spdlog::logger *GetLogger();
+    MMDEPLOY_API spdlog::logger* GetLogger();
 
-MMDEPLOY_API void SetLogger(spdlog::logger *logger);
+    MMDEPLOY_API void            SetLogger(spdlog::logger* logger);
 
 }  // namespace mmdeploy
 
 // Honor spdlog settings if supported
 #if defined(SPDLOG_ACTIVE_LEVEL) && defined(SPDLOG_LEVEL_OFF)
 
-#define MMDEPLOY_LEVEL_TRACE SPDLOG_LEVEL_TRACE
-#define MMDEPLOY_LEVEL_DEBUG SPDLOG_LEVEL_DEBUG
-#define MMDEPLOY_LEVEL_INFO SPDLOG_LEVEL_INFO
-#define MMDEPLOY_LEVEL_WARN SPDLOG_LEVEL_WARN
-#define MMDEPLOY_LEVEL_ERROR SPDLOG_LEVEL_ERROR
-#define MMDEPLOY_LEVEL_CRITICAL SPDLOG_LEVEL_CRITICAL
-#define MMDEPLOY_LEVEL_OFF SPDLOG_LEVEL_OFF
+    #define MMDEPLOY_LEVEL_TRACE SPDLOG_LEVEL_TRACE
+    #define MMDEPLOY_LEVEL_DEBUG SPDLOG_LEVEL_DEBUG
+    #define MMDEPLOY_LEVEL_INFO SPDLOG_LEVEL_INFO
+    #define MMDEPLOY_LEVEL_WARN SPDLOG_LEVEL_WARN
+    #define MMDEPLOY_LEVEL_ERROR SPDLOG_LEVEL_ERROR
+    #define MMDEPLOY_LEVEL_CRITICAL SPDLOG_LEVEL_CRITICAL
+    #define MMDEPLOY_LEVEL_OFF SPDLOG_LEVEL_OFF
 
-#if !defined(MMDEPLOY_ACTIVE_LEVEL)
-#define MMDEPLOY_ACTIVE_LEVEL SPDLOG_ACTIVE_LEVEL
-#endif
+    #if !defined(MMDEPLOY_ACTIVE_LEVEL)
+        #define MMDEPLOY_ACTIVE_LEVEL SPDLOG_ACTIVE_LEVEL
+    #endif
 
 #else
 
-#define MMDEPLOY_LEVEL_TRACE 0
-#define MMDEPLOY_LEVEL_DEBUG 1
-#define MMDEPLOY_LEVEL_INFO 2
-#define MMDEPLOY_LEVEL_WARN 3
-#define MMDEPLOY_LEVEL_ERROR 4
-#define MMDEPLOY_LEVEL_CRITICAL 5
-#define MMDEPLOY_LEVEL_OFF 6
+    #define MMDEPLOY_LEVEL_TRACE 0
+    #define MMDEPLOY_LEVEL_DEBUG 1
+    #define MMDEPLOY_LEVEL_INFO 2
+    #define MMDEPLOY_LEVEL_WARN 3
+    #define MMDEPLOY_LEVEL_ERROR 4
+    #define MMDEPLOY_LEVEL_CRITICAL 5
+    #define MMDEPLOY_LEVEL_OFF 6
 
-#if !defined(MMDEPLOY_ACTIVE_LEVEL)
-#define MMDEPLOY_ACTIVE_LEVEL MMDEPLOY_LEVEL_INFO
-#endif
+    #if !defined(MMDEPLOY_ACTIVE_LEVEL)
+        #define MMDEPLOY_ACTIVE_LEVEL MMDEPLOY_LEVEL_INFO
+    #endif
 
 #endif
 
 #ifdef SPDLOG_LOGGER_CALL
-#define MMDEPLOY_LOG(level, ...) SPDLOG_LOGGER_CALL(mmdeploy::GetLogger(), level, __VA_ARGS__)
+    #define MMDEPLOY_LOG(level, ...) SPDLOG_LOGGER_CALL(mmdeploy::GetLogger(), level, __VA_ARGS__)
 #else
-#define MMDEPLOY_LOG(level, ...) mmdeploy::GetLogger()->log(level, __VA_ARGS__)
+    #define MMDEPLOY_LOG(level, ...) mmdeploy::GetLogger()->log(level, __VA_ARGS__)
 #endif
 
 #if MMDEPLOY_ACTIVE_LEVEL <= MMDEPLOY_LEVEL_TRACE
-#define MMDEPLOY_TRACE(...) MMDEPLOY_LOG(spdlog::level::trace, __VA_ARGS__)
+    #define MMDEPLOY_TRACE(...) MMDEPLOY_LOG(spdlog::level::trace, __VA_ARGS__)
 #else
-#define MMDEPLOY_TRACE(...) (void)0;
+    #define MMDEPLOY_TRACE(...) (void)0;
 #endif
 
 #if MMDEPLOY_ACTIVE_LEVEL <= MMDEPLOY_LEVEL_DEBUG
-#define MMDEPLOY_DEBUG(...) MMDEPLOY_LOG(spdlog::level::debug, __VA_ARGS__)
+    #define MMDEPLOY_DEBUG(...) MMDEPLOY_LOG(spdlog::level::debug, __VA_ARGS__)
 #else
-#define MMDEPLOY_DEBUG(...) (void)0;
+    #define MMDEPLOY_DEBUG(...) (void)0;
 #endif
 
 #if MMDEPLOY_ACTIVE_LEVEL <= MMDEPLOY_LEVEL_INFO
-#define MMDEPLOY_INFO(...) MMDEPLOY_LOG(spdlog::level::info, __VA_ARGS__)
+    #define MMDEPLOY_INFO(...) MMDEPLOY_LOG(spdlog::level::info, __VA_ARGS__)
 #else
-#define MMDEPLOY_INFO(...) (void)0;
+    #define MMDEPLOY_INFO(...) (void)0;
 #endif
 
 #if MMDEPLOY_ACTIVE_LEVEL <= MMDEPLOY_LEVEL_WARN
-#define MMDEPLOY_WARN(...) MMDEPLOY_LOG(spdlog::level::warn, __VA_ARGS__)
+    #define MMDEPLOY_WARN(...) MMDEPLOY_LOG(spdlog::level::warn, __VA_ARGS__)
 #else
-#define MMDEPLOY_WARN(...) (void)0;
+    #define MMDEPLOY_WARN(...) (void)0;
 #endif
 
 #if MMDEPLOY_ACTIVE_LEVEL <= MMDEPLOY_LEVEL_ERROR
-#define MMDEPLOY_ERROR(...) MMDEPLOY_LOG(spdlog::level::err, __VA_ARGS__)
+    #define MMDEPLOY_ERROR(...) MMDEPLOY_LOG(spdlog::level::err, __VA_ARGS__)
 #else
-#define MMDEPLOY_ERROR(...) (void)0;
+    #define MMDEPLOY_ERROR(...) (void)0;
 #endif
 
 #if MMDEPLOY_ACTIVE_LEVEL <= MMDEPLOY_LEVEL_CRITICAL
-#define MMDEPLOY_CRITICAL(...) MMDEPLOY_LOG(spdlog::level::critical, __VA_ARGS__)
+    #define MMDEPLOY_CRITICAL(...) MMDEPLOY_LOG(spdlog::level::critical, __VA_ARGS__)
 #else
-#define MMDEPLOY_CRITICAL(...) (void)0;
+    #define MMDEPLOY_CRITICAL(...) (void)0;
 #endif
 
 #endif  // !CORE_LOG_H
diff --git a/csrc/mmdeploy/core/macro.h b/csrc/mmdeploy/core/macro.h
index 8d3ebdb345..457cc28a78 100644
--- a/csrc/mmdeploy/core/macro.h
+++ b/csrc/mmdeploy/core/macro.h
@@ -4,19 +4,19 @@
 #define MMDEPLOY_SRC_CORE_MARCO_H_
 
 #ifndef MMDEPLOY_EXPORT
-#ifdef _MSC_VER
-#define MMDEPLOY_EXPORT __declspec(dllexport)
-#else
-#define MMDEPLOY_EXPORT __attribute__((visibility("default")))
-#endif
+    #ifdef _MSC_VER
+        #define MMDEPLOY_EXPORT __declspec(dllexport)
+    #else
+        #define MMDEPLOY_EXPORT __attribute__((visibility("default")))
+    #endif
 #endif
 
 #ifndef MMDEPLOY_API
-#ifdef MMDEPLOY_API_EXPORTS
-#define MMDEPLOY_API MMDEPLOY_EXPORT
-#else
-#define MMDEPLOY_API
-#endif
+    #ifdef MMDEPLOY_API_EXPORTS
+        #define MMDEPLOY_API MMDEPLOY_EXPORT
+    #else
+        #define MMDEPLOY_API
+    #endif
 #endif
 
 #define _MMDEPLOY_PP_CONCAT_IMPL(s1, s2) s1##s2
@@ -26,26 +26,22 @@
 
 // ! Be aware of ODR violation when using __COUNTER__
 #ifdef __COUNTER__
-#define MMDEPLOY_ANONYMOUS_VARIABLE(str) MMDEPLOY_PP_CONCAT(str, __COUNTER__)
+    #define MMDEPLOY_ANONYMOUS_VARIABLE(str) MMDEPLOY_PP_CONCAT(str, __COUNTER__)
 #else
-#define MMDEPLOY_ANONYMOUS_VARIABLE(str) MMDEPLOY_PP_CONCAT(str, __LINE__)
+    #define MMDEPLOY_ANONYMOUS_VARIABLE(str) MMDEPLOY_PP_CONCAT(str, __LINE__)
 #endif
 
 #define MMDEPLOY_PP_NARG(...) _MMDEPLOY_PP_NARG(__VA_ARGS__, _MMDEPLOY_PP_RESQ_N())
 
 #define _MMDEPLOY_PP_NARG(...) _MMDEPLOY_PP_ARG_N(__VA_ARGS__)
 
-#define _MMDEPLOY_PP_ARG_N(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, \
-                           _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30,  \
-                           _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44,  \
-                           _45, _46, _47, _48, _49, _50, _51, _52, _53, _54, _55, _56, _57, _58,  \
-                           _59, _60, _61, _62, _63, N, ...)                                       \
-  N
+#define _MMDEPLOY_PP_ARG_N(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46, _47, _48, _49, _50, _51, _52, _53, _54, _55, _56, _57, _58, _59, _60, _61, _62, _63, N, ...) \
+    N
 
-#define _MMDEPLOY_PP_RESQ_N()                                                                     \
-  63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, \
-      39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, \
-      16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+#define _MMDEPLOY_PP_RESQ_N()                                                                       \
+    63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, \
+        39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, \
+        16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 
 #define MMDEPLOY_PP_MAP_1(f, x) f(x)
 #define MMDEPLOY_PP_MAP_2(f, x, ...) f(x), MMDEPLOY_PP_MAP_1(f, __VA_ARGS__)
@@ -113,10 +109,10 @@
 #define MMDEPLOY_PP_MAP_64(f, x, ...) f(x), MMDEPLOY_PP_MAP_63(f, __VA_ARGS__)
 
 #define MMDEPLOY_PP_MAP(f, ...) \
-  _MMDEPLOY_PP_MAP_IMPL1(f, MMDEPLOY_PP_NARG(__VA_ARGS__), __VA_ARGS__)
+    _MMDEPLOY_PP_MAP_IMPL1(f, MMDEPLOY_PP_NARG(__VA_ARGS__), __VA_ARGS__)
 
 #define _MMDEPLOY_PP_MAP_IMPL1(f, n, ...) \
-  _MMDEPLOY_PP_MAP_IMPL2(f, MMDEPLOY_PP_CONCAT(MMDEPLOY_PP_MAP_, n), __VA_ARGS__)
+    _MMDEPLOY_PP_MAP_IMPL2(f, MMDEPLOY_PP_CONCAT(MMDEPLOY_PP_MAP_, n), __VA_ARGS__)
 
 #define _MMDEPLOY_PP_MAP_IMPL2(f, M_, ...) M_(f, __VA_ARGS__)
 
diff --git a/csrc/mmdeploy/core/mat.cpp b/csrc/mmdeploy/core/mat.cpp
index 1831dfb379..14c1b1d427 100644
--- a/csrc/mmdeploy/core/mat.cpp
+++ b/csrc/mmdeploy/core/mat.cpp
@@ -2,71 +2,90 @@
 
 #include "mat.h"
 
-namespace mmdeploy::framework {
+namespace mmdeploy::framework
+{
 
-Mat::Mat(int h, int w, PixelFormat format, DataType type, Device device, Allocator allocator)
-    : format_(format), type_(type), width_(w), height_(h) {
-  int bits_per_pixel = 0;
-  switch (format) {
-    case PixelFormat::kGRAYSCALE:
-      channel_ = 1;
-      bits_per_pixel = 8;
-      break;
-    case PixelFormat::kNV12:  // fall through
-    case PixelFormat::kNV21:
-      channel_ = 1;
-      bits_per_pixel = 12;
-      assert(w % 2 == 0);
-      break;
-    case PixelFormat::kBGR:  // fall through
-    case PixelFormat::kRGB:
-      channel_ = 3;
-      bits_per_pixel = 24;
-      break;
-    case PixelFormat::kBGRA:
-      channel_ = 4;
-      bits_per_pixel = 32;
-      break;
-    default:
-      throw_exception(eNotSupported);
-  }
+    Mat::Mat(int h, int w, PixelFormat format, DataType type, Device device, Allocator allocator)
+        : format_(format)
+        , type_(type)
+        , width_(w)
+        , height_(h)
+    {
+        int bits_per_pixel = 0;
+        switch (format)
+        {
+            case PixelFormat::kGRAYSCALE:
+                channel_       = 1;
+                bits_per_pixel = 8;
+                break;
+            case PixelFormat::kNV12:  // fall through
+            case PixelFormat::kNV21:
+                channel_       = 1;
+                bits_per_pixel = 12;
+                assert(w % 2 == 0);
+                break;
+            case PixelFormat::kBGR:  // fall through
+            case PixelFormat::kRGB:
+                channel_       = 3;
+                bits_per_pixel = 24;
+                break;
+            case PixelFormat::kBGRA:
+                channel_       = 4;
+                bits_per_pixel = 32;
+                break;
+            default:
+                throw_exception(eNotSupported);
+        }
 
-  size_ = height_ * width_ * channel_;
-  bytes_ = height_ * width_ * bits_per_pixel / 8;
+        size_  = height_ * width_ * channel_;
+        bytes_ = height_ * width_ * bits_per_pixel / 8;
 
-  switch (type) {
-    case DataType::kFLOAT:
-      bytes_ *= sizeof(float);
-      break;
-    case DataType::kHALF:
-      bytes_ *= 2;
-      break;
-    case DataType::kINT32:
-      bytes_ *= sizeof(int32_t);
-      break;
-    case DataType::kINT8:
-      break;
-    default:
-      throw_exception(eNotSupported);
-      break;
-  }
-  if (device.platform_id() >= 0 && bytes_ > 0) {
-    buf_ = Buffer(device, bytes_, std::move(allocator));
-  }
-}
+        switch (type)
+        {
+            case DataType::kFLOAT:
+                bytes_ *= sizeof(float);
+                break;
+            case DataType::kHALF:
+                bytes_ *= 2;
+                break;
+            case DataType::kINT32:
+                bytes_ *= sizeof(int32_t);
+                break;
+            case DataType::kINT8:
+                break;
+            default:
+                throw_exception(eNotSupported);
+                break;
+        }
+        if (device.platform_id() >= 0 && bytes_ > 0)
+        {
+            buf_ = Buffer(device, bytes_, std::move(allocator));
+        }
+    }
 
-Mat::Mat(int h, int w, PixelFormat format, DataType type, std::shared_ptr<void> data, Device device)
-    : Mat(h, w, format, type, device) {
-  buf_ = Buffer(device, bytes_, std::move(data));
-}
+    Mat::Mat(int h, int w, PixelFormat format, DataType type, std::shared_ptr<void> data, Device device)
+        : Mat(h, w, format, type, device)
+    {
+        buf_ = Buffer(device, bytes_, std::move(data));
+    }
 
-Mat::Mat(int h, int w, PixelFormat format, DataType type, void* data, Device device)
-    : Mat(h, w, format, type, device) {
-  buf_ = Buffer(device, bytes_, data);
-}
+    Mat::Mat(int h, int w, PixelFormat format, DataType type, void* data, Device device)
+        : Mat(h, w, format, type, device)
+    {
+        buf_ = Buffer(device, bytes_, data);
+    }
 
-Device Mat::device() const { return buf_.GetDevice(); }
-Buffer& Mat::buffer() { return buf_; }
-const Buffer& Mat::buffer() const { return buf_; }
+    Device Mat::device() const
+    {
+        return buf_.GetDevice();
+    }
+    Buffer& Mat::buffer()
+    {
+        return buf_;
+    }
+    const Buffer& Mat::buffer() const
+    {
+        return buf_;
+    }
 
 }  // namespace mmdeploy::framework
diff --git a/csrc/mmdeploy/core/mat.h b/csrc/mmdeploy/core/mat.h
index b285eba4c3..fad567bd25 100644
--- a/csrc/mmdeploy/core/mat.h
+++ b/csrc/mmdeploy/core/mat.h
@@ -10,89 +10,112 @@
 #include "mmdeploy/core/mpl/type_traits.h"
 #include "mmdeploy/core/types.h"
 
-namespace mmdeploy {
-
-namespace framework {
-
-class MMDEPLOY_API Mat final {
- public:
-  Mat() = default;
-
-  /**
-   * @brief construct a Mat for an image
-   * @param h height of an image
-   * @param w width of an image
-   * @param format pixel format of an image, rgb, bgr, gray etc. Note that in
-   * case of nv12 or nv21, height is the real height of an image,
-   * not height * 3 / 2
-   * @param type data type of an pixel in each channel
-   * @param device location Mat's buffer stores
-   */
-  Mat(int h, int w, PixelFormat format, DataType type, Device device = Device{0},
-      Allocator allocator = {});
-
-  /**@brief construct a Mat for an image using custom data
-   * @example
-   * ``` c++
-   * cv::Mat image = imread("test.jpg");
-   * std::shared_ptr<void> data(image.data, [image=image](void* p){});
-   * mmdeploy::Mat mat(image.rows, image.cols, kBGR, kINT8, data);
-   * ```
-   * @param h height of an image
-   * @param w width of an image
-   * @param format pixel format of an image, rgb, bgr, gray etc. Note that in
-   * case of nv12 or nv21, height is the real height of an image,
-   * not height * 3 / 2
-   * @param type data type of an pixel in each channel
-   * @param data custom data
-   * @param device location where `data` is on
-   */
-  Mat(int h, int w, PixelFormat format, DataType type, std::shared_ptr<void> data,
-      Device device = Device{0});
-
-  /**
-   * @brief construct a Mat for an image using custom data
-   * @param h height of an image
-   * @param w width of an image
-   * @param format pixel format of an image, rgb, bgr, gray etc. Note that in
-   * case of nv12 or nv21, height is the real height of an image,
-   * not height * 3 / 2
-   * @param type data type of an pixel in each channel
-   * @param data custom data
-   * @param device location where `data` is on
-   */
-  Mat(int h, int w, PixelFormat format, DataType type, void* data, Device device = Device{0});
-
-  Device device() const;
-  Buffer& buffer();
-  const Buffer& buffer() const;
-  PixelFormat pixel_format() const { return format_; }
-  DataType type() const { return type_; }
-  int height() const { return height_; }
-  int width() const { return width_; }
-  int channel() const { return channel_; }
-  int size() const { return size_; }
-  int byte_size() const { return bytes_; }
-
-  template <typename T>
-  T* data() const {
-    return reinterpret_cast<T*>(buf_.GetNative());
-  }
-
- private:
-  Buffer buf_;
-  PixelFormat format_{PixelFormat::kGRAYSCALE};
-  DataType type_{DataType::kINT8};
-  int width_{0};
-  int height_{0};
-  int channel_{0};
-  int size_{0};  // size of elements in mat
-  int bytes_{0};
-};
-
-}  // namespace framework
-
-MMDEPLOY_REGISTER_TYPE_ID(framework::Mat, 7);
+namespace mmdeploy
+{
+
+    namespace framework
+    {
+
+        class MMDEPLOY_API Mat final
+        {
+          public:
+            Mat() = default;
+
+            /**
+             * @brief construct a Mat for an image
+             * @param h height of an image
+             * @param w width of an image
+             * @param format pixel format of an image, rgb, bgr, gray etc. Note that in
+             * case of nv12 or nv21, height is the real height of an image,
+             * not height * 3 / 2
+             * @param type data type of an pixel in each channel
+             * @param device location Mat's buffer stores
+             */
+            Mat(int h, int w, PixelFormat format, DataType type, Device device = Device{0}, Allocator allocator = {});
+
+            /**@brief construct a Mat for an image using custom data
+             * @example
+             * ``` c++
+             * cv::Mat image = imread("test.jpg");
+             * std::shared_ptr<void> data(image.data, [image=image](void* p){});
+             * mmdeploy::Mat mat(image.rows, image.cols, kBGR, kINT8, data);
+             * ```
+             * @param h height of an image
+             * @param w width of an image
+             * @param format pixel format of an image, rgb, bgr, gray etc. Note that in
+             * case of nv12 or nv21, height is the real height of an image,
+             * not height * 3 / 2
+             * @param type data type of an pixel in each channel
+             * @param data custom data
+             * @param device location where `data` is on
+             */
+            Mat(int h, int w, PixelFormat format, DataType type, std::shared_ptr<void> data, Device device = Device{0});
+
+            /**
+             * @brief construct a Mat for an image using custom data
+             * @param h height of an image
+             * @param w width of an image
+             * @param format pixel format of an image, rgb, bgr, gray etc. Note that in
+             * case of nv12 or nv21, height is the real height of an image,
+             * not height * 3 / 2
+             * @param type data type of an pixel in each channel
+             * @param data custom data
+             * @param device location where `data` is on
+             */
+            Mat(int h, int w, PixelFormat format, DataType type, void* data, Device device = Device{0});
+
+            Device        device() const;
+            Buffer&       buffer();
+            const Buffer& buffer() const;
+            PixelFormat   pixel_format() const
+            {
+                return format_;
+            }
+            DataType type() const
+            {
+                return type_;
+            }
+            int height() const
+            {
+                return height_;
+            }
+            int width() const
+            {
+                return width_;
+            }
+            int channel() const
+            {
+                return channel_;
+            }
+            int size() const
+            {
+                return size_;
+            }
+            int byte_size() const
+            {
+                return bytes_;
+            }
+
+            template<typename T>
+            T* data() const
+            {
+                return reinterpret_cast<T*>(buf_.GetNative());
+            }
+
+          private:
+            Buffer      buf_;
+            PixelFormat format_{PixelFormat::kGRAYSCALE};
+            DataType    type_{DataType::kINT8};
+            int         width_{0};
+            int         height_{0};
+            int         channel_{0};
+            int         size_{0};  // size of elements in mat
+            int         bytes_{0};
+        };
+
+    }  // namespace framework
+
+    MMDEPLOY_REGISTER_TYPE_ID(framework::Mat, 7);
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/core/model.cpp b/csrc/mmdeploy/core/model.cpp
index 871d1a114e..4edd499af6 100644
--- a/csrc/mmdeploy/core/model.cpp
+++ b/csrc/mmdeploy/core/model.cpp
@@ -9,73 +9,93 @@
 
 using namespace std;
 
-namespace mmdeploy::framework {
+namespace mmdeploy::framework
+{
 
-Model::Model(const std::string& model_path) {
-  if (auto r = Model::Init(model_path); !r) {
-    MMDEPLOY_ERROR("Failed to load model \"{}\"", model_path);
-    r.error().throw_exception();
-  }
-}
+    Model::Model(const std::string& model_path)
+    {
+        if (auto r = Model::Init(model_path); !r)
+        {
+            MMDEPLOY_ERROR("Failed to load model \"{}\"", model_path);
+            r.error().throw_exception();
+        }
+    }
 
-Model::Model(const void* buffer, size_t size) { Init(buffer, size).value(); }
+    Model::Model(const void* buffer, size_t size)
+    {
+        Init(buffer, size).value();
+    }
 
-Result<void> Model::Init(const std::string& model_path) {
-  model_path_ = model_path;
-  if (!fs::exists(model_path)) {
-    MMDEPLOY_ERROR("File not found: \"{}\"", model_path);
-    return Status(eFileNotExist);
-  }
+    Result<void> Model::Init(const std::string& model_path)
+    {
+        model_path_ = model_path;
+        if (!fs::exists(model_path))
+        {
+            MMDEPLOY_ERROR("File not found: \"{}\"", model_path);
+            return Status(eFileNotExist);
+        }
 
-  for (const auto& creator : gRegistry<ModelImpl>().Creators()) {
-    if (auto impl = creator->Create(); impl->Init(model_path)) {
-      OUTCOME_TRY(auto meta, impl->ReadMeta());
-      impl_ = std::move(impl);
-      meta_ = std::move(meta);
-      MMDEPLOY_INFO("[{}] Load model: \"{}\"", creator->name(), model_path);
-      return success();
+        for (const auto& creator : gRegistry<ModelImpl>().Creators())
+        {
+            if (auto impl = creator->Create(); impl->Init(model_path))
+            {
+                OUTCOME_TRY(auto meta, impl->ReadMeta());
+                impl_ = std::move(impl);
+                meta_ = std::move(meta);
+                MMDEPLOY_INFO("[{}] Load model: \"{}\"", creator->name(), model_path);
+                return success();
+            }
+        }
+        MMDEPLOY_ERROR("Failed to load model: \"{}\", implementations tried: {}", model_path, gRegistry<ModelImpl>().List());
+        return Status(eNotSupported);
     }
-  }
-  MMDEPLOY_ERROR("Failed to load model: \"{}\", implementations tried: {}", model_path,
-                 gRegistry<ModelImpl>().List());
-  return Status(eNotSupported);
-}
 
-const std::string& Model::GetModelPath() const { return model_path_; }
+    const std::string& Model::GetModelPath() const
+    {
+        return model_path_;
+    }
 
-Result<void> Model::Init(const void* buffer, size_t size) {
-  for (const auto& creator : gRegistry<ModelImpl>().Creators()) {
-    if (auto impl = creator->Create(); impl->Init(buffer, size)) {
-      OUTCOME_TRY(auto meta, impl->ReadMeta());
-      impl_ = std::move(impl);
-      meta_ = std::move(meta);
-      MMDEPLOY_INFO("[{}] Parse model", creator->name());
-      return success();
+    Result<void> Model::Init(const void* buffer, size_t size)
+    {
+        for (const auto& creator : gRegistry<ModelImpl>().Creators())
+        {
+            if (auto impl = creator->Create(); impl->Init(buffer, size))
+            {
+                OUTCOME_TRY(auto meta, impl->ReadMeta());
+                impl_ = std::move(impl);
+                meta_ = std::move(meta);
+                MMDEPLOY_INFO("[{}] Parse model", creator->name());
+                return success();
+            }
+        }
+        MMDEPLOY_ERROR("Failed to parse model buffer, implementations tried: {}",
+                       gRegistry<ModelImpl>().List());
+        return Status(eNotSupported);
     }
-  }
-  MMDEPLOY_ERROR("Failed to parse model buffer, implementations tried: {}",
-                 gRegistry<ModelImpl>().List());
-  return Status(eNotSupported);
-}
 
-Result<model_meta_info_t> Model::GetModelConfig(const std::string& name) const {
-  for (auto& info : meta_.models) {
-    if (name == info.name) {
-      return info;
+    Result<model_meta_info_t> Model::GetModelConfig(const std::string& name) const
+    {
+        for (auto& info : meta_.models)
+        {
+            if (name == info.name)
+            {
+                return info;
+            }
+        }
+        MMDEPLOY_ERROR("Cannot find model '{}' in meta file", name);
+        return Status(eEntryNotFound);
     }
-  }
-  MMDEPLOY_ERROR("Cannot find model '{}' in meta file", name);
-  return Status(eEntryNotFound);
-}
 
-Result<std::string> Model::ReadFile(const std::string& file_path) noexcept {
-  return impl_->ReadFile(file_path);
-}
+    Result<std::string> Model::ReadFile(const std::string& file_path) noexcept
+    {
+        return impl_->ReadFile(file_path);
+    }
 
-Result<Value> Model::ReadConfig(const string& config_path) noexcept {
-  return impl_->ReadConfig(config_path);
-}
+    Result<Value> Model::ReadConfig(const string& config_path) noexcept
+    {
+        return impl_->ReadConfig(config_path);
+    }
 
-MMDEPLOY_DEFINE_REGISTRY(ModelImpl);
+    MMDEPLOY_DEFINE_REGISTRY(ModelImpl);
 
 }  // namespace mmdeploy::framework
diff --git a/csrc/mmdeploy/core/model.h b/csrc/mmdeploy/core/model.h
index fcb396d267..01d06af9ed 100644
--- a/csrc/mmdeploy/core/model.h
+++ b/csrc/mmdeploy/core/model.h
@@ -13,96 +13,107 @@
 #include "mmdeploy/core/types.h"
 #include "mmdeploy/core/value.h"
 
-namespace mmdeploy {
-
-namespace framework {
-
-struct model_meta_info_t {
-  std::string name;
-  std::string net;
-  std::string weights;
-  std::string backend;
-  int batch_size;
-  std::string precision;
-  bool dynamic_shape;
-  MMDEPLOY_ARCHIVE_MEMBERS(name, net, weights, backend, batch_size, precision, dynamic_shape);
-};
-
-struct deploy_meta_info_t {
-  std::string version;
-  std::vector<model_meta_info_t> models;
-  MMDEPLOY_ARCHIVE_MEMBERS(version, models);
-};
-
-class ModelImpl;
-
-/**
- * @class Model
- * @brief Load SDK model from file or buffer.
- */
-class MMDEPLOY_API Model {
- public:
-  Model() = default;
-
-  explicit Model(const std::string& model_path);
-
-  explicit Model(const void* buffer, size_t size);
-
-  ~Model() = default;
-
-  /**
-   * @brief Load SDK model.
-   * @param model_path file path of the model. It can be a file or a
-   * directory.
-   * @return status with an error code.
-   */
-  Result<void> Init(const std::string& model_path);
-
-  Result<void> Init(const void* buffer, size_t size);
-
-  /**
-   * @brief Get model's meta info
-   * @param name the name of a model in the SDK model file
-   * @return
-   */
-  Result<model_meta_info_t> GetModelConfig(const std::string& name) const;
-
-  /**
-   * @brief Read file from the SDK model
-   * @param file_path path relative to the root directory of the model.
-   * @return the content of file on success
-   */
-  Result<std::string> ReadFile(const std::string& file_path) noexcept;
-
-  Result<Value> ReadConfig(const std::string& config_path) noexcept;
-
-  /**
-   * @brief get meta information of the model
-   * @return SDK model's meta information
-   */
-  const deploy_meta_info_t& meta() const { return meta_; }
-
-  /**
-   * @brief Check if an instance of `Model` is valid
-   * @return the status of an instance of `Model`
-   */
-  explicit operator bool() const { return impl_ != nullptr; }
-
-  /**
-   * @brief get model_path that init with DirectoryModel
-   * @return file path of an sdk model
-   */
-  const std::string& GetModelPath() const;
-
- private:
-  std::string model_path_;
-  std::shared_ptr<ModelImpl> impl_;
-  deploy_meta_info_t meta_;
-};
-
-}  // namespace framework
-
-MMDEPLOY_REGISTER_TYPE_ID(framework::Model, 5);
+namespace mmdeploy
+{
+
+    namespace framework
+    {
+
+        struct model_meta_info_t
+        {
+            std::string name;
+            std::string net;
+            std::string weights;
+            std::string backend;
+            int         batch_size;
+            std::string precision;
+            bool        dynamic_shape;
+            MMDEPLOY_ARCHIVE_MEMBERS(name, net, weights, backend, batch_size, precision, dynamic_shape);
+        };
+
+        struct deploy_meta_info_t
+        {
+            std::string                    version;
+            std::vector<model_meta_info_t> models;
+            MMDEPLOY_ARCHIVE_MEMBERS(version, models);
+        };
+
+        class ModelImpl;
+
+        /**
+         * @class Model
+         * @brief Load SDK model from file or buffer.
+         */
+        class MMDEPLOY_API Model
+        {
+          public:
+            Model() = default;
+
+            explicit Model(const std::string& model_path);
+
+            explicit Model(const void* buffer, size_t size);
+
+            ~Model() = default;
+
+            /**
+             * @brief Load SDK model.
+             * @param model_path file path of the model. It can be a file or a
+             * directory.
+             * @return status with an error code.
+             */
+            Result<void>              Init(const std::string& model_path);
+
+            Result<void>              Init(const void* buffer, size_t size);
+
+            /**
+             * @brief Get model's meta info
+             * @param name the name of a model in the SDK model file
+             * @return
+             */
+            Result<model_meta_info_t> GetModelConfig(const std::string& name) const;
+
+            /**
+             * @brief Read file from the SDK model
+             * @param file_path path relative to the root directory of the model.
+             * @return the content of file on success
+             */
+            Result<std::string>       ReadFile(const std::string& file_path) noexcept;
+
+            Result<Value>             ReadConfig(const std::string& config_path) noexcept;
+
+            /**
+             * @brief get meta information of the model
+             * @return SDK model's meta information
+             */
+            const deploy_meta_info_t& meta() const
+            {
+                return meta_;
+            }
+
+            /**
+             * @brief Check if an instance of `Model` is valid
+             * @return the status of an instance of `Model`
+             */
+            explicit operator bool() const
+            {
+                return impl_ != nullptr;
+            }
+
+            /**
+             * @brief get model_path that init with DirectoryModel
+             * @return file path of an sdk model
+             */
+            const std::string& GetModelPath() const;
+
+          private:
+            std::string                model_path_;
+            std::shared_ptr<ModelImpl> impl_;
+            deploy_meta_info_t         meta_;
+        };
+
+    }  // namespace framework
+
+    MMDEPLOY_REGISTER_TYPE_ID(framework::Model, 5);
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/core/model_impl.h b/csrc/mmdeploy/core/model_impl.h
index 2b28e70058..5091686558 100644
--- a/csrc/mmdeploy/core/model_impl.h
+++ b/csrc/mmdeploy/core/model_impl.h
@@ -6,42 +6,50 @@
 #include "mmdeploy/core/model.h"
 #include "mmdeploy/core/registry.h"
 
-namespace mmdeploy::framework {
-
-/**
- * @class ModelImpl
- * @brief SDK model's implementation interface
- */
-class ModelImpl {
- public:
-  virtual ~ModelImpl() = default;
-
-  /**
-   * @brief Load an SDK model.
-   * @param model_path path of the model. It can be a file or a directory.
-   * @return status with an error code.
-   */
-  virtual Result<void> Init(const std::string& model_path) { return Status(eNotSupported); }
-
-  virtual Result<void> Init(const void* buffer, size_t size) { return Status(eNotSupported); }
-
-  /**
-   * @brief Read specified file from a SDK model
-   * @param file_path path relative to the root directory of the model.
-   * @return the content of the file on success
-   */
-  virtual Result<std::string> ReadFile(const std::string& file_path) const = 0;
-
-  virtual Result<Value> ReadConfig(const std::string& config_path) const = 0;
-
-  /**
-   * @brief get meta information of an sdk model
-   * @return SDK model's meta information
-   */
-  virtual Result<deploy_meta_info_t> ReadMeta() const = 0;
-};
-
-MMDEPLOY_DECLARE_REGISTRY(ModelImpl, std::unique_ptr<ModelImpl>());
+namespace mmdeploy::framework
+{
+
+    /**
+     * @class ModelImpl
+     * @brief SDK model's implementation interface
+     */
+    class ModelImpl
+    {
+      public:
+        virtual ~ModelImpl() = default;
+
+        /**
+         * @brief Load an SDK model.
+         * @param model_path path of the model. It can be a file or a directory.
+         * @return status with an error code.
+         */
+        virtual Result<void> Init(const std::string& model_path)
+        {
+            return Status(eNotSupported);
+        }
+
+        virtual Result<void> Init(const void* buffer, size_t size)
+        {
+            return Status(eNotSupported);
+        }
+
+        /**
+         * @brief Read specified file from a SDK model
+         * @param file_path path relative to the root directory of the model.
+         * @return the content of the file on success
+         */
+        virtual Result<std::string>        ReadFile(const std::string& file_path) const = 0;
+
+        virtual Result<Value>              ReadConfig(const std::string& config_path) const = 0;
+
+        /**
+         * @brief get meta information of an sdk model
+         * @return SDK model's meta information
+         */
+        virtual Result<deploy_meta_info_t> ReadMeta() const = 0;
+    };
+
+    MMDEPLOY_DECLARE_REGISTRY(ModelImpl, std::unique_ptr<ModelImpl>());
 
 }  // namespace mmdeploy::framework
 
diff --git a/csrc/mmdeploy/core/module.cpp b/csrc/mmdeploy/core/module.cpp
index 31b964e730..2cfc1091d4 100644
--- a/csrc/mmdeploy/core/module.cpp
+++ b/csrc/mmdeploy/core/module.cpp
@@ -4,8 +4,9 @@
 
 #include "registry.h"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-MMDEPLOY_DEFINE_REGISTRY(Module);
+    MMDEPLOY_DEFINE_REGISTRY(Module);
 
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/core/module.h b/csrc/mmdeploy/core/module.h
index c158887c45..f76615c8d1 100644
--- a/csrc/mmdeploy/core/module.h
+++ b/csrc/mmdeploy/core/module.h
@@ -8,15 +8,17 @@
 #include "mmdeploy/core/status_code.h"
 #include "mmdeploy/core/value.h"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-class MMDEPLOY_API Module {
- public:
-  virtual ~Module() = default;
-  virtual Result<Value> Process(const Value& args) = 0;
-};
+    class MMDEPLOY_API Module
+    {
+      public:
+        virtual ~Module()                                = default;
+        virtual Result<Value> Process(const Value& args) = 0;
+    };
 
-MMDEPLOY_DECLARE_REGISTRY(Module, std::unique_ptr<Module>(const Value& config));
+    MMDEPLOY_DECLARE_REGISTRY(Module, std::unique_ptr<Module>(const Value& config));
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/core/mpl/detected.h b/csrc/mmdeploy/core/mpl/detected.h
index 1674c1e22f..1eee541eef 100644
--- a/csrc/mmdeploy/core/mpl/detected.h
+++ b/csrc/mmdeploy/core/mpl/detected.h
@@ -5,49 +5,53 @@
 
 #include <type_traits>
 
-namespace mmdeploy::detail {
-
-struct nonesuch {
-  nonesuch() = delete;
-  ~nonesuch() = delete;
-  nonesuch(nonesuch const&) = delete;
-  nonesuch(nonesuch const&&) = delete;
-  void operator=(nonesuch const&) = delete;
-  void operator=(nonesuch&&) = delete;
-};
-
-template <class Default, class AlwaysVoid, template <class...> class Op, class... Args>
-struct detector {
-  using value_t = std::false_type;
-  using type = Default;
-};
-
-template <class Default, template <class...> class Op, class... Args>
-struct detector<Default, std::void_t<Op<Args...>>, Op, Args...> {
-  using value_t = std::true_type;
-  using type = Op<Args...>;
-};
-
-template <template <class...> class Op, class... Args>
-using is_detected = typename detector<nonesuch, void, Op, Args...>::value_t;
-
-template <template <class...> class Op, class... Args>
-using detected_t = typename detector<nonesuch, void, Op, Args...>::type;
-
-template <template <class...> class Op, class... Args>
-constexpr inline bool is_detected_v = is_detected<Op, Args...>::value;
-
-template <class Default, template <class...> class Op, class... Args>
-using detected_or = detector<Default, void, Op, Args...>;
-
-template <class Default, template <class...> class Op, class... Args>
-using detected_or_t = typename detected_or<Default, Op, Args...>::type;
-
-template <class Expected, template <class...> class Op, class... Args>
-using is_detected_exact = std::is_same<Expected, detected_t<Op, Args...>>;
-
-template <class To, template <class...> class Op, class... Args>
-using is_detected_convertible = std::is_convertible<detected_t<Op, Args...>, To>;
+namespace mmdeploy::detail
+{
+
+    struct nonesuch
+    {
+        nonesuch()                      = delete;
+        ~nonesuch()                     = delete;
+        nonesuch(nonesuch const&)       = delete;
+        nonesuch(nonesuch const&&)      = delete;
+        void operator=(nonesuch const&) = delete;
+        void operator=(nonesuch&&)      = delete;
+    };
+
+    template<class Default, class AlwaysVoid, template<class...> class Op, class... Args>
+    struct detector
+    {
+        using value_t = std::false_type;
+        using type    = Default;
+    };
+
+    template<class Default, template<class...> class Op, class... Args>
+    struct detector<Default, std::void_t<Op<Args...>>, Op, Args...>
+    {
+        using value_t = std::true_type;
+        using type    = Op<Args...>;
+    };
+
+    template<template<class...> class Op, class... Args>
+    using is_detected = typename detector<nonesuch, void, Op, Args...>::value_t;
+
+    template<template<class...> class Op, class... Args>
+    using detected_t = typename detector<nonesuch, void, Op, Args...>::type;
+
+    template<template<class...> class Op, class... Args>
+    constexpr inline bool is_detected_v = is_detected<Op, Args...>::value;
+
+    template<class Default, template<class...> class Op, class... Args>
+    using detected_or = detector<Default, void, Op, Args...>;
+
+    template<class Default, template<class...> class Op, class... Args>
+    using detected_or_t = typename detected_or<Default, Op, Args...>::type;
+
+    template<class Expected, template<class...> class Op, class... Args>
+    using is_detected_exact = std::is_same<Expected, detected_t<Op, Args...>>;
+
+    template<class To, template<class...> class Op, class... Args>
+    using is_detected_convertible = std::is_convertible<detected_t<Op, Args...>, To>;
 
 }  // namespace mmdeploy::detail
 
diff --git a/csrc/mmdeploy/core/mpl/iterator.h b/csrc/mmdeploy/core/mpl/iterator.h
index dc91ccb1cb..7667b2ea7c 100644
--- a/csrc/mmdeploy/core/mpl/iterator.h
+++ b/csrc/mmdeploy/core/mpl/iterator.h
@@ -7,13 +7,14 @@
 
 #include "type_traits.h"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-template <typename T>
-using iter_value_t = typename std::iterator_traits<uncvref_t<T> >::value_type;
+    template<typename T>
+    using iter_value_t = typename std::iterator_traits<uncvref_t<T>>::value_type;
 
-template <typename T>
-using iter_reference_t = decltype(*std::declval<T&>());
+    template<typename T>
+    using iter_reference_t = decltype(*std::declval<T&>());
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/core/mpl/priority_tag.h b/csrc/mmdeploy/core/mpl/priority_tag.h
index 6d52d6692d..fef515c1b1 100644
--- a/csrc/mmdeploy/core/mpl/priority_tag.h
+++ b/csrc/mmdeploy/core/mpl/priority_tag.h
@@ -3,12 +3,17 @@
 #ifndef MMDEPLOY_SRC_CORE_MPL_PRIORITY_TAG_H_
 #define MMDEPLOY_SRC_CORE_MPL_PRIORITY_TAG_H_
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-template <unsigned N>
-struct priority_tag : priority_tag<N - 1> {};
-template <>
-struct priority_tag<0> {};
+    template<unsigned N>
+    struct priority_tag : priority_tag<N - 1>
+    {
+    };
+    template<>
+    struct priority_tag<0>
+    {
+    };
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/core/mpl/span.h b/csrc/mmdeploy/core/mpl/span.h
index 9b081bb4d8..2ab135e99e 100644
--- a/csrc/mmdeploy/core/mpl/span.h
+++ b/csrc/mmdeploy/core/mpl/span.h
@@ -9,126 +9,230 @@
 #include "detected.h"
 #include "iterator.h"
 
-namespace mmdeploy {
-
-namespace detail {
-
-template <typename T>
-using arrow_t = decltype(std::declval<T>().operator->());
-
-template <typename T>
-constexpr auto to_address(const T& p) noexcept {
-  if constexpr (std::is_pointer_v<T>) {
-    return p;
-  } else if (detail::is_detected_v<arrow_t, T>) {
-    return to_address(p.operator->());
-  }
-}
-
-}  // namespace detail
-
-template <typename T>
-class Span {
- public:
-  using element_type = T;
-  using value_type = std::remove_cv_t<T>;
-  using size_type = std::size_t;
-  using difference_type = std::ptrdiff_t;
-  using pointer = T*;
-  using const_pointer = const T*;
-  using reference = T&;
-  using const_reference = const T&;
-  using iterator = T*;
-  using reverse_iterator = std::reverse_iterator<iterator>;
-
- public:
-  constexpr Span() noexcept : data_(nullptr), size_(0) {}
-
-  // clang-format off
-  template <typename It,
-      std::void_t<decltype(std::addressof(std::declval<It&>()))>* = nullptr>
-  // clang-format on
-  constexpr Span(It first, size_type size) : data_(detail::to_address(first)), size_(size) {}
-
-  template <typename It, typename End,
-            std::enable_if_t<!std::is_convertible_v<End, std::size_t>, int> = 0>
-  constexpr Span(It first, End last) : data_(detail::to_address(first)), size_(last - first) {}
-
-  template <typename U, typename = std::void_t<decltype(std::data(std::declval<U>()))>,
-            typename = std::void_t<decltype(std::size(std::declval<U>()))>>
-  constexpr Span(U& v) : data_(std::data(v)), size_(std::size(v)) {}
-
-  template <typename U, typename = std::void_t<decltype(std::data(std::declval<U>()))>,
-            typename = std::void_t<decltype(std::size(std::declval<U>()))>>
-  constexpr Span(const U& v) : data_(std::data(v)), size_(std::size(v)) {}
-
-  template <typename U>
-  constexpr Span(std::initializer_list<U> il) noexcept : Span(il.begin(), il.size()) {}
-
-  template <std::size_t N>
-  constexpr Span(element_type (&arr)[N]) noexcept : data_(std::data(arr)), size_(N) {}
-
-  constexpr Span(const Span& other) noexcept : data_(std::data(other)), size_(std::size(other)) {}
-
-  constexpr iterator begin() const noexcept { return data_; }
-  constexpr iterator end() const noexcept { return data_ + size_; }
-  constexpr reverse_iterator rbegin() const noexcept { return std::make_reverse_iterator(end()); }
-  constexpr reverse_iterator rend() const noexcept { return std::make_reverse_iterator(begin()); }
-  constexpr reference front() const { return data_[0]; }
-  constexpr reference back() const { return data_[size_ - 1]; }
-  constexpr reference operator[](size_type idx) const { return data_[idx]; }
-  constexpr pointer data() const noexcept { return data_; }
-  constexpr size_type size() const noexcept { return size_; }
-  constexpr size_type size_bytes() const noexcept { return sizeof(value_type) * size(); }
-  constexpr bool empty() const noexcept { return size_ == 0; }
-  constexpr Span<element_type> first(size_type count) const { return {begin(), count}; }
-  constexpr Span<element_type> last(size_type count) const { return {end() - count, count}; }
-  constexpr Span<element_type> subspan(size_type offset, size_type count = -1) const {
-    if (count == -1) {
-      return Span(begin() + offset, end());
-    } else {
-      return Span(begin() + offset, begin() + offset + count);
-    }
-  }
-
-  constexpr Span& operator=(const Span& other) noexcept = default;
-
-  template <typename U>
-  friend bool operator!=(const Span& a, const Span<U>& b) {
-    if (a.size() != b.size()) {
-      return true;
-    }
-    for (size_type i = 0; i < a.size(); ++i) {
-      if (a[i] != b[i]) {
-        return true;
-      }
-    }
-    return false;
-  }
-
-  template <typename U>
-  friend bool operator==(const Span& a, const Span<U>& b) {
-    return !(a != b);
-  }
-
- private:
-  T* data_;
-  size_type size_;
-};
-// clang-format off
+namespace mmdeploy
+{
+
+    namespace detail
+    {
+
+        template<typename T>
+        using arrow_t = decltype(std::declval<T>().operator->());
+
+        template<typename T>
+        constexpr auto to_address(const T& p) noexcept
+        {
+            if constexpr (std::is_pointer_v<T>)
+            {
+                return p;
+            }
+            else if (detail::is_detected_v<arrow_t, T>)
+            {
+                return to_address(p.operator->());
+            }
+        }
+
+    }  // namespace detail
+
+    template<typename T>
+    class Span
+    {
+      public:
+        using element_type     = T;
+        using value_type       = std::remove_cv_t<T>;
+        using size_type        = std::size_t;
+        using difference_type  = std::ptrdiff_t;
+        using pointer          = T*;
+        using const_pointer    = const T*;
+        using reference        = T&;
+        using const_reference  = const T&;
+        using iterator         = T*;
+        using reverse_iterator = std::reverse_iterator<iterator>;
+
+      public:
+        constexpr Span() noexcept
+            : data_(nullptr)
+            , size_(0)
+        {
+        }
+
+
+        template<typename It,
+                 std::void_t<decltype(std::addressof(std::declval<It&>()))>* = nullptr>
+        constexpr Span(It first, size_type size)
+            : data_(detail::to_address(first))
+            , size_(size)
+        {
+        }
+
+        template<typename It,
+                 typename End,
+                 std::enable_if_t<!std::is_convertible_v<End, std::size_t>, int> = 0>
+        constexpr Span(It first, End last)
+            : data_(detail::to_address(first))
+            , size_(last - first)
+        {
+        }
+
+        template<typename U,
+                 typename = std::void_t<decltype(std::data(std::declval<U>()))>,
+                 typename = std::void_t<decltype(std::size(std::declval<U>()))>>
+        constexpr Span(U& v)
+            : data_(std::data(v))
+            , size_(std::size(v))
+        {
+        }
+
+        template<typename U,
+                 typename = std::void_t<decltype(std::data(std::declval<U>()))>,
+                 typename = std::void_t<decltype(std::size(std::declval<U>()))>>
+        constexpr Span(const U& v)
+            : data_(std::data(v))
+            , size_(std::size(v))
+        {
+        }
+
+        template<typename U>
+        constexpr Span(std::initializer_list<U> il) noexcept
+            : Span(il.begin(), il.size())
+        {
+        }
+
+        template<std::size_t N>
+        constexpr Span(element_type (&arr)[N]) noexcept
+            : data_(std::data(arr))
+            , size_(N)
+        {
+        }
+
+        constexpr Span(const Span& other) noexcept
+            : data_(std::data(other))
+            , size_(std::size(other))
+        {
+        }
+
+        constexpr iterator begin() const noexcept
+        {
+            return data_;
+        }
+
+        constexpr iterator end() const noexcept
+        {
+            return data_ + size_;
+        }
+
+        constexpr reverse_iterator rbegin() const noexcept
+        {
+            return std::make_reverse_iterator(end());
+        }
+
+        constexpr reverse_iterator rend() const noexcept
+        {
+            return std::make_reverse_iterator(begin());
+        }
+
+        constexpr reference front() const
+        {
+            return data_[0];
+        }
+
+        constexpr reference back() const
+        {
+            return data_[size_ - 1];
+        }
+
+        constexpr reference operator[](size_type idx) const
+        {
+            return data_[idx];
+        }
+
+        constexpr pointer data() const noexcept
+        {
+            return data_;
+        }
+
+        constexpr size_type size() const noexcept
+        {
+            return size_;
+        }
+
+        constexpr size_type size_bytes() const noexcept
+        {
+            return sizeof(value_type) * size();
+        }
+
+        constexpr bool empty() const noexcept
+        {
+            return size_ == 0;
+        }
+
+        constexpr Span<element_type> first(size_type count) const
+        {
+            return {begin(), count};
+        }
+
+        constexpr Span<element_type> last(size_type count) const
+        {
+            return {end() - count, count};
+        }
+
+        constexpr Span<element_type> subspan(size_type offset, size_type count = -1) const
+        {
+            if (count == -1)
+            {
+                return Span(begin() + offset, end());
+            }
+            else
+            {
+                return Span(begin() + offset, begin() + offset + count);
+            }
+        }
+
+        constexpr Span& operator=(const Span& other) noexcept = default;
+
+        template<typename U>
+        friend bool operator!=(const Span& a, const Span<U>& b)
+        {
+            if (a.size() != b.size())
+            {
+                return true;
+            }
+
+            for (size_type i = 0; i < a.size(); ++i)
+            {
+                if (a[i] != b[i])
+                {
+                    return true;
+                }
+            }
+
+            return false;
+        }
+
+        template<typename U>
+        friend bool operator==(const Span& a, const Span<U>& b)
+        {
+            return !(a != b);
+        }
+
+      private:
+        T*        data_;
+        size_type size_;
+    };
+
 template <typename It, typename EndOrSize>
 Span(It, EndOrSize) -> Span<std::remove_reference_t<iter_reference_t<It>>>;
 
 template <typename T, std::size_t N>
 Span(T (&)[N]) -> Span<T>;
 
-template <typename U, typename = std::void_t<decltype(std::declval<U>().data())>,
-          typename = std::void_t<decltype(std::declval<U>().size())>>
+template<typename U,
+         typename = std::void_t<decltype(std::declval<U>().data())>,
+         typename = std::void_t<decltype(std::declval<U>().size())>>
 Span(U& v) -> Span<typename uncvref_t<U>::value_type>;
 
-template <typename T>
+template<typename T>
 Span(std::initializer_list<T>) -> Span<const T>;
-// clang-format on
+
 }  // namespace mmdeploy
 
 #endif  // MMDEPLOY_SRC_CORE_MPL_SPAN_H_
diff --git a/csrc/mmdeploy/core/mpl/static_any.h b/csrc/mmdeploy/core/mpl/static_any.h
index ec1beba91b..cfea65e48f 100644
--- a/csrc/mmdeploy/core/mpl/static_any.h
+++ b/csrc/mmdeploy/core/mpl/static_any.h
@@ -13,452 +13,554 @@
 
 #include "mmdeploy/core/mpl/type_traits.h"
 
-namespace mmdeploy {
-
-namespace detail {
-
-template <typename T>
-struct is_in_place_type_impl : std::false_type {};
-
-template <typename T>
-struct is_in_place_type_impl<std::in_place_type_t<T>> : std::true_type {};
-
-template <typename T>
-struct is_in_place_type : public is_in_place_type_impl<T> {};
-
-}  // namespace detail
-
-class BadAnyCast : public std::bad_cast {
- public:
-  const char* what() const noexcept override { return "BadAnyCast"; }
-};
-
-[[noreturn]] inline void ThrowBadAnyCast() {
+namespace mmdeploy
+{
+
+    namespace detail
+    {
+
+        template<typename T>
+        struct is_in_place_type_impl : std::false_type
+        {
+        };
+
+        template<typename T>
+        struct is_in_place_type_impl<std::in_place_type_t<T>> : std::true_type
+        {
+        };
+
+        template<typename T>
+        struct is_in_place_type : public is_in_place_type_impl<T>
+        {
+        };
+
+    }  // namespace detail
+
+    class BadAnyCast : public std::bad_cast
+    {
+      public:
+        const char* what() const noexcept override
+        {
+            return "BadAnyCast";
+        }
+    };
+
+    [[noreturn]] inline void ThrowBadAnyCast()
+    {
 #if __cpp_exceptions
-  throw BadAnyCast{};
+        throw BadAnyCast{};
 #else
-  std::abort();
+        std::abort();
 #endif
-}
-
-// Forward declarations
-class StaticAny;
-
-template <class ValueType>
-std::add_pointer_t<std::add_const_t<ValueType>> static_any_cast(const StaticAny*) noexcept;
-
-template <class ValueType>
-std::add_pointer_t<ValueType> static_any_cast(StaticAny*) noexcept;
-
-namespace __static_any_impl {
-
-using _Buffer = std::aligned_storage_t<3 * sizeof(void*), std::alignment_of_v<void*>>;
-
-template <class T>
-using _IsSmallObject =
-    std::integral_constant<bool, sizeof(T) <= sizeof(_Buffer) &&
-                                     std::alignment_of_v<_Buffer> % std::alignment_of_v<T> == 0 &&
-                                     std::is_nothrow_move_constructible_v<T>>;
+    }
 
-enum class _Action { _Destroy, _Copy, _Move, _Get, _TypeInfo };
+    // Forward declarations
+    class StaticAny;
+
+    template<class ValueType>
+    std::add_pointer_t<std::add_const_t<ValueType>> static_any_cast(const StaticAny*) noexcept;
+
+    template<class ValueType>
+    std::add_pointer_t<ValueType> static_any_cast(StaticAny*) noexcept;
+
+    namespace __static_any_impl
+    {
+
+        using _Buffer = std::aligned_storage_t<3 * sizeof(void*), std::alignment_of_v<void*>>;
+
+        template<class T>
+        using _IsSmallObject =
+            std::integral_constant<bool, sizeof(T) <= sizeof(_Buffer) && std::alignment_of_v<_Buffer> % std::alignment_of_v<T> == 0 && std::is_nothrow_move_constructible_v<T>>;
+
+        enum class _Action
+        {
+            _Destroy,
+            _Copy,
+            _Move,
+            _Get,
+            _TypeInfo
+        };
+
+        union _Ret
+        {
+            void*             ptr_;
+            traits::type_id_t type_id_;
+        };
+
+        template<class T>
+        struct _SmallHandler;
+        template<class T>
+        struct _LargeHandler;
+
+        template<class T>
+        inline bool __compare_typeid(traits::type_id_t __id)
+        {
+            if (__id && __id == traits::TypeId<T>::value)
+            {
+                return true;
+            }
+            return false;
+        }
+
+        template<class T>
+        using _Handler = std::conditional_t<_IsSmallObject<T>::value, _SmallHandler<T>, _LargeHandler<T>>;
+
+    }  // namespace __static_any_impl
+
+    class StaticAny
+    {
+      public:
+        constexpr StaticAny() noexcept
+            : h_(nullptr)
+        {
+        }
+
+        StaticAny(const StaticAny& other)
+            : h_(nullptr)
+        {
+            if (other.h_)
+            {
+                other.__call(_Action::_Copy, this);
+            }
+        }
+
+        StaticAny(StaticAny&& other) noexcept
+            : h_(nullptr)
+        {
+            if (other.h_)
+            {
+                other.__call(_Action::_Move, this);
+            }
+        }
+
+        template<class ValueType, class T = std::decay_t<ValueType>, class = std::enable_if_t<!std::is_same<T, StaticAny>::value && !detail::is_in_place_type<ValueType>::value && std::is_copy_constructible<T>::value && traits::TypeId<T>::value>>
+        explicit StaticAny(ValueType&& value);
+
+        template<
+            class ValueType,
+            class... Args,
+            class T = std::decay_t<ValueType>,
+            class   = std::enable_if_t<std::is_constructible<T, Args...>::value &&
+                                     std::is_copy_constructible<T>::value && traits::TypeId<T>::value>>
+        explicit StaticAny(std::in_place_type_t<ValueType>, Args&&... args);
+
+        template<class ValueType, class U, class... Args, class T = std::decay_t<ValueType>, class = std::enable_if_t<std::is_constructible<T, std::initializer_list<U>&, Args...>::value && std::is_copy_constructible<T>::value && traits::TypeId<T>::value>>
+        explicit StaticAny(std::in_place_type_t<ValueType>, std::initializer_list<U>, Args&&... args);
+
+        ~StaticAny()
+        {
+            this->reset();
+        }
+
+        StaticAny& operator=(const StaticAny& rhs)
+        {
+            StaticAny(rhs).swap(*this);
+            return *this;
+        }
+
+        StaticAny& operator=(StaticAny&& rhs) noexcept
+        {
+            StaticAny(std::move(rhs)).swap(*this);
+            return *this;
+        }
+
+        template<
+            class ValueType,
+            class T = std::decay_t<ValueType>,
+            class   = std::enable_if_t<!std::is_same<T, StaticAny>::value &&
+                                     std::is_copy_constructible<T>::value && traits::TypeId<T>::value>>
+        StaticAny& operator=(ValueType&& v);
+
+        template<
+            class ValueType,
+            class... Args,
+            class T = std::decay_t<ValueType>,
+            class   = std::enable_if_t<std::is_constructible<T, Args...>::value &&
+                                     std::is_copy_constructible<T>::value && traits::TypeId<T>::value>>
+        T& emplace(Args&&... args);
+
+        template<class ValueType, class U, class... Args, class T = std::decay_t<ValueType>, class = std::enable_if_t<std::is_constructible<T, std::initializer_list<U>&, Args...>::value && std::is_copy_constructible<T>::value && traits::TypeId<T>::value>>
+        T&   emplace(std::initializer_list<U>, Args&&...);
+
+        void reset() noexcept
+        {
+            if (h_)
+            {
+                this->__call(_Action::_Destroy);
+            }
+        }
+
+        void swap(StaticAny& rhs) noexcept;
+
+        bool has_value() const noexcept
+        {
+            return h_ != nullptr;
+        }
+
+        traits::type_id_t type() const noexcept
+        {
+            if (h_)
+            {
+                return this->__call(_Action::_TypeInfo).type_id_;
+            }
+            else
+            {
+                return traits::TypeId<void>::value;
+            }
+        }
+
+      private:
+        using _Action        = __static_any_impl::_Action;
+        using _Ret           = __static_any_impl::_Ret;
+        using _HandleFuncPtr = _Ret (*)(_Action, const StaticAny*, StaticAny*, traits::type_id_t info);
+
+        union _Storage
+        {
+            constexpr _Storage()
+                : ptr_(nullptr)
+            {
+            }
+            void*                      ptr_;
+            __static_any_impl::_Buffer buf_;
+        };
+
+        _Ret __call(_Action a, StaticAny* other = nullptr, traits::type_id_t info = 0) const
+        {
+            return h_(a, this, other, info);
+        }
+
+        _Ret __call(_Action a, StaticAny* other = nullptr, traits::type_id_t info = 0)
+        {
+            return h_(a, this, other, info);
+        }
+
+        template<class>
+        friend struct __static_any_impl::_SmallHandler;
+
+        template<class>
+        friend struct __static_any_impl::_LargeHandler;
+
+        template<class ValueType>
+        friend std::add_pointer_t<std::add_const_t<ValueType>> static_any_cast(const StaticAny*) noexcept;
+
+        template<class ValueType>
+        friend std::add_pointer_t<ValueType> static_any_cast(StaticAny*) noexcept;
+
+        _HandleFuncPtr                       h_ = nullptr;
+        _Storage                             s_;
+    };
+
+    namespace __static_any_impl
+    {
+
+        template<class T>
+        struct _SmallHandler
+        {
+            static _Ret __handle(_Action action, const StaticAny* self, StaticAny* other, traits::type_id_t info)
+            {
+                _Ret ret;
+                ret.ptr_ = nullptr;
+                switch (action)
+                {
+                    case _Action::_Destroy:
+                        __destroy(const_cast<StaticAny&>(*self));
+                        break;
+                    case _Action::_Copy:
+                        __copy(*self, *other);
+                        break;
+                    case _Action::_Move:
+                        __move(const_cast<StaticAny&>(*self), *other);
+                        break;
+                    case _Action::_Get:
+                        ret.ptr_ = __get(const_cast<StaticAny&>(*self), info);
+                        break;
+                    case _Action::_TypeInfo:
+                        ret.type_id_ = __type_info();
+                        break;
+                }
+                return ret;
+            }
+
+            template<class... Args>
+            static T& __create(StaticAny& dest, Args&&... args)
+            {
+                T* ret  = ::new (static_cast<void*>(&dest.s_.buf_)) T(std::forward<Args>(args)...);
+                dest.h_ = &_SmallHandler::__handle;
+                return *ret;
+            }
+
+          private:
+            template<class... Args>
+            static void __destroy(StaticAny& self)
+            {
+                T& value = *static_cast<T*>(static_cast<void*>(&self.s_.buf_));
+                value.~T();
+                self.h_ = nullptr;
+            }
+
+            template<class... Args>
+            static void __copy(const StaticAny& self, StaticAny& dest)
+            {
+                _SmallHandler::__create(dest, *static_cast<const T*>(static_cast<const void*>(&self.s_.buf_)));
+            }
+
+            static void __move(StaticAny& self, StaticAny& dest)
+            {
+                _SmallHandler::__create(dest, std::move(*static_cast<T*>(static_cast<void*>(&self.s_.buf_))));
+                __destroy(self);
+            }
+
+            static void* __get(StaticAny& self, traits::type_id_t info)
+            {
+                if (__static_any_impl::__compare_typeid<T>(info))
+                {
+                    return static_cast<void*>(&self.s_.buf_);
+                }
+                return nullptr;
+            }
+
+            static traits::type_id_t __type_info()
+            {
+                return traits::TypeId<T>::value;
+            }
+        };
+
+        template<class T>
+        struct _LargeHandler
+        {
+            static _Ret __handle(_Action action, const StaticAny* self, StaticAny* other, traits::type_id_t info)
+            {
+                _Ret ret;
+                ret.ptr_ = nullptr;
+                switch (action)
+                {
+                    case _Action::_Destroy:
+                        __destroy(const_cast<StaticAny&>(*self));
+                        break;
+                    case _Action::_Copy:
+                        __copy(*self, *other);
+                        break;
+                    case _Action::_Move:
+                        __move(const_cast<StaticAny&>(*self), *other);
+                        break;
+                    case _Action::_Get:
+                        ret.ptr_ = __get(const_cast<StaticAny&>(*self), info);
+                        break;
+                    case _Action::_TypeInfo:
+                        ret.type_id_ = __type_info();
+                        break;
+                }
+                return ret;
+            }
+
+            template<class... Args>
+            static T& __create(StaticAny& dest, Args&&... args)
+            {
+                using _Alloc = std::allocator<T>;
+                _Alloc alloc;
+                auto   dealloc = [&](T* p)
+                { alloc.deallocate(p, 1); };
+                std::unique_ptr<T, decltype(dealloc)> hold(alloc.allocate(1), dealloc);
+                T*                                    ret = ::new ((void*)hold.get()) T(std::forward<Args>(args)...);
+                dest.s_.ptr_                              = hold.release();
+                dest.h_                                   = &_LargeHandler::__handle;
+                return *ret;
+            }
+
+          private:
+            static void __destroy(StaticAny& self)
+            {
+                delete static_cast<T*>(self.s_.ptr_);
+                self.h_ = nullptr;
+            }
+
+            static void __copy(const StaticAny& self, StaticAny& dest)
+            {
+                _LargeHandler::__create(dest, *static_cast<const T*>(self.s_.ptr_));
+            }
+
+            static void __move(StaticAny& self, StaticAny& dest)
+            {
+                dest.s_.ptr_ = self.s_.ptr_;
+                dest.h_      = &_LargeHandler::__handle;
+                self.h_      = nullptr;
+            }
+
+            static void* __get(StaticAny& self, traits::type_id_t info)
+            {
+                if (__static_any_impl::__compare_typeid<T>(info))
+                {
+                    return static_cast<void*>(self.s_.ptr_);
+                }
+                return nullptr;
+            }
+
+            static traits::type_id_t __type_info()
+            {
+                return traits::TypeId<T>::value;
+            }
+        };
+
+    }  // namespace __static_any_impl
+
+    template<class ValueType, class T, class>
+    StaticAny::StaticAny(ValueType&& v)
+        : h_(nullptr)
+    {
+        __static_any_impl::_Handler<T>::__create(*this, std::forward<ValueType>(v));
+    }
 
-union _Ret {
-  void* ptr_;
-  traits::type_id_t type_id_;
-};
+    template<class ValueType, class... Args, class T, class>
+    StaticAny::StaticAny(std::in_place_type_t<ValueType>, Args&&... args)
+    {
+        __static_any_impl::_Handler<T>::__create(*this, std::forward<Args>(args)...);
+    }
 
-template <class T>
-struct _SmallHandler;
-template <class T>
-struct _LargeHandler;
+    template<class ValueType, class U, class... Args, class T, class>
+    StaticAny::StaticAny(std::in_place_type_t<ValueType>, std::initializer_list<U> il, Args&&... args)
+    {
+        __static_any_impl::_Handler<T>::__create(*this, il, std::forward<Args>(args)...);
+    }
 
-template <class T>
-inline bool __compare_typeid(traits::type_id_t __id) {
-  if (__id && __id == traits::TypeId<T>::value) {
-    return true;
-  }
-  return false;
-}
+    template<class ValueType, class, class>
+    inline StaticAny& StaticAny::operator=(ValueType&& v)
+    {
+        StaticAny(std::forward<ValueType>(v)).swap(*this);
+        return *this;
+    }
 
-template <class T>
-using _Handler = std::conditional_t<_IsSmallObject<T>::value, _SmallHandler<T>, _LargeHandler<T>>;
+    template<class ValueType, class... Args, class T, class>
+    inline T& StaticAny::emplace(Args&&... args)
+    {
+        reset();
+        return __static_any_impl::_Handler<T>::__create(*this, std::forward<Args>(args)...);
+    }
 
-}  // namespace __static_any_impl
+    template<class ValueType, class U, class... Args, class T, class>
+    inline T& StaticAny::emplace(std::initializer_list<U> il, Args&&... args)
+    {
+        reset();
+        return __static_any_impl::_Handler<T>::_create(*this, il, std::forward<Args>(args)...);
+    }
 
-class StaticAny {
- public:
-  constexpr StaticAny() noexcept : h_(nullptr) {}
+    inline void StaticAny::swap(StaticAny& rhs) noexcept
+    {
+        if (this == &rhs)
+        {
+            return;
+        }
+        if (h_ && rhs.h_)
+        {
+            StaticAny tmp;
+            rhs.__call(_Action::_Move, &tmp);
+            this->__call(_Action::_Move, &rhs);
+            tmp.__call(_Action::_Move, this);
+        }
+        else if (h_)
+        {
+            this->__call(_Action::_Move, &rhs);
+        }
+        else if (rhs.h_)
+        {
+            rhs.__call(_Action::_Move, this);
+        }
+    }
 
-  StaticAny(const StaticAny& other) : h_(nullptr) {
-    if (other.h_) {
-      other.__call(_Action::_Copy, this);
+    inline void swap(StaticAny& lhs, StaticAny& rhs) noexcept
+    {
+        lhs.swap(rhs);
     }
-  }
 
-  StaticAny(StaticAny&& other) noexcept : h_(nullptr) {
-    if (other.h_) {
-      other.__call(_Action::_Move, this);
+    template<class T, class... Args>
+    inline StaticAny make_static_any(Args&&... args)
+    {
+        return StaticAny(std::in_place_type<T>, std::forward<Args>(args)...);
     }
-  }
-
-  template <class ValueType, class T = std::decay_t<ValueType>,
-            class = std::enable_if_t<
-                !std::is_same<T, StaticAny>::value && !detail::is_in_place_type<ValueType>::value &&
-                std::is_copy_constructible<T>::value && traits::TypeId<T>::value>>
-  explicit StaticAny(ValueType&& value);
-
-  template <
-      class ValueType, class... Args, class T = std::decay_t<ValueType>,
-      class = std::enable_if_t<std::is_constructible<T, Args...>::value &&
-                               std::is_copy_constructible<T>::value && traits::TypeId<T>::value>>
-  explicit StaticAny(std::in_place_type_t<ValueType>, Args&&... args);
-
-  template <class ValueType, class U, class... Args, class T = std::decay_t<ValueType>,
-            class = std::enable_if_t<
-                std::is_constructible<T, std::initializer_list<U>&, Args...>::value &&
-                std::is_copy_constructible<T>::value && traits::TypeId<T>::value>>
-  explicit StaticAny(std::in_place_type_t<ValueType>, std::initializer_list<U>, Args&&... args);
-
-  ~StaticAny() { this->reset(); }
-
-  StaticAny& operator=(const StaticAny& rhs) {
-    StaticAny(rhs).swap(*this);
-    return *this;
-  }
-
-  StaticAny& operator=(StaticAny&& rhs) noexcept {
-    StaticAny(std::move(rhs)).swap(*this);
-    return *this;
-  }
-
-  template <
-      class ValueType, class T = std::decay_t<ValueType>,
-      class = std::enable_if_t<!std::is_same<T, StaticAny>::value &&
-                               std::is_copy_constructible<T>::value && traits::TypeId<T>::value>>
-  StaticAny& operator=(ValueType&& v);
-
-  template <
-      class ValueType, class... Args, class T = std::decay_t<ValueType>,
-      class = std::enable_if_t<std::is_constructible<T, Args...>::value &&
-                               std::is_copy_constructible<T>::value && traits::TypeId<T>::value>>
-  T& emplace(Args&&... args);
-
-  template <class ValueType, class U, class... Args, class T = std::decay_t<ValueType>,
-            class = std::enable_if_t<
-                std::is_constructible<T, std::initializer_list<U>&, Args...>::value &&
-                std::is_copy_constructible<T>::value && traits::TypeId<T>::value>>
-  T& emplace(std::initializer_list<U>, Args&&...);
-
-  void reset() noexcept {
-    if (h_) {
-      this->__call(_Action::_Destroy);
+
+    template<class T, class U, class... Args>
+    StaticAny make_static_any(std::initializer_list<U> il, Args&&... args)
+    {
+        return StaticAny(std::in_place_type<T>, il, std::forward<Args>(args)...);
     }
-  }
 
-  void swap(StaticAny& rhs) noexcept;
+    template<class ValueType>
+    ValueType static_any_cast(const StaticAny& v)
+    {
+        using _RawValueType = std::remove_cv_t<std::remove_reference_t<ValueType>>;
+        static_assert(std::is_constructible<ValueType, const _RawValueType&>::value,
+                      "ValueType is required to be a const lvalue reference "
+                      "or a CopyConstructible type");
+        auto tmp = static_any_cast<std::add_const_t<_RawValueType>>(&v);
+        if (tmp == nullptr)
+        {
+            ThrowBadAnyCast();
+        }
+        return static_cast<ValueType>(*tmp);
+    }
 
-  bool has_value() const noexcept { return h_ != nullptr; }
+    template<class ValueType>
+    inline ValueType static_any_cast(StaticAny& v)
+    {
+        using _RawValueType = std::remove_cv_t<std::remove_reference_t<ValueType>>;
+        static_assert(std::is_constructible<ValueType, _RawValueType&>::value,
+                      "ValueType is required to be an lvalue reference "
+                      "or a CopyConstructible type");
+        auto tmp = static_any_cast<_RawValueType>(&v);
+        if (tmp == nullptr)
+        {
+            ThrowBadAnyCast();
+        }
+        return static_cast<ValueType>(*tmp);
+    }
 
-  traits::type_id_t type() const noexcept {
-    if (h_) {
-      return this->__call(_Action::_TypeInfo).type_id_;
-    } else {
-      return traits::TypeId<void>::value;
+    template<class ValueType>
+    inline ValueType static_any_cast(StaticAny&& v)
+    {
+        using _RawValueType = std::remove_cv_t<std::remove_reference_t<ValueType>>;
+        static_assert(std::is_constructible<ValueType, _RawValueType>::value,
+                      "ValueType is required to be an rvalue reference "
+                      "or a CopyConstructible type");
+        auto tmp = static_any_cast<_RawValueType>(&v);
+        if (tmp == nullptr)
+        {
+            ThrowBadAnyCast();
+        }
+        return static_cast<ValueType>(std::move(*tmp));
     }
-  }
-
- private:
-  using _Action = __static_any_impl::_Action;
-  using _Ret = __static_any_impl::_Ret;
-  using _HandleFuncPtr = _Ret (*)(_Action, const StaticAny*, StaticAny*, traits::type_id_t info);
-
-  union _Storage {
-    constexpr _Storage() : ptr_(nullptr) {}
-    void* ptr_;
-    __static_any_impl::_Buffer buf_;
-  };
-
-  _Ret __call(_Action a, StaticAny* other = nullptr, traits::type_id_t info = 0) const {
-    return h_(a, this, other, info);
-  }
-
-  _Ret __call(_Action a, StaticAny* other = nullptr, traits::type_id_t info = 0) {
-    return h_(a, this, other, info);
-  }
-
-  template <class>
-  friend struct __static_any_impl::_SmallHandler;
-
-  template <class>
-  friend struct __static_any_impl::_LargeHandler;
-
-  template <class ValueType>
-  friend std::add_pointer_t<std::add_const_t<ValueType>> static_any_cast(const StaticAny*) noexcept;
-
-  template <class ValueType>
-  friend std::add_pointer_t<ValueType> static_any_cast(StaticAny*) noexcept;
-
-  _HandleFuncPtr h_ = nullptr;
-  _Storage s_;
-};
-
-namespace __static_any_impl {
-
-template <class T>
-struct _SmallHandler {
-  static _Ret __handle(_Action action, const StaticAny* self, StaticAny* other,
-                       traits::type_id_t info) {
-    _Ret ret;
-    ret.ptr_ = nullptr;
-    switch (action) {
-      case _Action::_Destroy:
-        __destroy(const_cast<StaticAny&>(*self));
-        break;
-      case _Action::_Copy:
-        __copy(*self, *other);
-        break;
-      case _Action::_Move:
-        __move(const_cast<StaticAny&>(*self), *other);
-        break;
-      case _Action::_Get:
-        ret.ptr_ = __get(const_cast<StaticAny&>(*self), info);
-        break;
-      case _Action::_TypeInfo:
-        ret.type_id_ = __type_info();
-        break;
+
+    template<class ValueType>
+    inline std::add_pointer_t<std::add_const_t<ValueType>> static_any_cast(
+        const StaticAny* __any) noexcept
+    {
+        static_assert(!std::is_reference<ValueType>::value, "ValueType may not be a reference.");
+        return static_any_cast<ValueType>(const_cast<StaticAny*>(__any));
     }
-    return ret;
-  }
-
-  template <class... Args>
-  static T& __create(StaticAny& dest, Args&&... args) {
-    T* ret = ::new (static_cast<void*>(&dest.s_.buf_)) T(std::forward<Args>(args)...);
-    dest.h_ = &_SmallHandler::__handle;
-    return *ret;
-  }
-
- private:
-  template <class... Args>
-  static void __destroy(StaticAny& self) {
-    T& value = *static_cast<T*>(static_cast<void*>(&self.s_.buf_));
-    value.~T();
-    self.h_ = nullptr;
-  }
-
-  template <class... Args>
-  static void __copy(const StaticAny& self, StaticAny& dest) {
-    _SmallHandler::__create(dest, *static_cast<const T*>(static_cast<const void*>(&self.s_.buf_)));
-  }
-
-  static void __move(StaticAny& self, StaticAny& dest) {
-    _SmallHandler::__create(dest, std::move(*static_cast<T*>(static_cast<void*>(&self.s_.buf_))));
-    __destroy(self);
-  }
-
-  static void* __get(StaticAny& self, traits::type_id_t info) {
-    if (__static_any_impl::__compare_typeid<T>(info)) {
-      return static_cast<void*>(&self.s_.buf_);
+
+    template<class RetType>
+    inline RetType __pointer_or_func_test(void* p, std::false_type) noexcept
+    {
+        return static_cast<RetType>(p);
     }
-    return nullptr;
-  }
-
-  static traits::type_id_t __type_info() { return traits::TypeId<T>::value; }
-};
-
-template <class T>
-struct _LargeHandler {
-  static _Ret __handle(_Action action, const StaticAny* self, StaticAny* other,
-                       traits::type_id_t info) {
-    _Ret ret;
-    ret.ptr_ = nullptr;
-    switch (action) {
-      case _Action::_Destroy:
-        __destroy(const_cast<StaticAny&>(*self));
-        break;
-      case _Action::_Copy:
-        __copy(*self, *other);
-        break;
-      case _Action::_Move:
-        __move(const_cast<StaticAny&>(*self), *other);
-        break;
-      case _Action::_Get:
-        ret.ptr_ = __get(const_cast<StaticAny&>(*self), info);
-        break;
-      case _Action::_TypeInfo:
-        ret.type_id_ = __type_info();
-        break;
+
+    template<class RetType>
+    inline RetType __pointer_or_func_test(void*, std::true_type) noexcept
+    {
+        return nullptr;
     }
-    return ret;
-  }
-
-  template <class... Args>
-  static T& __create(StaticAny& dest, Args&&... args) {
-    using _Alloc = std::allocator<T>;
-    _Alloc alloc;
-    auto dealloc = [&](T* p) { alloc.deallocate(p, 1); };
-    std::unique_ptr<T, decltype(dealloc)> hold(alloc.allocate(1), dealloc);
-    T* ret = ::new ((void*)hold.get()) T(std::forward<Args>(args)...);
-    dest.s_.ptr_ = hold.release();
-    dest.h_ = &_LargeHandler::__handle;
-    return *ret;
-  }
-
- private:
-  static void __destroy(StaticAny& self) {
-    delete static_cast<T*>(self.s_.ptr_);
-    self.h_ = nullptr;
-  }
-
-  static void __copy(const StaticAny& self, StaticAny& dest) {
-    _LargeHandler::__create(dest, *static_cast<const T*>(self.s_.ptr_));
-  }
-
-  static void __move(StaticAny& self, StaticAny& dest) {
-    dest.s_.ptr_ = self.s_.ptr_;
-    dest.h_ = &_LargeHandler::__handle;
-    self.h_ = nullptr;
-  }
-
-  static void* __get(StaticAny& self, traits::type_id_t info) {
-    if (__static_any_impl::__compare_typeid<T>(info)) {
-      return static_cast<void*>(self.s_.ptr_);
+
+    template<class ValueType>
+    std::add_pointer_t<ValueType> static_any_cast(StaticAny* any) noexcept
+    {
+        using __static_any_impl::_Action;
+        static_assert(!std::is_reference<ValueType>::value, "ValueType may not be a reference.");
+        using ReturnType = std::add_pointer_t<ValueType>;
+        if (any && any->h_)
+        {
+            void* p = any->__call(_Action::_Get, nullptr, traits::TypeId<ValueType>::value).ptr_;
+            return __pointer_or_func_test<ReturnType>(p, std::is_function<ValueType>{});
+        }
+        return nullptr;
     }
-    return nullptr;
-  }
-
-  static traits::type_id_t __type_info() { return traits::TypeId<T>::value; }
-};
-
-}  // namespace __static_any_impl
-
-template <class ValueType, class T, class>
-StaticAny::StaticAny(ValueType&& v) : h_(nullptr) {
-  __static_any_impl::_Handler<T>::__create(*this, std::forward<ValueType>(v));
-}
-
-template <class ValueType, class... Args, class T, class>
-StaticAny::StaticAny(std::in_place_type_t<ValueType>, Args&&... args) {
-  __static_any_impl::_Handler<T>::__create(*this, std::forward<Args>(args)...);
-}
-
-template <class ValueType, class U, class... Args, class T, class>
-StaticAny::StaticAny(std::in_place_type_t<ValueType>, std::initializer_list<U> il, Args&&... args) {
-  __static_any_impl::_Handler<T>::__create(*this, il, std::forward<Args>(args)...);
-}
-
-template <class ValueType, class, class>
-inline StaticAny& StaticAny::operator=(ValueType&& v) {
-  StaticAny(std::forward<ValueType>(v)).swap(*this);
-  return *this;
-}
-
-template <class ValueType, class... Args, class T, class>
-inline T& StaticAny::emplace(Args&&... args) {
-  reset();
-  return __static_any_impl::_Handler<T>::__create(*this, std::forward<Args>(args)...);
-}
-
-template <class ValueType, class U, class... Args, class T, class>
-inline T& StaticAny::emplace(std::initializer_list<U> il, Args&&... args) {
-  reset();
-  return __static_any_impl::_Handler<T>::_create(*this, il, std::forward<Args>(args)...);
-}
-
-inline void StaticAny::swap(StaticAny& rhs) noexcept {
-  if (this == &rhs) {
-    return;
-  }
-  if (h_ && rhs.h_) {
-    StaticAny tmp;
-    rhs.__call(_Action::_Move, &tmp);
-    this->__call(_Action::_Move, &rhs);
-    tmp.__call(_Action::_Move, this);
-  } else if (h_) {
-    this->__call(_Action::_Move, &rhs);
-  } else if (rhs.h_) {
-    rhs.__call(_Action::_Move, this);
-  }
-}
-
-inline void swap(StaticAny& lhs, StaticAny& rhs) noexcept { lhs.swap(rhs); }
-
-template <class T, class... Args>
-inline StaticAny make_static_any(Args&&... args) {
-  return StaticAny(std::in_place_type<T>, std::forward<Args>(args)...);
-}
-
-template <class T, class U, class... Args>
-StaticAny make_static_any(std::initializer_list<U> il, Args&&... args) {
-  return StaticAny(std::in_place_type<T>, il, std::forward<Args>(args)...);
-}
-
-template <class ValueType>
-ValueType static_any_cast(const StaticAny& v) {
-  using _RawValueType = std::remove_cv_t<std::remove_reference_t<ValueType>>;
-  static_assert(std::is_constructible<ValueType, const _RawValueType&>::value,
-                "ValueType is required to be a const lvalue reference "
-                "or a CopyConstructible type");
-  auto tmp = static_any_cast<std::add_const_t<_RawValueType>>(&v);
-  if (tmp == nullptr) {
-    ThrowBadAnyCast();
-  }
-  return static_cast<ValueType>(*tmp);
-}
-
-template <class ValueType>
-inline ValueType static_any_cast(StaticAny& v) {
-  using _RawValueType = std::remove_cv_t<std::remove_reference_t<ValueType>>;
-  static_assert(std::is_constructible<ValueType, _RawValueType&>::value,
-                "ValueType is required to be an lvalue reference "
-                "or a CopyConstructible type");
-  auto tmp = static_any_cast<_RawValueType>(&v);
-  if (tmp == nullptr) {
-    ThrowBadAnyCast();
-  }
-  return static_cast<ValueType>(*tmp);
-}
-
-template <class ValueType>
-inline ValueType static_any_cast(StaticAny&& v) {
-  using _RawValueType = std::remove_cv_t<std::remove_reference_t<ValueType>>;
-  static_assert(std::is_constructible<ValueType, _RawValueType>::value,
-                "ValueType is required to be an rvalue reference "
-                "or a CopyConstructible type");
-  auto tmp = static_any_cast<_RawValueType>(&v);
-  if (tmp == nullptr) {
-    ThrowBadAnyCast();
-  }
-  return static_cast<ValueType>(std::move(*tmp));
-}
-
-template <class ValueType>
-inline std::add_pointer_t<std::add_const_t<ValueType>> static_any_cast(
-    const StaticAny* __any) noexcept {
-  static_assert(!std::is_reference<ValueType>::value, "ValueType may not be a reference.");
-  return static_any_cast<ValueType>(const_cast<StaticAny*>(__any));
-}
-
-template <class RetType>
-inline RetType __pointer_or_func_test(void* p, std::false_type) noexcept {
-  return static_cast<RetType>(p);
-}
-
-template <class RetType>
-inline RetType __pointer_or_func_test(void*, std::true_type) noexcept {
-  return nullptr;
-}
-
-template <class ValueType>
-std::add_pointer_t<ValueType> static_any_cast(StaticAny* any) noexcept {
-  using __static_any_impl::_Action;
-  static_assert(!std::is_reference<ValueType>::value, "ValueType may not be a reference.");
-  using ReturnType = std::add_pointer_t<ValueType>;
-  if (any && any->h_) {
-    void* p = any->__call(_Action::_Get, nullptr, traits::TypeId<ValueType>::value).ptr_;
-    return __pointer_or_func_test<ReturnType>(p, std::is_function<ValueType>{});
-  }
-  return nullptr;
-}
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/core/mpl/structure.h b/csrc/mmdeploy/core/mpl/structure.h
index 1e7f5b3804..84dc7aeed1 100644
--- a/csrc/mmdeploy/core/mpl/structure.h
+++ b/csrc/mmdeploy/core/mpl/structure.h
@@ -8,229 +8,285 @@
 #include <tuple>
 #include <utility>
 
-namespace mmdeploy {
-
-namespace _structure {
-
-using std::array;
-using std::index_sequence;
-using std::integral_constant;
-using std::tuple;
-
-// [p0][T0]...[p1][T1]...[pn][Tn]...[px][X]
-// ^                                     |
-// |-------------------------------------|
-template <size_t Size>
-class Storage {
-  static constexpr auto S = Size + 1;
-  using Indices = std::make_index_sequence<S>;
-
- public:
-  Storage(const Storage&) = delete;
-  Storage(Storage&&) noexcept = delete;
-  Storage& operator=(const Storage&) = delete;
-  Storage& operator=(Storage&&) noexcept = delete;
-
-  Storage(const array<size_t, Size>& sizes, const array<size_t, Size>& aligns) {
-    create(std::make_index_sequence<Size>{}, sizes, aligns);
-  }
-
-  template <size_t offset>
-  Storage(const array<size_t, Size>& sizes, const array<size_t, Size>& aligns,
-          integral_constant<size_t, offset> index, void* ptr) noexcept {
-    create(std::make_index_sequence<Size>{}, sizes, aligns, index, ptr);
-  }
-
-  template <size_t... i, typename... As>
-  void create(index_sequence<i...>, const array<size_t, Size>& sizes,
-              const array<size_t, Size>& aligns, As&&... as) {
-    std::tie(data_, pointers_) =
-        Creator{{sizes[i]..., sizeof(void*)}, {aligns[i]..., alignof(void*)}}.create((As &&) as...);
-  }
-
-  ~Storage() {
-    if (data_) {
-      delete[] static_cast<uint8_t*>(data_);
-      release();
-    }
-  }
-
-  void* data() const noexcept { return data_; }
-
-  template <size_t i>
-  void* at() const noexcept {
-    return pointers_[i];
-  }
-
-  array<void*, S>& pointers() { return pointers_; }
-
-  void* release() noexcept {
-    std::fill_n(pointers_.data(), S, nullptr);
-    return std::exchange(data_, nullptr);
-  }
-
- private:
-  struct Creator {
-    const array<size_t, S>& sizes_;
-    const array<size_t, S>& aligns_;
-
-    tuple<void*, array<void*, S>> create() {
-      auto space = get_space(Indices{});
-      void* data = new uint8_t[space];
-      auto ptr = data;
-      array<void*, S> pointers{};
-      // build the layout according to sizes and alignments
-      align<0>(ptr, space, pointers, Indices{});
-      // store a pointer to the head of data in the last slot
-      *reinterpret_cast<void**>(pointers.back()) = data;
-      return {data, pointers};
-    }
-
-    template <size_t offset>
-    tuple<void*, array<void*, S>> create(integral_constant<size_t, offset>, void* ptr) {
-      auto space = get_space(Indices{});
-      array<void*, S> pointers{};
-      // recover the layout after offset
-      align<offset>(ptr, space, pointers, std::make_index_sequence<S - offset>{});
-      // recover data pointer
-      auto data = ptr = *reinterpret_cast<void**>(pointers.back());
-      // recover the layout before offset
-      align<0>(ptr, space, pointers, std::make_index_sequence<offset>{});
-      return {data, pointers};
-    }
-
-   private:
-    template <size_t... i>
-    size_t get_space(index_sequence<i...>) const noexcept {
-      return ((sizes_[i] + aligns_[i]) + ...);
-    }
-
-    template <size_t offset, size_t... i>
-    void align(void*& ptr, size_t& space, array<void*, S>& pointers,
-               index_sequence<i...>) noexcept {
-      (align(ptr, space, pointers, integral_constant<size_t, offset + i>{}), ...);
-    }
-
-    template <size_t i>
-    void align(void*& ptr, size_t& space, array<void*, S>& pointers,
-               integral_constant<size_t, i>) noexcept {
-      pointers[i] = std::align(aligns_[i], sizes_[i], ptr, space);
-      ptr = reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(ptr) + sizes_[i]);
-      space -= sizes_[i];
-    }
-  };
-
- private:
-  void* data_{};
-  array<void*, S> pointers_{};
-};
-
-template <typename T, typename... Ts>
-struct _count {
-  static constexpr size_t value = (std::is_same_v<T, Ts> + ...);
-};
-
-template <typename T, typename Ts, typename Is, typename = void>
-struct get_type_index {};
-
-template <typename T, typename... Ts, size_t... Is>
-struct get_type_index<T, tuple<Ts...>, std::index_sequence<Is...>,
-                      std::enable_if_t<_count<T, Ts...>::value == 1>> {
-  static constexpr size_t value = ((std::is_same_v<T, Ts> * Is) + ...);
-};
-
-template <typename T>
-using _size_t = size_t;
-
-template <typename... Ts>
-class Structure : public Storage<sizeof...(Ts)> {
-  static constexpr auto Size = sizeof...(Ts);
-  using Base = Storage<Size>;
-  using Indices = std::index_sequence_for<Ts...>;
-
- public:
-  explicit Structure() : Structure(1) {}
-
-  explicit Structure(size_t length) : Structure(array<size_t, Size>{_size_t<Ts>(length)...}) {}
-
-  explicit Structure(const array<size_t, Size>& lengths)
-      : Base(get_sizes(lengths, Indices{}), {alignof(Ts)...}), lengths_{lengths} {
-    construct(Indices{});
-  }
-
-  template <typename T, size_t index = get_type_index<T, tuple<Ts...>, Indices>::value>
-  explicit Structure(T* p) : Structure(1, integral_constant<size_t, index>{}, p) {}
-
-  template <typename T, size_t index = get_type_index<T, tuple<Ts...>, Indices>::value>
-  explicit Structure(size_t length, T* p)
-      : Structure(length, integral_constant<size_t, index>{}, p) {}
-
-  template <typename T, size_t index = get_type_index<T, tuple<Ts...>, Indices>::value>
-  explicit Structure(const array<size_t, Size>& lengths, T* p)
-      : Structure(lengths, integral_constant<size_t, index>{}, p) {}
-
-  template <size_t i>
-  explicit Structure(integral_constant<size_t, i> index, void* p) : Structure(1, index, p) {}
-
-  template <size_t i>
-  explicit Structure(size_t length, integral_constant<size_t, i> index, void* p)
-      : Structure({_size_t<Ts>(length)...}, index, p) {}
-
-  template <size_t i>
-  explicit Structure(const array<size_t, Size>& lengths, integral_constant<size_t, i> index,
-                     void* p)
-      : Base(get_sizes(lengths, Indices{}), {alignof(Ts)...}, index, p), lengths_{lengths} {}
-
-  ~Structure() {
-    if (this->data()) {
-      destruct(Indices{});
-    }
-  }
-
-  template <size_t i>
-  decltype(auto) get() const {
-    using T = std::tuple_element_t<i, tuple<Ts...>>;
-    return reinterpret_cast<T*>(this->template at<i>());
-  }
-
-  tuple<Ts*...> pointers() const noexcept { return pointers(Indices{}); }
-
- private:
-  template <size_t... i>
-  static array<size_t, Size> get_sizes(const array<size_t, Size>& lengths,
-                                       index_sequence<i...>) noexcept {
-    return {(sizeof(Ts) * lengths[i])...};
-  }
-
-  template <size_t... i>
-  tuple<Ts*...> pointers(index_sequence<i...>) const noexcept {
-    return {get<i>()...};
-  }
-
-  template <size_t... i>
-  void construct(index_sequence<i...>) {
-    (create_n(get<i>(), lengths_[i]), ...);
-  }
-
-  template <typename T>
-  static void create_n(T* data, size_t n) {
-    for (size_t i = 0; i < n; ++i) {
-      new (data + i) T{};
-    }
-  }
-
-  template <size_t... i>
-  void destruct(index_sequence<i...>) {
-    (std::destroy_n(get<i>(), lengths_[i]), ...);
-  }
-
- private:
-  array<size_t, Size> lengths_;
-};
-
-}  // namespace _structure
-
-using _structure::Structure;
+namespace mmdeploy
+{
+
+    namespace _structure
+    {
+
+        using std::array;
+        using std::index_sequence;
+        using std::integral_constant;
+        using std::tuple;
+
+        // [p0][T0]...[p1][T1]...[pn][Tn]...[px][X]
+        // ^                                     |
+        // |-------------------------------------|
+        template<size_t Size>
+        class Storage
+        {
+            static constexpr auto S = Size + 1;
+            using Indices           = std::make_index_sequence<S>;
+
+          public:
+            Storage(const Storage&)                = delete;
+            Storage(Storage&&) noexcept            = delete;
+            Storage& operator=(const Storage&)     = delete;
+            Storage& operator=(Storage&&) noexcept = delete;
+
+            Storage(const array<size_t, Size>& sizes, const array<size_t, Size>& aligns)
+            {
+                create(std::make_index_sequence<Size>{}, sizes, aligns);
+            }
+
+            template<size_t offset>
+            Storage(const array<size_t, Size>& sizes, const array<size_t, Size>& aligns, integral_constant<size_t, offset> index, void* ptr) noexcept
+            {
+                create(std::make_index_sequence<Size>{}, sizes, aligns, index, ptr);
+            }
+
+            template<size_t... i, typename... As>
+            void create(index_sequence<i...>, const array<size_t, Size>& sizes, const array<size_t, Size>& aligns, As&&... as)
+            {
+                std::tie(data_, pointers_) =
+                    Creator{{sizes[i]..., sizeof(void*)}, {aligns[i]..., alignof(void*)}}.create((As&&)as...);
+            }
+
+            ~Storage()
+            {
+                if (data_)
+                {
+                    delete[] static_cast<uint8_t*>(data_);
+                    release();
+                }
+            }
+
+            void* data() const noexcept
+            {
+                return data_;
+            }
+
+            template<size_t i>
+            void* at() const noexcept
+            {
+                return pointers_[i];
+            }
+
+            array<void*, S>& pointers()
+            {
+                return pointers_;
+            }
+
+            void* release() noexcept
+            {
+                std::fill_n(pointers_.data(), S, nullptr);
+                return std::exchange(data_, nullptr);
+            }
+
+          private:
+            struct Creator
+            {
+                const array<size_t, S>&       sizes_;
+                const array<size_t, S>&       aligns_;
+
+                tuple<void*, array<void*, S>> create()
+                {
+                    auto            space = get_space(Indices{});
+                    void*           data  = new uint8_t[space];
+                    auto            ptr   = data;
+                    array<void*, S> pointers{};
+                    // build the layout according to sizes and alignments
+                    align<0>(ptr, space, pointers, Indices{});
+                    // store a pointer to the head of data in the last slot
+                    *reinterpret_cast<void**>(pointers.back()) = data;
+                    return {data, pointers};
+                }
+
+                template<size_t offset>
+                tuple<void*, array<void*, S>> create(integral_constant<size_t, offset>, void* ptr)
+                {
+                    auto            space = get_space(Indices{});
+                    array<void*, S> pointers{};
+                    // recover the layout after offset
+                    align<offset>(ptr, space, pointers, std::make_index_sequence<S - offset>{});
+                    // recover data pointer
+                    auto data = ptr = *reinterpret_cast<void**>(pointers.back());
+                    // recover the layout before offset
+                    align<0>(ptr, space, pointers, std::make_index_sequence<offset>{});
+                    return {data, pointers};
+                }
+
+              private:
+                template<size_t... i>
+                size_t get_space(index_sequence<i...>) const noexcept
+                {
+                    return ((sizes_[i] + aligns_[i]) + ...);
+                }
+
+                template<size_t offset, size_t... i>
+                void align(void*& ptr, size_t& space, array<void*, S>& pointers, index_sequence<i...>) noexcept
+                {
+                    (align(ptr, space, pointers, integral_constant<size_t, offset + i>{}), ...);
+                }
+
+                template<size_t i>
+                void align(void*& ptr, size_t& space, array<void*, S>& pointers, integral_constant<size_t, i>) noexcept
+                {
+                    pointers[i] = std::align(aligns_[i], sizes_[i], ptr, space);
+                    ptr         = reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(ptr) + sizes_[i]);
+                    space -= sizes_[i];
+                }
+            };
+
+          private:
+            void*           data_{};
+            array<void*, S> pointers_{};
+        };
+
+        template<typename T, typename... Ts>
+        struct _count
+        {
+            static constexpr size_t value = (std::is_same_v<T, Ts> + ...);
+        };
+
+        template<typename T, typename Ts, typename Is, typename = void>
+        struct get_type_index
+        {
+        };
+
+        template<typename T, typename... Ts, size_t... Is>
+        struct get_type_index<T, tuple<Ts...>, std::index_sequence<Is...>, std::enable_if_t<_count<T, Ts...>::value == 1>>
+        {
+            static constexpr size_t value = ((std::is_same_v<T, Ts> * Is) + ...);
+        };
+
+        template<typename T>
+        using _size_t = size_t;
+
+        template<typename... Ts>
+        class Structure : public Storage<sizeof...(Ts)>
+        {
+            static constexpr auto Size = sizeof...(Ts);
+            using Base                 = Storage<Size>;
+            using Indices              = std::index_sequence_for<Ts...>;
+
+          public:
+            explicit Structure()
+                : Structure(1)
+            {
+            }
+
+            explicit Structure(size_t length)
+                : Structure(array<size_t, Size>{_size_t<Ts>(length)...})
+            {
+            }
+
+            explicit Structure(const array<size_t, Size>& lengths)
+                : Base(get_sizes(lengths, Indices{}), {alignof(Ts)...})
+                , lengths_{lengths}
+            {
+                construct(Indices{});
+            }
+
+            template<typename T, size_t index = get_type_index<T, tuple<Ts...>, Indices>::value>
+            explicit Structure(T* p)
+                : Structure(1, integral_constant<size_t, index>{}, p)
+            {
+            }
+
+            template<typename T, size_t index = get_type_index<T, tuple<Ts...>, Indices>::value>
+            explicit Structure(size_t length, T* p)
+                : Structure(length, integral_constant<size_t, index>{}, p)
+            {
+            }
+
+            template<typename T, size_t index = get_type_index<T, tuple<Ts...>, Indices>::value>
+            explicit Structure(const array<size_t, Size>& lengths, T* p)
+                : Structure(lengths, integral_constant<size_t, index>{}, p)
+            {
+            }
+
+            template<size_t i>
+            explicit Structure(integral_constant<size_t, i> index, void* p)
+                : Structure(1, index, p)
+            {
+            }
+
+            template<size_t i>
+            explicit Structure(size_t length, integral_constant<size_t, i> index, void* p)
+                : Structure({_size_t<Ts>(length)...}, index, p)
+            {
+            }
+
+            template<size_t i>
+            explicit Structure(const array<size_t, Size>& lengths, integral_constant<size_t, i> index, void* p)
+                : Base(get_sizes(lengths, Indices{}), {alignof(Ts)...}, index, p)
+                , lengths_{lengths}
+            {
+            }
+
+            ~Structure()
+            {
+                if (this->data())
+                {
+                    destruct(Indices{});
+                }
+            }
+
+            template<size_t i>
+            decltype(auto) get() const
+            {
+                using T = std::tuple_element_t<i, tuple<Ts...>>;
+                return reinterpret_cast<T*>(this->template at<i>());
+            }
+
+            tuple<Ts*...> pointers() const noexcept
+            {
+                return pointers(Indices{});
+            }
+
+          private:
+            template<size_t... i>
+            static array<size_t, Size> get_sizes(const array<size_t, Size>& lengths,
+                                                 index_sequence<i...>) noexcept
+            {
+                return {(sizeof(Ts) * lengths[i])...};
+            }
+
+            template<size_t... i>
+            tuple<Ts*...> pointers(index_sequence<i...>) const noexcept
+            {
+                return {get<i>()...};
+            }
+
+            template<size_t... i>
+            void construct(index_sequence<i...>)
+            {
+                (create_n(get<i>(), lengths_[i]), ...);
+            }
+
+            template<typename T>
+            static void create_n(T* data, size_t n)
+            {
+                for (size_t i = 0; i < n; ++i)
+                {
+                    new (data + i) T{};
+                }
+            }
+
+            template<size_t... i>
+            void destruct(index_sequence<i...>)
+            {
+                (std::destroy_n(get<i>(), lengths_[i]), ...);
+            }
+
+          private:
+            array<size_t, Size> lengths_;
+        };
+
+    }  // namespace _structure
+
+    using _structure::Structure;
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/core/mpl/type_traits.h b/csrc/mmdeploy/core/mpl/type_traits.h
index 3e03bf9717..edbb5f2637 100644
--- a/csrc/mmdeploy/core/mpl/type_traits.h
+++ b/csrc/mmdeploy/core/mpl/type_traits.h
@@ -6,44 +6,55 @@
 #include <cstdint>
 #include <type_traits>
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-template <typename T>
-struct uncvref {
-  typedef std::remove_cv_t<std::remove_reference_t<T>> type;
-};
+    template<typename T>
+    struct uncvref
+    {
+        typedef std::remove_cv_t<std::remove_reference_t<T>> type;
+    };
 
-template <typename T>
-using uncvref_t = typename uncvref<T>::type;
+    template<typename T>
+    using uncvref_t = typename uncvref<T>::type;
 
-template <class T>
-struct is_cast_by_erasure : std::false_type {};
+    template<class T>
+    struct is_cast_by_erasure : std::false_type
+    {
+    };
 
-namespace traits {
+    namespace traits
+    {
 
-using type_id_t = uint64_t;
+        using type_id_t = uint64_t;
 
-template <class T>
-struct TypeId {
-  static constexpr type_id_t value = 0;
-};
+        template<class T>
+        struct TypeId
+        {
+            static constexpr type_id_t value = 0;
+        };
 
-template <>
-struct TypeId<void> {
-  static constexpr auto value = static_cast<type_id_t>(-1);
-};
+        template<>
+        struct TypeId<void>
+        {
+            static constexpr auto value = static_cast<type_id_t>(-1);
+        };
 
 // ! This only works when calling inside mmdeploy namespace
-#define MMDEPLOY_REGISTER_TYPE_ID(type, id) \
-  namespace traits {                        \
-  template <>                               \
-  struct TypeId<type> {                     \
-    static constexpr type_id_t value = id;  \
-  };                                        \
-  }                                         \
-  template <>                               \
-  struct is_cast_by_erasure<type> : std::true_type {};
-}  // namespace traits
+#define MMDEPLOY_REGISTER_TYPE_ID(type, id)          \
+    namespace traits                                 \
+    {                                                \
+        template<>                                   \
+        struct TypeId<type>                          \
+        {                                            \
+            static constexpr type_id_t value = id;   \
+        };                                           \
+    }                                                \
+    template<>                                       \
+    struct is_cast_by_erasure<type> : std::true_type \
+    {                                                \
+    };
+    }  // namespace traits
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/core/net.cpp b/csrc/mmdeploy/core/net.cpp
index 0864bcf1c5..fa6e7c144f 100644
--- a/csrc/mmdeploy/core/net.cpp
+++ b/csrc/mmdeploy/core/net.cpp
@@ -4,8 +4,9 @@
 
 #include "registry.h"
 
-namespace mmdeploy::framework {
+namespace mmdeploy::framework
+{
 
-MMDEPLOY_DEFINE_REGISTRY(Net);
+    MMDEPLOY_DEFINE_REGISTRY(Net);
 
 }  // namespace mmdeploy::framework
diff --git a/csrc/mmdeploy/core/net.h b/csrc/mmdeploy/core/net.h
index bfc7e6d416..6a2927057f 100644
--- a/csrc/mmdeploy/core/net.h
+++ b/csrc/mmdeploy/core/net.h
@@ -8,21 +8,23 @@
 #include "mmdeploy/core/tensor.h"
 #include "mmdeploy/core/value.h"
 
-namespace mmdeploy::framework {
+namespace mmdeploy::framework
+{
 
-class Net {
- public:
-  virtual ~Net() = default;
-  virtual Result<void> Init(const Value& cfg) = 0;
-  virtual Result<void> Deinit() = 0;
-  virtual Result<Span<Tensor>> GetInputTensors() = 0;
-  virtual Result<Span<Tensor>> GetOutputTensors() = 0;
-  virtual Result<void> Reshape(Span<TensorShape> input_shapes) = 0;
-  virtual Result<void> Forward() = 0;
-  virtual Result<void> ForwardAsync(Event* event) = 0;
-};
+    class Net
+    {
+      public:
+        virtual ~Net()                                                       = default;
+        virtual Result<void>         Init(const Value& cfg)                  = 0;
+        virtual Result<void>         Deinit()                                = 0;
+        virtual Result<Span<Tensor>> GetInputTensors()                       = 0;
+        virtual Result<Span<Tensor>> GetOutputTensors()                      = 0;
+        virtual Result<void>         Reshape(Span<TensorShape> input_shapes) = 0;
+        virtual Result<void>         Forward()                               = 0;
+        virtual Result<void>         ForwardAsync(Event* event)              = 0;
+    };
 
-MMDEPLOY_DECLARE_REGISTRY(Net, std::unique_ptr<Net>(const Value& config));
+    MMDEPLOY_DECLARE_REGISTRY(Net, std::unique_ptr<Net>(const Value& config));
 
 }  // namespace mmdeploy::framework
 
diff --git a/csrc/mmdeploy/core/operator.cpp b/csrc/mmdeploy/core/operator.cpp
index 3fa4e0d669..e74d018b92 100644
--- a/csrc/mmdeploy/core/operator.cpp
+++ b/csrc/mmdeploy/core/operator.cpp
@@ -6,201 +6,260 @@
 
 #include "mmdeploy/core/utils/formatter.h"
 
-namespace mmdeploy::graph {
-
-Result<void> Gather(const Value::Array& array, const vector<int>& idxs, Value::Array& output) {
-  if (idxs.empty()) {
-    return success();
-  }
-  auto max_idx = *max_element(begin(idxs), end(idxs));
-  if (array.size() <= max_idx) {
-    return Status(eOutOfRange);
-  }
-  output.reserve(output.size() + idxs.size());
-  for (const auto& idx : idxs) {
-    output.push_back(array[idx]);
-  }
-  return success();
-}
-
-Result<void> Gather(Value::Array&& array, const vector<int>& idxs, Value::Array& output) {
-  if (idxs.empty()) {
-    return success();
-  }
-  auto max_idx = *max_element(begin(idxs), end(idxs));
-  if (array.size() <= max_idx) {
-    return Status(eOutOfRange);
-  }
-  output.reserve(output.size() + idxs.size());
-  for (const auto& idx : idxs) {
-    output.push_back(std::move(array[idx]));
-  }
-  return success();
-}
-
-Result<void> Gather(const Value::Object& object, const vector<std::string>& keys,
-                    Value::Array& output) {
-  output.reserve(output.size() + keys.size());
-  try {
-    for (const auto& key : keys) {
-      output.push_back(object.at(key));
-    }
-  } catch (const std::out_of_range& e) {
-    return Status(eOutOfRange);
-  }
-  return success();
-}
-
-Result<void> Gather(Value::Object&& object, const vector<std::string>& keys, Value::Array& output) {
-  output.reserve(output.size() + keys.size());
-  try {
-    for (const auto& key : keys) {
-      output.push_back(std::move(object.at(key)));
-    }
-  } catch (const std::out_of_range& e) {
-    return Status(eOutOfRange);
-  }
-  return success();
-}
-
-Result<void> Scatter(Value::Array array, const vector<int>& idxs, Value::Array& output) {
-  if (array.size() < idxs.size()) {
-    return Status(eOutOfRange);
-  }
-  for (int i = 0; i < idxs.size(); ++i) {
-    output[idxs[i]] = std::move(array[i]);
-  }
-  return success();
-}
-
-Result<void> Scatter(Value::Array array, const vector<std::string>& keys, Value::Object& output) {
-  if (array.size() < keys.size()) {
-    return Status(eOutOfRange);
-  }
-  for (int i = 0; i < keys.size(); ++i) {
-    output.emplace(keys[i], std::move(array[i]));
-  }
-  return success();
-}
-
-Result<Value> DistribOA(const Value& oa) {
-  if (!oa.is_object()) {
-    return Status(eInvalidArgument);
-  }
-  Value ao = ValueType::kArray;
-  for (auto inner = oa.begin(); inner != oa.end(); ++inner) {
-    if (!inner->is_array()) {
-      return Status(eInvalidArgument);
-    }
-    if (ao.empty()) {
-      for (int i = 0; i < inner->size(); ++i) ao.push_back(ValueType::kObject);
-    }
-    if (inner->size() != oa.size()) {
-      return Status(eInvalidArgument);
-    }
-    for (int i = 0; i < inner->size(); ++i) {
-      ao[i][inner.key()] = (*inner)[i];
+namespace mmdeploy::graph
+{
+
+    Result<void> Gather(const Value::Array& array, const vector<int>& idxs, Value::Array& output)
+    {
+        if (idxs.empty())
+        {
+            return success();
+        }
+        auto max_idx = *max_element(begin(idxs), end(idxs));
+        if (array.size() <= max_idx)
+        {
+            return Status(eOutOfRange);
+        }
+        output.reserve(output.size() + idxs.size());
+        for (const auto& idx : idxs)
+        {
+            output.push_back(array[idx]);
+        }
+        return success();
     }
-  }
-  return ao;
-}
-
-Result<Value> DistribAO(const Value& ao) {
-  if (!ao.is_array()) {
-    return Status(eInvalidArgument);
-  }
-  Value oa = ValueType::kObject;
-  for (const auto& inner : ao) {
-    if (inner.is_object()) {
-      return Status(eInvalidArgument);
+
+    Result<void> Gather(Value::Array&& array, const vector<int>& idxs, Value::Array& output)
+    {
+        if (idxs.empty())
+        {
+            return success();
+        }
+        auto max_idx = *max_element(begin(idxs), end(idxs));
+        if (array.size() <= max_idx)
+        {
+            return Status(eOutOfRange);
+        }
+        output.reserve(output.size() + idxs.size());
+        for (const auto& idx : idxs)
+        {
+            output.push_back(std::move(array[idx]));
+        }
+        return success();
     }
-    if (oa.empty()) {
-      for (auto item = inner.begin(); item != inner.end(); ++item) {
-        oa[item.key()] = ValueType::kObject;
-      }
+
+    Result<void> Gather(const Value::Object& object, const vector<std::string>& keys, Value::Array& output)
+    {
+        output.reserve(output.size() + keys.size());
+        try
+        {
+            for (const auto& key : keys)
+            {
+                output.push_back(object.at(key));
+            }
+        }
+        catch (const std::out_of_range& e)
+        {
+            return Status(eOutOfRange);
+        }
+        return success();
     }
-    if (inner.size() != oa.size()) {
-      return Status(eInvalidArgument);
+
+    Result<void> Gather(Value::Object&& object, const vector<std::string>& keys, Value::Array& output)
+    {
+        output.reserve(output.size() + keys.size());
+        try
+        {
+            for (const auto& key : keys)
+            {
+                output.push_back(std::move(object.at(key)));
+            }
+        }
+        catch (const std::out_of_range& e)
+        {
+            return Status(eOutOfRange);
+        }
+        return success();
     }
-    for (auto item = inner.begin(); item != inner.end(); ++item) {
-      if (!oa.contains(item.key())) {
-        return Status(eInvalidArgument);
-      }
-      oa[item.key()].push_back(*item);
+
+    Result<void> Scatter(Value::Array array, const vector<int>& idxs, Value::Array& output)
+    {
+        if (array.size() < idxs.size())
+        {
+            return Status(eOutOfRange);
+        }
+        for (int i = 0; i < idxs.size(); ++i)
+        {
+            output[idxs[i]] = std::move(array[i]);
+        }
+        return success();
     }
-  }
-  return oa;
-}
-
-Result<Value> DistribAA(const Value& a) {
-  if (!a.is_array()) {
-    return Status(eInvalidArgument);
-  }
-  auto ta = Value::Array{};
-  for (const auto& inner : a.get_ref<const Value::Array&>()) {
-    if (!inner.is_array()) {
-      return Status(eInvalidArgument);
+
+    Result<void> Scatter(Value::Array array, const vector<std::string>& keys, Value::Object& output)
+    {
+        if (array.size() < keys.size())
+        {
+            return Status(eOutOfRange);
+        }
+        for (int i = 0; i < keys.size(); ++i)
+        {
+            output.emplace(keys[i], std::move(array[i]));
+        }
+        return success();
     }
-    if (ta.empty()) {
-      ta.reserve(inner.size());
-      for (int i = 0; i < inner.size(); ++i) {
-        ta.emplace_back(Value::kArray);
-      }
+
+    Result<Value> DistribOA(const Value& oa)
+    {
+        if (!oa.is_object())
+        {
+            return Status(eInvalidArgument);
+        }
+        Value ao = ValueType::kArray;
+        for (auto inner = oa.begin(); inner != oa.end(); ++inner)
+        {
+            if (!inner->is_array())
+            {
+                return Status(eInvalidArgument);
+            }
+            if (ao.empty())
+            {
+                for (int i = 0; i < inner->size(); ++i) ao.push_back(ValueType::kObject);
+            }
+            if (inner->size() != oa.size())
+            {
+                return Status(eInvalidArgument);
+            }
+            for (int i = 0; i < inner->size(); ++i)
+            {
+                ao[i][inner.key()] = (*inner)[i];
+            }
+        }
+        return ao;
     }
-    if (inner.size() != ta.size()) {
-      return Status(eInvalidArgument);
+
+    Result<Value> DistribAO(const Value& ao)
+    {
+        if (!ao.is_array())
+        {
+            return Status(eInvalidArgument);
+        }
+        Value oa = ValueType::kObject;
+        for (const auto& inner : ao)
+        {
+            if (inner.is_object())
+            {
+                return Status(eInvalidArgument);
+            }
+            if (oa.empty())
+            {
+                for (auto item = inner.begin(); item != inner.end(); ++item)
+                {
+                    oa[item.key()] = ValueType::kObject;
+                }
+            }
+            if (inner.size() != oa.size())
+            {
+                return Status(eInvalidArgument);
+            }
+            for (auto item = inner.begin(); item != inner.end(); ++item)
+            {
+                if (!oa.contains(item.key()))
+                {
+                    return Status(eInvalidArgument);
+                }
+                oa[item.key()].push_back(*item);
+            }
+        }
+        return oa;
     }
-    for (int i = 0; i < inner.size(); ++i) {
-      ta[i].push_back(inner[i]);
+
+    Result<Value> DistribAA(const Value& a)
+    {
+        if (!a.is_array())
+        {
+            return Status(eInvalidArgument);
+        }
+        auto ta = Value::Array{};
+        for (const auto& inner : a.get_ref<const Value::Array&>())
+        {
+            if (!inner.is_array())
+            {
+                return Status(eInvalidArgument);
+            }
+            if (ta.empty())
+            {
+                ta.reserve(inner.size());
+                for (int i = 0; i < inner.size(); ++i)
+                {
+                    ta.emplace_back(Value::kArray);
+                }
+            }
+            if (inner.size() != ta.size())
+            {
+                return Status(eInvalidArgument);
+            }
+            for (int i = 0; i < inner.size(); ++i)
+            {
+                ta[i].push_back(inner[i]);
+            }
+        }
+        return ta;
     }
-  }
-  return ta;
-}
-
-std::tuple<Value::Array, std::vector<int>> FlattenArray(Value::Array values,
-                                                        const vector<bool>& predicate) {
-  assert(values.size() == predicate.size());
-  std::vector<int> indices;
-  for (int i = 0; i < values.size(); ++i) {
-    if (predicate[i]) {
-      std::vector<int> idx;
-      std::tie(values[i], idx) = Flatten(values[i]).value();
-      if (indices.empty()) {
-        indices.swap(idx);
-      } else {
-        assert(idx == indices);
-      }
+
+    std::tuple<Value::Array, std::vector<int>> FlattenArray(Value::Array        values,
+                                                            const vector<bool>& predicate)
+    {
+        assert(values.size() == predicate.size());
+        std::vector<int> indices;
+        for (int i = 0; i < values.size(); ++i)
+        {
+            if (predicate[i])
+            {
+                std::vector<int> idx;
+                std::tie(values[i], idx) = Flatten(values[i]).value();
+                if (indices.empty())
+                {
+                    indices.swap(idx);
+                }
+                else
+                {
+                    assert(idx == indices);
+                }
+            }
+        }
+        return {std::move(values), std::move(indices)};
     }
-  }
-  return {std::move(values), std::move(indices)};
-}
-
-Value::Array UnflattenArray(Value::Array values, const vector<int>& index,
-                            const vector<bool>& predicate) {
-  assert(values.size() == predicate.size());
-  for (int i = 0; i < values.size(); ++i) {
-    if (predicate[i]) {
-      values[i] = Unflatten(std::move(values[i]), index).value();
+
+    Value::Array UnflattenArray(Value::Array values, const vector<int>& index, const vector<bool>& predicate)
+    {
+        assert(values.size() == predicate.size());
+        for (int i = 0; i < values.size(); ++i)
+        {
+            if (predicate[i])
+            {
+                values[i] = Unflatten(std::move(values[i]), index).value();
+            }
+        }
+        return values;
     }
-  }
-  return values;
-}
-
-Value::Array BroadcastArray(Value::Array values, const vector<int>& index,
-                            const vector<bool>& predicate) {
-  assert(values.size() == predicate.size());
-  for (int i = 0; i < values.size(); ++i) {
-    if (predicate[i]) {
-      assert(values[i].is_array());
-      auto& val = values[i].array();
-      Value::Array ret(index.size() - 1);
-      for (int j = 0; j < ret.size(); ++j) {
-        ret[j] = val[index[j]];
-      }
-      values[i] = std::move(ret);
+
+    Value::Array BroadcastArray(Value::Array values, const vector<int>& index, const vector<bool>& predicate)
+    {
+        assert(values.size() == predicate.size());
+        for (int i = 0; i < values.size(); ++i)
+        {
+            if (predicate[i])
+            {
+                assert(values[i].is_array());
+                auto&        val = values[i].array();
+                Value::Array ret(index.size() - 1);
+                for (int j = 0; j < ret.size(); ++j)
+                {
+                    ret[j] = val[index[j]];
+                }
+                values[i] = std::move(ret);
+            }
+        }
+        return values;
     }
-  }
-  return values;
-}
 
 }  // namespace mmdeploy::graph
diff --git a/csrc/mmdeploy/core/operator.h b/csrc/mmdeploy/core/operator.h
index d45a088892..a1aea8e0a0 100644
--- a/csrc/mmdeploy/core/operator.h
+++ b/csrc/mmdeploy/core/operator.h
@@ -5,117 +5,126 @@
 
 #include "mmdeploy/core/value.h"
 
-namespace mmdeploy::graph {
-
-using std::string;
-using std::tuple;
-using std::vector;
-
-MMDEPLOY_API Result<void> Gather(const Value::Array& array, const vector<int>& idxs,
-                                 Value::Array& output);
-MMDEPLOY_API Result<void> Gather(Value::Array&& array, const vector<int>& idxs,
-                                 Value::Array& output);
-MMDEPLOY_API Result<void> Gather(const Value::Object& object, const vector<std::string>& keys,
-                                 Value::Array& output);
-MMDEPLOY_API Result<void> Gather(Value::Object&& object, const vector<std::string>& keys,
-                                 Value::Array& output);
-MMDEPLOY_API Result<void> Scatter(Value::Array array, const vector<int>& idxs,
-                                  Value::Array& output);
-MMDEPLOY_API Result<void> Scatter(Value::Array array, const vector<std::string>& keys,
-                                  Value::Object& output);
-
-inline Result<Value::Array> Gather(const Value::Array& array, const vector<int>& idxs) {
-  Value::Array output;
-  OUTCOME_TRY(Gather(array, idxs, output));
-  return output;
-}
-
-inline Result<Value::Array> Gather(Value::Array&& array, const vector<int>& idxs) {
-  Value::Array output;
-  OUTCOME_TRY(Gather(std::move(array), idxs, output));
-  return output;
-}
-
-inline Result<Value::Array> Gather(const Value::Object& object, const vector<std::string>& keys) {
-  Value::Array output;
-  OUTCOME_TRY(Gather(object, keys, output));
-  return output;
-}
-
-inline Result<Value::Array> Gather(Value::Object&& object, const vector<std::string>& keys) {
-  Value::Array output;
-  OUTCOME_TRY(Gather(std::move(object), keys, output));
-  return output;
-}
-
-inline Result<Value::Array> Scatter(Value::Array array, const vector<int>& idxs) {
-  Value::Array output(idxs.size(), Value::kNull);
-  OUTCOME_TRY(Scatter(std::move(array), idxs, output));
-  return output;
-}
-
-inline Result<Value::Object> Scatter(Value::Array array, const vector<std::string>& keys) {
-  Value::Object output;
-  OUTCOME_TRY(Scatter(std::move(array), keys, output));
-  return output;
-}
-
-template <class V, std::enable_if_t<is_value_v<std::decay_t<V>>, bool> = true>
-Result<tuple<Value, vector<int>>> Flatten(V&& input) {
-  if (!input.is_array()) {
-    return Status(eInvalidArgument);
-  }
-  Value output = ValueType::kArray;
-  std::vector<int> idxs;
-  for (int i = 0; i < input.size(); ++i) {
-    auto inner = std::forward<V>(input)[i];
-    if (!inner.is_array()) {
-      return Status(eInvalidArgument);
+namespace mmdeploy::graph
+{
+
+    using std::string;
+    using std::tuple;
+    using std::vector;
+
+    MMDEPLOY_API Result<void> Gather(const Value::Array& array, const vector<int>& idxs, Value::Array& output);
+    MMDEPLOY_API Result<void> Gather(Value::Array&& array, const vector<int>& idxs, Value::Array& output);
+    MMDEPLOY_API Result<void> Gather(const Value::Object& object, const vector<std::string>& keys, Value::Array& output);
+    MMDEPLOY_API Result<void> Gather(Value::Object&& object, const vector<std::string>& keys, Value::Array& output);
+    MMDEPLOY_API Result<void> Scatter(Value::Array array, const vector<int>& idxs, Value::Array& output);
+    MMDEPLOY_API Result<void>   Scatter(Value::Array array, const vector<std::string>& keys, Value::Object& output);
+
+    inline Result<Value::Array> Gather(const Value::Array& array, const vector<int>& idxs)
+    {
+        Value::Array output;
+        OUTCOME_TRY(Gather(array, idxs, output));
+        return output;
     }
-    for (auto& item : inner) {
-      output.push_back(std::move(item));
-      idxs.push_back(i);
+
+    inline Result<Value::Array> Gather(Value::Array&& array, const vector<int>& idxs)
+    {
+        Value::Array output;
+        OUTCOME_TRY(Gather(std::move(array), idxs, output));
+        return output;
+    }
+
+    inline Result<Value::Array> Gather(const Value::Object& object, const vector<std::string>& keys)
+    {
+        Value::Array output;
+        OUTCOME_TRY(Gather(object, keys, output));
+        return output;
+    }
+
+    inline Result<Value::Array> Gather(Value::Object&& object, const vector<std::string>& keys)
+    {
+        Value::Array output;
+        OUTCOME_TRY(Gather(std::move(object), keys, output));
+        return output;
+    }
+
+    inline Result<Value::Array> Scatter(Value::Array array, const vector<int>& idxs)
+    {
+        Value::Array output(idxs.size(), Value::kNull);
+        OUTCOME_TRY(Scatter(std::move(array), idxs, output));
+        return output;
+    }
+
+    inline Result<Value::Object> Scatter(Value::Array array, const vector<std::string>& keys)
+    {
+        Value::Object output;
+        OUTCOME_TRY(Scatter(std::move(array), keys, output));
+        return output;
     }
-  }
-  idxs.push_back(static_cast<int>(input.size()));
-  return {output, idxs};
-}
-
-template <class V, std::enable_if_t<is_value_v<std::decay_t<V>>, bool> = true>
-Result<Value> Unflatten(V&& input, const vector<int>& idxs) {
-  if (!input.is_array()) {
-    return Status(eInvalidArgument);
-  }
-  Value output = ValueType::kArray;
-  for (int i = 0; i < idxs.back(); ++i) {
-    output.push_back(ValueType::kArray);
-  }
-  for (int i = 0; i < input.size(); ++i) {
-    if (idxs[i] >= output.size()) {
-      return Status(eInvalidArgument);
+
+    template<class V, std::enable_if_t<is_value_v<std::decay_t<V>>, bool> = true>
+    Result<tuple<Value, vector<int>>> Flatten(V&& input)
+    {
+        if (!input.is_array())
+        {
+            return Status(eInvalidArgument);
+        }
+        Value            output = ValueType::kArray;
+        std::vector<int> idxs;
+        for (int i = 0; i < input.size(); ++i)
+        {
+            auto inner = std::forward<V>(input)[i];
+            if (!inner.is_array())
+            {
+                return Status(eInvalidArgument);
+            }
+            for (auto& item : inner)
+            {
+                output.push_back(std::move(item));
+                idxs.push_back(i);
+            }
+        }
+        idxs.push_back(static_cast<int>(input.size()));
+        return {output, idxs};
+    }
+
+    template<class V, std::enable_if_t<is_value_v<std::decay_t<V>>, bool> = true>
+    Result<Value> Unflatten(V&& input, const vector<int>& idxs)
+    {
+        if (!input.is_array())
+        {
+            return Status(eInvalidArgument);
+        }
+        Value output = ValueType::kArray;
+        for (int i = 0; i < idxs.back(); ++i)
+        {
+            output.push_back(ValueType::kArray);
+        }
+        for (int i = 0; i < input.size(); ++i)
+        {
+            if (idxs[i] >= output.size())
+            {
+                return Status(eInvalidArgument);
+            }
+            output[idxs[i]].push_back(std::forward<V>(input)[i]);
+        }
+        return output;
     }
-    output[idxs[i]].push_back(std::forward<V>(input)[i]);
-  }
-  return output;
-}
 
-// object of arrays -> array of objects, all arrays must be of same length
-MMDEPLOY_API Result<Value> DistribOA(const Value& oa);
+    // object of arrays -> array of objects, all arrays must be of same length
+    MMDEPLOY_API Result<Value> DistribOA(const Value& oa);
 
-// array of objects -> object of arrays, all objects must be isomorphic
-MMDEPLOY_API Result<Value> DistribAO(const Value& ao);
+    // array of objects -> object of arrays, all objects must be isomorphic
+    MMDEPLOY_API Result<Value> DistribAO(const Value& ao);
 
-// array of arrays -> array of arrays, this is equivalent to transpose
-MMDEPLOY_API Result<Value> DistribAA(const Value& a);
+    // array of arrays -> array of arrays, this is equivalent to transpose
+    MMDEPLOY_API Result<Value> DistribAA(const Value& a);
 
-MMDEPLOY_API std::tuple<Value::Array, std::vector<int>> FlattenArray(Value::Array values,
-                                                                     const vector<bool>& predicate);
+    MMDEPLOY_API std::tuple<Value::Array, std::vector<int>> FlattenArray(Value::Array        values,
+                                                                         const vector<bool>& predicate);
 
-MMDEPLOY_API Value::Array UnflattenArray(Value::Array values, const vector<int>& index,
-                                         const vector<bool>& predicate);
+    MMDEPLOY_API Value::Array UnflattenArray(Value::Array values, const vector<int>& index, const vector<bool>& predicate);
 
-MMDEPLOY_API Value::Array BroadcastArray(Value::Array values, const std::vector<int>& index,
-                                         const vector<bool>& predicate);
+    MMDEPLOY_API Value::Array BroadcastArray(Value::Array values, const std::vector<int>& index, const vector<bool>& predicate);
 
 }  // namespace mmdeploy::graph
 
diff --git a/csrc/mmdeploy/core/profiler.cpp b/csrc/mmdeploy/core/profiler.cpp
index 573e8ec383..09bc34d032 100644
--- a/csrc/mmdeploy/core/profiler.cpp
+++ b/csrc/mmdeploy/core/profiler.cpp
@@ -4,84 +4,101 @@
 
 #include <iomanip>
 
-namespace mmdeploy {
-namespace profiler {
-
-Event* Scope::Add(Event::Type type, Index index, TimePoint time_point) {
-  return profiler_->AddEvent({this, type, index, time_point});
-}
-
-Scope* Scope::CreateScope(std::string_view name) {
-  auto node = children_.emplace_back(profiler_->CreateScope(name));
-  node->parent_ = this;
-  return node;
-}
-
-void Scope::Dump(Scope* scope, std::ofstream& ofs) {
-  ofs << scope->name_ << " " << (void*)scope << " ";
-  for (auto& child : scope->children_) {
-    ofs << (void*)child << " ";
-  }
-  ofs << "\n";
-  for (const auto& child : scope->children_) {
-    Dump(child, ofs);
-  }
-}
-
-ScopedCounter::ScopedCounter(Scope* scope) {
-  if (scope) {
-    start_ = scope->Add(Event::kStart, scope->next_.fetch_add(1, std::memory_order_relaxed),
-                        Clock::now());
-  }
-}
-
-ScopedCounter::~ScopedCounter() {
-  if (start_) {
-    start_->scope->Add(Event::kEnd, start_->index, Clock::now());
-  }
-}
-
-Profiler::Profiler(std::string_view path) : path_(path) { root_ = CreateScope("."); }
-
-Scope* Profiler::CreateScope(std::string_view name) {
-  auto& node = nodes_.emplace_back();
-  node.profiler_ = this;
-  node.name_ = name;
-  return &node;
-}
-
-Event* Profiler::AddEvent(Event e) {
-  auto uptr = std::make_unique<Event>(e);
-  Event* pe = uptr.get();
-  events_.enqueue(std::move(uptr));
-  return pe;
-}
-
-void Profiler::Release() {
-  std::ofstream ofs(path_);
-  root_->Dump(ofs);
-  ofs << "----\n";
-
-  std::unique_ptr<Event> item;
-  std::vector<std::unique_ptr<Event>> vec;
-  while (events_.try_dequeue(item)) {
-    vec.push_back(std::move(item));
-  }
-
-  std::sort(vec.begin(), vec.end(),
-            [](const std::unique_ptr<Event>& a, const std::unique_ptr<Event>& b) {
-              return a->time_point < b->time_point;
-            });
-
-  for (int i = 0; i < vec.size(); i++) {
-    ofs << (void*)vec[i]->scope << " " << vec[i]->type << " " << vec[i]->index << " "
-        << std::chrono::duration_cast<std::chrono::microseconds>(vec[i]->time_point -
-                                                                 vec[0]->time_point)
-               .count()
-        << "\n";
-  }
-}
-
-}  // namespace profiler
+namespace mmdeploy
+{
+    namespace profiler
+    {
+
+        Event* Scope::Add(Event::Type type, Index index, TimePoint time_point)
+        {
+            return profiler_->AddEvent({this, type, index, time_point});
+        }
+
+        Scope* Scope::CreateScope(std::string_view name)
+        {
+            auto node     = children_.emplace_back(profiler_->CreateScope(name));
+            node->parent_ = this;
+            return node;
+        }
+
+        void Scope::Dump(Scope* scope, std::ofstream& ofs)
+        {
+            ofs << scope->name_ << " " << (void*)scope << " ";
+            for (auto& child : scope->children_)
+            {
+                ofs << (void*)child << " ";
+            }
+            ofs << "\n";
+            for (const auto& child : scope->children_)
+            {
+                Dump(child, ofs);
+            }
+        }
+
+        ScopedCounter::ScopedCounter(Scope* scope)
+        {
+            if (scope)
+            {
+                start_ = scope->Add(Event::kStart, scope->next_.fetch_add(1, std::memory_order_relaxed), Clock::now());
+            }
+        }
+
+        ScopedCounter::~ScopedCounter()
+        {
+            if (start_)
+            {
+                start_->scope->Add(Event::kEnd, start_->index, Clock::now());
+            }
+        }
+
+        Profiler::Profiler(std::string_view path)
+            : path_(path)
+        {
+            root_ = CreateScope(".");
+        }
+
+        Scope* Profiler::CreateScope(std::string_view name)
+        {
+            auto& node     = nodes_.emplace_back();
+            node.profiler_ = this;
+            node.name_     = name;
+            return &node;
+        }
+
+        Event* Profiler::AddEvent(Event e)
+        {
+            auto   uptr = std::make_unique<Event>(e);
+            Event* pe   = uptr.get();
+            events_.enqueue(std::move(uptr));
+            return pe;
+        }
+
+        void Profiler::Release()
+        {
+            std::ofstream ofs(path_);
+            root_->Dump(ofs);
+            ofs << "----\n";
+
+            std::unique_ptr<Event>              item;
+            std::vector<std::unique_ptr<Event>> vec;
+            while (events_.try_dequeue(item))
+            {
+                vec.push_back(std::move(item));
+            }
+
+            std::sort(vec.begin(), vec.end(), [](const std::unique_ptr<Event>& a, const std::unique_ptr<Event>& b)
+                      { return a->time_point < b->time_point; });
+
+            for (int i = 0; i < vec.size(); i++)
+            {
+                ofs << (void*)vec[i]->scope << " " << vec[i]->type << " " << vec[i]->index << " "
+                    << std::chrono::duration_cast<std::chrono::microseconds>(vec[i]->time_point -
+                                                                             vec[0]->time_point)
+                           .count()
+                    << "\n";
+            }
+        }
+
+    }  // namespace profiler
 
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/core/profiler.h b/csrc/mmdeploy/core/profiler.h
index 3f8f9b9876..899eda3c61 100644
--- a/csrc/mmdeploy/core/profiler.h
+++ b/csrc/mmdeploy/core/profiler.h
@@ -17,69 +17,86 @@
 #include "mmdeploy/core/macro.h"
 #include "mmdeploy/core/value.h"
 
-namespace mmdeploy {
-namespace profiler {
-
-struct Profiler;
-struct Scope;
-
-using Clock = std::conditional_t<std::chrono::high_resolution_clock::is_steady,
-                                 std::chrono::high_resolution_clock, std::chrono::steady_clock>;
-using TimePoint = Clock::time_point;
-using Index = uint64_t;
-
-struct Event {
-  enum Type { kStart, kEnd };
-  Scope* scope;
-  Type type;
-  Index index;
-  TimePoint time_point;
-};
-
-struct MMDEPLOY_API Scope {
-  Scope() = default;
-  Scope(const Scope&) = delete;
-  Scope(Scope&&) noexcept = delete;
-  Scope& operator=(const Scope&) = delete;
-  Scope& operator=(Scope&&) noexcept = delete;
-
-  Event* Add(Event::Type type, Index index, TimePoint time_point);
-
-  Scope* CreateScope(std::string_view name);
-
-  void Dump(Scope* scope, std::ofstream& ofs);
-  void Dump(std::ofstream& ofs) { Dump(this, ofs); }
-
-  Profiler* profiler_{};
-  Scope* parent_{};
-  std::vector<Scope*> children_;
-  std::atomic<Index> next_{};
-  std::string name_;
-};
-
-struct MMDEPLOY_API ScopedCounter {
-  explicit ScopedCounter(Scope* scope);
-  ~ScopedCounter();
-
-  Event* start_{};
-};
-
-struct MMDEPLOY_API Profiler {
-  explicit Profiler(std::string_view path);
-  Scope* CreateScope(std::string_view name);
-  Event* AddEvent(Event e);
-  Scope* scope() const noexcept { return root_; }
-  void Release();
-
-  std::string path_;
-  std::deque<Scope> nodes_;
-  moodycamel::ConcurrentQueue<std::unique_ptr<Event>> events_;
-  Scope* root_{};
-};
-
-}  // namespace profiler
-
-MMDEPLOY_REGISTER_TYPE_ID(profiler::Scope*, 10);
+namespace mmdeploy
+{
+    namespace profiler
+    {
+
+        struct Profiler;
+        struct Scope;
+
+        using Clock     = std::conditional_t<std::chrono::high_resolution_clock::is_steady,
+                                         std::chrono::high_resolution_clock,
+                                         std::chrono::steady_clock>;
+        using TimePoint = Clock::time_point;
+        using Index     = uint64_t;
+
+        struct Event
+        {
+            enum Type
+            {
+                kStart,
+                kEnd
+            };
+            Scope*    scope;
+            Type      type;
+            Index     index;
+            TimePoint time_point;
+        };
+
+        struct MMDEPLOY_API Scope
+        {
+            Scope()                            = default;
+            Scope(const Scope&)                = delete;
+            Scope(Scope&&) noexcept            = delete;
+            Scope& operator=(const Scope&)     = delete;
+            Scope& operator=(Scope&&) noexcept = delete;
+
+            Event* Add(Event::Type type, Index index, TimePoint time_point);
+
+            Scope* CreateScope(std::string_view name);
+
+            void   Dump(Scope* scope, std::ofstream& ofs);
+            void   Dump(std::ofstream& ofs)
+            {
+                Dump(this, ofs);
+            }
+
+            Profiler*           profiler_{};
+            Scope*              parent_{};
+            std::vector<Scope*> children_;
+            std::atomic<Index>  next_{};
+            std::string         name_;
+        };
+
+        struct MMDEPLOY_API ScopedCounter
+        {
+            explicit ScopedCounter(Scope* scope);
+            ~ScopedCounter();
+
+            Event* start_{};
+        };
+
+        struct MMDEPLOY_API Profiler
+        {
+            explicit Profiler(std::string_view path);
+            Scope* CreateScope(std::string_view name);
+            Event* AddEvent(Event e);
+            Scope* scope() const noexcept
+            {
+                return root_;
+            }
+            void                                                Release();
+
+            std::string                                         path_;
+            std::deque<Scope>                                   nodes_;
+            moodycamel::ConcurrentQueue<std::unique_ptr<Event>> events_;
+            Scope*                                              root_{};
+        };
+
+    }  // namespace profiler
+
+    MMDEPLOY_REGISTER_TYPE_ID(profiler::Scope*, 10);
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/core/registry.cpp b/csrc/mmdeploy/core/registry.cpp
index dab2c92416..82a0bb75a6 100644
--- a/csrc/mmdeploy/core/registry.cpp
+++ b/csrc/mmdeploy/core/registry.cpp
@@ -7,75 +7,97 @@
 
 #include "mmdeploy/core/logger.h"
 
-namespace mmdeploy {
-
-namespace _registry {
-
-struct Registry<void>::Impl {
-  template <typename It>
-  auto convert(It u, It v) {
-    return std::pair{creators_.begin() + (u - names_.begin()),
-                     creators_.begin() + (v - names_.begin())};
-  }
-
-  Creator<void>* Get(const string_view& name, int version) {
-    const auto& [u, v] = std::equal_range(names_.begin(), names_.end(), name);
-    const auto& [i, j] = convert(u, v);
-    if (version == -1) {
-      if (auto n = j - i; n == 1) {
-        return *i;
-      }
-      return nullptr;
-    }
-    for (const auto& creator : iterator_range(i, j)) {
-      if (creator->version() == version) {
-        return creator;
-      }
-    }
-    return nullptr;
-  }
-
-  bool Add(Creator<void>& creator) {
-    const auto& [u, v] = std::equal_range(names_.begin(), names_.end(), creator.name());
-    const auto& [i, j] = convert(u, v);
-    if (i != j) {
-      for (const auto& other : iterator_range(i, j)) {
-        if (creator.version() == other->version()) {
-          MMDEPLOY_WARN("Adding duplicated creator ({}, {}).", creator.name(), creator.version());
-          return false;
+namespace mmdeploy
+{
+
+    namespace _registry
+    {
+
+        struct Registry<void>::Impl
+        {
+            template<typename It>
+            auto convert(It u, It v)
+            {
+                return std::pair{creators_.begin() + (u - names_.begin()),
+                                 creators_.begin() + (v - names_.begin())};
+            }
+
+            Creator<void>* Get(const string_view& name, int version)
+            {
+                const auto& [u, v] = std::equal_range(names_.begin(), names_.end(), name);
+                const auto& [i, j] = convert(u, v);
+                if (version == -1)
+                {
+                    if (auto n = j - i; n == 1)
+                    {
+                        return *i;
+                    }
+                    return nullptr;
+                }
+                for (const auto& creator : iterator_range(i, j))
+                {
+                    if (creator->version() == version)
+                    {
+                        return creator;
+                    }
+                }
+                return nullptr;
+            }
+
+            bool Add(Creator<void>& creator)
+            {
+                const auto& [u, v] = std::equal_range(names_.begin(), names_.end(), creator.name());
+                const auto& [i, j] = convert(u, v);
+                if (i != j)
+                {
+                    for (const auto& other : iterator_range(i, j))
+                    {
+                        if (creator.version() == other->version())
+                        {
+                            MMDEPLOY_WARN("Adding duplicated creator ({}, {}).", creator.name(), creator.version());
+                            return false;
+                        }
+                    }
+                }
+                names_.insert(v, creator.name());
+                creators_.insert(j, &creator);
+                return true;
+            }
+
+            Span<Creator<void>*> Creators()
+            {
+                return creators_;
+            }
+
+            std::vector<Creator<void>*> creators_;
+            std::vector<string_view>    names_;
+        };
+
+        Registry<void>::Registry()
+            : impl_(std::make_unique<Impl>())
+        {
         }
-      }
-    }
-    names_.insert(v, creator.name());
-    creators_.insert(j, &creator);
-    return true;
-  }
 
-  Span<Creator<void>*> Creators() { return creators_; }
+        Registry<void>::~Registry() = default;
 
-  std::vector<Creator<void>*> creators_;
-  std::vector<string_view> names_;
-};
-
-Registry<void>::Registry() : impl_(std::make_unique<Impl>()) {}
-
-Registry<void>::~Registry() = default;
-
-bool Registry<void>::AddCreator(Creator<void>& creator) {
-  assert(impl_);
-  return impl_->Add(creator);
-}
+        bool Registry<void>::AddCreator(Creator<void>& creator)
+        {
+            assert(impl_);
+            return impl_->Add(creator);
+        }
 
-Creator<void>* Registry<void>::GetCreator(const std::string_view& name, int version) {
-  assert(impl_);
-  return impl_->Get(name, version);
-}
+        Creator<void>* Registry<void>::GetCreator(const std::string_view& name, int version)
+        {
+            assert(impl_);
+            return impl_->Get(name, version);
+        }
 
-Span<Creator<void>*> Registry<void>::Creators() {
-  assert(impl_);
-  return impl_->Creators();
-}
+        Span<Creator<void>*> Registry<void>::Creators()
+        {
+            assert(impl_);
+            return impl_->Creators();
+        }
 
-}  // namespace _registry
+    }  // namespace _registry
 
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/core/registry.h b/csrc/mmdeploy/core/registry.h
index 60afe0577c..bf898d34b3 100644
--- a/csrc/mmdeploy/core/registry.h
+++ b/csrc/mmdeploy/core/registry.h
@@ -13,219 +13,284 @@
 #include "mmdeploy/core/macro.h"
 #include "mmdeploy/core/mpl/span.h"
 
-namespace mmdeploy {
-
-template <typename T>
-struct basic_type {
-  using type = T;
-};
-
-template <typename Iterator>
-class iterator_range {
- public:
-  explicit iterator_range(Iterator first, Iterator last) : first_(first), last_(last) {}
-  explicit iterator_range(std::pair<Iterator, Iterator> range)
-      : iterator_range(range.first, range.second) {}
-
-  auto begin() noexcept { return first_; }
-  auto end() noexcept { return last_; }
-
- private:
-  Iterator first_;
-  Iterator last_;
-};
-
-namespace _registry {
-
-using std::optional;
-using std::string_view;
-
-template <typename T, typename = void>
-struct _get_signature {
-  static_assert(!std::is_same_v<T, T>, "tag T is not associated with a signature");
-};
-
-template <typename T>
-using get_signature_t = decltype(get_signature(basic_type<T>{}));
-
-template <typename T>
-struct _get_signature<T, std::void_t<get_signature_t<T>>> {
-  using type = typename get_signature_t<T>::type;
-};
-
-template <typename T>
-using GetSignature = typename _get_signature<T>::type;
-
-template <typename Tag>
-class Creator;
-
-template <>
-class MMDEPLOY_API Creator<void> {
- public:
-  virtual ~Creator() = default;
-  virtual string_view name() const noexcept = 0;
-  virtual int version() const noexcept { return 0; }
-};
-
-template <typename Ret, typename... Args>
-class MMDEPLOY_API Creator<Ret(Args...)> : public Creator<void> {
- public:
-  virtual Ret Create(Args... args) = 0;
-};
-
-template <typename Tag>
-class SimpleCreator;
-
-template <typename Ret, typename... Args>
-class SimpleCreator<Ret(Args...)> : public Creator<Ret(Args...)> {
- public:
-  using FunctionType = std::function<Ret(Args...)>;
-
-  SimpleCreator(const string_view& name, int version, FunctionType func)
-      : name_(name), version_(version), func_(std::move(func)) {}
-
-  string_view name() const noexcept override { return name_; }
-  int version() const noexcept override { return version_; }
-  Ret Create(Args... args) override { return func_(args...); }
-
- private:
-  std::string name_;
-  int version_;
-  FunctionType func_;
-};
-
-template <typename Tag>
-class Registry;
-
-template <>
-class MMDEPLOY_API Registry<void> {
- public:
-  Registry();
-
-  ~Registry();
-
-  bool AddCreator(Creator<void>& creator);
-
-  Creator<void>* GetCreator(const string_view& name, int version);
-
-  Span<Creator<void>*> Creators();
-
- private:
-  struct Impl;
-  std::unique_ptr<Impl> impl_;
-};
-
-template <typename Signature>
-struct _result_of;
-
-template <typename R, typename... As>
-struct _result_of<R(As...)> {
-  using type = R;
-};
-
-template <typename Tag>
-class Registry : public Registry<void> {
- public:
-  using Signature = GetSignature<Tag>;
-  using CreatorType = Creator<Signature>;
-
-  // workaround for gcc-10.2 (https://github.com/open-mmlab/mmdeploy/issues/1796)
-  Registry() : Registry<void>{} {}
-
-  bool Add(CreatorType& creator) & { return AddCreator(creator); }
-
-  CreatorType* Get(const string_view& name, int version) & {
-    return static_cast<CreatorType*>(GetCreator(name, version));
-  }
-
-  CreatorType* Get(const string_view& name) & { return Get(name, -1); }
-
-  template <typename... Args>
-  auto Create(const std::pair<string_view, int>& desc,
-              Args&&... args) & -> optional<typename _result_of<Signature>::type> {
-    if (auto creator = Get(desc.first, desc.second); creator) {
-      return creator->Create((Args &&) args...);
-    } else {
-      return std::nullopt;
-    }
-  }
-
-  template <typename... Args>
-  auto Create(const string_view& name, Args&&... args) & {
-    return Create(std::pair{name, -1}, (Args &&) args...);
-  }
-
-  Span<CreatorType*> Creators() & {
-    auto creators = Registry<void>::Creators();
-    return {reinterpret_cast<CreatorType**>(creators.data()), creators.size()};
-  }
-
-  auto List() & {
-    std::vector<std::pair<string_view, int>> list;
-    for (const auto& creator : Creators()) {
-      list.emplace_back(creator->name(), creator->version());
-    }
-    return list;
-  }
-};
-
-template <typename Tag>
-auto gRegistry() -> decltype((get_registry(basic_type<Tag>{}))) {
-  return get_registry(basic_type<Tag>{});
-}
-
-template <typename F>
-class Registerer {
- public:
-  explicit Registerer(F f) : func_(std::move(f)) { func_(); }
-
- private:
-  F func_;
-};
-
-}  // namespace _registry
-
-using _registry::gRegistry;
-using _registry::Registerer;
-using _registry::Registry;
-
-template <typename Tag>
-using Creator = _registry::Creator<_registry::GetSignature<Tag>>;
-
-template <typename Tag>
-using SimpleCreator = _registry::SimpleCreator<_registry::GetSignature<Tag>>;
+namespace mmdeploy
+{
+
+    template<typename T>
+    struct basic_type
+    {
+        using type = T;
+    };
+
+    template<typename Iterator>
+    class iterator_range
+    {
+      public:
+        explicit iterator_range(Iterator first, Iterator last)
+            : first_(first)
+            , last_(last)
+        {
+        }
+        explicit iterator_range(std::pair<Iterator, Iterator> range)
+            : iterator_range(range.first, range.second)
+        {
+        }
+
+        auto begin() noexcept
+        {
+            return first_;
+        }
+        auto end() noexcept
+        {
+            return last_;
+        }
+
+      private:
+        Iterator first_;
+        Iterator last_;
+    };
+
+    namespace _registry
+    {
+
+        using std::optional;
+        using std::string_view;
+
+        template<typename T, typename = void>
+        struct _get_signature
+        {
+            static_assert(!std::is_same_v<T, T>, "tag T is not associated with a signature");
+        };
+
+        template<typename T>
+        using get_signature_t = decltype(get_signature(basic_type<T>{}));
+
+        template<typename T>
+        struct _get_signature<T, std::void_t<get_signature_t<T>>>
+        {
+            using type = typename get_signature_t<T>::type;
+        };
+
+        template<typename T>
+        using GetSignature = typename _get_signature<T>::type;
+
+        template<typename Tag>
+        class Creator;
+
+        template<>
+        class MMDEPLOY_API Creator<void>
+        {
+          public:
+            virtual ~Creator()                        = default;
+            virtual string_view name() const noexcept = 0;
+            virtual int         version() const noexcept
+            {
+                return 0;
+            }
+        };
+
+        template<typename Ret, typename... Args>
+        class MMDEPLOY_API Creator<Ret(Args...)> : public Creator<void>
+        {
+          public:
+            virtual Ret Create(Args... args) = 0;
+        };
+
+        template<typename Tag>
+        class SimpleCreator;
+
+        template<typename Ret, typename... Args>
+        class SimpleCreator<Ret(Args...)> : public Creator<Ret(Args...)>
+        {
+          public:
+            using FunctionType = std::function<Ret(Args...)>;
+
+            SimpleCreator(const string_view& name, int version, FunctionType func)
+                : name_(name)
+                , version_(version)
+                , func_(std::move(func))
+            {
+            }
+
+            string_view name() const noexcept override
+            {
+                return name_;
+            }
+            int version() const noexcept override
+            {
+                return version_;
+            }
+            Ret Create(Args... args) override
+            {
+                return func_(args...);
+            }
+
+          private:
+            std::string  name_;
+            int          version_;
+            FunctionType func_;
+        };
+
+        template<typename Tag>
+        class Registry;
+
+        template<>
+        class MMDEPLOY_API Registry<void>
+        {
+          public:
+            Registry();
+
+            ~Registry();
+
+            bool                 AddCreator(Creator<void>& creator);
+
+            Creator<void>*       GetCreator(const string_view& name, int version);
+
+            Span<Creator<void>*> Creators();
+
+          private:
+            struct Impl;
+            std::unique_ptr<Impl> impl_;
+        };
+
+        template<typename Signature>
+        struct _result_of;
+
+        template<typename R, typename... As>
+        struct _result_of<R(As...)>
+        {
+            using type = R;
+        };
+
+        template<typename Tag>
+        class Registry : public Registry<void>
+        {
+          public:
+            using Signature   = GetSignature<Tag>;
+            using CreatorType = Creator<Signature>;
+
+            // workaround for gcc-10.2 (https://github.com/open-mmlab/mmdeploy/issues/1796)
+            Registry()
+                : Registry<void>{}
+            {
+            }
+
+            bool Add(CreatorType& creator) &
+            {
+                return AddCreator(creator);
+            }
+
+            CreatorType* Get(const string_view& name, int version) &
+            {
+                return static_cast<CreatorType*>(GetCreator(name, version));
+            }
+
+            CreatorType* Get(const string_view& name) &
+            {
+                return Get(name, -1);
+            }
+
+            template<typename... Args>
+            auto Create(const std::pair<string_view, int>& desc,
+                        Args&&... args) & -> optional<typename _result_of<Signature>::type>
+            {
+                if (auto creator = Get(desc.first, desc.second); creator)
+                {
+                    return creator->Create((Args&&)args...);
+                }
+                else
+                {
+                    return std::nullopt;
+                }
+            }
+
+            template<typename... Args>
+            auto Create(const string_view& name, Args&&... args) &
+            {
+                return Create(std::pair{name, -1}, (Args&&)args...);
+            }
+
+            Span<CreatorType*> Creators() &
+            {
+                auto creators = Registry<void>::Creators();
+                return {reinterpret_cast<CreatorType**>(creators.data()), creators.size()};
+            }
+
+            auto List() &
+            {
+                std::vector<std::pair<string_view, int>> list;
+                for (const auto& creator : Creators())
+                {
+                    list.emplace_back(creator->name(), creator->version());
+                }
+                return list;
+            }
+        };
+
+        template<typename Tag>
+        auto gRegistry() -> decltype((get_registry(basic_type<Tag>{})))
+        {
+            return get_registry(basic_type<Tag>{});
+        }
+
+        template<typename F>
+        class Registerer
+        {
+          public:
+            explicit Registerer(F f)
+                : func_(std::move(f))
+            {
+                func_();
+            }
+
+          private:
+            F func_;
+        };
+
+    }  // namespace _registry
+
+    using _registry::gRegistry;
+    using _registry::Registerer;
+    using _registry::Registry;
+
+    template<typename Tag>
+    using Creator = _registry::Creator<_registry::GetSignature<Tag>>;
+
+    template<typename Tag>
+    using SimpleCreator = _registry::SimpleCreator<_registry::GetSignature<Tag>>;
 
 }  // namespace mmdeploy
 
 // Specify creator signature for tag
 #define MMDEPLOY_CREATOR_SIGNATURE(tag, signature) \
-  ::mmdeploy::basic_type<signature> get_signature(::mmdeploy::basic_type<tag>);
+    ::mmdeploy::basic_type<signature> get_signature(::mmdeploy::basic_type<tag>);
 
 #define MMDEPLOY_DECLARE_REGISTRY(tag, signature) \
-  MMDEPLOY_CREATOR_SIGNATURE(tag, signature)      \
-  MMDEPLOY_API ::mmdeploy::Registry<tag>& get_registry(::mmdeploy::basic_type<tag>);
+    MMDEPLOY_CREATOR_SIGNATURE(tag, signature)    \
+    MMDEPLOY_API ::mmdeploy::Registry<tag>& get_registry(::mmdeploy::basic_type<tag>);
 
-#define MMDEPLOY_DECLARE_REGISTRY_EXPAND(tag, signature)        \
-  MMDEPLOY_CREATOR_SIGNATURE(tag, MMDEPLOY_PP_EXPAND signature) \
-  MMDEPLOY_API ::mmdeploy::Registry<tag>& get_registry(::mmdeploy::basic_type<tag>);
+#define MMDEPLOY_DECLARE_REGISTRY_EXPAND(tag, signature)          \
+    MMDEPLOY_CREATOR_SIGNATURE(tag, MMDEPLOY_PP_EXPAND signature) \
+    MMDEPLOY_API ::mmdeploy::Registry<tag>& get_registry(::mmdeploy::basic_type<tag>);
 
 #define MMDEPLOY_DEFINE_REGISTRY(tag)                                    \
-  ::mmdeploy::Registry<tag>& get_registry(::mmdeploy::basic_type<tag>) { \
-    static ::mmdeploy::Registry<tag> instance{};                         \
-    return instance;                                                     \
-  }
+    ::mmdeploy::Registry<tag>& get_registry(::mmdeploy::basic_type<tag>) \
+    {                                                                    \
+        static ::mmdeploy::Registry<tag> instance{};                     \
+        return instance;                                                 \
+    }
 
-#define MMDEPLOY_REGISTER_CREATOR(tag, creator_type)                    \
-  static ::mmdeploy::Registerer MMDEPLOY_ANONYMOUS_VARIABLE(register_){ \
-      [creator = creator_type{}]() mutable { ::mmdeploy::gRegistry<tag>().Add(creator); }};
+#define MMDEPLOY_REGISTER_CREATOR(tag, creator_type)                      \
+    static ::mmdeploy::Registerer MMDEPLOY_ANONYMOUS_VARIABLE(register_){ \
+        [creator = creator_type{}]() mutable { ::mmdeploy::gRegistry<tag>().Add(creator); }};
 
 #define MMDEPLOY_CREATOR_DESC(name, version) #name, version
 
-#define MMDEPLOY_REGISTER_FACTORY_FUNC(tag, creator_desc, func)                                  \
-  static ::mmdeploy::Registerer MMDEPLOY_ANONYMOUS_VARIABLE(register_){                          \
-      [creator =                                                                                 \
-           ::mmdeploy::SimpleCreator<tag>(MMDEPLOY_CREATOR_DESC creator_desc, func)]() mutable { \
-        ::mmdeploy::gRegistry<tag>().Add(creator);                                               \
-      }};
+#define MMDEPLOY_REGISTER_FACTORY_FUNC(tag, creator_desc, func)                                    \
+    static ::mmdeploy::Registerer MMDEPLOY_ANONYMOUS_VARIABLE(register_){                          \
+        [creator =                                                                                 \
+             ::mmdeploy::SimpleCreator<tag>(MMDEPLOY_CREATOR_DESC creator_desc, func)]() mutable { \
+            ::mmdeploy::gRegistry<tag>().Add(creator);                                             \
+        }};
 
 #endif  // MMDEPLOY_REGISTRY_H
diff --git a/csrc/mmdeploy/core/serialization.h b/csrc/mmdeploy/core/serialization.h
index a8c2f93e1f..c085b5e7b6 100644
--- a/csrc/mmdeploy/core/serialization.h
+++ b/csrc/mmdeploy/core/serialization.h
@@ -13,288 +13,345 @@
 #include "mmdeploy/core/mpl/type_traits.h"
 #include "mmdeploy/core/status_code.h"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
 #define MMDEPLOY_ARCHIVE_NVP(archive, ...) archive(MMDEPLOY_PP_MAP(MMDEPLOY_NVP, __VA_ARGS__))
 
-#define MMDEPLOY_ARCHIVE_MEMBERS(...)           \
-  template <typename Archive>                   \
-  void serialize(Archive &archive) {            \
-    MMDEPLOY_ARCHIVE_NVP(archive, __VA_ARGS__); \
-  }
-
-#define MMDEPLOY_NVP(var) \
-  ::mmdeploy::NamedValue { std::forward_as_tuple(#var, var) }
-
-template <typename NameT, typename ValueT>
-class NamedValue {
- public:
-  explicit NamedValue(std::tuple<NameT, ValueT> &&data) : data_(std::move(data)) {}
-  template <typename Archive>
-  void serialize(Archive &archive) {
-    archive.named_value(std::forward<NameT>(std::get<0>(data_)),
-                        std::forward<ValueT>(std::get<1>(data_)));
-  }
-  std::tuple<NameT, ValueT> &data() { return data_; }
-
- private:
-  std::tuple<NameT, ValueT> data_;
-};
-
-template <typename T>
-struct array_tag {
-  explicit array_tag(std::size_t size) : size_(size) {}
-  std::size_t size() const { return size_; }
-  std::size_t size_;
-};
-
-template <typename T>
-struct object_tag {};
-
-template <typename T>
-using mapped_type_t = typename T::mapped_type;
-
-template <typename T>
-using has_mapped_type = detail::is_detected<mapped_type_t, T>;
-
-template <typename T>
-using get_size_t = decltype(std::declval<T>().size());
-
-template <typename T>
-using has_size = detail::is_detected<get_size_t, T>;
-
-template <typename T>
-using reserve_t = decltype(std::declval<T>().reserve(std::size_t{0}));
-
-template <typename T>
-using has_reserve = detail::is_detected<reserve_t, T>;
-
-namespace detail {
-
-template <typename Archive, typename T, typename U = uncvref_t<T>,
-          typename ValueType = typename U::value_type,
-          std::enable_if_t<!std::is_same_v<U, std::string>, int> = 0>
-auto save(Archive &archive, T &&iterable)
-    -> std::void_t<decltype(iterable.begin(), iterable.end())> {
-  if constexpr (has_size<T>::value) {
-    archive.init(array_tag<ValueType>(iterable.size()));
-  }
-  for (auto &&x : iterable) {
-    archive.item(std::forward<decltype(x)>(x));
-  }
-}
-
-template <typename T0, typename T1>
-class KeyValue {
- public:
-  explicit KeyValue(std::tuple<T0, T1> &&data) : data_(std::move(data)) {}
-  template <typename Archive>
-  void serialize(Archive &archive) {
-    archive.named_value("key", std::forward<T0>(std::get<0>(data_)));
-    archive.named_value("value", std::forward<T1>(std::get<1>(data_)));
-  }
-  std::tuple<T0, T1> &data() { return data_; }
-
- private:
-  std::tuple<T0, T1> data_;
-};
-
-template <typename Archive, typename T, typename U = uncvref_t<T>,
-          typename KeyType = typename U::key_type, typename MappedType = typename U::mapped_type,
-          std::enable_if_t<!std::is_constructible_v<std::string, KeyType>, int> = 0>
-auto save(Archive &archive, T &object) -> std::void_t<decltype(object.begin(), object.end())> {
-  if constexpr (has_size<T>::value) {
-    // TODO: provide meaningful type info
-    archive.init(array_tag<void>(object.size()));
-  }
-  for (auto &&[k, v] : object) {
-    archive.item(KeyValue{
-        std::forward_as_tuple(std::forward<decltype(k)>(k), std::forward<decltype(v)>(v))});
-  }
-}
-
-template <typename Archive, typename T, typename U = uncvref_t<T>,
-          typename KeyType = typename U::key_type, typename MappedType = typename U::mapped_type,
-          std::enable_if_t<std::is_constructible_v<std::string, KeyType>, int> = 0>
-auto save(Archive &archive, T &object) -> std::void_t<decltype(object.begin(), object.end())> {
-  if constexpr (has_size<T>::value) {
-    archive.init(object_tag<MappedType>());
-  }
-  for (auto &&[k, v] : object) {
-    archive.named_value(std::forward<decltype(k)>(k), std::forward<decltype(v)>(v));
-  }
-}
-
-template <typename Archive, typename T, std::size_t... Is>
-void save_tuple_impl(Archive &archive, T &&tuple, std::index_sequence<Is...>) {
-  (archive.item(std::get<Is>(std::forward<T>(tuple))), ...);
-}
-
-template <typename Archive, typename... Ts>
-void save(Archive &archive, const std::tuple<Ts...> &tuple) {
-  save_tuple_impl(archive, tuple, std::index_sequence_for<Ts...>{});
-}
-
-template <typename Archive, typename T, size_t... Is>
-void load_tuple_impl(Archive &archive, T &tuple, std::index_sequence<Is...>) {
-  (archive.item(std::get<Is>(tuple)), ...);
-}
-
-template <typename Archive, typename T, std::size_t N>
-void save(Archive &archive, T (&v)[N]) {
-  archive.init(array_tag<T>(N));
-  for (std::size_t i = 0; i < N; ++i) {
-    archive.item(v[i]);
-  }
-}
-
-template <typename Archive, typename... Ts>
-void load(Archive &archive, std::tuple<Ts...> &tuple) {
-  std::size_t size{};
-  archive.init(size);
-  if (size != sizeof...(Ts)) {
-    throw_exception(eShapeMismatch);
-  }
-  load_tuple_impl(archive, tuple, std::index_sequence_for<Ts...>{});
-}
-
-template <typename Archive, typename T, typename U = uncvref_t<T>,
-          typename ValueType = typename U::value_type,
-          std::enable_if_t<!std::is_same_v<U, std::string>, int> = 0>
-auto load(Archive &&archive, T &&vec) -> std::void_t<decltype(vec.push_back(ValueType{}))> {
-  std::size_t size{};
-  archive.init(size);
-  vec.clear();
-  for (std::size_t i = 0; i < size; ++i) {
-    ValueType v{};
-    archive.item(v);
-    vec.push_back(std::move(v));
-  }
-}
-
-template <typename Archive, typename T, std::size_t N>
-void load(Archive &archive, std::array<T, N> &v) {
-  std::size_t size{};
-  archive.init(size);
-  for (std::size_t i = 0; i < size; ++i) {
-    archive.item(v[i]);
-  }
-}
-
-template <typename Archive, typename T, std::size_t N>
-void load(Archive &archive, T (&v)[N]) {
-  std::size_t size{};
-  archive.init(size);
-  for (std::size_t i = 0; i < size; ++i) {
-    archive.item(v[i]);
-  }
-}
-
-template <typename Archive, typename T, typename U = uncvref_t<T>,
-          typename ValueType = typename U::value_type,
-          std::enable_if_t<std::conjunction_v<std::is_default_constructible<ValueType>,
-                                              std::negation<has_mapped_type<U>>>,
-                           int> = 0>
-auto load(Archive &&archive, T &&set)
-    -> std::void_t<decltype(set.insert(std::declval<ValueType>()))> {
-  std::size_t size{};
-  archive.init(size);
-  for (std::size_t i = 0; i < size; ++i) {
-    ValueType v{};
-    archive.item(v);
-    set.insert(std::move(v));
-  }
-}
-
-template <
-    typename Archive, typename T, typename U = uncvref_t<T>,
-    typename KeyType = typename U::key_type, typename MappedType = typename U::mapped_type,
-    std::enable_if_t<std::conjunction_v<std::negation<std::is_constructible<KeyType, std::string>>,
-                                        std::is_default_constructible<KeyType>,
-                                        std::is_default_constructible<MappedType>>,
-                     int> = 0>
-void load(Archive &&archive, T &&object) {
-  std::size_t size{};
-  archive.init(size);
-  for (std::size_t i = 0; i < size; ++i) {
-    KeyType key;
-    MappedType mapped;
-    archive.item(KeyValue{std::tie(key, mapped)});
-    object.insert({std::move(key), std::move(mapped)});
-  };
-}
-
-template <typename Archive, typename T, typename U = uncvref_t<T>,
-          typename KeyType = typename U::key_type, typename MappedType = typename U::mapped_type,
-          std::enable_if_t<std::conjunction_v<std::is_constructible<KeyType, std::string>,
-                                              std::is_default_constructible<MappedType>>,
-                           int> = 0>
-void load(Archive &&archive, T &&object) {
-  std::size_t size{};
-  archive.init(size);
-  for (std::size_t i = 0; i < size; ++i) {
-    std::string name;
-    MappedType value{};
-    archive.named_value(name, value);
-    object.insert({std::move(name), std::move(value)});
-  }
-}
-
-struct save_fn {
-  template <typename Archive, typename T>
-  auto operator()(Archive &&a, T &&v) const
-      -> decltype(save(std::forward<Archive>(a), std::forward<T>(v))) {
-    return save(std::forward<Archive>(a), std::forward<T>(v));
-  }
-};
-
-struct load_fn {
-  template <typename Archive, typename T>
-  auto operator()(Archive &&a, T &&v) const
-      -> decltype(load(std::forward<Archive>(a), std::forward<T>(v))) {
-    return load(std::forward<Archive>(a), std::forward<T>(v));
-  }
-};
-
-struct serialize_fn {
-  template <typename Archive, typename T>
-  auto operator()(Archive &&a, T &&v) const
-      -> decltype(serialize(std::forward<Archive>(a), std::forward<T>(v))) {
-    return serialize(std::forward<Archive>(a), std::forward<T>(v));
-  }
-};
-
-}  // namespace detail
-
-namespace {
-
-constexpr inline detail::save_fn save{};
-constexpr inline detail::load_fn load{};
-constexpr inline detail::serialize_fn serialize{};
-
-}  // namespace
-
-template <typename T = void, typename SFINAE = void>
-struct adl_serializer;
-
-template <typename, typename>
-struct adl_serializer {
-  template <typename Archive, typename T>
-  static auto save(Archive &&a, T &&v)
-      -> decltype(::mmdeploy::save(std::forward<Archive>(a), std::forward<T>(v))) {
-    ::mmdeploy::save(std::forward<Archive>(a), std::forward<T>(v));
-  }
-  template <typename Archive, typename T>
-  static auto load(Archive &&a, T &&v)
-      -> decltype(::mmdeploy::load(std::forward<Archive>(a), std::forward<T>(v))) {
-    ::mmdeploy::load(std::forward<Archive>(a), std::forward<T>(v));
-  }
-  template <typename Archive, typename T>
-  static auto serialize(Archive &&a, T &&v)
-      -> decltype(::mmdeploy::serialize(std::forward<Archive>(a), std::forward<T>(v))) {
-    ::mmdeploy::serialize(std::forward<Archive>(a), std::forward<T>(v));
-  }
-};
+#define MMDEPLOY_ARCHIVE_MEMBERS(...)               \
+    template<typename Archive>                      \
+    void serialize(Archive& archive)                \
+    {                                               \
+        MMDEPLOY_ARCHIVE_NVP(archive, __VA_ARGS__); \
+    }
+
+#define MMDEPLOY_NVP(var)                \
+    ::mmdeploy::NamedValue               \
+    {                                    \
+        std::forward_as_tuple(#var, var) \
+    }
+
+    template<typename NameT, typename ValueT>
+    class NamedValue
+    {
+      public:
+        explicit NamedValue(std::tuple<NameT, ValueT>&& data)
+            : data_(std::move(data))
+        {
+        }
+        template<typename Archive>
+        void serialize(Archive& archive)
+        {
+            archive.named_value(std::forward<NameT>(std::get<0>(data_)),
+                                std::forward<ValueT>(std::get<1>(data_)));
+        }
+        std::tuple<NameT, ValueT>& data()
+        {
+            return data_;
+        }
+
+      private:
+        std::tuple<NameT, ValueT> data_;
+    };
+
+    template<typename T>
+    struct array_tag
+    {
+        explicit array_tag(std::size_t size)
+            : size_(size)
+        {
+        }
+        std::size_t size() const
+        {
+            return size_;
+        }
+        std::size_t size_;
+    };
+
+    template<typename T>
+    struct object_tag
+    {
+    };
+
+    template<typename T>
+    using mapped_type_t = typename T::mapped_type;
+
+    template<typename T>
+    using has_mapped_type = detail::is_detected<mapped_type_t, T>;
+
+    template<typename T>
+    using get_size_t = decltype(std::declval<T>().size());
+
+    template<typename T>
+    using has_size = detail::is_detected<get_size_t, T>;
+
+    template<typename T>
+    using reserve_t = decltype(std::declval<T>().reserve(std::size_t{0}));
+
+    template<typename T>
+    using has_reserve = detail::is_detected<reserve_t, T>;
+
+    namespace detail
+    {
+
+        template<typename Archive, typename T, typename U = uncvref_t<T>, typename ValueType = typename U::value_type, std::enable_if_t<!std::is_same_v<U, std::string>, int> = 0>
+        auto save(Archive& archive, T&& iterable)
+            -> std::void_t<decltype(iterable.begin(), iterable.end())>
+        {
+            if constexpr (has_size<T>::value)
+            {
+                archive.init(array_tag<ValueType>(iterable.size()));
+            }
+            for (auto&& x : iterable)
+            {
+                archive.item(std::forward<decltype(x)>(x));
+            }
+        }
+
+        template<typename T0, typename T1>
+        class KeyValue
+        {
+          public:
+            explicit KeyValue(std::tuple<T0, T1>&& data)
+                : data_(std::move(data))
+            {
+            }
+            template<typename Archive>
+            void serialize(Archive& archive)
+            {
+                archive.named_value("key", std::forward<T0>(std::get<0>(data_)));
+                archive.named_value("value", std::forward<T1>(std::get<1>(data_)));
+            }
+            std::tuple<T0, T1>& data()
+            {
+                return data_;
+            }
+
+          private:
+            std::tuple<T0, T1> data_;
+        };
+
+        template<typename Archive, typename T, typename U = uncvref_t<T>, typename KeyType = typename U::key_type, typename MappedType = typename U::mapped_type, std::enable_if_t<!std::is_constructible_v<std::string, KeyType>, int> = 0>
+        auto save(Archive& archive, T& object) -> std::void_t<decltype(object.begin(), object.end())>
+        {
+            if constexpr (has_size<T>::value)
+            {
+                // TODO: provide meaningful type info
+                archive.init(array_tag<void>(object.size()));
+            }
+            for (auto&& [k, v] : object)
+            {
+                archive.item(KeyValue{
+                    std::forward_as_tuple(std::forward<decltype(k)>(k), std::forward<decltype(v)>(v))});
+            }
+        }
+
+        template<typename Archive, typename T, typename U = uncvref_t<T>, typename KeyType = typename U::key_type, typename MappedType = typename U::mapped_type, std::enable_if_t<std::is_constructible_v<std::string, KeyType>, int> = 0>
+        auto save(Archive& archive, T& object) -> std::void_t<decltype(object.begin(), object.end())>
+        {
+            if constexpr (has_size<T>::value)
+            {
+                archive.init(object_tag<MappedType>());
+            }
+            for (auto&& [k, v] : object)
+            {
+                archive.named_value(std::forward<decltype(k)>(k), std::forward<decltype(v)>(v));
+            }
+        }
+
+        template<typename Archive, typename T, std::size_t... Is>
+        void save_tuple_impl(Archive& archive, T&& tuple, std::index_sequence<Is...>)
+        {
+            (archive.item(std::get<Is>(std::forward<T>(tuple))), ...);
+        }
+
+        template<typename Archive, typename... Ts>
+        void save(Archive& archive, const std::tuple<Ts...>& tuple)
+        {
+            save_tuple_impl(archive, tuple, std::index_sequence_for<Ts...>{});
+        }
+
+        template<typename Archive, typename T, size_t... Is>
+        void load_tuple_impl(Archive& archive, T& tuple, std::index_sequence<Is...>)
+        {
+            (archive.item(std::get<Is>(tuple)), ...);
+        }
+
+        template<typename Archive, typename T, std::size_t N>
+        void save(Archive& archive, T (&v)[N])
+        {
+            archive.init(array_tag<T>(N));
+            for (std::size_t i = 0; i < N; ++i)
+            {
+                archive.item(v[i]);
+            }
+        }
+
+        template<typename Archive, typename... Ts>
+        void load(Archive& archive, std::tuple<Ts...>& tuple)
+        {
+            std::size_t size{};
+            archive.init(size);
+            if (size != sizeof...(Ts))
+            {
+                throw_exception(eShapeMismatch);
+            }
+            load_tuple_impl(archive, tuple, std::index_sequence_for<Ts...>{});
+        }
+
+        template<typename Archive, typename T, typename U = uncvref_t<T>, typename ValueType = typename U::value_type, std::enable_if_t<!std::is_same_v<U, std::string>, int> = 0>
+        auto load(Archive&& archive, T&& vec) -> std::void_t<decltype(vec.push_back(ValueType{}))>
+        {
+            std::size_t size{};
+            archive.init(size);
+            vec.clear();
+            for (std::size_t i = 0; i < size; ++i)
+            {
+                ValueType v{};
+                archive.item(v);
+                vec.push_back(std::move(v));
+            }
+        }
+
+        template<typename Archive, typename T, std::size_t N>
+        void load(Archive& archive, std::array<T, N>& v)
+        {
+            std::size_t size{};
+            archive.init(size);
+            for (std::size_t i = 0; i < size; ++i)
+            {
+                archive.item(v[i]);
+            }
+        }
+
+        template<typename Archive, typename T, std::size_t N>
+        void load(Archive& archive, T (&v)[N])
+        {
+            std::size_t size{};
+            archive.init(size);
+            for (std::size_t i = 0; i < size; ++i)
+            {
+                archive.item(v[i]);
+            }
+        }
+
+        template<typename Archive, typename T, typename U = uncvref_t<T>, typename ValueType = typename U::value_type, std::enable_if_t<std::conjunction_v<std::is_default_constructible<ValueType>, std::negation<has_mapped_type<U>>>, int> = 0>
+        auto load(Archive&& archive, T&& set)
+            -> std::void_t<decltype(set.insert(std::declval<ValueType>()))>
+        {
+            std::size_t size{};
+            archive.init(size);
+            for (std::size_t i = 0; i < size; ++i)
+            {
+                ValueType v{};
+                archive.item(v);
+                set.insert(std::move(v));
+            }
+        }
+
+        template<
+            typename Archive,
+            typename T,
+            typename U            = uncvref_t<T>,
+            typename KeyType      = typename U::key_type,
+            typename MappedType   = typename U::mapped_type,
+            std::enable_if_t<std::conjunction_v<std::negation<std::is_constructible<KeyType, std::string>>,
+                                                std::is_default_constructible<KeyType>,
+                                                std::is_default_constructible<MappedType>>,
+                             int> = 0>
+        void load(Archive&& archive, T&& object)
+        {
+            std::size_t size{};
+            archive.init(size);
+            for (std::size_t i = 0; i < size; ++i)
+            {
+                KeyType    key;
+                MappedType mapped;
+                archive.item(KeyValue{std::tie(key, mapped)});
+                object.insert({std::move(key), std::move(mapped)});
+            };
+        }
+
+        template<typename Archive, typename T, typename U = uncvref_t<T>, typename KeyType = typename U::key_type, typename MappedType = typename U::mapped_type, std::enable_if_t<std::conjunction_v<std::is_constructible<KeyType, std::string>, std::is_default_constructible<MappedType>>, int> = 0>
+        void load(Archive&& archive, T&& object)
+        {
+            std::size_t size{};
+            archive.init(size);
+            for (std::size_t i = 0; i < size; ++i)
+            {
+                std::string name;
+                MappedType  value{};
+                archive.named_value(name, value);
+                object.insert({std::move(name), std::move(value)});
+            }
+        }
+
+        struct save_fn
+        {
+            template<typename Archive, typename T>
+            auto operator()(Archive&& a, T&& v) const
+                -> decltype(save(std::forward<Archive>(a), std::forward<T>(v)))
+            {
+                return save(std::forward<Archive>(a), std::forward<T>(v));
+            }
+        };
+
+        struct load_fn
+        {
+            template<typename Archive, typename T>
+            auto operator()(Archive&& a, T&& v) const
+                -> decltype(load(std::forward<Archive>(a), std::forward<T>(v)))
+            {
+                return load(std::forward<Archive>(a), std::forward<T>(v));
+            }
+        };
+
+        struct serialize_fn
+        {
+            template<typename Archive, typename T>
+            auto operator()(Archive&& a, T&& v) const
+                -> decltype(serialize(std::forward<Archive>(a), std::forward<T>(v)))
+            {
+                return serialize(std::forward<Archive>(a), std::forward<T>(v));
+            }
+        };
+
+    }  // namespace detail
+
+    namespace
+    {
+
+        constexpr inline detail::save_fn      save{};
+        constexpr inline detail::load_fn      load{};
+        constexpr inline detail::serialize_fn serialize{};
+
+    }  // namespace
+
+    template<typename T = void, typename SFINAE = void>
+    struct adl_serializer;
+
+    template<typename, typename>
+    struct adl_serializer
+    {
+        template<typename Archive, typename T>
+        static auto save(Archive&& a, T&& v)
+            -> decltype(::mmdeploy::save(std::forward<Archive>(a), std::forward<T>(v)))
+        {
+            ::mmdeploy::save(std::forward<Archive>(a), std::forward<T>(v));
+        }
+        template<typename Archive, typename T>
+        static auto load(Archive&& a, T&& v)
+            -> decltype(::mmdeploy::load(std::forward<Archive>(a), std::forward<T>(v)))
+        {
+            ::mmdeploy::load(std::forward<Archive>(a), std::forward<T>(v));
+        }
+        template<typename Archive, typename T>
+        static auto serialize(Archive&& a, T&& v)
+            -> decltype(::mmdeploy::serialize(std::forward<Archive>(a), std::forward<T>(v)))
+        {
+            ::mmdeploy::serialize(std::forward<Archive>(a), std::forward<T>(v));
+        }
+    };
 
 };  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/core/status_code.cpp b/csrc/mmdeploy/core/status_code.cpp
index 3747960b5d..0bce609241 100644
--- a/csrc/mmdeploy/core/status_code.cpp
+++ b/csrc/mmdeploy/core/status_code.cpp
@@ -5,43 +5,51 @@
 #include "mmdeploy/core/logger.h"
 #include "mmdeploy/core/utils/source_location.h"
 
-namespace mmdeploy {
-
-void StatusDomain::_do_throw_exception(
-    const SYSTEM_ERROR2_NAMESPACE::status_code<void> &code) const {
-  assert(code.domain() == *this);
-  const auto &c = static_cast<const StatusCode &>(code);  // NOLINT
-  throw SYSTEM_ERROR2_NAMESPACE::status_error(c);
-}
-
-using string_ref = SYSTEM_ERROR2_NAMESPACE::status_code_domain::string_ref;
-using atomic_refcounted_string_ref =
-    SYSTEM_ERROR2_NAMESPACE::status_code_domain::atomic_refcounted_string_ref;
-
-string_ref Status::message() const {
-  std::string ret;
-  try {
+namespace mmdeploy
+{
+
+    void StatusDomain::_do_throw_exception(
+        const SYSTEM_ERROR2_NAMESPACE::status_code<void>& code) const
+    {
+        assert(code.domain() == *this);
+        const auto& c = static_cast<const StatusCode&>(code);  // NOLINT
+        throw SYSTEM_ERROR2_NAMESPACE::status_error(c);
+    }
+
+    using string_ref = SYSTEM_ERROR2_NAMESPACE::status_code_domain::string_ref;
+    using atomic_refcounted_string_ref =
+        SYSTEM_ERROR2_NAMESPACE::status_code_domain::atomic_refcounted_string_ref;
+
+    string_ref Status::message() const
+    {
+        std::string ret;
+        try
+        {
 #if MMDEPLOY_STATUS_USE_SOURCE_LOCATION
-#if MMDEPLOY_HAS_SOURCE_LOCATION
-    ret = fmt::format("{} ({}) @ {}:{}", to_string(ec), (int32_t)ec, file, line);
-#else
-    ret = fmt::format("{} ({})", to_string(ec), (int32_t)ec);
-#endif
+    #if MMDEPLOY_HAS_SOURCE_LOCATION
+            ret = fmt::format("{} ({}) @ {}:{}", to_string(ec), (int32_t)ec, file, line);
+    #else
+            ret = fmt::format("{} ({})", to_string(ec), (int32_t)ec);
+    #endif
 #elif MMDEPLOY_STATUS_USE_STACKTRACE
-    ret = fmt::format("{} ({}), stacktrace:\n{}", to_string(ec), (int32_t)ec, st.to_string());
+            ret = fmt::format("{} ({}), stacktrace:\n{}", to_string(ec), (int32_t)ec, st.to_string());
 #else
-    ret = fmt::format("{} ({})", to_string(ec), (int32_t)ec);
+            ret = fmt::format("{} ({})", to_string(ec), (int32_t)ec);
 #endif
-
-  } catch (...) {
-    return string_ref("Failed to retrieve message for status");
-  }
-  if (auto p = static_cast<char *>(malloc(ret.size() + 1))) {
-    memcpy(p, ret.c_str(), ret.size() + 1);
-    return atomic_refcounted_string_ref(p, ret.size());
-  } else {
-    return string_ref("Failed to allocate memory to store error string");
-  }
-}
+        }
+        catch (...)
+        {
+            return string_ref("Failed to retrieve message for status");
+        }
+        if (auto p = static_cast<char*>(malloc(ret.size() + 1)))
+        {
+            memcpy(p, ret.c_str(), ret.size() + 1);
+            return atomic_refcounted_string_ref(p, ret.size());
+        }
+        else
+        {
+            return string_ref("Failed to allocate memory to store error string");
+        }
+    }
 
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/core/status_code.h b/csrc/mmdeploy/core/status_code.h
index 337c4c7e8c..a2d016c02e 100644
--- a/csrc/mmdeploy/core/status_code.h
+++ b/csrc/mmdeploy/core/status_code.h
@@ -8,14 +8,15 @@
 #include "mmdeploy/core/macro.h"
 #include "outcome-experimental.hpp"
 #if MMDEPLOY_STATUS_USE_SOURCE_LOCATION
-#include "mmdeploy/core/utils/source_location.h"
+    #include "mmdeploy/core/utils/source_location.h"
 #elif MMDEPLOY_STATUS_USE_STACKTRACE
-#include "mmdeploy/core/utils/stacktrace.h"
+    #include "mmdeploy/core/utils/stacktrace.h"
 #endif
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-// clang-format off
+    // clang-format off
 
 enum class ErrorCode: int32_t {
   eSuccess         = 0,
@@ -31,93 +32,114 @@ enum class ErrorCode: int32_t {
   eUnknown         = -1,
 };
 
-// clang-format on
+    // clang-format on
 
 #define USING_ERROR_CODE(code) constexpr inline const auto code = ErrorCode::code  // NOLINT
 
-// note that eSuccess is not brought to the outer namespace on purpose
-USING_ERROR_CODE(eInvalidArgument);
-USING_ERROR_CODE(eNotSupported);
-USING_ERROR_CODE(eOutOfRange);
-USING_ERROR_CODE(eOutOfMemory);
-USING_ERROR_CODE(eFileNotExist);
-USING_ERROR_CODE(eFail);
-USING_ERROR_CODE(eShapeMismatch);
-USING_ERROR_CODE(eEntryNotFound);
-USING_ERROR_CODE(eNotReady);
-USING_ERROR_CODE(eUnknown);
-
-inline const char *to_string(ErrorCode code) {
-  switch (code) {
-    case ErrorCode::eSuccess:
-      return "success";
-    case ErrorCode::eInvalidArgument:
-      return "invalid argument";
-    case ErrorCode::eNotSupported:
-      return "not supported";
-    case ErrorCode::eOutOfRange:
-      return "out of range";
-    case ErrorCode::eOutOfMemory:
-      return "out of memory";
-    case ErrorCode::eFileNotExist:
-      return "file not exist";
-    case ErrorCode::eShapeMismatch:
-      return "shape mismatch";
-    case ErrorCode::eEntryNotFound:
-      return "entry not found";
-    case ErrorCode::eNotReady:
-      return "not ready";
-    default:
-      return "unknown";
-  }
-}
+    // note that eSuccess is not brought to the outer namespace on purpose
+    USING_ERROR_CODE(eInvalidArgument);
+    USING_ERROR_CODE(eNotSupported);
+    USING_ERROR_CODE(eOutOfRange);
+    USING_ERROR_CODE(eOutOfMemory);
+    USING_ERROR_CODE(eFileNotExist);
+    USING_ERROR_CODE(eFail);
+    USING_ERROR_CODE(eShapeMismatch);
+    USING_ERROR_CODE(eEntryNotFound);
+    USING_ERROR_CODE(eNotReady);
+    USING_ERROR_CODE(eUnknown);
+
+    inline const char* to_string(ErrorCode code)
+    {
+        switch (code)
+        {
+            case ErrorCode::eSuccess:
+                return "success";
+            case ErrorCode::eInvalidArgument:
+                return "invalid argument";
+            case ErrorCode::eNotSupported:
+                return "not supported";
+            case ErrorCode::eOutOfRange:
+                return "out of range";
+            case ErrorCode::eOutOfMemory:
+                return "out of memory";
+            case ErrorCode::eFileNotExist:
+                return "file not exist";
+            case ErrorCode::eShapeMismatch:
+                return "shape mismatch";
+            case ErrorCode::eEntryNotFound:
+                return "entry not found";
+            case ErrorCode::eNotReady:
+                return "not ready";
+            default:
+                return "unknown";
+        }
+    }
 
-struct MMDEPLOY_API Status {
-  ErrorCode ec{};
-  Status() = default;
-  SYSTEM_ERROR2_NAMESPACE::status_code_domain::string_ref message() const;
-  bool operator==(const ErrorCode &b) const noexcept { return ec == b; }
+    struct MMDEPLOY_API Status
+    {
+        ErrorCode ec{};
+        Status() = default;
+        SYSTEM_ERROR2_NAMESPACE::status_code_domain::string_ref message() const;
+        bool                                                    operator==(const ErrorCode& b) const noexcept
+        {
+            return ec == b;
+        }
 
 #if MMDEPLOY_STATUS_USE_SOURCE_LOCATION
-  const char *file{""};
-  int line{};
-  explicit Status(ErrorCode _ec, SourceLocation location = SourceLocation::current())
-      : ec(_ec), file(location.file_name()), line(static_cast<int>(location.line())) {}
+        const char* file{""};
+        int         line{};
+        explicit Status(ErrorCode _ec, SourceLocation location = SourceLocation::current())
+            : ec(_ec)
+            , file(location.file_name())
+            , line(static_cast<int>(location.line()))
+        {
+        }
 #elif MMDEPLOY_STATUS_USE_STACKTRACE
-  Stacktrace st;
-  explicit Status(ErrorCode _ec, Stacktrace _st = Stacktrace(0)) : ec(_ec), st(std::move(_st)) {}
+        Stacktrace st;
+        explicit Status(ErrorCode _ec, Stacktrace _st = Stacktrace(0))
+            : ec(_ec)
+            , st(std::move(_st))
+        {
+        }
 #else
-  explicit Status(ErrorCode _ec) : ec(_ec) {}
+        explicit Status(ErrorCode _ec)
+            : ec(_ec)
+        {
+        }
 #endif
-};
+    };
 
-class StatusDomain;
+    class StatusDomain;
 
-using StatusCode = SYSTEM_ERROR2_NAMESPACE::status_code<StatusDomain>;
+    using StatusCode = SYSTEM_ERROR2_NAMESPACE::status_code<StatusDomain>;
 
-class MMDEPLOY_API StatusDomain : public SYSTEM_ERROR2_NAMESPACE::status_code_domain {
-  using _base = status_code_domain;
+    class MMDEPLOY_API StatusDomain : public SYSTEM_ERROR2_NAMESPACE::status_code_domain
+    {
+        using _base = status_code_domain;
 
- public:
-  using value_type = Status;
+      public:
+        using value_type = Status;
 
-  constexpr explicit StatusDomain(typename _base::unique_id_type id = 0x3584b6716049efb4) noexcept
-      : _base(id) {}
+        constexpr explicit StatusDomain(typename _base::unique_id_type id = 0x3584b6716049efb4) noexcept
+            : _base(id)
+        {
+        }
 
-  StatusDomain(const StatusDomain &) = default;
-  StatusDomain(StatusDomain &&) = default;
-  StatusDomain &operator=(const StatusDomain &) = default;
-  StatusDomain &operator=(StatusDomain &&) = default;
-  ~StatusDomain() = default;
+        StatusDomain(const StatusDomain&)            = default;
+        StatusDomain(StatusDomain&&)                 = default;
+        StatusDomain& operator=(const StatusDomain&) = default;
+        StatusDomain& operator=(StatusDomain&&)      = default;
+        ~StatusDomain()                              = default;
 
-  static inline constexpr const StatusDomain &get();
+        static inline constexpr const StatusDomain& get();
 
-  string_ref name() const noexcept override {
-    static string_ref v("mmdeploy");
-    return v;
-  }
+        string_ref                                  name() const noexcept override
+        {
+            static string_ref v("mmdeploy");
+            return v;
+        }
 
-  // clang-format off
+        // clang-format off
   bool _do_failure(const SYSTEM_ERROR2_NAMESPACE::status_code<void> &code) const noexcept override {
     assert(code.domain() == *this);
     auto &c = static_cast<const StatusCode &>(code);  // NOLINT
@@ -145,56 +167,74 @@ class MMDEPLOY_API StatusDomain : public SYSTEM_ERROR2_NAMESPACE::status_code_do
     auto &c = static_cast<const StatusCode &>(code);  // NOLINT
     return c.value().message();
   }
-  // clang-format on
-  void _do_throw_exception(const SYSTEM_ERROR2_NAMESPACE::status_code<void> &code) const override;
-};
-
-constexpr inline StatusDomain status_domain;
-inline constexpr const StatusDomain &StatusDomain::get() { return status_domain; }
+        // clang-format on
+        void _do_throw_exception(const SYSTEM_ERROR2_NAMESPACE::status_code<void>& code) const override;
+    };
+
+    constexpr inline StatusDomain        status_domain;
+    inline constexpr const StatusDomain& StatusDomain::get()
+    {
+        return status_domain;
+    }
 
-inline StatusCode make_status_code(StatusCode::value_type v) {
-  return StatusCode(SYSTEM_ERROR2_NAMESPACE::in_place, static_cast<StatusCode::value_type &&>(v));
-}
+    inline StatusCode make_status_code(StatusCode::value_type v)
+    {
+        return StatusCode(SYSTEM_ERROR2_NAMESPACE::in_place, static_cast<StatusCode::value_type&&>(v));
+    }
 
-using OUTCOME_V2_NAMESPACE::failure;
-using OUTCOME_V2_NAMESPACE::in_place_type;
-using OUTCOME_V2_NAMESPACE::success;
+    using OUTCOME_V2_NAMESPACE::failure;
+    using OUTCOME_V2_NAMESPACE::in_place_type;
+    using OUTCOME_V2_NAMESPACE::success;
 
-inline bool operator==(const StatusCode &sc, ErrorCode ec) noexcept { return sc.value().ec == ec; }
-inline bool operator==(ErrorCode ec, const StatusCode &sc) noexcept { return sc.value().ec == ec; }
+    inline bool operator==(const StatusCode& sc, ErrorCode ec) noexcept
+    {
+        return sc.value().ec == ec;
+    }
+    inline bool operator==(ErrorCode ec, const StatusCode& sc) noexcept
+    {
+        return sc.value().ec == ec;
+    }
 
-using Error = SYSTEM_ERROR2_NAMESPACE::errored_status_code<StatusDomain>;
+    using Error = SYSTEM_ERROR2_NAMESPACE::errored_status_code<StatusDomain>;
 
-using Exception = SYSTEM_ERROR2_NAMESPACE::status_error<StatusDomain>;
+    using Exception = SYSTEM_ERROR2_NAMESPACE::status_error<StatusDomain>;
 
-template <typename T>
-struct Result : OUTCOME_V2_NAMESPACE::experimental::status_result<T, Error> {
-  using OUTCOME_V2_NAMESPACE::experimental::status_result<T, Error>::status_result;
-};
+    template<typename T>
+    struct Result : OUTCOME_V2_NAMESPACE::experimental::status_result<T, Error>
+    {
+        using OUTCOME_V2_NAMESPACE::experimental::status_result<T, Error>::status_result;
+    };
 
 #if MMDEPLOY_STATUS_USE_SOURCE_LOCATION
-[[noreturn]] inline void throw_exception(ErrorCode ec,
-                                         SourceLocation location = SourceLocation::current()) {
-  Error(Status(ec, location)).throw_exception();
-}
+    [[noreturn]] inline void throw_exception(ErrorCode      ec,
+                                             SourceLocation location = SourceLocation::current())
+    {
+        Error(Status(ec, location)).throw_exception();
+    }
 #elif MMDEPLOY_STATUS_USE_STACKTRACE
-[[noreturn]] inline void throw_exception(ErrorCode ec, Stacktrace stacktrace = Stacktrace(0)) {
-  Error(Status(ec, std::move(stacktrace))).throw_exception();
-}
+    [[noreturn]] inline void throw_exception(ErrorCode ec, Stacktrace stacktrace = Stacktrace(0))
+    {
+        Error(Status(ec, std::move(stacktrace))).throw_exception();
+    }
 #else
-[[noreturn]] inline void throw_exception(const ErrorCode ec) {
-  Error(Status(ec)).throw_exception();
-}
+    [[noreturn]] inline void throw_exception(const ErrorCode ec)
+    {
+        Error(Status(ec)).throw_exception();
+    }
 #endif
 
-template <typename T>
-struct is_result : std::false_type {};
+    template<typename T>
+    struct is_result : std::false_type
+    {
+    };
 
-template <typename T>
-struct is_result<Result<T>> : std::true_type {};
+    template<typename T>
+    struct is_result<Result<T>> : std::true_type
+    {
+    };
 
-template <typename T>
-inline constexpr bool is_result_v = is_result<T>::value;
+    template<typename T>
+    inline constexpr bool is_result_v = is_result<T>::value;
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/core/tensor.cpp b/csrc/mmdeploy/core/tensor.cpp
index 1bde8cf7c7..8df8ec528a 100644
--- a/csrc/mmdeploy/core/tensor.cpp
+++ b/csrc/mmdeploy/core/tensor.cpp
@@ -11,191 +11,264 @@
 
 using std::stringstream;
 
-namespace mmdeploy::framework {
-
-static inline int64_t element_size(DataType data_type) {
-  switch (data_type) {
-    case DataType::kFLOAT:
-      return 4;
-    case DataType::kHALF:
-      return 2;
-    case DataType::kINT8:
-      return 1;
-    case DataType::kINT32:
-      return 4;
-    case DataType::kINT64:
-      return 8;
-    default:
-      return 0;
-  }
-}
-
-inline static std::string shape_string(const TensorShape& shape) {
-  if (shape.empty()) {
-    return "0";
-  }
-  stringstream ss;
-  ss << shape[0];
-  for (size_t i = 1; i < shape.size(); ++i) ss << "," << shape[i];
-  return ss.str();
-}
-
-Tensor::Tensor(const TensorDesc& desc, Allocator allocator)
-    : desc_(desc), allocator_(std::move(allocator)) {
-  buffer_ = Buffer(desc.device, byte_size(), allocator_);
-}
-
-Tensor::Tensor(const TensorDesc& desc, Buffer buffer)  // NOLINT
-    : desc_(desc), buffer_(std::move(buffer)) {}
-
-Tensor::Tensor(const TensorDesc& desc, std::shared_ptr<void> data) {
-  desc_ = desc;
-  buffer_ = Buffer(desc.device, byte_size(), std::move(data));
-}
-
-static inline int64_t get_size(const std::vector<int64_t>& shape) {
-  if (shape.empty()) {
-    return 0;
-  }
-  auto _size = std::accumulate(begin(shape), end(shape), 1LL, std::multiplies<>());
-  return std::max(0LL, _size);
-}
-
-int64_t Tensor::size() const { return get_size(shape()); }
-
-int64_t Tensor::byte_size() const { return size() * element_size(data_type()); }
-const TensorDesc& Tensor::desc() const { return desc_; }
-const TensorShape& Tensor::shape() const { return desc_.shape; }
-DataType Tensor::data_type() const { return desc_.data_type; }
-const char* Tensor::name() const { return desc_.name.c_str(); }
-const Buffer& Tensor::buffer() const { return buffer_; }
-
-Buffer& Tensor::buffer() {
-  Allocate();
-  return buffer_;
-}
-
-Device Tensor::device() const { return desc_.device; }
-
-void Tensor::Reshape(const TensorShape& shape) {
-  bool is_same_size = size() == get_size(shape);
-  desc_.shape = shape;
-  if (buffer_ && !is_same_size) {
-    // re-allocate buffer
-    buffer_ = {};
-    Allocate();
-  }
-}
-
-void Tensor::Squeeze() {
-  desc_.shape.erase(std::remove(desc_.shape.begin(), desc_.shape.end(), 1), desc_.shape.end());
-}
-
-void Tensor::Squeeze(int dim) {
-  if (shape(dim) == 1) {
-    desc_.shape.erase(desc_.shape.begin() + dim);
-  }
-}
-
-Result<void> Tensor::CopyFrom(const Tensor& tensor, Stream stream) {
-  if (desc_.shape.empty() || tensor.desc().shape.empty()) {
-    MMDEPLOY_ERROR("uninitialized tensor");
-    return Status(eInvalidArgument);
-  }
-  if (!(desc_.shape == tensor.desc().shape)) {
-    MMDEPLOY_ERROR("mismatched shape {} vs {}", shape_string(desc_.shape),
-                   shape_string(tensor.desc().shape));
-    return Status(eShapeMismatch);
-  }
-  if (desc_.data_type != tensor.desc().data_type) {
-    MMDEPLOY_ERROR("mismatched data type {} vs {}", desc_.data_type, tensor.desc().data_type);
-    return Status(eShapeMismatch);
-  }
-  Allocate();
-  if (!stream) {
-    auto device = desc_.device.is_device() ? desc_.device : tensor.desc().device;
-    auto default_stream = Stream::GetDefault(device);
-    OUTCOME_TRY(default_stream.Copy(tensor.buffer(), buffer_, tensor.byte_size()));
-  } else {
-    OUTCOME_TRY(stream.Copy(tensor.buffer(), buffer_, tensor.byte_size()));
-  }
-  return success();
-}
-
-Result<void> Tensor::CopyTo(Tensor& tensor, Stream stream) const {
-  if (desc_.shape.empty() || tensor.desc().shape.empty()) {
-    MMDEPLOY_ERROR("uninitialized tensor");
-    return Status(eInvalidArgument);
-  }
-
-  if (!(desc_.shape == tensor.desc().shape)) {
-    MMDEPLOY_ERROR("mismatched shape {} vs {}", shape_string(desc_.shape),
-                   shape_string(tensor.desc().shape));
-    return Status(eShapeMismatch);
-  }
-  if (desc_.data_type != tensor.desc().data_type) {
-    MMDEPLOY_ERROR("mismatched data type {} vs {}", desc_.data_type, tensor.desc().data_type);
-    return Status(eShapeMismatch);
-  }
-  tensor.Allocate();
-  if (!stream) {
-    Device device = desc_.device.is_device() ? desc_.device : tensor.desc().device;
-    Stream default_stream = Stream::GetDefault(device);
-    return default_stream.Copy(buffer_, tensor.buffer(), byte_size());
-  } else {
-    return stream.Copy(buffer_, tensor.buffer(), byte_size());
-  }
-}
-
-Result<void> Tensor::CopyFrom(void* host_ptr, Stream stream) {
-  if (nullptr == host_ptr) {
-    return Status(eInvalidArgument);
-  }
-  if (desc_.shape.empty()) {
-    MMDEPLOY_ERROR("uninitialized tensor");
-    return Status(eInvalidArgument);
-  }
-  Allocate();
-  if (!stream) {
-    auto default_stream = Stream::GetDefault(desc_.device);
-    return default_stream.Copy(host_ptr, buffer_, byte_size());
-  } else {
-    return stream.Copy(host_ptr, buffer_, byte_size());
-  }
-}
-
-Result<void> Tensor::CopyTo(void* host_ptr, Stream stream) const {
-  if (nullptr == host_ptr) {
-    return Status(eInvalidArgument);
-  }
-  if (desc_.shape.empty()) {
-    MMDEPLOY_ERROR("uninitialized tensor");
-    return Status(eInvalidArgument);
-  }
-  if (!stream) {
-    auto default_stream = Stream::GetDefault(desc_.device);
-    return default_stream.Copy(buffer_, host_ptr, byte_size());
-  } else {
-    return stream.Copy(buffer_, host_ptr, byte_size());
-  }
-}
-
-void Tensor::Allocate() {
-  if (!buffer_) {
-    auto _desc = desc();
-    *this = Tensor(_desc, allocator_);
-  }
-}
-
-Tensor Tensor::Slice(int start, int end) {
-  Tensor slice = *this;
-  slice.desc_.shape[0] = 1;
-  auto bytes = element_size(desc_.data_type) * get_size(slice.desc_.shape);
-  slice.desc_.shape[0] = end - start;
-  slice.buffer_ = Buffer(buffer(), start * bytes, (end - start) * bytes);
-  return slice;
-}
-
-TensorShape::value_type Tensor::shape(int dim) const { return desc().shape[dim]; }
+namespace mmdeploy::framework
+{
+
+    static inline int64_t element_size(DataType data_type)
+    {
+        switch (data_type)
+        {
+            case DataType::kFLOAT:
+                return 4;
+            case DataType::kHALF:
+                return 2;
+            case DataType::kINT8:
+                return 1;
+            case DataType::kINT32:
+                return 4;
+            case DataType::kINT64:
+                return 8;
+            default:
+                return 0;
+        }
+    }
+
+    inline static std::string shape_string(const TensorShape& shape)
+    {
+        if (shape.empty())
+        {
+            return "0";
+        }
+        stringstream ss;
+        ss << shape[0];
+        for (size_t i = 1; i < shape.size(); ++i) ss << "," << shape[i];
+        return ss.str();
+    }
+
+    Tensor::Tensor(const TensorDesc& desc, Allocator allocator)
+        : desc_(desc)
+        , allocator_(std::move(allocator))
+    {
+        buffer_ = Buffer(desc.device, byte_size(), allocator_);
+    }
+
+    Tensor::Tensor(const TensorDesc& desc, Buffer buffer)  // NOLINT
+        : desc_(desc)
+        , buffer_(std::move(buffer))
+    {
+    }
+
+    Tensor::Tensor(const TensorDesc& desc, std::shared_ptr<void> data)
+    {
+        desc_   = desc;
+        buffer_ = Buffer(desc.device, byte_size(), std::move(data));
+    }
+
+    static inline int64_t get_size(const std::vector<int64_t>& shape)
+    {
+        if (shape.empty())
+        {
+            return 0;
+        }
+        auto _size = std::accumulate(begin(shape), end(shape), 1LL, std::multiplies<>());
+        return std::max(0LL, _size);
+    }
+
+    int64_t Tensor::size() const
+    {
+        return get_size(shape());
+    }
+
+    int64_t Tensor::byte_size() const
+    {
+        return size() * element_size(data_type());
+    }
+    const TensorDesc& Tensor::desc() const
+    {
+        return desc_;
+    }
+    const TensorShape& Tensor::shape() const
+    {
+        return desc_.shape;
+    }
+    DataType Tensor::data_type() const
+    {
+        return desc_.data_type;
+    }
+    const char* Tensor::name() const
+    {
+        return desc_.name.c_str();
+    }
+    const Buffer& Tensor::buffer() const
+    {
+        return buffer_;
+    }
+
+    Buffer& Tensor::buffer()
+    {
+        Allocate();
+        return buffer_;
+    }
+
+    Device Tensor::device() const
+    {
+        return desc_.device;
+    }
+
+    void Tensor::Reshape(const TensorShape& shape)
+    {
+        bool is_same_size = size() == get_size(shape);
+        desc_.shape       = shape;
+        if (buffer_ && !is_same_size)
+        {
+            // re-allocate buffer
+            buffer_ = {};
+            Allocate();
+        }
+    }
+
+    void Tensor::Squeeze()
+    {
+        desc_.shape.erase(std::remove(desc_.shape.begin(), desc_.shape.end(), 1), desc_.shape.end());
+    }
+
+    void Tensor::Squeeze(int dim)
+    {
+        if (shape(dim) == 1)
+        {
+            desc_.shape.erase(desc_.shape.begin() + dim);
+        }
+    }
+
+    Result<void> Tensor::CopyFrom(const Tensor& tensor, Stream stream)
+    {
+        if (desc_.shape.empty() || tensor.desc().shape.empty())
+        {
+            MMDEPLOY_ERROR("uninitialized tensor");
+            return Status(eInvalidArgument);
+        }
+        if (!(desc_.shape == tensor.desc().shape))
+        {
+            MMDEPLOY_ERROR("mismatched shape {} vs {}", shape_string(desc_.shape), shape_string(tensor.desc().shape));
+            return Status(eShapeMismatch);
+        }
+        if (desc_.data_type != tensor.desc().data_type)
+        {
+            MMDEPLOY_ERROR("mismatched data type {} vs {}", desc_.data_type, tensor.desc().data_type);
+            return Status(eShapeMismatch);
+        }
+        Allocate();
+        if (!stream)
+        {
+            auto device         = desc_.device.is_device() ? desc_.device : tensor.desc().device;
+            auto default_stream = Stream::GetDefault(device);
+            OUTCOME_TRY(default_stream.Copy(tensor.buffer(), buffer_, tensor.byte_size()));
+        }
+        else
+        {
+            OUTCOME_TRY(stream.Copy(tensor.buffer(), buffer_, tensor.byte_size()));
+        }
+        return success();
+    }
+
+    Result<void> Tensor::CopyTo(Tensor& tensor, Stream stream) const
+    {
+        if (desc_.shape.empty() || tensor.desc().shape.empty())
+        {
+            MMDEPLOY_ERROR("uninitialized tensor");
+            return Status(eInvalidArgument);
+        }
+
+        if (!(desc_.shape == tensor.desc().shape))
+        {
+            MMDEPLOY_ERROR("mismatched shape {} vs {}", shape_string(desc_.shape), shape_string(tensor.desc().shape));
+            return Status(eShapeMismatch);
+        }
+        if (desc_.data_type != tensor.desc().data_type)
+        {
+            MMDEPLOY_ERROR("mismatched data type {} vs {}", desc_.data_type, tensor.desc().data_type);
+            return Status(eShapeMismatch);
+        }
+        tensor.Allocate();
+        if (!stream)
+        {
+            Device device         = desc_.device.is_device() ? desc_.device : tensor.desc().device;
+            Stream default_stream = Stream::GetDefault(device);
+            return default_stream.Copy(buffer_, tensor.buffer(), byte_size());
+        }
+        else
+        {
+            return stream.Copy(buffer_, tensor.buffer(), byte_size());
+        }
+    }
+
+    Result<void> Tensor::CopyFrom(void* host_ptr, Stream stream)
+    {
+        if (nullptr == host_ptr)
+        {
+            return Status(eInvalidArgument);
+        }
+        if (desc_.shape.empty())
+        {
+            MMDEPLOY_ERROR("uninitialized tensor");
+            return Status(eInvalidArgument);
+        }
+        Allocate();
+        if (!stream)
+        {
+            auto default_stream = Stream::GetDefault(desc_.device);
+            return default_stream.Copy(host_ptr, buffer_, byte_size());
+        }
+        else
+        {
+            return stream.Copy(host_ptr, buffer_, byte_size());
+        }
+    }
+
+    Result<void> Tensor::CopyTo(void* host_ptr, Stream stream) const
+    {
+        if (nullptr == host_ptr)
+        {
+            return Status(eInvalidArgument);
+        }
+        if (desc_.shape.empty())
+        {
+            MMDEPLOY_ERROR("uninitialized tensor");
+            return Status(eInvalidArgument);
+        }
+        if (!stream)
+        {
+            auto default_stream = Stream::GetDefault(desc_.device);
+            return default_stream.Copy(buffer_, host_ptr, byte_size());
+        }
+        else
+        {
+            return stream.Copy(buffer_, host_ptr, byte_size());
+        }
+    }
+
+    void Tensor::Allocate()
+    {
+        if (!buffer_)
+        {
+            auto _desc = desc();
+            *this      = Tensor(_desc, allocator_);
+        }
+    }
+
+    Tensor Tensor::Slice(int start, int end)
+    {
+        Tensor slice         = *this;
+        slice.desc_.shape[0] = 1;
+        auto bytes           = element_size(desc_.data_type) * get_size(slice.desc_.shape);
+        slice.desc_.shape[0] = end - start;
+        slice.buffer_        = Buffer(buffer(), start * bytes, (end - start) * bytes);
+        return slice;
+    }
+
+    TensorShape::value_type Tensor::shape(int dim) const
+    {
+        return desc().shape[dim];
+    }
 
 }  // namespace mmdeploy::framework
diff --git a/csrc/mmdeploy/core/tensor.h b/csrc/mmdeploy/core/tensor.h
index 0a25207278..1a941fcea3 100644
--- a/csrc/mmdeploy/core/tensor.h
+++ b/csrc/mmdeploy/core/tensor.h
@@ -9,82 +9,94 @@
 #include "mmdeploy/core/device.h"
 #include "mmdeploy/core/types.h"
 
-namespace mmdeploy {
-
-namespace framework {
-
-using TensorShape = std::vector<int64_t>;
-struct TensorDesc {
-  Device device;
-  DataType data_type{DataType::kFLOAT};
-  TensorShape shape;
-  std::string name;
-};
-
-class MMDEPLOY_API Tensor {
- public:
-  Tensor() = default;
-  Tensor(const Tensor&) = default;
-  Tensor(Tensor&&) noexcept = default;
-  Tensor& operator=(const Tensor&) = default;
-  Tensor& operator=(Tensor&&) noexcept = default;
-
-  Tensor(const TensorDesc& desc, Allocator allocator = {});  // NOLINT
-  Tensor(const TensorDesc& desc, Buffer buffer);
-  Tensor(const TensorDesc& desc, std::shared_ptr<void> data);
-  ~Tensor() = default;
-
-  const TensorDesc& desc() const;
-  const TensorShape& shape() const;
-  TensorShape::value_type shape(int dim) const;
-  DataType data_type() const;
-  const char* name() const;
-  int64_t size() const;
-  int64_t byte_size() const;
-
-  const Buffer& buffer() const;
-  Buffer& buffer();
-  Device device() const;
-
-  void Reshape(const TensorShape& shape);
-
-  void Squeeze();
-  void Squeeze(int dim);
-
-  Tensor Slice(int start, int end);
-  Tensor Slice(int index) { return Slice(index, index + 1); }
-
-  Result<void> CopyFrom(const Tensor& tensor, Stream stream = {});
-  Result<void> CopyTo(Tensor& tensor, Stream stream = {}) const;
-
-  Result<void> CopyFrom(void* host_ptr, Stream stream = {});
-  Result<void> CopyTo(void* host_ptr, Stream stream = {}) const;
-
-  Allocator allocator() { return allocator_; }
-
-  template <typename T = void>
-  T* data() {
-    return GetNative<T*>(buffer());
-  }
-
-  template <typename T = void, typename U = std::add_const_t<T> >
-  U* data() const {
-    return GetNative<U*>(buffer());
-  }
-
- private:
-  void Allocate();
-
-  TensorDesc desc_;
-  Allocator allocator_;
-  Buffer buffer_;
-};
-
-// static_assert(sizeof(Tensor) == 80);
-
-}  // namespace framework
-
-MMDEPLOY_REGISTER_TYPE_ID(framework::Tensor, 6);
+namespace mmdeploy
+{
+
+    namespace framework
+    {
+
+        using TensorShape = std::vector<int64_t>;
+        struct TensorDesc
+        {
+            Device      device;
+            DataType    data_type{DataType::kFLOAT};
+            TensorShape shape;
+            std::string name;
+        };
+
+        class MMDEPLOY_API Tensor
+        {
+          public:
+            Tensor()                             = default;
+            Tensor(const Tensor&)                = default;
+            Tensor(Tensor&&) noexcept            = default;
+            Tensor& operator=(const Tensor&)     = default;
+            Tensor& operator=(Tensor&&) noexcept = default;
+
+            Tensor(const TensorDesc& desc, Allocator allocator = {});  // NOLINT
+            Tensor(const TensorDesc& desc, Buffer buffer);
+            Tensor(const TensorDesc& desc, std::shared_ptr<void> data);
+            ~Tensor() = default;
+
+            const TensorDesc&       desc() const;
+            const TensorShape&      shape() const;
+            TensorShape::value_type shape(int dim) const;
+            DataType                data_type() const;
+            const char*             name() const;
+            int64_t                 size() const;
+            int64_t                 byte_size() const;
+
+            const Buffer&           buffer() const;
+            Buffer&                 buffer();
+            Device                  device() const;
+
+            void                    Reshape(const TensorShape& shape);
+
+            void                    Squeeze();
+            void                    Squeeze(int dim);
+
+            Tensor                  Slice(int start, int end);
+            Tensor                  Slice(int index)
+            {
+                return Slice(index, index + 1);
+            }
+
+            Result<void> CopyFrom(const Tensor& tensor, Stream stream = {});
+            Result<void> CopyTo(Tensor& tensor, Stream stream = {}) const;
+
+            Result<void> CopyFrom(void* host_ptr, Stream stream = {});
+            Result<void> CopyTo(void* host_ptr, Stream stream = {}) const;
+
+            Allocator    allocator()
+            {
+                return allocator_;
+            }
+
+            template<typename T = void>
+            T* data()
+            {
+                return GetNative<T*>(buffer());
+            }
+
+            template<typename T = void, typename U = std::add_const_t<T>>
+            U* data() const
+            {
+                return GetNative<U*>(buffer());
+            }
+
+          private:
+            void       Allocate();
+
+            TensorDesc desc_;
+            Allocator  allocator_;
+            Buffer     buffer_;
+        };
+
+        // static_assert(sizeof(Tensor) == 80);
+
+    }  // namespace framework
+
+    MMDEPLOY_REGISTER_TYPE_ID(framework::Tensor, 6);
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/core/types.h b/csrc/mmdeploy/core/types.h
index 4076cac36e..e140c5fb74 100644
--- a/csrc/mmdeploy/core/types.h
+++ b/csrc/mmdeploy/core/types.h
@@ -7,9 +7,10 @@
 
 typedef int err_t;
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-// clang-format off
+    // clang-format off
 
 enum class PixelFormat : int32_t {
   kBGR,
@@ -32,44 +33,48 @@ enum class DataType : int32_t {
   kCOUNT
 };
 
-// clang-format on
-
-namespace pixel_formats {
-
-constexpr auto kBGR = PixelFormat::kBGR;
-constexpr auto kRGB = PixelFormat::kRGB;
-constexpr auto kGRAY = PixelFormat::kGRAYSCALE;
-constexpr auto kNV12 = PixelFormat::kNV12;
-constexpr auto kNV21 = PixelFormat::kNV21;
-constexpr auto kBGRA = PixelFormat::kBGRA;
-
-}  // namespace pixel_formats
-
-namespace data_types {
-
-constexpr auto kFLOAT = DataType::kFLOAT;
-constexpr auto kHALF = DataType::kHALF;
-constexpr auto kINT8 = DataType::kINT8;
-constexpr auto kINT32 = DataType::kINT32;
-constexpr auto kINT64 = DataType::kINT64;
-
-}  // namespace data_types
-
-class NonCopyable {
- public:
-  NonCopyable() = default;
-  NonCopyable(const NonCopyable&) = delete;
-  NonCopyable& operator=(const NonCopyable&) = delete;
-};
-
-class NonMovable {
- public:
-  NonMovable() = default;
-  NonMovable(const NonCopyable&) = delete;
-  NonMovable& operator=(const NonCopyable&) = delete;
-  NonMovable(NonMovable&&) noexcept = delete;
-  NonMovable& operator=(NonMovable&&) noexcept = delete;
-};
+    // clang-format on
+
+    namespace pixel_formats
+    {
+
+        constexpr auto kBGR  = PixelFormat::kBGR;
+        constexpr auto kRGB  = PixelFormat::kRGB;
+        constexpr auto kGRAY = PixelFormat::kGRAYSCALE;
+        constexpr auto kNV12 = PixelFormat::kNV12;
+        constexpr auto kNV21 = PixelFormat::kNV21;
+        constexpr auto kBGRA = PixelFormat::kBGRA;
+
+    }  // namespace pixel_formats
+
+    namespace data_types
+    {
+
+        constexpr auto kFLOAT = DataType::kFLOAT;
+        constexpr auto kHALF  = DataType::kHALF;
+        constexpr auto kINT8  = DataType::kINT8;
+        constexpr auto kINT32 = DataType::kINT32;
+        constexpr auto kINT64 = DataType::kINT64;
+
+    }  // namespace data_types
+
+    class NonCopyable
+    {
+      public:
+        NonCopyable()                              = default;
+        NonCopyable(const NonCopyable&)            = delete;
+        NonCopyable& operator=(const NonCopyable&) = delete;
+    };
+
+    class NonMovable
+    {
+      public:
+        NonMovable()                                 = default;
+        NonMovable(const NonCopyable&)               = delete;
+        NonMovable& operator=(const NonCopyable&)    = delete;
+        NonMovable(NonMovable&&) noexcept            = delete;
+        NonMovable& operator=(NonMovable&&) noexcept = delete;
+    };
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/core/utils/device_utils.cpp b/csrc/mmdeploy/core/utils/device_utils.cpp
index e6ffbc7905..7f8c828453 100644
--- a/csrc/mmdeploy/core/utils/device_utils.cpp
+++ b/csrc/mmdeploy/core/utils/device_utils.cpp
@@ -4,42 +4,49 @@
 
 #include "mmdeploy/core/logger.h"
 
-namespace mmdeploy::framework {
-
-Result<Mat> MakeAvailableOnDevice(const Mat& src, const Device& device, Stream& stream) {
-  if (src.device() == device) {
-    return src;
-  }
-
-  Mat dst{src.height(), src.width(), src.pixel_format(), src.type(), device};
-  OUTCOME_TRY(stream.Copy(src.buffer(), dst.buffer(), dst.byte_size()));
-
-  // ! When the target device is different from stream's device (e.g. DtoH), insert a sync op as
-  //   computation on dst won't be synchronized with stream
-  if (device != stream.GetDevice()) {
-    OUTCOME_TRY(stream.Wait());
-  }
-
-  return dst;
-}
-
-Result<Tensor> MakeAvailableOnDevice(const Tensor& src, const Device& device, Stream& stream) {
-  if (src.device() == device) {
-    return src;
-  }
-
-  TensorDesc desc{device, src.data_type(), src.shape(), src.name()};
-  Tensor dst(desc);
-
-  OUTCOME_TRY(stream.Copy(src.buffer(), dst.buffer(), src.byte_size()));
-
-  // ! When the target device is different from stream's device (e.g. DtoH), insert a sync op as
-  //   computation on dst won't be synchronized with stream
-  if (device != stream.GetDevice()) {
-    OUTCOME_TRY(stream.Wait());
-  }
-
-  return dst;
-}
+namespace mmdeploy::framework
+{
+
+    Result<Mat> MakeAvailableOnDevice(const Mat& src, const Device& device, Stream& stream)
+    {
+        if (src.device() == device)
+        {
+            return src;
+        }
+
+        Mat dst{src.height(), src.width(), src.pixel_format(), src.type(), device};
+        OUTCOME_TRY(stream.Copy(src.buffer(), dst.buffer(), dst.byte_size()));
+
+        // ! When the target device is different from stream's device (e.g. DtoH), insert a sync op as
+        //   computation on dst won't be synchronized with stream
+        if (device != stream.GetDevice())
+        {
+            OUTCOME_TRY(stream.Wait());
+        }
+
+        return dst;
+    }
+
+    Result<Tensor> MakeAvailableOnDevice(const Tensor& src, const Device& device, Stream& stream)
+    {
+        if (src.device() == device)
+        {
+            return src;
+        }
+
+        TensorDesc desc{device, src.data_type(), src.shape(), src.name()};
+        Tensor     dst(desc);
+
+        OUTCOME_TRY(stream.Copy(src.buffer(), dst.buffer(), src.byte_size()));
+
+        // ! When the target device is different from stream's device (e.g. DtoH), insert a sync op as
+        //   computation on dst won't be synchronized with stream
+        if (device != stream.GetDevice())
+        {
+            OUTCOME_TRY(stream.Wait());
+        }
+
+        return dst;
+    }
 
 }  // namespace mmdeploy::framework
diff --git a/csrc/mmdeploy/core/utils/device_utils.h b/csrc/mmdeploy/core/utils/device_utils.h
index 8ad2661a19..d2f91e334f 100644
--- a/csrc/mmdeploy/core/utils/device_utils.h
+++ b/csrc/mmdeploy/core/utils/device_utils.h
@@ -8,26 +8,25 @@
 #include "mmdeploy/core/mat.h"
 #include "mmdeploy/core/tensor.h"
 
-namespace mmdeploy::framework {
-/**
- *
- * @param src
- * @param device
- * @param stream
- * @return
- */
-MMDEPLOY_API Result<Mat> MakeAvailableOnDevice(const Mat& src, const Device& device,
-                                               Stream& stream);
+namespace mmdeploy::framework
+{
+    /**
+     *
+     * @param src
+     * @param device
+     * @param stream
+     * @return
+     */
+    MMDEPLOY_API Result<Mat> MakeAvailableOnDevice(const Mat& src, const Device& device, Stream& stream);
 
-/**
- *
- * @param src
- * @param device
- * @param stream
- * @return
- */
-MMDEPLOY_API Result<Tensor> MakeAvailableOnDevice(const Tensor& src, const Device& device,
-                                                  Stream& stream);
+    /**
+     *
+     * @param src
+     * @param device
+     * @param stream
+     * @return
+     */
+    MMDEPLOY_API Result<Tensor> MakeAvailableOnDevice(const Tensor& src, const Device& device, Stream& stream);
 
 }  // namespace mmdeploy::framework
 
diff --git a/csrc/mmdeploy/core/utils/filesystem.h b/csrc/mmdeploy/core/utils/filesystem.h
index 3f2a3286ce..24a27a33af 100644
--- a/csrc/mmdeploy/core/utils/filesystem.h
+++ b/csrc/mmdeploy/core/utils/filesystem.h
@@ -4,10 +4,10 @@
 #define MMDEPLOY_CSRC_CORE_UTILS_FILESYSTEM_H_
 
 #if __GNUC__ >= 8 || _MSC_VER || __clang_major__ >= 7
-#include <filesystem>
+    #include <filesystem>
 namespace fs = std::filesystem;
 #else
-#include <experimental/filesystem>
+    #include <experimental/filesystem>
 namespace fs = std::experimental::filesystem;
 #endif
 
diff --git a/csrc/mmdeploy/core/utils/formatter.cpp b/csrc/mmdeploy/core/utils/formatter.cpp
index 9e78a9038e..8aebf626f0 100644
--- a/csrc/mmdeploy/core/utils/formatter.cpp
+++ b/csrc/mmdeploy/core/utils/formatter.cpp
@@ -5,8 +5,12 @@
 #include "mmdeploy/archive/json_archive.h"
 #include "mmdeploy/core/value.h"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-std::string format_value(const Value& value) { return mmdeploy::to_json(value).dump(2); }
+    std::string format_value(const Value& value)
+    {
+        return mmdeploy::to_json(value).dump(2);
+    }
 
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/core/utils/formatter.h b/csrc/mmdeploy/core/utils/formatter.h
index ee5c065712..9ffb610377 100644
--- a/csrc/mmdeploy/core/utils/formatter.h
+++ b/csrc/mmdeploy/core/utils/formatter.h
@@ -11,106 +11,126 @@
 #include "spdlog/fmt/ostr.h"
 
 #if FMT_VERSION >= 50000
-#include "spdlog/fmt/bundled/ostream.h"
-#include "spdlog/fmt/bundled/ranges.h"
+    #include "spdlog/fmt/bundled/ostream.h"
+    #include "spdlog/fmt/bundled/ranges.h"
 #else
-#include <type_traits>
+    #include <type_traits>
 #endif
 
-namespace mmdeploy {
-
-class Value;
-
-MMDEPLOY_API std::string format_value(const Value& value);
-
-inline std::string to_string(PixelFormat format) {
-  switch (format) {
-    case PixelFormat::kBGR:
-      return "BGR";
-    case PixelFormat::kRGB:
-      return "RGB";
-    case PixelFormat::kGRAYSCALE:
-      return "GRAYSCALE";
-    case PixelFormat::kNV12:
-      return "NV12";
-    case PixelFormat::kNV21:
-      return "NV21";
-    case PixelFormat::kBGRA:
-      return "BGRA";
-    default:
-      return "invalid_format_enum";
-  }
-}
-
-inline std::string to_string(DataType type) {
-  switch (type) {
-    case DataType::kFLOAT:
-      return "FLOAT";
-    case DataType::kHALF:
-      return "HALF";
-    case DataType::kINT8:
-      return "INT8";
-    case DataType::kINT32:
-      return "INT32";
-    case DataType::kINT64:
-      return "INT64";
-    default:
-      return "invalid_data_type_enum";
-  }
-}
-
-inline std::ostream& operator<<(std::ostream& os, PixelFormat format) {
-  return os << to_string(format);
-}
-
-inline std::ostream& operator<<(std::ostream& os, DataType type) { return os << to_string(type); }
+namespace mmdeploy
+{
+
+    class Value;
+
+    MMDEPLOY_API std::string format_value(const Value& value);
+
+    inline std::string       to_string(PixelFormat format)
+    {
+        switch (format)
+        {
+            case PixelFormat::kBGR:
+                return "BGR";
+            case PixelFormat::kRGB:
+                return "RGB";
+            case PixelFormat::kGRAYSCALE:
+                return "GRAYSCALE";
+            case PixelFormat::kNV12:
+                return "NV12";
+            case PixelFormat::kNV21:
+                return "NV21";
+            case PixelFormat::kBGRA:
+                return "BGRA";
+            default:
+                return "invalid_format_enum";
+        }
+    }
+
+    inline std::string to_string(DataType type)
+    {
+        switch (type)
+        {
+            case DataType::kFLOAT:
+                return "FLOAT";
+            case DataType::kHALF:
+                return "HALF";
+            case DataType::kINT8:
+                return "INT8";
+            case DataType::kINT32:
+                return "INT32";
+            case DataType::kINT64:
+                return "INT64";
+            default:
+                return "invalid_data_type_enum";
+        }
+    }
+
+    inline std::ostream& operator<<(std::ostream& os, PixelFormat format)
+    {
+        return os << to_string(format);
+    }
+
+    inline std::ostream& operator<<(std::ostream& os, DataType type)
+    {
+        return os << to_string(type);
+    }
 
 }  // namespace mmdeploy
 
-namespace fmt {
+namespace fmt
+{
 
 #if FMT_VERSION >= 50000
 
-// `Value` maybe an incomplete type at this point, making `operator<<` not usable
-template <>
-struct formatter<mmdeploy::Value> {
-  constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
-  template <typename Context>
-  auto format(const mmdeploy::Value& value, Context& ctx) {
-    return format_to(ctx.out(), "{}", mmdeploy::format_value(value));
-  }
-};
+    // `Value` maybe an incomplete type at this point, making `operator<<` not usable
+    template<>
+    struct formatter<mmdeploy::Value>
+    {
+        constexpr auto parse(format_parse_context& ctx)
+        {
+            return ctx.begin();
+        }
+        template<typename Context>
+        auto format(const mmdeploy::Value& value, Context& ctx)
+        {
+            return format_to(ctx.out(), "{}", mmdeploy::format_value(value));
+        }
+    };
 
 #else
 
-inline void format_arg(BasicFormatter<char>& f, const char*, const mmdeploy::Value& d) {
-  f.writer() << mmdeploy::format_value(d);
-}
-
-template <typename T>
-auto format_arg(BasicFormatter<char>& f, const char*, const T& v)
-    -> std::void_t<decltype(begin(v), end(v))> {
-  f.writer() << "[";
-  bool first = true;
-  for (const auto& x : v) {
-    f.writer() << (first ? "" : ", ") << fmt::format("{}", x);
-    first = false;
-  }
-  f.writer() << "]";
-}
-
-template <class Tuple, size_t... Is>
-void format_tuple_impl(BasicFormatter<char>& f, const Tuple& t, std::index_sequence<Is...>) {
-  constexpr int last = sizeof...(Is) - 1;
-  f.writer() << "(";
-  ((f.writer() << fmt::format("{}", std::get<Is>(t)) << (Is != last ? ", " : "")), ...);
-  f.writer() << ")";
-}
-
-template <typename... Ts>
-void format_arg(BasicFormatter<char>& f, const char*, const std::tuple<Ts...>& t) {
-  format_tuple_impl(f, t, std::index_sequence_for<Ts...>{});
-}
+    inline void format_arg(BasicFormatter<char>& f, const char*, const mmdeploy::Value& d)
+    {
+        f.writer() << mmdeploy::format_value(d);
+    }
+
+    template<typename T>
+    auto format_arg(BasicFormatter<char>& f, const char*, const T& v)
+        -> std::void_t<decltype(begin(v), end(v))>
+    {
+        f.writer() << "[";
+        bool first = true;
+        for (const auto& x : v)
+        {
+            f.writer() << (first ? "" : ", ") << fmt::format("{}", x);
+            first = false;
+        }
+        f.writer() << "]";
+    }
+
+    template<class Tuple, size_t... Is>
+    void format_tuple_impl(BasicFormatter<char>& f, const Tuple& t, std::index_sequence<Is...>)
+    {
+        constexpr int last = sizeof...(Is) - 1;
+        f.writer() << "(";
+        ((f.writer() << fmt::format("{}", std::get<Is>(t)) << (Is != last ? ", " : "")), ...);
+        f.writer() << ")";
+    }
+
+    template<typename... Ts>
+    void format_arg(BasicFormatter<char>& f, const char*, const std::tuple<Ts...>& t)
+    {
+        format_tuple_impl(f, t, std::index_sequence_for<Ts...>{});
+    }
 
 #endif
 
diff --git a/csrc/mmdeploy/core/utils/source_location.h b/csrc/mmdeploy/core/utils/source_location.h
index e87a2ee3ba..2972014aaf 100644
--- a/csrc/mmdeploy/core/utils/source_location.h
+++ b/csrc/mmdeploy/core/utils/source_location.h
@@ -28,19 +28,36 @@
 // clang-format on
 
 #ifndef MMDEPLOY_HAS_SOURCE_LOCATION
-#include <cstdint>
-namespace mmdeploy {
-class SourceLocation {
- public:
-  constexpr SourceLocation() noexcept = default;
-  SourceLocation(const SourceLocation&) = default;
-  SourceLocation(SourceLocation&&) noexcept = default;
-  constexpr std::uint_least32_t line() const noexcept { return 0; };
-  constexpr std::uint_least32_t column() const noexcept { return 0; }
-  constexpr const char* file_name() const noexcept { return ""; }
-  constexpr const char* function_name() const noexcept { return ""; }
-  static constexpr SourceLocation current() noexcept { return {}; }
-};
+    #include <cstdint>
+namespace mmdeploy
+{
+    class SourceLocation
+    {
+      public:
+        constexpr SourceLocation() noexcept       = default;
+        SourceLocation(const SourceLocation&)     = default;
+        SourceLocation(SourceLocation&&) noexcept = default;
+        constexpr std::uint_least32_t line() const noexcept
+        {
+            return 0;
+        };
+        constexpr std::uint_least32_t column() const noexcept
+        {
+            return 0;
+        }
+        constexpr const char* file_name() const noexcept
+        {
+            return "";
+        }
+        constexpr const char* function_name() const noexcept
+        {
+            return "";
+        }
+        static constexpr SourceLocation current() noexcept
+        {
+            return {};
+        }
+    };
 }  // namespace mmdeploy
 #endif
 
diff --git a/csrc/mmdeploy/core/utils/stacktrace.cpp b/csrc/mmdeploy/core/utils/stacktrace.cpp
index e04d0d335a..bfa0074513 100644
--- a/csrc/mmdeploy/core/utils/stacktrace.cpp
+++ b/csrc/mmdeploy/core/utils/stacktrace.cpp
@@ -4,52 +4,76 @@
 
 #if MMDEPLOY_STATUS_USE_STACKTRACE
 
-#define BOOST_STACKTRACE_USE_BACKTRACE
-#include "boost/stacktrace.hpp"
-
-namespace mmdeploy {
-
-struct Stacktrace::Impl {
-  boost::stacktrace::stacktrace st_;
-};
-Stacktrace::~Stacktrace() = default;
-Stacktrace::Stacktrace(int)
-    : impl_(new Impl{boost::stacktrace::stacktrace(1, static_cast<std::size_t>(-1))}) {}
-Stacktrace::Stacktrace() noexcept = default;
-Stacktrace::Stacktrace(const Stacktrace& other) : impl_(std::make_unique<Impl>(*other.impl_)) {}
-Stacktrace::Stacktrace(Stacktrace&& other) noexcept : impl_(std::move(other.impl_)) {}
-Stacktrace& Stacktrace::operator=(Stacktrace&& other) noexcept {
-  impl_ = std::move(other.impl_);
-  return *this;
-}
-Stacktrace& Stacktrace::operator=(const Stacktrace& other) {
-  impl_ = std::make_unique<Impl>(*other.impl_);
-  return *this;
-}
-std::string Stacktrace::to_string() const {
-  if (impl_) {
-    return boost::stacktrace::to_string(impl_->st_);
-  }
-  return "";
-}
+    #define BOOST_STACKTRACE_USE_BACKTRACE
+    #include "boost/stacktrace.hpp"
+
+namespace mmdeploy
+{
+
+    struct Stacktrace::Impl
+    {
+        boost::stacktrace::stacktrace st_;
+    };
+    Stacktrace::~Stacktrace() = default;
+    Stacktrace::Stacktrace(int)
+        : impl_(new Impl{boost::stacktrace::stacktrace(1, static_cast<std::size_t>(-1))})
+    {
+    }
+    Stacktrace::Stacktrace() noexcept = default;
+    Stacktrace::Stacktrace(const Stacktrace& other)
+        : impl_(std::make_unique<Impl>(*other.impl_))
+    {
+    }
+    Stacktrace::Stacktrace(Stacktrace&& other) noexcept
+        : impl_(std::move(other.impl_))
+    {
+    }
+    Stacktrace& Stacktrace::operator=(Stacktrace&& other) noexcept
+    {
+        impl_ = std::move(other.impl_);
+        return *this;
+    }
+    Stacktrace& Stacktrace::operator=(const Stacktrace& other)
+    {
+        impl_ = std::make_unique<Impl>(*other.impl_);
+        return *this;
+    }
+    std::string Stacktrace::to_string() const
+    {
+        if (impl_)
+        {
+            return boost::stacktrace::to_string(impl_->st_);
+        }
+        return "";
+    }
 
 }  // namespace mmdeploy
 
 #else
-#include <string>
-namespace mmdeploy {
-
-struct Stacktrace::Impl {};
-Stacktrace::~Stacktrace() = default;
-Stacktrace::Stacktrace(int) {}
-Stacktrace::Stacktrace() noexcept = default;
-Stacktrace::Stacktrace(const Stacktrace&) {}
-Stacktrace::Stacktrace(Stacktrace&&) noexcept {}
-Stacktrace& Stacktrace::operator=(Stacktrace&&) noexcept { return *this; }
-Stacktrace& Stacktrace::operator=(const Stacktrace&) { return *this; }
-std::string Stacktrace::to_string() const {
-  return "the library is compiled with no stacktrace support";
-}
+    #include <string>
+namespace mmdeploy
+{
+
+    struct Stacktrace::Impl
+    {
+    };
+    Stacktrace::~Stacktrace() = default;
+    Stacktrace::Stacktrace(int) {}
+    Stacktrace::Stacktrace() noexcept = default;
+    Stacktrace::Stacktrace(const Stacktrace&) {}
+    Stacktrace::Stacktrace(Stacktrace&&) noexcept {}
+    Stacktrace& Stacktrace::operator=(Stacktrace&&) noexcept
+    {
+        return *this;
+    }
+    Stacktrace& Stacktrace::operator=(const Stacktrace&)
+    {
+        return *this;
+    }
+    std::string Stacktrace::to_string() const
+    {
+        return "the library is compiled with no stacktrace support";
+    }
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/core/utils/stacktrace.h b/csrc/mmdeploy/core/utils/stacktrace.h
index 53b1a44b99..8b525cf1c2 100644
--- a/csrc/mmdeploy/core/utils/stacktrace.h
+++ b/csrc/mmdeploy/core/utils/stacktrace.h
@@ -6,23 +6,25 @@
 #include <memory>
 #include <string>
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-class Stacktrace {
- public:
-  ~Stacktrace();
-  Stacktrace() noexcept;
-  explicit Stacktrace(int);
-  Stacktrace& operator=(const Stacktrace&);
-  Stacktrace& operator=(Stacktrace&& other) noexcept;
-  Stacktrace(const Stacktrace&);
-  Stacktrace(Stacktrace&&) noexcept;
-  std::string to_string() const;
+    class Stacktrace
+    {
+      public:
+        ~Stacktrace();
+        Stacktrace() noexcept;
+        explicit Stacktrace(int);
+        Stacktrace& operator=(const Stacktrace&);
+        Stacktrace& operator=(Stacktrace&& other) noexcept;
+        Stacktrace(const Stacktrace&);
+        Stacktrace(Stacktrace&&) noexcept;
+        std::string to_string() const;
 
- private:
-  struct Impl;
-  std::unique_ptr<Impl> impl_;
-};
+      private:
+        struct Impl;
+        std::unique_ptr<Impl> impl_;
+    };
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/core/value.h b/csrc/mmdeploy/core/value.h
index 45e14f509b..f14907bf0a 100644
--- a/csrc/mmdeploy/core/value.h
+++ b/csrc/mmdeploy/core/value.h
@@ -18,1039 +18,1568 @@
 #include "mmdeploy/core/mpl/type_traits.h"
 #include "mmdeploy/core/status_code.h"
 
-namespace mmdeploy {
-
-enum class ValueType : int {
-  kNull = 0,
-  kBool,
-  kInt,
-  kUInt,
-  kFloat,
-  kString,
-  kBinary,
-  kArray,
-  kObject,
-  kPointer,
-  kDynamic,
-  kAny,
-};
-
-class Value;
+namespace mmdeploy
+{
+
+    enum class ValueType : int
+    {
+        kNull = 0,
+        kBool,
+        kInt,
+        kUInt,
+        kFloat,
+        kString,
+        kBinary,
+        kArray,
+        kObject,
+        kPointer,
+        kDynamic,
+        kAny,
+    };
+
+    class Value;
 
 #if __GNUC__ >= 8
-using Byte = std::byte;
+    using Byte = std::byte;
 #else
-enum class Byte : unsigned char {};
+    enum class Byte : unsigned char
+    {
+    };
 #endif
 
-namespace detail {
-class ValueRef;
-}
-
-template <typename T>
-class ValueIterator {
- public:
-  using value_type = Value;
-  using difference_type = std::ptrdiff_t;
-  using pointer = value_type*;
-  using reference = value_type&;
-  using iterator_category = std::bidirectional_iterator_tag;
-  using object_iterator_t = typename T::Object::iterator;
-  using array_iterator_t = typename T::Array::iterator;
-  ValueIterator() = default;
-  ValueIterator(T* value, object_iterator_t iter) : value_(value), object_iter_(iter) {}
-  ValueIterator(T* value, array_iterator_t iter) : value_(value), array_iter_(iter) {}
-  ValueIterator& operator++() {
-    if (value_->is_array()) {
-      ++array_iter_;
-    } else {
-      ++object_iter_;
+    namespace detail
+    {
+        class ValueRef;
     }
-    return *this;
-  }
-  ValueIterator operator++(int) {
-    auto it = *this;
-    ++(*this);
-    return it;
-  }
-  T& operator*() {
-    if (value_->is_array()) {
-      return *array_iter_;
-    } else {
-      return object_iter_->second;
-    }
-  }
-  const T& operator*() const {
-    if (value_->is_array()) {
-      return *array_iter_;
-    } else {
-      return object_iter_->second;
-    }
-  }
-  T* operator->() {
-    if (value_->is_array()) {
-      return &(*array_iter_);
-    } else {
-      return &object_iter_->second;
-    }
-  }
-  const T* operator->() const {
-    if (value_->is_array()) {
-      return &(*array_iter_);
-    } else {
-      return &object_iter_->second;
-    }
-  }
-  const std::string& key() {
-    if (value_->is_object()) {
-      return object_iter_->first;
-    }
-    throw_exception(eInvalidArgument);
-  }
-  bool operator==(const ValueIterator& other) const {
-    return value_ == other.value_ && object_iter_ == other.object_iter_ &&
-           array_iter_ == other.array_iter_;
-  }
-  bool operator!=(const ValueIterator& other) const { return !(*this == other); }
-
- private:
-  T* value_{};
-  object_iterator_t object_iter_{};
-  array_iterator_t array_iter_{};
-};
-
-class Dynamic;
-
-class Value;
-
-template <class T>
-struct EraseType {
-  T value;
-};
-
-template <class T>
-struct ArchiveType {
-  T value;
-};
-
-template <class T>
-EraseType<T&&> cast_by_erasure(T&& v) {
-  return {std::forward<T>(v)};
-}
-
-template <class T>
-ArchiveType<T&&> cast_by_archive(T&& v) {
-  return {std::forward<T>(v)};
-}
-
-template <typename T>
-struct is_value : std::is_same<T, Value> {};
-
-template <typename T>
-inline constexpr bool is_value_v = is_value<T>::value;
-
-namespace detail {
-template <typename T>
-struct is_pointer_to_const : std::false_type {};
-template <typename T>
-struct is_pointer_to_const<const T*> : std::true_type {};
-template <typename T>
-struct is_const_reference : std::false_type {};
-template <typename T>
-struct is_const_reference<const T&> : std::true_type {};
-}  // namespace detail
-
-class Value {
- public:
-  using value_type = Value;
-  using reference = value_type&;
-  using const_reference = const value_type&;
-  using difference_type = std::ptrdiff_t;
-  using size_type = std::size_t;
-  using pointer = value_type*;
-  using const_pointer = const value_type*;
-  using iterator = ValueIterator<Value>;
-  using const_iterator = ValueIterator<const Value>;
-
-  using Type = ValueType;
-
-  using Boolean = bool;
-  using Integer = int64_t;
-  using Unsigned = uint64_t;
-  using Float = double;
-  using String = std::string;
-  using Binary = std::vector<Byte>;
-  using Array = std::vector<Value>;
-  using Object = std::map<std::string, Value>;
-  using Pointer = std::shared_ptr<Value>;
-  using Dynamic = ::mmdeploy::Dynamic;
-  using Any = ::mmdeploy::StaticAny;
-  using ValueRef = detail::ValueRef;
-
-  static constexpr const auto kNull = ValueType::kNull;
-  static constexpr const auto kBool = ValueType::kBool;
-  static constexpr const auto kInt = ValueType::kInt;
-  static constexpr const auto kUInt = ValueType::kUInt;
-  static constexpr const auto kFloat = ValueType::kFloat;
-  static constexpr const auto kString = ValueType::kString;
-  static constexpr const auto kBinary = ValueType::kBinary;
-  static constexpr const auto kArray = ValueType::kArray;
-  static constexpr const auto kObject = ValueType::kObject;
-  static constexpr const auto kPointer = ValueType::kPointer;
-  static constexpr const auto kDynamic = ValueType::kDynamic;
-  static constexpr const auto kAny = ValueType::kAny;
-
-  Value(const ValueType v) : type_(v), data_(v) {}
-
-  Value(std::nullptr_t = nullptr) noexcept : Value(ValueType::kNull) {}
-
-  template <typename T, std::enable_if_t<std::is_same_v<T, ValueRef>, int> = 0>
-  Value(const T& ref) : Value(ref.moved_or_copied()) {}
-
-  Value(const Value& other) : type_(other.type_) {
-    switch (type_) {
-      case ValueType::kNull:
-        break;
-      case ValueType::kBool:
-        data_ = other.data_.boolean;
-        break;
-      case ValueType::kInt:
-        data_ = other.data_.number_integer;
-        break;
-      case ValueType::kUInt:
-        data_ = other.data_.number_unsigned;
-        break;
-      case ValueType::kFloat:
-        data_ = other.data_.number_float;
-        break;
-      case ValueType::kString:
-        data_ = *other.data_.string;
-        break;
-      case ValueType::kBinary:
-        data_ = *other.data_.binary;
-        break;
-      case ValueType::kArray:
-        data_ = *other.data_.array;
-        break;
-      case ValueType::kObject:
-        data_ = *other.data_.object;
-        break;
-      case ValueType::kPointer:
-        data_ = *other.data_.pointer;
-        break;
-      case ValueType::kAny:
-        data_.any = create<Any>(*other.data_.any);
-        break;
-      default:
-        throw_exception(eInvalidArgument);
-    }
-  }
-
-  template <class T, std::enable_if_t<std::is_same<std::decay_t<T>, bool>::value, bool> = true>
-  Value(T&& value) : type_(kBool), data_(Boolean{value}) {}
-
-  Value(int8_t value) : type_(kInt), data_(Integer{value}) {}
-  Value(int16_t value) : type_(kInt), data_(Integer{value}) {}
-  Value(int32_t value) : type_(kInt), data_(Integer{value}) {}
-  Value(int64_t value) : type_(kInt), data_(Integer{value}) {}
-  Value(uint8_t value) : type_(kUInt), data_(Unsigned{value}) {}
-  Value(uint16_t value) : type_(kUInt), data_(Unsigned{value}) {}
-  Value(uint32_t value) : type_(kUInt), data_(Unsigned{value}) {}
-  Value(uint64_t value) : type_(kUInt), data_(Unsigned{value}) {}
-  Value(float value) : type_(kFloat), data_(Float{value}) {}
-  Value(double value) : type_(kFloat), data_(Float{value}) {}
-  Value(Binary value) : type_(kBinary), data_(std::move(value)) {}
-  Value(Array value) : type_(kArray), data_(std::move(value)) {}
-  Value(Object value) : type_(kObject), data_(std::move(value)) {}
-  Value(Pointer value) : type_(kPointer), data_(std::move(value)) {}
-
-  template <class T, std::enable_if_t<std::is_constructible<String, T>::value, bool> = true>
-  Value(T&& value) : type_(kString), data_(String{std::forward<T>(value)}) {}
-
-  template <typename T, std::enable_if_t<is_cast_by_erasure<std::decay_t<T>>::value, bool> = true>
-  Value(T&& value) : Value(cast_by_erasure(std::forward<T>(value))) {}
-
-  template <typename T>
-  Value(EraseType<T>&& value) : type_(Type::kAny) {
-    data_.any = create<Any>(std::forward<T>(value.value));
-  }
-
-  Value(std::initializer_list<ValueRef> init, bool type_deduction = true,
-        Type manual_type = Type::kArray);
-
-  Value(Value&& other) noexcept : type_(other.type_), data_(other.data_) {
-    other.type_ = ValueType::kNull;
-    other.data_ = {};
-  }
-
-  // copy-and-swap
-  Value& operator=(Value other) noexcept {
-    using std::swap;
-    swap(type_, other.type_);
-    swap(data_, other.data_);
-    return *this;
-  }
-
-  ~Value() { data_.destroy(type_); }
-
-  operator Type() const noexcept { return type(); }
-  Type type() const noexcept { return _unwrap().type_; }
-  bool is_null() const noexcept { return _unwrap()._is_null(); }
-  bool is_array() const noexcept { return _unwrap()._is_array(); }
-  bool is_object() const noexcept { return _unwrap()._is_object(); }
-  template <typename T = void>
-  bool is_any() const noexcept {
-    return _unwrap()._is_any<T>();
-  }
-  bool is_boolean() const noexcept { return _unwrap()._is_boolean(); }
-  bool is_string() const noexcept { return _unwrap()._is_string(); }
-  bool is_binary() const noexcept { return _unwrap()._is_binary(); }
-  bool is_number() const noexcept { return _unwrap()._is_number(); }
-  bool is_number_integer() const noexcept { return _unwrap()._is_number_integer(); }
-  bool is_number_unsigned() const noexcept { return _unwrap()._is_number_unsigned(); }
-  bool is_number_float() const noexcept { return _unwrap()._is_number_float(); }
-  bool is_pointer() const noexcept { return _is_pointer(); }
-  size_t size() const noexcept { return _unwrap()._size(); }
-  bool empty() const noexcept { return _unwrap()._empty(); }
-
- private:
-  constexpr Type _type() const noexcept { return type_; }
-
-  constexpr bool _is_null() const noexcept { return type_ == Type::kNull; }
-  constexpr bool _is_array() const noexcept { return type_ == Type::kArray; }
-  constexpr bool _is_object() const noexcept { return type_ == Type::kObject; }
-
-  template <typename T = void>
-  constexpr bool _is_any() const noexcept {
-    if (type_ != Type::kAny) {
-      return false;
-    }
-    if constexpr (std::is_void_v<T>) {
-      return true;
-    } else {
-      return traits::TypeId<T>::value == data_.any->type();
-    }
-  }
-
-  constexpr bool _is_boolean() const noexcept { return type_ == Type::kBool; }
-  constexpr bool _is_string() const noexcept { return type_ == Type::kString; }
-  constexpr bool _is_binary() const noexcept { return type_ == Type::kBinary; }
-  constexpr bool _is_number() const noexcept { return _is_number_integer() || _is_number_float(); }
-
-  constexpr bool _is_number_integer() const noexcept {
-    return type_ == Type::kInt || type_ == Type::kUInt;
-  }
-
-  constexpr bool _is_number_unsigned() const noexcept { return type_ == Type::kUInt; }
-  constexpr bool _is_number_float() const noexcept { return type_ == Type::kFloat; }
-  constexpr bool _is_pointer() const noexcept { return type_ == Type::kPointer; }
-
-  size_t _size() const noexcept {
-    switch (_type()) {
-      case ValueType::kNull:
-        return 0;
-      case ValueType::kArray:
-        return data_.array->size();
-      case ValueType::kObject:
-        return data_.object->size();
-      default:
-        return 1;
-    }
-  }
-
-  bool _empty() const noexcept {
-    switch (_type()) {
-      case Type::kNull:
-        return true;
-      case Type::kArray:
-        return data_.array->empty();
-      case Type::kObject:
-        return data_.object->empty();
-      default:
-        return false;
-    }
-  }
-
- private:
-  Boolean* get_impl_ptr(Boolean*) noexcept { return _is_boolean() ? &data_.boolean : nullptr; }
-  const Boolean* get_impl_ptr(const Boolean*) const noexcept {
-    return _is_boolean() ? &data_.boolean : nullptr;
-  }
-  Integer* get_impl_ptr(Integer*) noexcept {
-    return _is_number_integer() ? &data_.number_integer : nullptr;
-  }
-  const Integer* get_impl_ptr(const Integer*) const noexcept {
-    return _is_number_integer() ? &data_.number_integer : nullptr;
-  }
-  Unsigned* get_impl_ptr(Unsigned*) noexcept {
-    return _is_number_unsigned() ? &data_.number_unsigned : nullptr;
-  }
-  const Unsigned* get_impl_ptr(const Unsigned*) const noexcept {
-    return _is_number_unsigned() ? &data_.number_unsigned : nullptr;
-  }
-  Float* get_impl_ptr(Float*) noexcept {
-    return _is_number_float() ? &data_.number_float : nullptr;
-  }
-  const Float* get_impl_ptr(const Float*) const noexcept {
-    return _is_number_float() ? &data_.number_float : nullptr;
-  }
-  String* get_impl_ptr(String*) noexcept { return _is_string() ? data_.string : nullptr; }
-  const String* get_impl_ptr(const String*) const noexcept {
-    return _is_string() ? data_.string : nullptr;
-  }
-  Binary* get_impl_ptr(Binary*) noexcept { return _is_binary() ? data_.binary : nullptr; }
-  const Binary* get_impl_ptr(const Binary*) const noexcept {
-    return _is_binary() ? data_.binary : nullptr;
-  }
-  Array* get_impl_ptr(Array*) noexcept { return _is_array() ? data_.array : nullptr; }
-  const Array* get_impl_ptr(const Array*) const noexcept {
-    return _is_array() ? data_.array : nullptr;
-  }
-  Object* get_impl_ptr(Object*) noexcept { return _is_object() ? data_.object : nullptr; }
-  const Object* get_impl_ptr(const Object*) const noexcept {
-    return _is_object() ? data_.object : nullptr;
-  }
-  Pointer* get_impl_ptr(Pointer*) noexcept { return _is_pointer() ? data_.pointer : nullptr; }
-  const Pointer* get_impl_ptr(const Pointer*) const noexcept {
-    return _is_pointer() ? data_.pointer : nullptr;
-  }
-  Any* get_impl_ptr(Any*) noexcept { return _is_any() ? data_.any : nullptr; }
-  const Any* get_impl_ptr(const Any*) const noexcept { return _is_any() ? data_.any : nullptr; }
-
-  template <typename T>
-  T* get_erased_ptr(EraseType<T>*) noexcept {
-    return _is_any() ? static_any_cast<T>(data_.any) : nullptr;
-  }
-  template <typename T>
-  const T* get_erased_ptr(const EraseType<T>*) const noexcept {
-    return _is_any() ? static_any_cast<T>(const_cast<const Any*>(data_.any)) : nullptr;
-  }
-
-  template <typename T, typename This>
-  static auto get_ref_impl(This& obj)
-      -> decltype((*obj.template get_ptr<std::add_pointer_t<T>>())) {
-    auto p = obj.template get_ptr<std::add_pointer_t<T>>();
-    if (p) {
-      return *p;
-    }
-    throw_exception(eInvalidArgument);
-  }
-
-  template <typename T, std::enable_if_t<std::is_pointer<T>::value, bool> = true>
-  auto _get_ptr() noexcept -> decltype(std::declval<Value&>().get_impl_ptr(std::declval<T>())) {
-    return get_impl_ptr(static_cast<T>(nullptr));
-  }
-
-  template <typename T, std::enable_if_t<detail::is_pointer_to_const<T>::value, bool> = true>
-  auto _get_ptr() const noexcept
-      -> decltype(std::declval<const Value&>().get_impl_ptr(std::declval<T>())) {
-    return get_impl_ptr(static_cast<T>(nullptr));
-  }
-
-  template <typename T, std::enable_if_t<std::is_pointer<T>::value, bool> = true>
-  auto _get_ptr() noexcept -> decltype(std::declval<Value&>().get_erased_ptr(std::declval<T>())) {
-    return get_erased_ptr(static_cast<T>(nullptr));
-  }
-
-  template <typename T, std::enable_if_t<detail::is_pointer_to_const<T>::value, bool> = true>
-  auto _get_ptr() const noexcept
-      -> decltype(std::declval<const Value&>().get_erased_ptr(std::declval<T>())) {
-    return get_erased_ptr(static_cast<T>(nullptr));
-  }
-
-  // T* -> EraseType<T>*
-  template <
-      typename T, typename T0 = std::remove_pointer_t<T>,
-      std::enable_if_t<std::is_pointer<T>::value && is_cast_by_erasure<T0>::value, bool> = true>
-  auto _get_ptr() noexcept
-      -> decltype(std::declval<Value&>().get_erased_ptr(std::declval<EraseType<T0>*>())) {
-    return get_erased_ptr(static_cast<EraseType<T0>*>(nullptr));
-  }
-
-  // const T* -> const EraseType<T>*
-  template <typename T, typename T0 = std::remove_const_t<std::remove_pointer_t<T>>,
-            std::enable_if_t<detail::is_pointer_to_const<T>::value && is_cast_by_erasure<T0>::value,
-                             bool> = true>
-  auto _get_ptr() const noexcept
-      -> decltype(std::declval<Value&>().get_erased_ptr(std::declval<const EraseType<T0>*>())) {
-    return get_erased_ptr(static_cast<const EraseType<T0>*>(nullptr));
-  }
-
-  template <typename T>
-  static auto test_get_ptr(T) -> decltype(std::declval<Value&>()._get_ptr<T>(), std::true_type{});
-
-  static std::false_type test_get_ptr(...);
-
-  template <typename T>
-  using has_get_ptr = decltype(test_get_ptr(std::declval<std::add_pointer_t<T>>()));
-
-  template <typename T, std::enable_if_t<std::is_reference<T>::value, bool> = true>
-  auto _get_ref() -> decltype((get_ref_impl<T>(std::declval<Value&>()))) {
-    return get_ref_impl<T>(*this);
-  }
-
-  template <typename T, std::enable_if_t<detail::is_const_reference<T>::value, bool> = true>
-  auto _get_ref() const -> decltype((get_ref_impl<T>(std::declval<Value&>()))) {
-    return get_ref_impl<T>(*this);
-  }
-
-  template <typename T,
-            std::enable_if_t<std::is_same<std::remove_const_t<T>, Value>::value, bool> = true>
-  Value _get() const {
-    return *this;
-  }
-
-  template <typename T,
-            std::enable_if_t<!std::is_arithmetic<T>::value && has_get_ptr<T>::value, bool> = true>
-  auto _get() const
-      -> std::remove_reference_t<decltype(std::declval<Value&>()._get_ref<const T&>())> {
-    return get_ref<const T&>();
-  }
-
-  template <typename T, std::enable_if_t<std::is_arithmetic<T>::value, bool> = true>
-  T _get() const {
-    switch (_type()) {
-      case kInt:
-        return static_cast<T>(*_get_ptr<const Integer*>());
-      case kUInt:
-        return static_cast<T>(*_get_ptr<const Unsigned*>());
-      case kFloat:
-        return static_cast<T>(*_get_ptr<const Float*>());
-      case kBool:
-        return static_cast<T>(*_get_ptr<const Boolean*>());
-      default:
-        throw_exception(eInvalidArgument);
-    }
-  }
-
-  template <typename T, std::enable_if_t<std::is_same<T, const char*>::value, bool> = true>
-  const char* _get() const {
-    if (_is_string()) {
-      return data_.string->c_str();
-    }
-    throw_exception(eInvalidArgument);
-  }
-
-  template <typename T>
-  T& _get_to(T& v) const {
-    v = get<T>();
-    return v;
-  }
-
- public:
-  template <typename T>
-  auto get_ptr() noexcept -> decltype(std::declval<Value&>()._get_ptr<T>()) {
-    return _unwrap()._get_ptr<T>();
-  }
-
-  template <typename T>
-  auto get_ptr() const noexcept -> decltype(std::declval<const Value&>()._get_ptr<T>()) {
-    return _unwrap()._get_ptr<T>();
-  }
-
-  template <typename T>
-  auto get_ref() -> decltype((std::declval<Value&>()._get_ref<T>())) {
-    return _unwrap()._get_ref<T>();
-  }
-
-  template <typename T>
-  auto get_ref() const -> decltype((std::declval<const Value&>()._get_ref<T>())) {
-    return _unwrap()._get_ref<T>();
-  }
-
-  template <typename T>
-  auto get() -> decltype(std::declval<Value&>()._get<T>()) {
-    return _unwrap()._get<T>();
-  }
-
-  template <typename T>
-  auto get() const -> decltype(std::declval<const Value&>()._get<T>()) {
-    return _unwrap()._get<T>();
-  }
-
-  template <typename T>
-  auto get_to(T& v) const -> decltype((std::declval<const Value&>()._get_to(v))) {
-    return _unwrap()._get_to(v);
-  }
-
-  Array& array() & { return get_ref<Array&>(); }
-  Array&& array() && { return static_cast<Array&&>(get_ref<Array&>()); }
-  const Array& array() const& { return get_ref<const Array&>(); }
-  const Array&& array() const&& { return static_cast<const Array&&>(get_ref<const Array&>()); }
-
-  Object& object() & { return get_ref<Object&>(); }
-  Object&& object() && { return static_cast<Object&&>(get_ref<Object&>()); }
-  const Object& object() const& { return get_ref<const Object&>(); }
-  const Object&& object() const&& { return static_cast<const Object&&>(get_ref<const Object&>()); }
-
-  value_type& operator[](size_t idx) & {
-    return static_cast<value_type&>(_unwrap()._subscript(idx));
-  }
-
-  value_type&& operator[](size_t idx) && {
-    return static_cast<value_type&&>(_unwrap()._subscript(idx));
-  }
-
-  const value_type& operator[](size_t idx) const& {
-    return static_cast<const value_type&>(_unwrap()._subscript(idx));
-  }
-
-  const value_type&& operator[](size_t idx) const&& {
-    return static_cast<const value_type&&>(_unwrap()._subscript(idx));
-  }
-
-  value_type& operator[](const Object::key_type& idx) & {
-    return static_cast<value_type&>(_unwrap()._subscript(idx));
-  }
-
-  value_type&& operator[](const Object::key_type& idx) && {
-    return static_cast<value_type&&>(_unwrap()._subscript(idx));
-  }
-
-  const value_type& operator[](const Object::key_type& idx) const& {
-    return static_cast<const value_type&>(_unwrap()._subscript(idx));
-  }
-
-  const value_type&& operator[](const Object::key_type& idx) const&& {
-    return static_cast<const value_type&&>(_unwrap()._subscript(idx));
-  }
-
-  reference front() { return _unwrap()._front(); }
-
-  const_reference front() const { return _unwrap()._front(); }
-
-  reference back() { return _unwrap()._back(); }
-
-  const_reference back() const { return _unwrap()._back(); }
-
-  void push_back(Value&& val) { _unwrap()._push_back(std::move(val)); }
-
-  void push_back(const Value& val) { _unwrap()._push_back(val); }
-
-  template <typename Key>
-  bool contains(Key&& key) const {
-    return _unwrap()._contains(std::forward<Key>(key));
-  }
-
-  template <typename Key>
-  iterator find(Key&& key) {
-    return _unwrap()._find(std::forward<Key>(key));
-  }
-
-  template <typename Key>
-  const_iterator find(Key&& key) const {
-    return _unwrap()._find(std::forward<Key>(key));
-  }
-
-  template <typename T>
-  T value(const typename Object::key_type& key, const T& default_value) const {
-    return _unwrap()._value(key, default_value);
-  }
-
-  iterator begin() { return _unwrap()._begin(); }
-
-  iterator end() { return _unwrap()._end(); }
-
-  const_iterator begin() const { return _unwrap()._begin(); }
-
-  const_iterator end() const { return _unwrap()._end(); }
-
-  void update(const_reference v) { return _unwrap()._update(v); }
-
- private:
-  reference _front() {
-    if (_is_array()) {
-      return (*data_.array).front();
-    }
-    throw_exception(eInvalidArgument);
-  }
-
-  const_reference _front() const {
-    if (_is_array()) {
-      return (*data_.array).front();
-    }
-    throw_exception(eInvalidArgument);
-  }
 
-  reference _back() {
-    if (_is_array()) {
-      return (*data_.array).back();
+    template<typename T>
+    class ValueIterator
+    {
+      public:
+        using value_type        = Value;
+        using difference_type   = std::ptrdiff_t;
+        using pointer           = value_type*;
+        using reference         = value_type&;
+        using iterator_category = std::bidirectional_iterator_tag;
+        using object_iterator_t = typename T::Object::iterator;
+        using array_iterator_t  = typename T::Array::iterator;
+        ValueIterator()         = default;
+        ValueIterator(T* value, object_iterator_t iter)
+            : value_(value)
+            , object_iter_(iter)
+        {
+        }
+        ValueIterator(T* value, array_iterator_t iter)
+            : value_(value)
+            , array_iter_(iter)
+        {
+        }
+        ValueIterator& operator++()
+        {
+            if (value_->is_array())
+            {
+                ++array_iter_;
+            }
+            else
+            {
+                ++object_iter_;
+            }
+            return *this;
+        }
+        ValueIterator operator++(int)
+        {
+            auto it = *this;
+            ++(*this);
+            return it;
+        }
+        T& operator*()
+        {
+            if (value_->is_array())
+            {
+                return *array_iter_;
+            }
+            else
+            {
+                return object_iter_->second;
+            }
+        }
+        const T& operator*() const
+        {
+            if (value_->is_array())
+            {
+                return *array_iter_;
+            }
+            else
+            {
+                return object_iter_->second;
+            }
+        }
+        T* operator->()
+        {
+            if (value_->is_array())
+            {
+                return &(*array_iter_);
+            }
+            else
+            {
+                return &object_iter_->second;
+            }
+        }
+        const T* operator->() const
+        {
+            if (value_->is_array())
+            {
+                return &(*array_iter_);
+            }
+            else
+            {
+                return &object_iter_->second;
+            }
+        }
+        const std::string& key()
+        {
+            if (value_->is_object())
+            {
+                return object_iter_->first;
+            }
+            throw_exception(eInvalidArgument);
+        }
+        bool operator==(const ValueIterator& other) const
+        {
+            return value_ == other.value_ && object_iter_ == other.object_iter_ &&
+                   array_iter_ == other.array_iter_;
+        }
+        bool operator!=(const ValueIterator& other) const
+        {
+            return !(*this == other);
+        }
+
+      private:
+        T*                value_{};
+        object_iterator_t object_iter_{};
+        array_iterator_t  array_iter_{};
+    };
+
+    class Dynamic;
+
+    class Value;
+
+    template<class T>
+    struct EraseType
+    {
+        T value;
+    };
+
+    template<class T>
+    struct ArchiveType
+    {
+        T value;
+    };
+
+    template<class T>
+    EraseType<T&&> cast_by_erasure(T&& v)
+    {
+        return {std::forward<T>(v)};
     }
-    throw_exception(eInvalidArgument);
-  }
 
-  const_reference _back() const {
-    if (_is_array()) {
-      return (*data_.array).back();
+    template<class T>
+    ArchiveType<T&&> cast_by_archive(T&& v)
+    {
+        return {std::forward<T>(v)};
     }
-    throw_exception(eInvalidArgument);
-  }
 
-  void _push_back(Value&& val) {
-    if (!(_is_null() || _is_array())) {
-      throw_exception(eInvalidArgument);
+    template<typename T>
+    struct is_value : std::is_same<T, Value>
+    {
+    };
+
+    template<typename T>
+    inline constexpr bool is_value_v = is_value<T>::value;
+
+    namespace detail
+    {
+        template<typename T>
+        struct is_pointer_to_const : std::false_type
+        {
+        };
+        template<typename T>
+        struct is_pointer_to_const<const T*> : std::true_type
+        {
+        };
+        template<typename T>
+        struct is_const_reference : std::false_type
+        {
+        };
+        template<typename T>
+        struct is_const_reference<const T&> : std::true_type
+        {
+        };
+    }  // namespace detail
+
+    class Value
+    {
+      public:
+        using value_type      = Value;
+        using reference       = value_type&;
+        using const_reference = const value_type&;
+        using difference_type = std::ptrdiff_t;
+        using size_type       = std::size_t;
+        using pointer         = value_type*;
+        using const_pointer   = const value_type*;
+        using iterator        = ValueIterator<Value>;
+        using const_iterator  = ValueIterator<const Value>;
+
+        using Type = ValueType;
+
+        using Boolean  = bool;
+        using Integer  = int64_t;
+        using Unsigned = uint64_t;
+        using Float    = double;
+        using String   = std::string;
+        using Binary   = std::vector<Byte>;
+        using Array    = std::vector<Value>;
+        using Object   = std::map<std::string, Value>;
+        using Pointer  = std::shared_ptr<Value>;
+        using Dynamic  = ::mmdeploy::Dynamic;
+        using Any      = ::mmdeploy::StaticAny;
+        using ValueRef = detail::ValueRef;
+
+        static constexpr const auto kNull    = ValueType::kNull;
+        static constexpr const auto kBool    = ValueType::kBool;
+        static constexpr const auto kInt     = ValueType::kInt;
+        static constexpr const auto kUInt    = ValueType::kUInt;
+        static constexpr const auto kFloat   = ValueType::kFloat;
+        static constexpr const auto kString  = ValueType::kString;
+        static constexpr const auto kBinary  = ValueType::kBinary;
+        static constexpr const auto kArray   = ValueType::kArray;
+        static constexpr const auto kObject  = ValueType::kObject;
+        static constexpr const auto kPointer = ValueType::kPointer;
+        static constexpr const auto kDynamic = ValueType::kDynamic;
+        static constexpr const auto kAny     = ValueType::kAny;
+
+        Value(const ValueType v)
+            : type_(v)
+            , data_(v)
+        {
+        }
+
+        Value(std::nullptr_t = nullptr) noexcept
+            : Value(ValueType::kNull)
+        {
+        }
+
+        template<typename T, std::enable_if_t<std::is_same_v<T, ValueRef>, int> = 0>
+        Value(const T& ref)
+            : Value(ref.moved_or_copied())
+        {
+        }
+
+        Value(const Value& other)
+            : type_(other.type_)
+        {
+            switch (type_)
+            {
+                case ValueType::kNull:
+                    break;
+                case ValueType::kBool:
+                    data_ = other.data_.boolean;
+                    break;
+                case ValueType::kInt:
+                    data_ = other.data_.number_integer;
+                    break;
+                case ValueType::kUInt:
+                    data_ = other.data_.number_unsigned;
+                    break;
+                case ValueType::kFloat:
+                    data_ = other.data_.number_float;
+                    break;
+                case ValueType::kString:
+                    data_ = *other.data_.string;
+                    break;
+                case ValueType::kBinary:
+                    data_ = *other.data_.binary;
+                    break;
+                case ValueType::kArray:
+                    data_ = *other.data_.array;
+                    break;
+                case ValueType::kObject:
+                    data_ = *other.data_.object;
+                    break;
+                case ValueType::kPointer:
+                    data_ = *other.data_.pointer;
+                    break;
+                case ValueType::kAny:
+                    data_.any = create<Any>(*other.data_.any);
+                    break;
+                default:
+                    throw_exception(eInvalidArgument);
+            }
+        }
+
+        template<class T, std::enable_if_t<std::is_same<std::decay_t<T>, bool>::value, bool> = true>
+        Value(T&& value)
+            : type_(kBool)
+            , data_(Boolean{value})
+        {
+        }
+
+        Value(int8_t value)
+            : type_(kInt)
+            , data_(Integer{value})
+        {
+        }
+        Value(int16_t value)
+            : type_(kInt)
+            , data_(Integer{value})
+        {
+        }
+        Value(int32_t value)
+            : type_(kInt)
+            , data_(Integer{value})
+        {
+        }
+        Value(int64_t value)
+            : type_(kInt)
+            , data_(Integer{value})
+        {
+        }
+        Value(uint8_t value)
+            : type_(kUInt)
+            , data_(Unsigned{value})
+        {
+        }
+        Value(uint16_t value)
+            : type_(kUInt)
+            , data_(Unsigned{value})
+        {
+        }
+        Value(uint32_t value)
+            : type_(kUInt)
+            , data_(Unsigned{value})
+        {
+        }
+        Value(uint64_t value)
+            : type_(kUInt)
+            , data_(Unsigned{value})
+        {
+        }
+        Value(float value)
+            : type_(kFloat)
+            , data_(Float{value})
+        {
+        }
+        Value(double value)
+            : type_(kFloat)
+            , data_(Float{value})
+        {
+        }
+        Value(Binary value)
+            : type_(kBinary)
+            , data_(std::move(value))
+        {
+        }
+        Value(Array value)
+            : type_(kArray)
+            , data_(std::move(value))
+        {
+        }
+        Value(Object value)
+            : type_(kObject)
+            , data_(std::move(value))
+        {
+        }
+        Value(Pointer value)
+            : type_(kPointer)
+            , data_(std::move(value))
+        {
+        }
+
+        template<class T,
+                 std::enable_if_t<std::is_constructible<String, T>::value, bool> = true>
+        Value(T&& value)
+            : type_(kString)
+            , data_(String{std::forward<T>(value)})
+        {
+        }
+
+        template<typename T,
+                 std::enable_if_t<is_cast_by_erasure<std::decay_t<T>>::value, bool> = true>
+        Value(T&& value)
+            : Value(cast_by_erasure(std::forward<T>(value)))
+        {
+        }
+
+        template<typename T>
+        Value(EraseType<T>&& value)
+            : type_(Type::kAny)
+        {
+            data_.any = create<Any>(std::forward<T>(value.value));
+        }
+
+        Value(std::initializer_list<ValueRef> init,
+              bool                            type_deduction = true,
+              Type                            manual_type    = Type::kArray);
+
+        Value(Value&& other) noexcept
+            : type_(other.type_)
+            , data_(other.data_)
+        {
+            other.type_ = ValueType::kNull;
+            other.data_ = {};
+        }
+
+        // copy-and-swap
+        Value& operator=(Value other) noexcept
+        {
+            using std::swap;
+            swap(type_, other.type_);
+            swap(data_, other.data_);
+            return *this;
+        }
+
+        ~Value()
+        {
+            data_.destroy(type_);
+        }
+
+        operator Type() const noexcept
+        {
+            return type();
+        }
+        Type type() const noexcept
+        {
+            return _unwrap().type_;
+        }
+        bool is_null() const noexcept
+        {
+            return _unwrap()._is_null();
+        }
+        bool is_array() const noexcept
+        {
+            return _unwrap()._is_array();
+        }
+        bool is_object() const noexcept
+        {
+            return _unwrap()._is_object();
+        }
+        template<typename T = void>
+        bool is_any() const noexcept
+        {
+            return _unwrap()._is_any<T>();
+        }
+        bool is_boolean() const noexcept
+        {
+            return _unwrap()._is_boolean();
+        }
+        bool is_string() const noexcept
+        {
+            return _unwrap()._is_string();
+        }
+        bool is_binary() const noexcept
+        {
+            return _unwrap()._is_binary();
+        }
+        bool is_number() const noexcept
+        {
+            return _unwrap()._is_number();
+        }
+        bool is_number_integer() const noexcept
+        {
+            return _unwrap()._is_number_integer();
+        }
+        bool is_number_unsigned() const noexcept
+        {
+            return _unwrap()._is_number_unsigned();
+        }
+        bool is_number_float() const noexcept
+        {
+            return _unwrap()._is_number_float();
+        }
+        bool is_pointer() const noexcept
+        {
+            return _is_pointer();
+        }
+        size_t size() const noexcept
+        {
+            return _unwrap()._size();
+        }
+        bool empty() const noexcept
+        {
+            return _unwrap()._empty();
+        }
+
+      private:
+        constexpr Type _type() const noexcept
+        {
+            return type_;
+        }
+
+        constexpr bool _is_null() const noexcept
+        {
+            return type_ == Type::kNull;
+        }
+        constexpr bool _is_array() const noexcept
+        {
+            return type_ == Type::kArray;
+        }
+        constexpr bool _is_object() const noexcept
+        {
+            return type_ == Type::kObject;
+        }
+
+        template<typename T = void>
+        constexpr bool _is_any() const noexcept
+        {
+            if (type_ != Type::kAny)
+            {
+                return false;
+            }
+            if constexpr (std::is_void_v<T>)
+            {
+                return true;
+            }
+            else
+            {
+                return traits::TypeId<T>::value == data_.any->type();
+            }
+        }
+
+        constexpr bool _is_boolean() const noexcept
+        {
+            return type_ == Type::kBool;
+        }
+        constexpr bool _is_string() const noexcept
+        {
+            return type_ == Type::kString;
+        }
+        constexpr bool _is_binary() const noexcept
+        {
+            return type_ == Type::kBinary;
+        }
+        constexpr bool _is_number() const noexcept
+        {
+            return _is_number_integer() || _is_number_float();
+        }
+
+        constexpr bool _is_number_integer() const noexcept
+        {
+            return type_ == Type::kInt || type_ == Type::kUInt;
+        }
+
+        constexpr bool _is_number_unsigned() const noexcept
+        {
+            return type_ == Type::kUInt;
+        }
+        constexpr bool _is_number_float() const noexcept
+        {
+            return type_ == Type::kFloat;
+        }
+        constexpr bool _is_pointer() const noexcept
+        {
+            return type_ == Type::kPointer;
+        }
+
+        size_t _size() const noexcept
+        {
+            switch (_type())
+            {
+                case ValueType::kNull:
+                    return 0;
+                case ValueType::kArray:
+                    return data_.array->size();
+                case ValueType::kObject:
+                    return data_.object->size();
+                default:
+                    return 1;
+            }
+        }
+
+        bool _empty() const noexcept
+        {
+            switch (_type())
+            {
+                case Type::kNull:
+                    return true;
+                case Type::kArray:
+                    return data_.array->empty();
+                case Type::kObject:
+                    return data_.object->empty();
+                default:
+                    return false;
+            }
+        }
+
+      private:
+        Boolean* get_impl_ptr(Boolean*) noexcept
+        {
+            return _is_boolean() ? &data_.boolean : nullptr;
+        }
+        const Boolean* get_impl_ptr(const Boolean*) const noexcept
+        {
+            return _is_boolean() ? &data_.boolean : nullptr;
+        }
+        Integer* get_impl_ptr(Integer*) noexcept
+        {
+            return _is_number_integer() ? &data_.number_integer : nullptr;
+        }
+        const Integer* get_impl_ptr(const Integer*) const noexcept
+        {
+            return _is_number_integer() ? &data_.number_integer : nullptr;
+        }
+        Unsigned* get_impl_ptr(Unsigned*) noexcept
+        {
+            return _is_number_unsigned() ? &data_.number_unsigned : nullptr;
+        }
+        const Unsigned* get_impl_ptr(const Unsigned*) const noexcept
+        {
+            return _is_number_unsigned() ? &data_.number_unsigned : nullptr;
+        }
+        Float* get_impl_ptr(Float*) noexcept
+        {
+            return _is_number_float() ? &data_.number_float : nullptr;
+        }
+        const Float* get_impl_ptr(const Float*) const noexcept
+        {
+            return _is_number_float() ? &data_.number_float : nullptr;
+        }
+        String* get_impl_ptr(String*) noexcept
+        {
+            return _is_string() ? data_.string : nullptr;
+        }
+        const String* get_impl_ptr(const String*) const noexcept
+        {
+            return _is_string() ? data_.string : nullptr;
+        }
+        Binary* get_impl_ptr(Binary*) noexcept
+        {
+            return _is_binary() ? data_.binary : nullptr;
+        }
+        const Binary* get_impl_ptr(const Binary*) const noexcept
+        {
+            return _is_binary() ? data_.binary : nullptr;
+        }
+        Array* get_impl_ptr(Array*) noexcept
+        {
+            return _is_array() ? data_.array : nullptr;
+        }
+        const Array* get_impl_ptr(const Array*) const noexcept
+        {
+            return _is_array() ? data_.array : nullptr;
+        }
+        Object* get_impl_ptr(Object*) noexcept
+        {
+            return _is_object() ? data_.object : nullptr;
+        }
+        const Object* get_impl_ptr(const Object*) const noexcept
+        {
+            return _is_object() ? data_.object : nullptr;
+        }
+        Pointer* get_impl_ptr(Pointer*) noexcept
+        {
+            return _is_pointer() ? data_.pointer : nullptr;
+        }
+        const Pointer* get_impl_ptr(const Pointer*) const noexcept
+        {
+            return _is_pointer() ? data_.pointer : nullptr;
+        }
+        Any* get_impl_ptr(Any*) noexcept
+        {
+            return _is_any() ? data_.any : nullptr;
+        }
+        const Any* get_impl_ptr(const Any*) const noexcept
+        {
+            return _is_any() ? data_.any : nullptr;
+        }
+
+        template<typename T>
+        T* get_erased_ptr(EraseType<T>*) noexcept
+        {
+            return _is_any() ? static_any_cast<T>(data_.any) : nullptr;
+        }
+        template<typename T>
+        const T* get_erased_ptr(const EraseType<T>*) const noexcept
+        {
+            return _is_any() ? static_any_cast<T>(const_cast<const Any*>(data_.any)) : nullptr;
+        }
+
+        template<typename T, typename This>
+        static auto get_ref_impl(This& obj)
+            -> decltype((*obj.template get_ptr<std::add_pointer_t<T>>()))
+        {
+            auto p = obj.template get_ptr<std::add_pointer_t<T>>();
+            if (p)
+            {
+                return *p;
+            }
+            throw_exception(eInvalidArgument);
+        }
+
+        template<typename T, std::enable_if_t<std::is_pointer<T>::value, bool> = true>
+        auto _get_ptr() noexcept -> decltype(std::declval<Value&>().get_impl_ptr(std::declval<T>()))
+        {
+            return get_impl_ptr(static_cast<T>(nullptr));
+        }
+
+        template<typename T, std::enable_if_t<detail::is_pointer_to_const<T>::value, bool> = true>
+        auto _get_ptr() const noexcept
+            -> decltype(std::declval<const Value&>().get_impl_ptr(std::declval<T>()))
+        {
+            return get_impl_ptr(static_cast<T>(nullptr));
+        }
+
+        template<typename T, std::enable_if_t<std::is_pointer<T>::value, bool> = true>
+        auto _get_ptr() noexcept -> decltype(std::declval<Value&>().get_erased_ptr(std::declval<T>()))
+        {
+            return get_erased_ptr(static_cast<T>(nullptr));
+        }
+
+        template<typename T, std::enable_if_t<detail::is_pointer_to_const<T>::value, bool> = true>
+        auto _get_ptr() const noexcept
+            -> decltype(std::declval<const Value&>().get_erased_ptr(std::declval<T>()))
+        {
+            return get_erased_ptr(static_cast<T>(nullptr));
+        }
+
+        // T* -> EraseType<T>*
+        template<
+            typename T,
+            typename T0                                                                        = std::remove_pointer_t<T>,
+            std::enable_if_t<std::is_pointer<T>::value && is_cast_by_erasure<T0>::value, bool> = true>
+        auto _get_ptr() noexcept
+            -> decltype(std::declval<Value&>().get_erased_ptr(std::declval<EraseType<T0>*>()))
+        {
+            return get_erased_ptr(static_cast<EraseType<T0>*>(nullptr));
+        }
+
+        // const T* -> const EraseType<T>*
+        template<typename T, typename T0 = std::remove_const_t<std::remove_pointer_t<T>>, std::enable_if_t<detail::is_pointer_to_const<T>::value && is_cast_by_erasure<T0>::value, bool> = true>
+        auto _get_ptr() const noexcept
+            -> decltype(std::declval<Value&>().get_erased_ptr(std::declval<const EraseType<T0>*>()))
+        {
+            return get_erased_ptr(static_cast<const EraseType<T0>*>(nullptr));
+        }
+
+        template<typename T>
+        static auto            test_get_ptr(T) -> decltype(std::declval<Value&>()._get_ptr<T>(), std::true_type{});
+
+        static std::false_type test_get_ptr(...);
+
+        template<typename T>
+        using has_get_ptr = decltype(test_get_ptr(std::declval<std::add_pointer_t<T>>()));
+
+        template<typename T, std::enable_if_t<std::is_reference<T>::value, bool> = true>
+        auto _get_ref() -> decltype((get_ref_impl<T>(std::declval<Value&>())))
+        {
+            return get_ref_impl<T>(*this);
+        }
+
+        template<typename T, std::enable_if_t<detail::is_const_reference<T>::value, bool> = true>
+        auto _get_ref() const -> decltype((get_ref_impl<T>(std::declval<Value&>())))
+        {
+            return get_ref_impl<T>(*this);
+        }
+
+        template<typename T,
+                 std::enable_if_t<std::is_same<std::remove_const_t<T>, Value>::value, bool> = true>
+        Value _get() const
+        {
+            return *this;
+        }
+
+        template<typename T,
+                 std::enable_if_t<!std::is_arithmetic<T>::value && has_get_ptr<T>::value, bool> = true>
+        auto _get() const
+            -> std::remove_reference_t<decltype(std::declval<Value&>()._get_ref<const T&>())>
+        {
+            return get_ref<const T&>();
+        }
+
+        template<typename T, std::enable_if_t<std::is_arithmetic<T>::value, bool> = true>
+        T _get() const
+        {
+            switch (_type())
+            {
+                case kInt:
+                    return static_cast<T>(*_get_ptr<const Integer*>());
+                case kUInt:
+                    return static_cast<T>(*_get_ptr<const Unsigned*>());
+                case kFloat:
+                    return static_cast<T>(*_get_ptr<const Float*>());
+                case kBool:
+                    return static_cast<T>(*_get_ptr<const Boolean*>());
+                default:
+                    throw_exception(eInvalidArgument);
+            }
+        }
+
+        template<typename T, std::enable_if_t<std::is_same<T, const char*>::value, bool> = true>
+        const char* _get() const
+        {
+            if (_is_string())
+            {
+                return data_.string->c_str();
+            }
+            throw_exception(eInvalidArgument);
+        }
+
+        template<typename T>
+        T& _get_to(T& v) const
+        {
+            v = get<T>();
+            return v;
+        }
+
+      public:
+        template<typename T>
+        auto get_ptr() noexcept -> decltype(std::declval<Value&>()._get_ptr<T>())
+        {
+            return _unwrap()._get_ptr<T>();
+        }
+
+        template<typename T>
+        auto get_ptr() const noexcept -> decltype(std::declval<const Value&>()._get_ptr<T>())
+        {
+            return _unwrap()._get_ptr<T>();
+        }
+
+        template<typename T>
+        auto get_ref() -> decltype((std::declval<Value&>()._get_ref<T>()))
+        {
+            return _unwrap()._get_ref<T>();
+        }
+
+        template<typename T>
+        auto get_ref() const -> decltype((std::declval<const Value&>()._get_ref<T>()))
+        {
+            return _unwrap()._get_ref<T>();
+        }
+
+        template<typename T>
+        auto get() -> decltype(std::declval<Value&>()._get<T>())
+        {
+            return _unwrap()._get<T>();
+        }
+
+        template<typename T>
+        auto get() const -> decltype(std::declval<const Value&>()._get<T>())
+        {
+            return _unwrap()._get<T>();
+        }
+
+        template<typename T>
+        auto get_to(T& v) const -> decltype((std::declval<const Value&>()._get_to(v)))
+        {
+            return _unwrap()._get_to(v);
+        }
+
+        Array& array() &
+        {
+            return get_ref<Array&>();
+        }
+        Array&& array() &&
+        {
+            return static_cast<Array&&>(get_ref<Array&>());
+        }
+        const Array& array() const&
+        {
+            return get_ref<const Array&>();
+        }
+        const Array&& array() const&&
+        {
+            return static_cast<const Array&&>(get_ref<const Array&>());
+        }
+
+        Object& object() &
+        {
+            return get_ref<Object&>();
+        }
+        Object&& object() &&
+        {
+            return static_cast<Object&&>(get_ref<Object&>());
+        }
+        const Object& object() const&
+        {
+            return get_ref<const Object&>();
+        }
+        const Object&& object() const&&
+        {
+            return static_cast<const Object&&>(get_ref<const Object&>());
+        }
+
+        value_type& operator[](size_t idx) &
+        {
+            return static_cast<value_type&>(_unwrap()._subscript(idx));
+        }
+
+        value_type&& operator[](size_t idx) &&
+        {
+            return static_cast<value_type&&>(_unwrap()._subscript(idx));
+        }
+
+        const value_type& operator[](size_t idx) const&
+        {
+            return static_cast<const value_type&>(_unwrap()._subscript(idx));
+        }
+
+        const value_type&& operator[](size_t idx) const&&
+        {
+            return static_cast<const value_type&&>(_unwrap()._subscript(idx));
+        }
+
+        value_type& operator[](const Object::key_type& idx) &
+        {
+            return static_cast<value_type&>(_unwrap()._subscript(idx));
+        }
+
+        value_type&& operator[](const Object::key_type& idx) &&
+        {
+            return static_cast<value_type&&>(_unwrap()._subscript(idx));
+        }
+
+        const value_type& operator[](const Object::key_type& idx) const&
+        {
+            return static_cast<const value_type&>(_unwrap()._subscript(idx));
+        }
+
+        const value_type&& operator[](const Object::key_type& idx) const&&
+        {
+            return static_cast<const value_type&&>(_unwrap()._subscript(idx));
+        }
+
+        reference front()
+        {
+            return _unwrap()._front();
+        }
+
+        const_reference front() const
+        {
+            return _unwrap()._front();
+        }
+
+        reference back()
+        {
+            return _unwrap()._back();
+        }
+
+        const_reference back() const
+        {
+            return _unwrap()._back();
+        }
+
+        void push_back(Value&& val)
+        {
+            _unwrap()._push_back(std::move(val));
+        }
+
+        void push_back(const Value& val)
+        {
+            _unwrap()._push_back(val);
+        }
+
+        template<typename Key>
+        bool contains(Key&& key) const
+        {
+            return _unwrap()._contains(std::forward<Key>(key));
+        }
+
+        template<typename Key>
+        iterator find(Key&& key)
+        {
+            return _unwrap()._find(std::forward<Key>(key));
+        }
+
+        template<typename Key>
+        const_iterator find(Key&& key) const
+        {
+            return _unwrap()._find(std::forward<Key>(key));
+        }
+
+        template<typename T>
+        T value(const typename Object::key_type& key, const T& default_value) const
+        {
+            return _unwrap()._value(key, default_value);
+        }
+
+        iterator begin()
+        {
+            return _unwrap()._begin();
+        }
+
+        iterator end()
+        {
+            return _unwrap()._end();
+        }
+
+        const_iterator begin() const
+        {
+            return _unwrap()._begin();
+        }
+
+        const_iterator end() const
+        {
+            return _unwrap()._end();
+        }
+
+        void update(const_reference v)
+        {
+            return _unwrap()._update(v);
+        }
+
+      private:
+        reference _front()
+        {
+            if (_is_array())
+            {
+                return (*data_.array).front();
+            }
+            throw_exception(eInvalidArgument);
+        }
+
+        const_reference _front() const
+        {
+            if (_is_array())
+            {
+                return (*data_.array).front();
+            }
+            throw_exception(eInvalidArgument);
+        }
+
+        reference _back()
+        {
+            if (_is_array())
+            {
+                return (*data_.array).back();
+            }
+            throw_exception(eInvalidArgument);
+        }
+
+        const_reference _back() const
+        {
+            if (_is_array())
+            {
+                return (*data_.array).back();
+            }
+            throw_exception(eInvalidArgument);
+        }
+
+        void _push_back(Value&& val)
+        {
+            if (!(_is_null() || _is_array()))
+            {
+                throw_exception(eInvalidArgument);
+            }
+            if (_is_null())
+            {
+                *this = Type::kArray;
+            }
+            data_.array->push_back(std::move(val));
+        }
+
+        void _push_back(const Value& val)
+        {
+            if (!(_is_null() || _is_array()))
+            {
+                throw_exception(eInvalidArgument);
+            }
+            if (_is_null())
+            {
+                *this = Type::kArray;
+            }
+            data_.array->push_back(val);
+        }
+
+        template<typename Key>
+        bool _contains(Key&& key) const
+        {
+            return _is_object() && data_.object->find(std::forward<Key>(key)) != data_.object->end();
+        }
+
+        template<typename Key>
+        iterator _find(Key&& key)
+        {
+            if (_is_object())
+            {
+                auto iter = data_.object->find(std::forward<Key>(key));
+                return {this, iter};
+            }
+            throw_exception(eInvalidArgument);
+        }
+
+        template<typename Key>
+        const_iterator _find(Key&& key) const
+        {
+            if (_is_object())
+            {
+                auto iter = data_.object->find(std::forward<Key>(key));
+                return {this, iter};
+            }
+            throw_exception(eInvalidArgument);
+        }
+
+        template<typename T>
+        T _value(const typename Object::key_type& key, const T& default_value) const
+        {
+            if (_is_object())
+            {
+                const auto it = _find(key);
+                if (it != _end())
+                {
+                    return (*it)._get<T>();
+                }
+                return default_value;
+            }
+            throw_exception(eInvalidArgument);
+        }
+
+        iterator _begin()
+        {
+            if (_is_array())
+            {
+                return {this, data_.array->begin()};
+            }
+            else if (_is_object())
+            {
+                return {this, data_.object->begin()};
+            }
+            else
+            {
+                throw_exception(eInvalidArgument);
+            }
+        }
+
+        iterator _end()
+        {
+            if (_is_array())
+            {
+                return {this, data_.array->end()};
+            }
+            else if (_is_object())
+            {
+                return {this, data_.object->end()};
+            }
+            else
+            {
+                throw_exception(eInvalidArgument);
+            }
+        }
+
+        const_iterator _begin() const
+        {
+            if (_is_array())
+            {
+                return {this, data_.array->begin()};
+            }
+            else if (_is_object())
+            {
+                return {this, data_.object->begin()};
+            }
+            else
+            {
+                throw_exception(eInvalidArgument);
+            }
+        }
+
+        const_iterator _end() const
+        {
+            if (_is_array())
+            {
+                return {this, data_.array->end()};
+            }
+            else if (_is_object())
+            {
+                return {this, data_.object->end()};
+            }
+            else
+            {
+                throw_exception(eInvalidArgument);
+            }
+        }
+
+        void _update(const_reference v)
+        {
+            if (_is_null())
+            {
+                type_        = ValueType::kObject;
+                data_.object = create<Object>();
+            }
+            if (!(_is_object() && v._is_object()))
+            {
+                throw_exception(eInvalidArgument);
+            }
+            for (auto it = v._begin(); it != v._end(); ++it)
+            {
+                data_.object->operator[](it.key()) = *it;
+            }
+        }
+
+        Value& _unwrap()
+        {
+            auto p = this;
+            while (p->_is_pointer() && *p->data_.pointer)
+            {
+                p = p->data_.pointer->get();
+            }
+            return *p;
+        }
+
+        const Value& _unwrap() const
+        {
+            auto p = this;
+            while (p->_is_pointer() && *p->data_.pointer)
+            {
+                p = p->data_.pointer->get();
+            }
+            return *p;
+        }
+
+      private:
+        template<typename T, typename... Args>
+        static T* create(Args&&... args)
+        {
+            return new T(std::forward<Args>(args)...);
+        }
+
+        template<typename T>
+        static void release(T* ptr)
+        {
+            delete ptr;
+        }
+
+        value_type& _subscript(size_t idx)
+        {
+            if (_is_array())
+            {
+                return (*data_.array)[idx];
+            }
+            throw_exception(eInvalidArgument);
+        }
+
+        const value_type& _subscript(size_t idx) const
+        {
+            if (_is_array())
+            {
+                return (*data_.array)[idx];
+            }
+            throw_exception(eInvalidArgument);
+        }
+
+        reference _subscript(const Object::key_type& key)
+        {
+            if (_is_null())
+            {
+                type_        = Type::kObject;
+                data_.object = create<Object>();
+            }
+            if (_is_object())
+            {
+                return (*data_.object)[key];
+            }
+            throw_exception(eInvalidArgument);
+        }
+
+        const_reference _subscript(const Object::key_type& key) const
+        {
+            if (_is_object())
+            {
+                return (*data_.object)[key];
+            }
+            throw_exception(eInvalidArgument);
+        }
+
+      private:
+        union ValueData
+        {
+            Boolean  boolean;
+            Integer  number_integer;
+            Unsigned number_unsigned;
+            Float    number_float;
+            String*  string;
+            Binary*  binary;
+            Array*   array;
+            Object*  object;
+            Dynamic* dynamic;
+            Pointer* pointer;
+            Any*     any;
+
+            ValueData() = default;
+
+            ValueData(Boolean v) noexcept
+                : boolean(v)
+            {
+            }
+
+            ValueData(Integer v) noexcept
+                : number_integer(v)
+            {
+            }
+
+            ValueData(Unsigned v) noexcept
+                : number_unsigned(v)
+            {
+            }
+
+            ValueData(Float v) noexcept
+                : number_float(v)
+            {
+            }
+
+            ValueData(Type type)
+            {
+                switch (type)
+                {
+                    case Type::kBool:
+                        boolean = Boolean{};
+                        break;
+                    case Type::kInt:
+                        number_integer = Integer{};
+                        break;
+                    case Type::kUInt:
+                        number_unsigned = Unsigned{};
+                        break;
+                    case Type::kFloat:
+                        number_float = Float{};
+                        break;
+                    case Type::kString:
+                        string = create<String>();
+                        break;
+                    case Type::kBinary:
+                        binary = create<Binary>();
+                        break;
+                    case Type::kArray:
+                        array = create<Array>();
+                        break;
+                    case Type::kObject:
+                        object = create<Object>();
+                        break;
+                    case Type::kPointer:
+                        pointer = create<Pointer>();
+                        break;
+                    case Type::kAny:
+                        any = create<Any>();
+                        break;
+                    case Type::kNull:
+                        object = nullptr;
+                        break;
+                    default:
+                        throw_exception(eNotSupported);
+                }
+            }
+
+            ValueData(const String& value)
+            {
+                string = create<String>(value);
+            }
+
+            ValueData(String&& value)
+            {
+                string = create<String>(std::move(value));
+            }
+
+            ValueData(const Binary& value)
+            {
+                binary = create<Binary>(value);
+            }
+
+            ValueData(Binary&& value)
+            {
+                binary = create<Binary>(std::move(value));
+            }
+
+            ValueData(const Object& value)
+            {
+                object = create<Object>(value);
+            }
+
+            ValueData(Object&& value)
+            {
+                object = create<Object>(std::move(value));
+            }
+
+            ValueData(const Array& value)
+            {
+                array = create<Array>(value);
+            }
+
+            ValueData(Array&& value)
+            {
+                array = create<Array>(std::move(value));
+            }
+
+            ValueData(const Pointer& value)
+            {
+                pointer = create<Pointer>(value);
+            }
+
+            ValueData(Pointer&& value)
+            {
+                pointer = create<Pointer>(std::move(value));
+            }
+
+            // nlohmann/json used an iterative implementation
+            void destroy(ValueType t)
+            {
+                switch (t)
+                {
+                    case ValueType::kString:
+                        release(string);
+                        break;
+                    case ValueType::kBinary:
+                        release(binary);
+                        break;
+                    case ValueType::kArray:
+                        release(array);
+                        break;
+                    case ValueType::kObject:
+                        release(object);
+                        break;
+                    case ValueType::kPointer:
+                        release(pointer);
+                        break;
+                    case ValueType::kAny:
+                        release(any);
+                        break;
+                    default:
+                        break;
+                }
+            }
+        };
+
+        ValueType type_ = ValueType::kNull;
+        ValueData data_ = {};
+    };
+
+    namespace detail
+    {
+
+        class ValueRef
+        {
+          public:
+            ValueRef(Value&& value)
+                : owned_value_(std::move(value))
+                , value_ref_(&owned_value_)
+                , is_rvalue_(true)
+            {
+            }
+
+            ValueRef(const Value& value)
+                : value_ref_(const_cast<Value*>(&value))
+                , is_rvalue_(false)
+            {
+            }
+
+            ValueRef(std::initializer_list<ValueRef> init)
+                : owned_value_(init)
+                , value_ref_(&owned_value_)
+                , is_rvalue_(true)
+            {
+            }
+
+            template<typename... Args, std::enable_if_t<std::is_constructible_v<Value, Args...>, int> = 0>
+            ValueRef(Args&&... args)
+                : owned_value_(std::forward<Args>(args)...)
+                , value_ref_(&owned_value_)
+                , is_rvalue_(true)
+            {
+            }
+
+            ValueRef(ValueRef&&)                 = default;
+            ValueRef(const ValueRef&)            = delete;
+            ValueRef& operator=(const ValueRef&) = delete;
+            ValueRef& operator=(ValueRef&&)      = delete;
+            ~ValueRef()                          = default;
+
+            Value moved_or_copied() const
+            {
+                if (is_rvalue_)
+                {
+                    return std::move(*value_ref_);
+                }
+                return *value_ref_;
+            }
+
+            const Value& operator*() const
+            {
+                return *static_cast<const Value*>(value_ref_);
+            }
+            const Value* operator->() const
+            {
+                return static_cast<const Value*>(value_ref_);
+            }
+
+          private:
+            mutable Value owned_value_;
+            Value*        value_ref_ = nullptr;
+            const bool    is_rvalue_ = true;
+        };
+
+    }  // namespace detail
+
+    inline Value::Value(std::initializer_list<ValueRef> init,
+                        bool                            type_deduction,
+                        Type                            manual_type)
+    {
+        bool is_an_object = true;
+        for (const auto& x : init)
+        {
+            if (!(x->_is_array() && x->_size() == 2 && x->_front()._is_string()))
+            {
+                is_an_object = false;
+                break;
+            }
+        }
+
+        if (!type_deduction)
+        {
+            if (manual_type == Type::kArray)
+            {
+                is_an_object = false;
+            }
+
+            if (manual_type == Type::kObject && !is_an_object)
+            {
+                throw_exception(eInvalidArgument);
+            }
+        }
+
+        if (is_an_object)
+        {
+            type_ = Type::kObject;
+            data_ = Type::kObject;
+            for (const auto& x : init)
+            {
+                auto e = x.moved_or_copied();
+                data_.object->emplace(std::move(*((*e.data_.array)[0].data_.string)),
+                                      std::move((*e.data_.array)[1]));
+            }
+        }
+        else
+        {
+            type_       = Type::kArray;
+            data_.array = create<Array>(init.begin(), init.end());
+        }
     }
-    if (_is_null()) {
-      *this = Type::kArray;
-    }
-    data_.array->push_back(std::move(val));
-  }
-
-  void _push_back(const Value& val) {
-    if (!(_is_null() || _is_array())) {
-      throw_exception(eInvalidArgument);
-    }
-    if (_is_null()) {
-      *this = Type::kArray;
-    }
-    data_.array->push_back(val);
-  }
-
-  template <typename Key>
-  bool _contains(Key&& key) const {
-    return _is_object() && data_.object->find(std::forward<Key>(key)) != data_.object->end();
-  }
-
-  template <typename Key>
-  iterator _find(Key&& key) {
-    if (_is_object()) {
-      auto iter = data_.object->find(std::forward<Key>(key));
-      return {this, iter};
-    }
-    throw_exception(eInvalidArgument);
-  }
-
-  template <typename Key>
-  const_iterator _find(Key&& key) const {
-    if (_is_object()) {
-      auto iter = data_.object->find(std::forward<Key>(key));
-      return {this, iter};
-    }
-    throw_exception(eInvalidArgument);
-  }
-
-  template <typename T>
-  T _value(const typename Object::key_type& key, const T& default_value) const {
-    if (_is_object()) {
-      const auto it = _find(key);
-      if (it != _end()) {
-        return (*it)._get<T>();
-      }
-      return default_value;
-    }
-    throw_exception(eInvalidArgument);
-  }
-
-  iterator _begin() {
-    if (_is_array()) {
-      return {this, data_.array->begin()};
-    } else if (_is_object()) {
-      return {this, data_.object->begin()};
-    } else {
-      throw_exception(eInvalidArgument);
-    }
-  }
-
-  iterator _end() {
-    if (_is_array()) {
-      return {this, data_.array->end()};
-    } else if (_is_object()) {
-      return {this, data_.object->end()};
-    } else {
-      throw_exception(eInvalidArgument);
-    }
-  }
-
-  const_iterator _begin() const {
-    if (_is_array()) {
-      return {this, data_.array->begin()};
-    } else if (_is_object()) {
-      return {this, data_.object->begin()};
-    } else {
-      throw_exception(eInvalidArgument);
-    }
-  }
-
-  const_iterator _end() const {
-    if (_is_array()) {
-      return {this, data_.array->end()};
-    } else if (_is_object()) {
-      return {this, data_.object->end()};
-    } else {
-      throw_exception(eInvalidArgument);
-    }
-  }
-
-  void _update(const_reference v) {
-    if (_is_null()) {
-      type_ = ValueType::kObject;
-      data_.object = create<Object>();
-    }
-    if (!(_is_object() && v._is_object())) {
-      throw_exception(eInvalidArgument);
-    }
-    for (auto it = v._begin(); it != v._end(); ++it) {
-      data_.object->operator[](it.key()) = *it;
-    }
-  }
-
-  Value& _unwrap() {
-    auto p = this;
-    while (p->_is_pointer() && *p->data_.pointer) {
-      p = p->data_.pointer->get();
-    }
-    return *p;
-  }
-
-  const Value& _unwrap() const {
-    auto p = this;
-    while (p->_is_pointer() && *p->data_.pointer) {
-      p = p->data_.pointer->get();
-    }
-    return *p;
-  }
-
- private:
-  template <typename T, typename... Args>
-  static T* create(Args&&... args) {
-    return new T(std::forward<Args>(args)...);
-  }
-
-  template <typename T>
-  static void release(T* ptr) {
-    delete ptr;
-  }
-
-  value_type& _subscript(size_t idx) {
-    if (_is_array()) {
-      return (*data_.array)[idx];
-    }
-    throw_exception(eInvalidArgument);
-  }
-
-  const value_type& _subscript(size_t idx) const {
-    if (_is_array()) {
-      return (*data_.array)[idx];
-    }
-    throw_exception(eInvalidArgument);
-  }
-
-  reference _subscript(const Object::key_type& key) {
-    if (_is_null()) {
-      type_ = Type::kObject;
-      data_.object = create<Object>();
-    }
-    if (_is_object()) {
-      return (*data_.object)[key];
-    }
-    throw_exception(eInvalidArgument);
-  }
 
-  const_reference _subscript(const Object::key_type& key) const {
-    if (_is_object()) {
-      return (*data_.object)[key];
+    inline Value make_pointer(Value v)
+    {
+        return std::make_shared<Value>(std::move(v));
     }
-    throw_exception(eInvalidArgument);
-  }
-
- private:
-  union ValueData {
-    Boolean boolean;
-    Integer number_integer;
-    Unsigned number_unsigned;
-    Float number_float;
-    String* string;
-    Binary* binary;
-    Array* array;
-    Object* object;
-    Dynamic* dynamic;
-    Pointer* pointer;
-    Any* any;
-
-    ValueData() = default;
-
-    ValueData(Boolean v) noexcept : boolean(v) {}
-
-    ValueData(Integer v) noexcept : number_integer(v) {}
-
-    ValueData(Unsigned v) noexcept : number_unsigned(v) {}
-
-    ValueData(Float v) noexcept : number_float(v) {}
-
-    ValueData(Type type) {
-      switch (type) {
-        case Type::kBool:
-          boolean = Boolean{};
-          break;
-        case Type::kInt:
-          number_integer = Integer{};
-          break;
-        case Type::kUInt:
-          number_unsigned = Unsigned{};
-          break;
-        case Type::kFloat:
-          number_float = Float{};
-          break;
-        case Type::kString:
-          string = create<String>();
-          break;
-        case Type::kBinary:
-          binary = create<Binary>();
-          break;
-        case Type::kArray:
-          array = create<Array>();
-          break;
-        case Type::kObject:
-          object = create<Object>();
-          break;
-        case Type::kPointer:
-          pointer = create<Pointer>();
-          break;
-        case Type::kAny:
-          any = create<Any>();
-          break;
-        case Type::kNull:
-          object = nullptr;
-          break;
-        default:
-          throw_exception(eNotSupported);
-      }
-    }
-
-    ValueData(const String& value) { string = create<String>(value); }
-
-    ValueData(String&& value) { string = create<String>(std::move(value)); }
-
-    ValueData(const Binary& value) { binary = create<Binary>(value); }
-
-    ValueData(Binary&& value) { binary = create<Binary>(std::move(value)); }
-
-    ValueData(const Object& value) { object = create<Object>(value); }
-
-    ValueData(Object&& value) { object = create<Object>(std::move(value)); }
-
-    ValueData(const Array& value) { array = create<Array>(value); }
-
-    ValueData(Array&& value) { array = create<Array>(std::move(value)); }
-
-    ValueData(const Pointer& value) { pointer = create<Pointer>(value); }
-
-    ValueData(Pointer&& value) { pointer = create<Pointer>(std::move(value)); }
-
-    // nlohmann/json used an iterative implementation
-    void destroy(ValueType t) {
-      switch (t) {
-        case ValueType::kString:
-          release(string);
-          break;
-        case ValueType::kBinary:
-          release(binary);
-          break;
-        case ValueType::kArray:
-          release(array);
-          break;
-        case ValueType::kObject:
-          release(object);
-          break;
-        case ValueType::kPointer:
-          release(pointer);
-          break;
-        case ValueType::kAny:
-          release(any);
-          break;
-        default:
-          break;
-      }
-    }
-  };
 
-  ValueType type_ = ValueType::kNull;
-  ValueData data_ = {};
-};
-
-namespace detail {
-
-class ValueRef {
- public:
-  ValueRef(Value&& value)
-      : owned_value_(std::move(value)), value_ref_(&owned_value_), is_rvalue_(true) {}
-
-  ValueRef(const Value& value) : value_ref_(const_cast<Value*>(&value)), is_rvalue_(false) {}
-
-  ValueRef(std::initializer_list<ValueRef> init)
-      : owned_value_(init), value_ref_(&owned_value_), is_rvalue_(true) {}
-
-  template <typename... Args, std::enable_if_t<std::is_constructible_v<Value, Args...>, int> = 0>
-  ValueRef(Args&&... args)
-      : owned_value_(std::forward<Args>(args)...), value_ref_(&owned_value_), is_rvalue_(true) {}
-
-  ValueRef(ValueRef&&) = default;
-  ValueRef(const ValueRef&) = delete;
-  ValueRef& operator=(const ValueRef&) = delete;
-  ValueRef& operator=(ValueRef&&) = delete;
-  ~ValueRef() = default;
-
-  Value moved_or_copied() const {
-    if (is_rvalue_) {
-      return std::move(*value_ref_);
-    }
-    return *value_ref_;
-  }
-
-  const Value& operator*() const { return *static_cast<const Value*>(value_ref_); }
-  const Value* operator->() const { return static_cast<const Value*>(value_ref_); }
-
- private:
-  mutable Value owned_value_;
-  Value* value_ref_ = nullptr;
-  const bool is_rvalue_ = true;
-};
-
-}  // namespace detail
-
-inline Value::Value(std::initializer_list<ValueRef> init, bool type_deduction, Type manual_type) {
-  bool is_an_object = true;
-  for (const auto& x : init) {
-    if (!(x->_is_array() && x->_size() == 2 && x->_front()._is_string())) {
-      is_an_object = false;
-      break;
-    }
-  }
-  if (!type_deduction) {
-    if (manual_type == Type::kArray) {
-      is_an_object = false;
-    }
-    if (manual_type == Type::kObject && !is_an_object) {
-      throw_exception(eInvalidArgument);
-    }
-  }
-  if (is_an_object) {
-    type_ = Type::kObject;
-    data_ = Type::kObject;
-    for (const auto& x : init) {
-      auto e = x.moved_or_copied();
-      data_.object->emplace(std::move(*((*e.data_.array)[0].data_.string)),
-                            std::move((*e.data_.array)[1]));
-    }
-  } else {
-    type_ = Type::kArray;
-    data_.array = create<Array>(init.begin(), init.end());
-  }
-}
-
-inline Value make_pointer(Value v) { return std::make_shared<Value>(std::move(v)); }
-
-inline void update(Value::Object& dst, const Value::Object& src, int depth) {
-  if (depth < 0) {
-    return;
-  }
-  for (const auto& [key, value] : src) {
-    auto ret = dst.insert({key, value});
-    if (!ret.second && ret.first->second.is_object() && value.is_object()) {
-      update(ret.first->second.object(), value.object(), depth - 1);
+    inline void update(Value::Object& dst, const Value::Object& src, int depth)
+    {
+        if (depth < 0)
+        {
+            return;
+        }
+
+        for (const auto& [key, value] : src)
+        {
+            auto ret = dst.insert({key, value});
+            if (!ret.second && ret.first->second.is_object() && value.is_object())
+            {
+                update(ret.first->second.object(), value.object(), depth - 1);
+            }
+        }
     }
-  }
-}
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/device/CMakeLists.txt b/csrc/mmdeploy/device/CMakeLists.txt
index 6243e8e6ab..f0c29ac38d 100644
--- a/csrc/mmdeploy/device/CMakeLists.txt
+++ b/csrc/mmdeploy/device/CMakeLists.txt
@@ -2,10 +2,10 @@
 
 add_subdirectory(cpu)
 
-if ("cuda" IN_LIST MMDEPLOY_TARGET_DEVICES)
-    add_subdirectory(cuda)
-endif ()
+if("cuda" IN_LIST MMDEPLOY_TARGET_DEVICES)
+  add_subdirectory(cuda)
+endif()
 
-if ("acl" IN_LIST MMDEPLOY_TARGET_BACKENDS)
-    add_subdirectory(acl)
-endif ()
+if("acl" IN_LIST MMDEPLOY_TARGET_BACKENDS)
+  add_subdirectory(acl)
+endif()
diff --git a/csrc/mmdeploy/device/acl/acl_device.cpp b/csrc/mmdeploy/device/acl/acl_device.cpp
index 6316f38322..3c805f0514 100644
--- a/csrc/mmdeploy/device/acl/acl_device.cpp
+++ b/csrc/mmdeploy/device/acl/acl_device.cpp
@@ -2,13 +2,18 @@
 
 #include "mmdeploy/core/device_impl.h"
 
-namespace mmdeploy::framework {
+namespace mmdeploy::framework
+{
 
-class AclPlatformRegisterer {
- public:
-  AclPlatformRegisterer() { gPlatformRegistry().AddAlias("npu", "cpu"); }
-};
+    class AclPlatformRegisterer
+    {
+      public:
+        AclPlatformRegisterer()
+        {
+            gPlatformRegistry().AddAlias("npu", "cpu");
+        }
+    };
 
-AclPlatformRegisterer g_acl_platform_registerer;
+    AclPlatformRegisterer g_acl_platform_registerer;
 
 }  // namespace mmdeploy::framework
diff --git a/csrc/mmdeploy/device/cpu/cpu_device.cpp b/csrc/mmdeploy/device/cpu/cpu_device.cpp
index eb4fa4c365..00f997e92e 100644
--- a/csrc/mmdeploy/device/cpu/cpu_device.cpp
+++ b/csrc/mmdeploy/device/cpu/cpu_device.cpp
@@ -5,422 +5,528 @@
 #include <cstdlib>
 #include <cstring>
 
-namespace mmdeploy::framework {
-
-class CpuHostMemory : public NonCopyable {
- public:
-  CpuHostMemory() : size_(), data_(), owned_data_{false} {}
-  Result<void> Init(size_t size, size_t alignment) {
-    alignment = std::max(alignment, sizeof(void*));
-    auto space = (size + alignment - 1) / alignment * alignment;
+namespace mmdeploy::framework
+{
+
+    class CpuHostMemory : public NonCopyable
+    {
+      public:
+        CpuHostMemory()
+            : size_()
+            , data_()
+            , owned_data_{false}
+        {
+        }
+        Result<void> Init(size_t size, size_t alignment)
+        {
+            alignment  = std::max(alignment, sizeof(void*));
+            auto space = (size + alignment - 1) / alignment * alignment;
 #ifdef _MSC_VER
-    data_ = _aligned_malloc(space, alignment);
+            data_ = _aligned_malloc(space, alignment);
 #elif defined(ANDROID)
-    posix_memalign(&data_, alignment, space);
+            posix_memalign(&data_, alignment, space);
 #else
-    data_ = aligned_alloc(alignment, space);
+            data_ = aligned_alloc(alignment, space);
 #endif
-    if (!data_) {
-      return Status(eOutOfMemory);
-    }
-    aligned_data_ = data_;
-    size_ = size;
-    owned_data_ = true;
-    return success();
-  }
-  Result<void> Init(size_t size, std::shared_ptr<void> data) {
-    size_ = size;
-    external_ = std::move(data);
-    data_ = external_.get();
-    owned_data_ = false;
-    return success();
-  }
-  Result<void> Init(size_t size, void* data) {
-    size_ = size;
-    data_ = data;
-    owned_data_ = false;
-    return success();
-  }
-  ~CpuHostMemory() {
-    if (data_) {
-      if (owned_data_) {
+            if (!data_)
+            {
+                return Status(eOutOfMemory);
+            }
+            aligned_data_ = data_;
+            size_         = size;
+            owned_data_   = true;
+            return success();
+        }
+        Result<void> Init(size_t size, std::shared_ptr<void> data)
+        {
+            size_       = size;
+            external_   = std::move(data);
+            data_       = external_.get();
+            owned_data_ = false;
+            return success();
+        }
+        Result<void> Init(size_t size, void* data)
+        {
+            size_       = size;
+            data_       = data;
+            owned_data_ = false;
+            return success();
+        }
+        ~CpuHostMemory()
+        {
+            if (data_)
+            {
+                if (owned_data_)
+                {
 #ifdef _MSC_VER
-        _aligned_free(data_);
+                    _aligned_free(data_);
 #else
-        std::free(data_);
+                    std::free(data_);
 #endif
-        owned_data_ = false;
-      }
-      data_ = nullptr;
-    }
-    external_.reset();
-    size_ = 0;
-  }
-  size_t size() const { return size_; }
-  void* data() const { return owned_data_ ? aligned_data_ : data_; }
-
- private:
-  size_t size_;
-  void* data_;
-  void* aligned_data_{nullptr};
-  bool owned_data_;
-  std::shared_ptr<void> external_;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-/// CpuPlatformImpl
-
-Result<void> CpuPlatformImpl::BindDevice(Device device, Device* prev) {
-  // do nothing
-  if (prev) {
-    *prev = device;
-  }
-  return success();
-}
-
-shared_ptr<BufferImpl> CpuPlatformImpl::CreateBuffer(Device device) {
-  return std::make_shared<CpuBufferImpl>(device);
-}
-
-shared_ptr<StreamImpl> CpuPlatformImpl::CreateStream(Device device) {
-  return std::make_shared<CpuStreamImpl>(device);
-}
-
-shared_ptr<EventImpl> CpuPlatformImpl::CreateEvent(Device device) {
-  return std::make_shared<CpuEventImpl>(device);
-}
-
-int CpuPlatformImpl::GetPlatformId() const noexcept { return 0; }
-
-const char* CpuPlatformImpl::GetPlatformName() const noexcept { return "cpu"; }
-
-bool CpuPlatformImpl::CheckCopyParam(size_t src_size, size_t dst_size, size_t src_offset,
-                                     size_t dst_offset, size_t copy_size) {
-  if (src_offset + copy_size > src_size) {
-    return false;
-  }
-  if (dst_offset + copy_size > dst_size) {
-    return false;
-  }
-  return true;
-}
-
-inline void* OffsetPtr(void* ptr, size_t offset) {
-  return static_cast<void*>(static_cast<uint8_t*>(ptr) + offset);
-}
-
-inline const void* OffsetPtr(const void* ptr, size_t offset) {
-  return static_cast<const void*>(static_cast<const uint8_t*>(ptr) + offset);
-}
-
-Result<void> CpuPlatformImpl::CopyImpl(const void* src, void* dst, size_t src_size, size_t dst_size,
-                                       size_t src_offset, size_t dst_offset, size_t size,
-                                       Stream st) {
-  if (!CheckCopyParam(src_size, dst_size, src_offset, dst_offset, size)) {
-    return Status(eInvalidArgument);
-  }
-  auto task = [=] { std::memcpy(OffsetPtr(dst, dst_offset), OffsetPtr(src, src_offset), size); };
-  if (!st) {
-    task();
-    return success();
-  }
-  if (st.GetDevice().platform_id() != 0) {
-    return Status(eInvalidArgument);
-  }
-  auto cpu_stream = static_cast<CpuStreamImpl*>(st.GetNative());
-  if (!cpu_stream) {
-    return Status(eInvalidArgument);
-  }
-  return cpu_stream->Enqueue(std::move(task));
-}
-
-Result<void> CpuPlatformImpl::Copy(const void* host_ptr, Buffer dst, size_t size, size_t dst_offset,
-                                   Stream stream) {
-  auto dst_ptr = dst.GetNative();
-  if (!dst_ptr) {
-    return Status(eInvalidArgument);
-  }
-  if (dst.GetDevice().platform_id() != 0) {
-    return Status(eInvalidArgument);
-  }
-  return CopyImpl(host_ptr, dst_ptr, size, dst.GetSize(), 0, dst_offset, size, stream);
-}
-
-Result<void> CpuPlatformImpl::Copy(Buffer src, void* host_ptr, size_t size, size_t src_offset,
-                                   Stream stream) {
-  auto src_ptr = src.GetNative();
-  if (!src_ptr) {
-    return Status(eInvalidArgument);
-  }
-  if (src.GetDevice().platform_id() != 0) {
-    return Status(eInvalidArgument);
-  }
-  return CopyImpl(src_ptr, host_ptr, src.GetSize(), size, src_offset, 0, size, stream);
-}
-Result<void> CpuPlatformImpl::Copy(Buffer src, Buffer dst, size_t size, size_t src_offset,
-                                   size_t dst_offset, Stream stream) {
-  auto src_ptr = src.GetNative();
-  auto dst_ptr = dst.GetNative();
-  if (!src_ptr || !dst_ptr) {
-    return Status(eInvalidArgument);
-  }
-  auto device = src.GetDevice();
-  if (device.platform_id() != 0 || device.platform_id() != dst.GetDevice().platform_id()) {
-    return Status(eInvalidArgument);
-  }
-  return CopyImpl(src_ptr, dst_ptr, src.GetSize(), dst.GetSize(), src_offset, dst_offset, size,
-                  stream);
-}
-
-Result<Stream> CpuPlatformImpl::GetDefaultStream(int32_t device_id) {
-  try {
-    std::call_once(init_flag_, [&] { default_stream_ = Stream(GetDevice(device_id)); });
-    return default_stream_;
-  } catch (...) {
-    return Status(eFail);
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// CpuBufferImpl
-
-CpuBufferImpl::CpuBufferImpl(Device device) : BufferImpl(device) {}
-
-void* CpuBufferImpl::GetNative(ErrorCode* ec) {
-  if (!memory_) {
-    if (ec) *ec = eInvalidArgument;
-    return nullptr;
-  }
-  if (ec) *ec = ErrorCode::eSuccess;
-  return OffsetPtr(memory_->data(), offset_);
-}
-
-Allocator CpuBufferImpl::GetAllocator() const { return {}; }
-
-size_t CpuBufferImpl::GetSize(ErrorCode* ec) {
-  if (!memory_) {
-    if (ec) *ec = eInvalidArgument;
-    return 0;
-  }
-  if (ec) *ec = ErrorCode::eSuccess;
-  return size_;
-}
-
-// int CpuBufferImpl::Fill(uint8_t pattern, size_t size, size_t offset,
-//                         Stream& st) {
-//   if (!memory_ || !memory_->handle) {
-//     return Status(eInvalidArgument);
-//   }
-//   if (offset + size >= size_) {
-//     return Status(eInvalidArgument);
-//   }
-//   auto task = [=] {
-//     auto data = OffsetPtr(memory_->handle, offset);
-//     std::memset(data, pattern, size);
-//   };
-//   if (!st) {
-//     task();
-//     return M_SUCCESS;
-//   }
-//   if (st.GetDevice() != Device()) {
-//     return Status(eInvalidArgument);
-//   }
-//   auto cpu_stream = static_cast<CpuStreamImpl*>(st.GetNative());
-//   if (!cpu_stream) {
-//     return Status(eInvalidArgument);
-//   }
-//   return cpu_stream->Enqueue(std::move(task));
-// }
-
-Result<void> CpuBufferImpl::Init(size_t size, Allocator allocator, size_t alignment,
-                                 uint64_t flags) {
-  assert(!allocator && "CPU device doesn't support allocators yet");
-  memory_ = std::make_shared<CpuHostMemory>();
-  OUTCOME_TRY(memory_->Init(size, alignment));
-  size_ = size;
-  return success();
-}
-
-Result<void> CpuBufferImpl::Init(size_t size, std::shared_ptr<void> native, uint64_t flags) {
-  memory_ = std::make_shared<CpuHostMemory>();
-  OUTCOME_TRY(memory_->Init(size, std::move(native)));
-  size_ = size;
-  return success();
-}
-
-Result<BufferImplPtr> CpuBufferImpl::SubBuffer(size_t offset, size_t size, uint64_t flags) {
-  if (offset_ + offset + size > memory_->size()) {
-    return Status(eInvalidArgument);
-  }
-  auto impl = std::make_shared<CpuBufferImpl>(device_);
-  impl->memory_ = memory_;
-  impl->offset_ = offset_ + offset;
-  impl->size_ = size;
-  return impl;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// CpuStreamImpl
-
-CpuStreamImpl::CpuStreamImpl(Device device) : StreamImpl(device) {}
-
-CpuStreamImpl::~CpuStreamImpl() {
-  {
-    std::lock_guard lock(mutex_);
-    abort_ = true;
-  }
-  cv_.notify_one();
-  thread_.join();
-}
-
-Result<void> CpuStreamImpl::Init(uint64_t flags) {
-  thread_ = std::thread(&CpuStreamImpl::InternalThreadEntry, this);
-  return success();
-}
-
-Result<void> CpuStreamImpl::Init(std::shared_ptr<void> native, uint64_t flags) {
-  return Status(eNotSupported);
-}
-
-Result<void> CpuStreamImpl::Enqueue(Task task) {
-  {
-    std::lock_guard lock(mutex_);
-    task_queue_.push(std::move(task));
-  }
-  cv_.notify_one();
-  return success();
-}
-
-Result<void> CpuStreamImpl::DependsOn(Event& event) {
-  return Enqueue([&] { event.Wait().value(); });
-}
-
-Result<void> CpuStreamImpl::Query() {
-  std::lock_guard lock(mutex_);
-  if (task_queue_.empty()) {
-    return success();
-  } else {
-    return Status(eFail);
-  }
-}
-
-Result<void> CpuStreamImpl::Wait() {
-  {
-    std::unique_lock lock(mutex_);
-    cv_.wait(lock, [this] { return task_queue_.empty() || abort_; });
-  }
-  cv_.notify_one();
-  return success();
-}
-
-Result<void> CpuStreamImpl::Submit(Kernel& kernel) {
-  if (GetDevice() != kernel.GetDevice()) {
-    return Status(eInvalidArgument);
-  }
-  auto task = static_cast<Task*>(kernel.GetNative());
-  if (task) {
-    OUTCOME_TRY(Enqueue(*task));
-    return success();
-  }
-  return Status(eInvalidArgument);
-}
-
-void* CpuStreamImpl::GetNative(ErrorCode* ec) {
-  if (ec) *ec = ErrorCode::eSuccess;
-  return this;
-}
-
-void CpuStreamImpl::InternalThreadEntry() {
-  while (true) {
-    Task task;
-    {
-      std::unique_lock lock(mutex_);
-      cv_.wait(lock, [this] { return !task_queue_.empty() || abort_; });
-      if (abort_) {
-        break;
-      }
-      task = std::move(task_queue_.front());
-    }
-    if (task) {
-      task();
-    }
-    {
-      std::lock_guard lock(mutex_);
-      task_queue_.pop();
-    }
-    cv_.notify_one();
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// CpuEventImpl
-
-CpuEventImpl::CpuEventImpl(Device device) : EventImpl(device) {}
-
-Result<void> CpuEventImpl::Init(uint64_t flags) {
-  Reset();
-  return success();
-};
-
-Result<void> CpuEventImpl::Init(std::shared_ptr<void> native, uint64_t flags) {
-  return Status(eNotSupported);
-};
-
-Result<void> CpuEventImpl::Query() {
-  auto status = future_.wait_for(std::chrono::microseconds::zero());
-  if (status == std::future_status::ready) {
-    return success();
-  } else {
-    return Status(eNotReady);
-  }
-}
-
-Result<void> CpuEventImpl::Record(Stream& stream) {
-  if (stream.GetDevice() != device_) {
-    return Status(eInvalidArgument);
-  }
-  auto cpu_stream = static_cast<CpuStreamImpl*>(stream.GetNative());
-  if (!cpu_stream) return Status(eInvalidArgument);
-  Reset();
-  return cpu_stream->Enqueue([this] { promise_.set_value(); });
-}
-
-Result<void> CpuEventImpl::Wait() {
-  future_.wait();
-  return success();
-};
-
-void CpuEventImpl::Reset() {
-  promise_ = std::promise<void>();
-  future_ = promise_.get_future();
-}
-
-void* CpuEventImpl::GetNative(ErrorCode* ec) {
-  if (ec) *ec = ErrorCode::eSuccess;
-  return this;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-Kernel CreateCpuKernel(std::function<void()> task) {
-  return Kernel(std::make_shared<CpuKernelImpl>(gCpuPlatform().GetDevice(0), std::move(task)));
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// CpuPlatformRegisterer
-
-CpuPlatformImpl& gCpuPlatform() {
-  static Platform platform("cpu");
-  return Access::get<CpuPlatformImpl>(platform);
-}
-
-class CpuPlatformRegisterer {
- public:
-  CpuPlatformRegisterer() {
-    gPlatformRegistry().Register([] { return std::make_shared<CpuPlatformImpl>(); });
-  }
-};
-
-CpuPlatformRegisterer g_cpu_platform_registerer;
+                    owned_data_ = false;
+                }
+                data_ = nullptr;
+            }
+            external_.reset();
+            size_ = 0;
+        }
+        size_t size() const
+        {
+            return size_;
+        }
+        void* data() const
+        {
+            return owned_data_ ? aligned_data_ : data_;
+        }
+
+      private:
+        size_t                size_;
+        void*                 data_;
+        void*                 aligned_data_{nullptr};
+        bool                  owned_data_;
+        std::shared_ptr<void> external_;
+    };
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// CpuPlatformImpl
+
+    Result<void> CpuPlatformImpl::BindDevice(Device device, Device* prev)
+    {
+        // do nothing
+        if (prev)
+        {
+            *prev = device;
+        }
+        return success();
+    }
+
+    shared_ptr<BufferImpl> CpuPlatformImpl::CreateBuffer(Device device)
+    {
+        return std::make_shared<CpuBufferImpl>(device);
+    }
+
+    shared_ptr<StreamImpl> CpuPlatformImpl::CreateStream(Device device)
+    {
+        return std::make_shared<CpuStreamImpl>(device);
+    }
+
+    shared_ptr<EventImpl> CpuPlatformImpl::CreateEvent(Device device)
+    {
+        return std::make_shared<CpuEventImpl>(device);
+    }
+
+    int CpuPlatformImpl::GetPlatformId() const noexcept
+    {
+        return 0;
+    }
+
+    const char* CpuPlatformImpl::GetPlatformName() const noexcept
+    {
+        return "cpu";
+    }
+
+    bool CpuPlatformImpl::CheckCopyParam(size_t src_size, size_t dst_size, size_t src_offset, size_t dst_offset, size_t copy_size)
+    {
+        if (src_offset + copy_size > src_size)
+        {
+            return false;
+        }
+        if (dst_offset + copy_size > dst_size)
+        {
+            return false;
+        }
+        return true;
+    }
+
+    inline void* OffsetPtr(void* ptr, size_t offset)
+    {
+        return static_cast<void*>(static_cast<uint8_t*>(ptr) + offset);
+    }
+
+    inline const void* OffsetPtr(const void* ptr, size_t offset)
+    {
+        return static_cast<const void*>(static_cast<const uint8_t*>(ptr) + offset);
+    }
+
+    Result<void> CpuPlatformImpl::CopyImpl(const void* src, void* dst, size_t src_size, size_t dst_size, size_t src_offset, size_t dst_offset, size_t size, Stream st)
+    {
+        if (!CheckCopyParam(src_size, dst_size, src_offset, dst_offset, size))
+        {
+            return Status(eInvalidArgument);
+        }
+        auto task = [=]
+        { std::memcpy(OffsetPtr(dst, dst_offset), OffsetPtr(src, src_offset), size); };
+        if (!st)
+        {
+            task();
+            return success();
+        }
+        if (st.GetDevice().platform_id() != 0)
+        {
+            return Status(eInvalidArgument);
+        }
+        auto cpu_stream = static_cast<CpuStreamImpl*>(st.GetNative());
+        if (!cpu_stream)
+        {
+            return Status(eInvalidArgument);
+        }
+        return cpu_stream->Enqueue(std::move(task));
+    }
+
+    Result<void> CpuPlatformImpl::Copy(const void* host_ptr, Buffer dst, size_t size, size_t dst_offset, Stream stream)
+    {
+        auto dst_ptr = dst.GetNative();
+        if (!dst_ptr)
+        {
+            return Status(eInvalidArgument);
+        }
+        if (dst.GetDevice().platform_id() != 0)
+        {
+            return Status(eInvalidArgument);
+        }
+        return CopyImpl(host_ptr, dst_ptr, size, dst.GetSize(), 0, dst_offset, size, stream);
+    }
+
+    Result<void> CpuPlatformImpl::Copy(Buffer src, void* host_ptr, size_t size, size_t src_offset, Stream stream)
+    {
+        auto src_ptr = src.GetNative();
+        if (!src_ptr)
+        {
+            return Status(eInvalidArgument);
+        }
+        if (src.GetDevice().platform_id() != 0)
+        {
+            return Status(eInvalidArgument);
+        }
+        return CopyImpl(src_ptr, host_ptr, src.GetSize(), size, src_offset, 0, size, stream);
+    }
+    Result<void> CpuPlatformImpl::Copy(Buffer src, Buffer dst, size_t size, size_t src_offset, size_t dst_offset, Stream stream)
+    {
+        auto src_ptr = src.GetNative();
+        auto dst_ptr = dst.GetNative();
+        if (!src_ptr || !dst_ptr)
+        {
+            return Status(eInvalidArgument);
+        }
+        auto device = src.GetDevice();
+        if (device.platform_id() != 0 || device.platform_id() != dst.GetDevice().platform_id())
+        {
+            return Status(eInvalidArgument);
+        }
+        return CopyImpl(src_ptr, dst_ptr, src.GetSize(), dst.GetSize(), src_offset, dst_offset, size, stream);
+    }
+
+    Result<Stream> CpuPlatformImpl::GetDefaultStream(int32_t device_id)
+    {
+        try
+        {
+            std::call_once(init_flag_, [&]
+                           { default_stream_ = Stream(GetDevice(device_id)); });
+            return default_stream_;
+        }
+        catch (...)
+        {
+            return Status(eFail);
+        }
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// CpuBufferImpl
+
+    CpuBufferImpl::CpuBufferImpl(Device device)
+        : BufferImpl(device)
+    {
+    }
+
+    void* CpuBufferImpl::GetNative(ErrorCode* ec)
+    {
+        if (!memory_)
+        {
+            if (ec) *ec = eInvalidArgument;
+            return nullptr;
+        }
+        if (ec) *ec = ErrorCode::eSuccess;
+        return OffsetPtr(memory_->data(), offset_);
+    }
+
+    Allocator CpuBufferImpl::GetAllocator() const
+    {
+        return {};
+    }
+
+    size_t CpuBufferImpl::GetSize(ErrorCode* ec)
+    {
+        if (!memory_)
+        {
+            if (ec) *ec = eInvalidArgument;
+            return 0;
+        }
+        if (ec) *ec = ErrorCode::eSuccess;
+        return size_;
+    }
+
+    // int CpuBufferImpl::Fill(uint8_t pattern, size_t size, size_t offset,
+    //                         Stream& st) {
+    //   if (!memory_ || !memory_->handle) {
+    //     return Status(eInvalidArgument);
+    //   }
+    //   if (offset + size >= size_) {
+    //     return Status(eInvalidArgument);
+    //   }
+    //   auto task = [=] {
+    //     auto data = OffsetPtr(memory_->handle, offset);
+    //     std::memset(data, pattern, size);
+    //   };
+    //   if (!st) {
+    //     task();
+    //     return M_SUCCESS;
+    //   }
+    //   if (st.GetDevice() != Device()) {
+    //     return Status(eInvalidArgument);
+    //   }
+    //   auto cpu_stream = static_cast<CpuStreamImpl*>(st.GetNative());
+    //   if (!cpu_stream) {
+    //     return Status(eInvalidArgument);
+    //   }
+    //   return cpu_stream->Enqueue(std::move(task));
+    // }
+
+    Result<void> CpuBufferImpl::Init(size_t size, Allocator allocator, size_t alignment, uint64_t flags)
+    {
+        assert(!allocator && "CPU device doesn't support allocators yet");
+        memory_ = std::make_shared<CpuHostMemory>();
+        OUTCOME_TRY(memory_->Init(size, alignment));
+        size_ = size;
+        return success();
+    }
+
+    Result<void> CpuBufferImpl::Init(size_t size, std::shared_ptr<void> native, uint64_t flags)
+    {
+        memory_ = std::make_shared<CpuHostMemory>();
+        OUTCOME_TRY(memory_->Init(size, std::move(native)));
+        size_ = size;
+        return success();
+    }
+
+    Result<BufferImplPtr> CpuBufferImpl::SubBuffer(size_t offset, size_t size, uint64_t flags)
+    {
+        if (offset_ + offset + size > memory_->size())
+        {
+            return Status(eInvalidArgument);
+        }
+        auto impl     = std::make_shared<CpuBufferImpl>(device_);
+        impl->memory_ = memory_;
+        impl->offset_ = offset_ + offset;
+        impl->size_   = size;
+        return impl;
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// CpuStreamImpl
+
+    CpuStreamImpl::CpuStreamImpl(Device device)
+        : StreamImpl(device)
+    {
+    }
+
+    CpuStreamImpl::~CpuStreamImpl()
+    {
+        {
+            std::lock_guard lock(mutex_);
+            abort_ = true;
+        }
+        cv_.notify_one();
+        thread_.join();
+    }
+
+    Result<void> CpuStreamImpl::Init(uint64_t flags)
+    {
+        thread_ = std::thread(&CpuStreamImpl::InternalThreadEntry, this);
+        return success();
+    }
+
+    Result<void> CpuStreamImpl::Init(std::shared_ptr<void> native, uint64_t flags)
+    {
+        return Status(eNotSupported);
+    }
+
+    Result<void> CpuStreamImpl::Enqueue(Task task)
+    {
+        {
+            std::lock_guard lock(mutex_);
+            task_queue_.push(std::move(task));
+        }
+        cv_.notify_one();
+        return success();
+    }
+
+    Result<void> CpuStreamImpl::DependsOn(Event& event)
+    {
+        return Enqueue([&]
+                       { event.Wait().value(); });
+    }
+
+    Result<void> CpuStreamImpl::Query()
+    {
+        std::lock_guard lock(mutex_);
+        if (task_queue_.empty())
+        {
+            return success();
+        }
+        else
+        {
+            return Status(eFail);
+        }
+    }
+
+    Result<void> CpuStreamImpl::Wait()
+    {
+        {
+            std::unique_lock lock(mutex_);
+            cv_.wait(lock, [this]
+                     { return task_queue_.empty() || abort_; });
+        }
+        cv_.notify_one();
+        return success();
+    }
+
+    Result<void> CpuStreamImpl::Submit(Kernel& kernel)
+    {
+        if (GetDevice() != kernel.GetDevice())
+        {
+            return Status(eInvalidArgument);
+        }
+        auto task = static_cast<Task*>(kernel.GetNative());
+        if (task)
+        {
+            OUTCOME_TRY(Enqueue(*task));
+            return success();
+        }
+        return Status(eInvalidArgument);
+    }
+
+    void* CpuStreamImpl::GetNative(ErrorCode* ec)
+    {
+        if (ec) *ec = ErrorCode::eSuccess;
+        return this;
+    }
+
+    void CpuStreamImpl::InternalThreadEntry()
+    {
+        while (true)
+        {
+            Task task;
+            {
+                std::unique_lock lock(mutex_);
+                cv_.wait(lock, [this]
+                         { return !task_queue_.empty() || abort_; });
+                if (abort_)
+                {
+                    break;
+                }
+                task = std::move(task_queue_.front());
+            }
+            if (task)
+            {
+                task();
+            }
+            {
+                std::lock_guard lock(mutex_);
+                task_queue_.pop();
+            }
+            cv_.notify_one();
+        }
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// CpuEventImpl
+
+    CpuEventImpl::CpuEventImpl(Device device)
+        : EventImpl(device)
+    {
+    }
+
+    Result<void> CpuEventImpl::Init(uint64_t flags)
+    {
+        Reset();
+        return success();
+    };
+
+    Result<void> CpuEventImpl::Init(std::shared_ptr<void> native, uint64_t flags)
+    {
+        return Status(eNotSupported);
+    };
+
+    Result<void> CpuEventImpl::Query()
+    {
+        auto status = future_.wait_for(std::chrono::microseconds::zero());
+        if (status == std::future_status::ready)
+        {
+            return success();
+        }
+        else
+        {
+            return Status(eNotReady);
+        }
+    }
+
+    Result<void> CpuEventImpl::Record(Stream& stream)
+    {
+        if (stream.GetDevice() != device_)
+        {
+            return Status(eInvalidArgument);
+        }
+        auto cpu_stream = static_cast<CpuStreamImpl*>(stream.GetNative());
+        if (!cpu_stream) return Status(eInvalidArgument);
+        Reset();
+        return cpu_stream->Enqueue([this]
+                                   { promise_.set_value(); });
+    }
+
+    Result<void> CpuEventImpl::Wait()
+    {
+        future_.wait();
+        return success();
+    };
+
+    void CpuEventImpl::Reset()
+    {
+        promise_ = std::promise<void>();
+        future_  = promise_.get_future();
+    }
+
+    void* CpuEventImpl::GetNative(ErrorCode* ec)
+    {
+        if (ec) *ec = ErrorCode::eSuccess;
+        return this;
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+
+    Kernel CreateCpuKernel(std::function<void()> task)
+    {
+        return Kernel(std::make_shared<CpuKernelImpl>(gCpuPlatform().GetDevice(0), std::move(task)));
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// CpuPlatformRegisterer
+
+    CpuPlatformImpl& gCpuPlatform()
+    {
+        static Platform platform("cpu");
+        return Access::get<CpuPlatformImpl>(platform);
+    }
+
+    class CpuPlatformRegisterer
+    {
+      public:
+        CpuPlatformRegisterer()
+        {
+            gPlatformRegistry().Register([]
+                                         { return std::make_shared<CpuPlatformImpl>(); });
+        }
+    };
+
+    CpuPlatformRegisterer g_cpu_platform_registerer;
 
 }  // namespace mmdeploy::framework
diff --git a/csrc/mmdeploy/device/cpu/cpu_device.h b/csrc/mmdeploy/device/cpu/cpu_device.h
index c508e030d7..cadb365073 100644
--- a/csrc/mmdeploy/device/cpu/cpu_device.h
+++ b/csrc/mmdeploy/device/cpu/cpu_device.h
@@ -9,143 +9,152 @@
 #include "mmdeploy/core/device_impl.h"
 #include "mmdeploy/core/types.h"
 
-namespace mmdeploy::framework {
+namespace mmdeploy::framework
+{
 
-class CpuPlatformImpl : public PlatformImpl {
- public:
-  int GetPlatformId() const noexcept override;
+    class CpuPlatformImpl : public PlatformImpl
+    {
+      public:
+        int                    GetPlatformId() const noexcept override;
 
-  const char* GetPlatformName() const noexcept override;
+        const char*            GetPlatformName() const noexcept override;
 
-  Result<void> BindDevice(Device device, Device* prev) override;
+        Result<void>           BindDevice(Device device, Device* prev) override;
 
-  shared_ptr<BufferImpl> CreateBuffer(Device device) override;
+        shared_ptr<BufferImpl> CreateBuffer(Device device) override;
 
-  shared_ptr<StreamImpl> CreateStream(Device device) override;
+        shared_ptr<StreamImpl> CreateStream(Device device) override;
 
-  shared_ptr<EventImpl> CreateEvent(Device device) override;
+        shared_ptr<EventImpl>  CreateEvent(Device device) override;
 
-  Result<void> Copy(const void* host_ptr, Buffer dst, size_t size, size_t dst_offset,
-                    Stream stream) override;
+        Result<void>           Copy(const void* host_ptr, Buffer dst, size_t size, size_t dst_offset, Stream stream) override;
 
-  Result<void> Copy(Buffer src, void* host_ptr, size_t size, size_t src_offset,
-                    Stream stream) override;
+        Result<void>           Copy(Buffer src, void* host_ptr, size_t size, size_t src_offset, Stream stream) override;
 
-  Result<void> Copy(Buffer src, Buffer dst, size_t size, size_t src_offset, size_t dst_offset,
-                    Stream stream) override;
+        Result<void>           Copy(Buffer src, Buffer dst, size_t size, size_t src_offset, size_t dst_offset, Stream stream) override;
 
-  Result<Stream> GetDefaultStream(int32_t device_id) override;
+        Result<Stream>         GetDefaultStream(int32_t device_id) override;
 
-  Device GetDevice(int device_id) const { return Device(GetPlatformId(), device_id); }
+        Device                 GetDevice(int device_id) const
+        {
+            return Device(GetPlatformId(), device_id);
+        }
 
- private:
-  static bool CheckCopyParam(size_t src_size, size_t dst_size, size_t src_offset, size_t dst_offset,
-                             size_t copy_size);
+      private:
+        static bool         CheckCopyParam(size_t src_size, size_t dst_size, size_t src_offset, size_t dst_offset, size_t copy_size);
 
-  static Result<void> CopyImpl(const void* src, void* dst, size_t src_size, size_t dst_size,
-                               size_t src_offset, size_t dst_offset, size_t size, Stream st);
+        static Result<void> CopyImpl(const void* src, void* dst, size_t src_size, size_t dst_size, size_t src_offset, size_t dst_offset, size_t size, Stream st);
 
-  Stream default_stream_;
-  std::once_flag init_flag_;
-};
+        Stream              default_stream_;
+        std::once_flag      init_flag_;
+    };
 
-CpuPlatformImpl& gCpuPlatform();
+    CpuPlatformImpl& gCpuPlatform();
 
-class CpuHostMemory;
+    class CpuHostMemory;
 
-class CpuBufferImpl : public BufferImpl {
- public:
-  explicit CpuBufferImpl(Device device);
+    class CpuBufferImpl : public BufferImpl
+    {
+      public:
+        explicit CpuBufferImpl(Device device);
 
-  Result<void> Init(size_t size, Allocator allocator, size_t alignment, uint64_t flags) override;
+        Result<void>          Init(size_t size, Allocator allocator, size_t alignment, uint64_t flags) override;
 
-  Result<void> Init(size_t size, std::shared_ptr<void> native, uint64_t flags) override;
+        Result<void>          Init(size_t size, std::shared_ptr<void> native, uint64_t flags) override;
 
-  Result<BufferImplPtr> SubBuffer(size_t offset, size_t size, uint64_t flags) override;
+        Result<BufferImplPtr> SubBuffer(size_t offset, size_t size, uint64_t flags) override;
 
-  void* GetNative(ErrorCode* ec) override;
+        void*                 GetNative(ErrorCode* ec) override;
 
-  Allocator GetAllocator() const override;
+        Allocator             GetAllocator() const override;
 
-  size_t GetSize(ErrorCode* ec) override;
+        size_t                GetSize(ErrorCode* ec) override;
 
- private:
-  std::shared_ptr<CpuHostMemory> memory_;
-  size_t offset_{0};
-  size_t size_{0};
-};
+      private:
+        std::shared_ptr<CpuHostMemory> memory_;
+        size_t                         offset_{0};
+        size_t                         size_{0};
+    };
 
-class CpuStreamImpl : public StreamImpl {
- public:
-  using Task = std::function<void()>;
+    class CpuStreamImpl : public StreamImpl
+    {
+      public:
+        using Task = std::function<void()>;
 
-  explicit CpuStreamImpl(Device device);
+        explicit CpuStreamImpl(Device device);
 
-  ~CpuStreamImpl() override;
+        ~CpuStreamImpl() override;
 
-  Result<void> Init(uint64_t flags) override;
+        Result<void> Init(uint64_t flags) override;
 
-  Result<void> Init(std::shared_ptr<void> native, uint64_t flags) override;
+        Result<void> Init(std::shared_ptr<void> native, uint64_t flags) override;
 
-  Result<void> Enqueue(Task task);
+        Result<void> Enqueue(Task task);
 
-  Result<void> DependsOn(Event& event) override;
+        Result<void> DependsOn(Event& event) override;
 
-  Result<void> Query() override;
+        Result<void> Query() override;
 
-  Result<void> Wait() override;
+        Result<void> Wait() override;
 
-  Result<void> Submit(Kernel& kernel) override;
+        Result<void> Submit(Kernel& kernel) override;
 
-  void* GetNative(ErrorCode* ec) override;
+        void*        GetNative(ErrorCode* ec) override;
 
- private:
-  void InternalThreadEntry();
-  std::mutex mutex_;
-  std::condition_variable cv_;
-  std::queue<Task> task_queue_;
-  std::thread thread_;
-  Device device_;
-  bool abort_{false};
-};
+      private:
+        void                    InternalThreadEntry();
+        std::mutex              mutex_;
+        std::condition_variable cv_;
+        std::queue<Task>        task_queue_;
+        std::thread             thread_;
+        Device                  device_;
+        bool                    abort_{false};
+    };
 
-class CpuEventImpl : public EventImpl {
- public:
-  explicit CpuEventImpl(Device device);
+    class CpuEventImpl : public EventImpl
+    {
+      public:
+        explicit CpuEventImpl(Device device);
 
-  ~CpuEventImpl() override = default;
+        ~CpuEventImpl() override = default;
 
-  Result<void> Init(uint64_t flags) override;
+        Result<void> Init(uint64_t flags) override;
 
-  Result<void> Init(std::shared_ptr<void> native, uint64_t flags) override;
+        Result<void> Init(std::shared_ptr<void> native, uint64_t flags) override;
 
-  Result<void> Query() override;
+        Result<void> Query() override;
 
-  Result<void> Record(Stream& stream) override;
+        Result<void> Record(Stream& stream) override;
 
-  Result<void> Wait() override;
+        Result<void> Wait() override;
 
-  void* GetNative(ErrorCode* ec) override;
+        void*        GetNative(ErrorCode* ec) override;
 
- private:
-  void Reset();
-  std::shared_future<void> future_;
-  std::promise<void> promise_;
-};
+      private:
+        void                     Reset();
+        std::shared_future<void> future_;
+        std::promise<void>       promise_;
+    };
 
-class CpuKernelImpl : public KernelImpl {
- public:
-  using Task = CpuStreamImpl::Task;
+    class CpuKernelImpl : public KernelImpl
+    {
+      public:
+        using Task = CpuStreamImpl::Task;
 
-  explicit CpuKernelImpl(Device device, Task task) : KernelImpl(device), task_(std::move(task)) {}
+        explicit CpuKernelImpl(Device device, Task task)
+            : KernelImpl(device)
+            , task_(std::move(task))
+        {
+        }
 
-  void* GetNative(ErrorCode* ec) override {
-    if (ec) *ec = ErrorCode::eSuccess;
-    return &task_;
-  }
+        void* GetNative(ErrorCode* ec) override
+        {
+            if (ec) *ec = ErrorCode::eSuccess;
+            return &task_;
+        }
 
- private:
-  Task task_;
-};
+      private:
+        Task task_;
+    };
 
 }  // namespace mmdeploy::framework
diff --git a/csrc/mmdeploy/device/cuda/buddy_allocator.h b/csrc/mmdeploy/device/cuda/buddy_allocator.h
index 8525c5562f..0dd578261d 100644
--- a/csrc/mmdeploy/device/cuda/buddy_allocator.h
+++ b/csrc/mmdeploy/device/cuda/buddy_allocator.h
@@ -14,169 +14,200 @@
 #include "mmdeploy/core/logger.h"
 #include "mmdeploy/device/cuda/default_allocator.h"
 
-namespace mmdeploy::cuda {
-
-class BuddyAllocator {
- public:
-  using size_type = std::size_t;
-
-  BuddyAllocator(size_type size, size_type block_size) {
-    block_size_ = block_size;
-    block_count_ = size / block_size_;
-    if (!IsPowerOfTwo(block_count_)) {
-      block_count_ = RoundToPowerOfTwo(block_count_);
-      MMDEPLOY_WARN("Rounding up block_count to next power of 2 {}", block_count_);
+namespace mmdeploy::cuda
+{
+
+    class BuddyAllocator
+    {
+      public:
+        using size_type = std::size_t;
+
+        BuddyAllocator(size_type size, size_type block_size)
+        {
+            block_size_  = block_size;
+            block_count_ = size / block_size_;
+            if (!IsPowerOfTwo(block_count_))
+            {
+                block_count_ = RoundToPowerOfTwo(block_count_);
+                MMDEPLOY_WARN("Rounding up block_count to next power of 2 {}", block_count_);
+            }
+            base_   = LogPowerOfTwo(block_count_);
+            size_   = block_size_ * block_count_;
+            memory_ = gDefaultAllocator().Allocate(size_);
+            tree_.resize(block_count_ * 2);
+            free_.resize(base_ + 1);
+            Build(1, 0);
+            Add(1, 0);
+            MMDEPLOY_ERROR("size = {}, block_size = {}, block_count = {}", size_, block_size_, block_count_);
+            size = size_;
+            for (int i = 0; i <= base_; ++i)
+            {
+                MMDEPLOY_ERROR("level {}, size = {}", i, size);
+                size /= 2;
+            }
+        }
+
+        ~BuddyAllocator()
+        {
+            for (int i = 0; i < free_.size(); ++i)
+            {
+                MMDEPLOY_ERROR("free_[{}].size(): {}", i, free_[i].size());
+            }
+            gDefaultAllocator().Deallocate(memory_, size_);
+        }
+
+        [[nodiscard]] void* Allocate(size_type n)
+        {
+            std::lock_guard lock{mutex_};
+            if (n > size_)
+            {
+                return nullptr;
+            }
+            auto n_level = GetLevel(n);
+            auto level   = n_level;
+            for (; level >= 0; --level)
+            {
+                if (!free_[level].empty())
+                {
+                    break;
+                }
+            }
+            if (level < 0)
+            {
+                MMDEPLOY_WARN("failed to allocate memory size = {} bytes", n);
+                return nullptr;
+            }
+            for (; level < n_level; ++level)
+            {
+                auto index = free_[level].front();
+                Split(index, level);
+            }
+            auto index = free_[level].front();
+            Del(index, level);
+            auto offset = (index ^ (1 << level)) << (base_ - level);
+            auto p      = static_cast<uint8_t*>(memory_) + offset * block_size_;
+            return p;
+        }
+
+        void Deallocate(void* p, size_type n)
+        {
+            std::lock_guard lock{mutex_};
+            auto            offset = static_cast<uint8_t*>(p) - static_cast<uint8_t*>(memory_);
+            if (offset < 0 || offset % block_size_)
+            {
+                MMDEPLOY_ERROR("invalid address: {}", p);
+            }
+            offset /= static_cast<long>(block_size_);
+            auto level = GetLevel(n);
+            auto index = (offset >> (base_ - level)) ^ (1 << level);
+            Add(index, level);
+            while (index > 1)
+            {
+                auto buddy = index ^ 1;
+                if (tree_[buddy] != free_[level].end())
+                {
+                    Merge(index, level);
+                    index /= 2;
+                    --level;
+                }
+                else
+                {
+                    break;
+                }
+            }
+        }
+
+      private:
+        void Add(size_type index, size_type level)
+        {
+            assert(tree_[index] == free_[level].end());
+            tree_[index] = free_[level].insert(free_[level].end(), index);
+        }
+
+        void Del(size_type index, size_type level)
+        {
+            assert(tree_[index] != free_[level].end());
+            free_[level].erase(tree_[index]);
+            tree_[index] = free_[level].end();
+        }
+
+        void Split(size_type index, size_type level)
+        {
+            Del(index, level);
+            Add(index * 2, level + 1);
+            Add(index * 2 + 1, level + 1);
+        }
+
+        void Merge(size_type index, size_type level)
+        {
+            Del(index, level);
+            Del(index ^ 1, level);
+            Add(index / 2, level - 1);
+        }
+
+        size_type GetLevel(size_type size) const
+        {
+            size = RoundToPowerOfTwo((size + block_size_ - 1) / block_size_);
+            return base_ - LogPowerOfTwo(size);
+        }
+
+        static bool IsPowerOfTwo(size_type n)
+        {
+            return (n & (n - 1)) == 0;
+        }
+
+        static size_type RoundToPowerOfTwo(size_type n)
+        {
+            --n;
+            n |= (n >> 1);
+            n |= (n >> 2);
+            n |= (n >> 4);
+            n |= (n >> 8);
+            n |= (n >> 16);
+            n |= (n >> 32);
+            return ++n;
+        }
+
+        static size_type LogPowerOfTwo(size_type v)
+        {
+            size_type r{};
+            r |= ((v & 0xFFFFFFFF00000000) != 0) << 5;
+            r |= ((v & 0xFFFF0000FFFF0000) != 0) << 4;
+            r |= ((v & 0xFF00FF00FF00FF00) != 0) << 3;
+            r |= ((v & 0xF0F0F0F0F0F0F0F0) != 0) << 2;
+            r |= ((v & 0xCCCCCCCCCCCCCCCC) != 0) << 1;
+            r |= ((v & 0xAAAAAAAAAAAAAAAA) != 0);
+            return r;
+        }
+
+        void Build(size_type index, size_type level)
+        {
+            if (index < tree_.size())
+            {
+                tree_[index] = free_[level].end();
+                index *= 2;
+                ++level;
+                Build(index, level);
+                Build(index + 1, level);
+            }
+        }
+
+      private:
+        size_type                                   size_;
+        size_type                                   block_size_;
+        size_type                                   block_count_;
+        size_type                                   base_;
+        void*                                       memory_;
+        std::vector<std::list<size_type>::iterator> tree_;
+        std::vector<std::list<size_type>>           free_;
+        std::mutex                                  mutex_;
+    };
+
+    inline BuddyAllocator& gBuddyAllocator()
+    {
+        static BuddyAllocator v(1U << 30, 1024 * 64);
+        return v;
     }
-    base_ = LogPowerOfTwo(block_count_);
-    size_ = block_size_ * block_count_;
-    memory_ = gDefaultAllocator().Allocate(size_);
-    tree_.resize(block_count_ * 2);
-    free_.resize(base_ + 1);
-    Build(1, 0);
-    Add(1, 0);
-    MMDEPLOY_ERROR("size = {}, block_size = {}, block_count = {}", size_, block_size_,
-                   block_count_);
-    size = size_;
-    for (int i = 0; i <= base_; ++i) {
-      MMDEPLOY_ERROR("level {}, size = {}", i, size);
-      size /= 2;
-    }
-  }
-
-  ~BuddyAllocator() {
-    for (int i = 0; i < free_.size(); ++i) {
-      MMDEPLOY_ERROR("free_[{}].size(): {}", i, free_[i].size());
-    }
-    gDefaultAllocator().Deallocate(memory_, size_);
-  }
-
-  [[nodiscard]] void* Allocate(size_type n) {
-    std::lock_guard lock{mutex_};
-    if (n > size_) {
-      return nullptr;
-    }
-    auto n_level = GetLevel(n);
-    auto level = n_level;
-    for (; level >= 0; --level) {
-      if (!free_[level].empty()) {
-        break;
-      }
-    }
-    if (level < 0) {
-      MMDEPLOY_WARN("failed to allocate memory size = {} bytes", n);
-      return nullptr;
-    }
-    for (; level < n_level; ++level) {
-      auto index = free_[level].front();
-      Split(index, level);
-    }
-    auto index = free_[level].front();
-    Del(index, level);
-    auto offset = (index ^ (1 << level)) << (base_ - level);
-    auto p = static_cast<uint8_t*>(memory_) + offset * block_size_;
-    return p;
-  }
-
-  void Deallocate(void* p, size_type n) {
-    std::lock_guard lock{mutex_};
-    auto offset = static_cast<uint8_t*>(p) - static_cast<uint8_t*>(memory_);
-    if (offset < 0 || offset % block_size_) {
-      MMDEPLOY_ERROR("invalid address: {}", p);
-    }
-    offset /= static_cast<long>(block_size_);
-    auto level = GetLevel(n);
-    auto index = (offset >> (base_ - level)) ^ (1 << level);
-    Add(index, level);
-    while (index > 1) {
-      auto buddy = index ^ 1;
-      if (tree_[buddy] != free_[level].end()) {
-        Merge(index, level);
-        index /= 2;
-        --level;
-      } else {
-        break;
-      }
-    }
-  }
-
- private:
-  void Add(size_type index, size_type level) {
-    assert(tree_[index] == free_[level].end());
-    tree_[index] = free_[level].insert(free_[level].end(), index);
-  }
-
-  void Del(size_type index, size_type level) {
-    assert(tree_[index] != free_[level].end());
-    free_[level].erase(tree_[index]);
-    tree_[index] = free_[level].end();
-  }
-
-  void Split(size_type index, size_type level) {
-    Del(index, level);
-    Add(index * 2, level + 1);
-    Add(index * 2 + 1, level + 1);
-  }
-
-  void Merge(size_type index, size_type level) {
-    Del(index, level);
-    Del(index ^ 1, level);
-    Add(index / 2, level - 1);
-  }
-
-  size_type GetLevel(size_type size) const {
-    size = RoundToPowerOfTwo((size + block_size_ - 1) / block_size_);
-    return base_ - LogPowerOfTwo(size);
-  }
-
-  static bool IsPowerOfTwo(size_type n) { return (n & (n - 1)) == 0; }
-
-  static size_type RoundToPowerOfTwo(size_type n) {
-    --n;
-    n |= (n >> 1);
-    n |= (n >> 2);
-    n |= (n >> 4);
-    n |= (n >> 8);
-    n |= (n >> 16);
-    n |= (n >> 32);
-    return ++n;
-  }
-
-  static size_type LogPowerOfTwo(size_type v) {
-    size_type r{};
-    r |= ((v & 0xFFFFFFFF00000000) != 0) << 5;
-    r |= ((v & 0xFFFF0000FFFF0000) != 0) << 4;
-    r |= ((v & 0xFF00FF00FF00FF00) != 0) << 3;
-    r |= ((v & 0xF0F0F0F0F0F0F0F0) != 0) << 2;
-    r |= ((v & 0xCCCCCCCCCCCCCCCC) != 0) << 1;
-    r |= ((v & 0xAAAAAAAAAAAAAAAA) != 0);
-    return r;
-  }
-
-  void Build(size_type index, size_type level) {
-    if (index < tree_.size()) {
-      tree_[index] = free_[level].end();
-      index *= 2;
-      ++level;
-      Build(index, level);
-      Build(index + 1, level);
-    }
-  }
-
- private:
-  size_type size_;
-  size_type block_size_;
-  size_type block_count_;
-  size_type base_;
-  void* memory_;
-  std::vector<std::list<size_type>::iterator> tree_;
-  std::vector<std::list<size_type> > free_;
-  std::mutex mutex_;
-};
-
-inline BuddyAllocator& gBuddyAllocator() {
-  static BuddyAllocator v(1U << 30, 1024 * 64);
-  return v;
-}
 
 }  // namespace mmdeploy::cuda
 
diff --git a/csrc/mmdeploy/device/cuda/cuda_device.cpp b/csrc/mmdeploy/device/cuda/cuda_device.cpp
index 70cac8802a..40ea817ae1 100644
--- a/csrc/mmdeploy/device/cuda/cuda_device.cpp
+++ b/csrc/mmdeploy/device/cuda/cuda_device.cpp
@@ -6,496 +6,637 @@
 
 #include "mmdeploy/device/device_allocator.h"
 
-namespace mmdeploy::framework {
-
-inline void* OffsetPtr(void* ptr, size_t offset) {
-  return static_cast<void*>(static_cast<uint8_t*>(ptr) + offset);
-}
-
-inline const void* OffsetPtr(const void* ptr, size_t offset) {
-  return static_cast<const void*>(static_cast<const uint8_t*>(ptr) + offset);
-}
-
-cudaMemcpyKind MapMemcpyKindToCuda(MemcpyKind kind) {
-  switch (kind) {
-    case MemcpyKind::HtoD:
-      return cudaMemcpyHostToDevice;
-    case MemcpyKind::DtoH:
-      return cudaMemcpyDeviceToHost;
-    case MemcpyKind::DtoD:
-      return cudaMemcpyDeviceToDevice;
-    default:
-      return cudaMemcpyDefault;
-  }
-}
-
-namespace cuda {
-
-class Mallocator : public AllocatorImpl {
- public:
-  Block Allocate(size_t size) noexcept override {
-    if (size == 0) {
-      return Block{};
-    }
-    Block block;
-    if (auto status = cudaMalloc(&block.handle, size); status != cudaSuccess) {
-      // log error
-    }
-    block.size = size;
-    return block;
-  }
-  void Deallocate(Block& block) noexcept override {
-    if (!block.handle) {
-      return;
-    }
-    cudaFree(block.handle);
-  }
-  bool Owns(const Block& block) const noexcept override { return true; }
-};
-
-Allocator CreateDefaultAllocator() {
-  using namespace device_allocator;
-  AllocatorImplPtr allocator = std::make_shared<Mallocator>();
-  allocator = std::make_shared<Tree>(allocator, -1, .5);
-  allocator = std::make_shared<Locked>(allocator);
-  MMDEPLOY_DEBUG("Default CUDA allocator initialized");
-  return Access::create<Allocator>(allocator);
-}
-
-}  // namespace cuda
-
-// ! this class doesn't handle device id
-class CudaDeviceMemory : public NonCopyable {
- public:
-  explicit CudaDeviceMemory(int device_id) : device_id_(device_id), size_(), owned_block_() {}
-  Result<void> Init(size_t size, Allocator allocator, size_t alignment, uint64_t flags) {
-    if (alignment > 256 || 256 % alignment != 0) {
-      return Status(eNotSupported);
-    }
-    allocator_ = std::move(allocator);
-    CudaDeviceGuard guard(device_id_);
-    block_ = Access::get<AllocatorImpl>(allocator_).Allocate(size);
-    if (size && !block_.handle) {
-      return Status(eOutOfMemory);
-    }
-    size_ = size;
-    owned_block_ = true;
-    return success();
-  }
-  Result<void> Init(size_t size, std::shared_ptr<void> data, uint64_t flags) {
-    size_ = size;
-    external_ = std::move(data);
-    block_.handle = external_.get();
-    block_.size = size;
-    owned_block_ = false;
-    return success();
-  }
-  ~CudaDeviceMemory() {
-    if (block_.handle) {
-      if (owned_block_) {
-        CudaDeviceGuard guard(device_id_);
-        Access::get<AllocatorImpl>(allocator_).Deallocate(block_);
-        owned_block_ = false;
-      }
-      block_.handle = nullptr;
-    }
-    external_.reset();
-    size_ = 0;
-  }
-  size_t size() const { return size_; }
-  void* data() const { return block_.handle; }
-  const Allocator& allocator() const { return allocator_; }
-
- private:
-  int device_id_;
-  size_t size_;
-  AllocatorImpl::Block block_;
-  bool owned_block_;
-  Allocator allocator_;
-  std::shared_ptr<void> external_;
-};
-
-shared_ptr<BufferImpl> CudaPlatformImpl::CreateBuffer(Device device) {
-  return std::make_shared<CudaBufferImpl>(device);
-}
-
-shared_ptr<StreamImpl> CudaPlatformImpl::CreateStream(Device device) {
-  return std::make_shared<CudaStreamImpl>(device);
-}
-
-shared_ptr<EventImpl> CudaPlatformImpl::CreateEvent(Device device) {
-  return std::make_shared<CudaEventImpl>(device);
-}
-
-Result<void> CudaPlatformImpl::BindDevice(Device device, Device* prev) {
-  if (device.platform_id() != platform_id_) {
-    return Status(eInvalidArgument);
-  }
-  // skip null device
-  if (device.device_id() == -1) {
-    return success();
-  }
-  int prev_device_id = -1;
-  if (prev) {
-    CUcontext ctx{};
-    cuCtxGetCurrent(&ctx);
-    if (ctx) {
-      cudaGetDevice(&prev_device_id);
-      *prev = Device(platform_id_, prev_device_id);
-    } else {
-      // cuda is not initialized return a null device as previous
-      *prev = Device(platform_id_, -1);
-    }
-  }
-  if (device.device_id() != prev_device_id) {
-    cudaSetDevice(device.device_id());
-  }
-  return success();
-}
-
-bool CudaPlatformImpl::CheckCopyDevice(const Device& src, const Device& dst, const Device& st) {
-  return st.is_device() && (src.is_host() || src == st) && (dst.is_host() || dst == st);
-}
-
-Result<void> CudaPlatformImpl::Copy(const void* host_ptr, Buffer dst, size_t size,
-                                    size_t dst_offset, Stream stream) {
-  if (!CheckCopyDevice(Device{0, 0}, dst.GetDevice(), stream.GetDevice())) {
-    return Status(eInvalidArgument);
-  }
-  if (size == 0) {
-    return success();
-  }
-  auto dst_ptr = dst.GetNative();
-  if (!dst_ptr) {
-    return Status(eInvalidArgument);
-  }
-  //  auto device = dst.GetDevice();
-  return CopyImpl(stream.GetDevice(), host_ptr, dst_ptr, size, dst.GetSize(), 0, dst_offset, size,
-                  stream);
-}
-
-Result<void> CudaPlatformImpl::Copy(Buffer src, void* host_ptr, size_t size, size_t src_offset,
-                                    Stream stream) {
-  if (!CheckCopyDevice(src.GetDevice(), Device{0, 0}, stream.GetDevice())) {
-    return Status(eInvalidArgument);
-  }
-  if (size == 0) {
-    return success();
-  }
-  auto src_ptr = src.GetNative();
-  if (!src_ptr) {
-    return Status(eInvalidArgument);
-  }
-  //  auto device = src.GetDevice();
-  return CopyImpl(stream.GetDevice(), src_ptr, host_ptr, src.GetSize(), size, src_offset, 0, size,
-                  stream);
-}
-
-Result<void> CudaPlatformImpl::Copy(Buffer src, Buffer dst, size_t size, size_t src_offset,
-                                    size_t dst_offset, Stream stream) {
-  if (!CheckCopyDevice(src.GetDevice(), dst.GetDevice(), stream.GetDevice())) {
-    return Status(eInvalidArgument);
-  }
-  if (size == 0) {
-    return success();
-  }
-  auto src_ptr = src.GetNative();
-  auto dst_ptr = dst.GetNative();
-  if (!src_ptr || !dst_ptr) {
-    return Status(eInvalidArgument);
-  }
-  return CopyImpl(stream.GetDevice(), src_ptr, dst_ptr, src.GetSize(), dst.GetSize(), src_offset,
-                  dst_offset, size, stream);
-}
-
-bool CudaPlatformImpl::CheckCopyParam(size_t src_size, size_t dst_size, size_t src_offset,
-                                      size_t dst_offset, size_t copy_size) {
-  if (src_offset + copy_size > src_size) {
-    return false;
-  }
-  if (dst_offset + copy_size > dst_size) {
-    return false;
-  }
-  return true;
-}
-
-Result<void> CudaPlatformImpl::CopyImpl(Device device, const void* src, void* dst, size_t src_size,
-                                        size_t dst_size, size_t src_offset, size_t dst_offset,
-                                        size_t size, Stream st) {
-  if (!CheckCopyParam(src_size, dst_size, src_offset, dst_offset, size)) {
-    return Status(eInvalidArgument);
-  }
-
-  auto p_dst = OffsetPtr(dst, dst_offset);
-  auto p_src = OffsetPtr(src, src_offset);
-
-  CudaDeviceGuard guard(device);
-
-  if (st) {
-    auto cuda_stream = ::mmdeploy::framework::GetNative<cudaStream_t>(st);
-    // TODO: how about default stream cudaStream_t(0)?
-    if (!cuda_stream) {
-      return Status(eInvalidArgument);
-    }
-    auto err = cudaMemcpyAsync(p_dst, p_src, size, cudaMemcpyDefault, cuda_stream);
-    if (err != cudaSuccess) {
-      return Status(eFail);
-    }
-  } else {
-    auto err = cudaMemcpy(p_dst, p_src, size, cudaMemcpyDefault);
-    if (err != cudaSuccess) {
-      return Status(eFail);
-    }
-  }
-  return success();
-}
-
-Result<Stream> CudaPlatformImpl::GetDefaultStream(int32_t device_id) {
-  if (device_id >= per_device_data_.size()) {
-    return Status(eInvalidArgument);
-  }
-  return per_device_data_[device_id]->default_stream();
-}
-
-void CudaPlatformImpl::PerDeviceData::init() {
-  std::call_once(init_flag_, [&] {
+namespace mmdeploy::framework
+{
+
+    inline void* OffsetPtr(void* ptr, size_t offset)
+    {
+        return static_cast<void*>(static_cast<uint8_t*>(ptr) + offset);
+    }
+
+    inline const void* OffsetPtr(const void* ptr, size_t offset)
+    {
+        return static_cast<const void*>(static_cast<const uint8_t*>(ptr) + offset);
+    }
+
+    cudaMemcpyKind MapMemcpyKindToCuda(MemcpyKind kind)
+    {
+        switch (kind)
+        {
+            case MemcpyKind::HtoD:
+                return cudaMemcpyHostToDevice;
+            case MemcpyKind::DtoH:
+                return cudaMemcpyDeviceToHost;
+            case MemcpyKind::DtoD:
+                return cudaMemcpyDeviceToDevice;
+            default:
+                return cudaMemcpyDefault;
+        }
+    }
+
+    namespace cuda
+    {
+
+        class Mallocator : public AllocatorImpl
+        {
+          public:
+            Block Allocate(size_t size) noexcept override
+            {
+                if (size == 0)
+                {
+                    return Block{};
+                }
+                Block block;
+                if (auto status = cudaMalloc(&block.handle, size); status != cudaSuccess)
+                {
+                    // log error
+                }
+                block.size = size;
+                return block;
+            }
+            void Deallocate(Block& block) noexcept override
+            {
+                if (!block.handle)
+                {
+                    return;
+                }
+                cudaFree(block.handle);
+            }
+            bool Owns(const Block& block) const noexcept override
+            {
+                return true;
+            }
+        };
+
+        Allocator CreateDefaultAllocator()
+        {
+            using namespace device_allocator;
+            AllocatorImplPtr allocator = std::make_shared<Mallocator>();
+            allocator                  = std::make_shared<Tree>(allocator, -1, .5);
+            allocator                  = std::make_shared<Locked>(allocator);
+            MMDEPLOY_DEBUG("Default CUDA allocator initialized");
+            return Access::create<Allocator>(allocator);
+        }
+
+    }  // namespace cuda
+
+    // ! this class doesn't handle device id
+    class CudaDeviceMemory : public NonCopyable
+    {
+      public:
+        explicit CudaDeviceMemory(int device_id)
+            : device_id_(device_id)
+            , size_()
+            , owned_block_()
+        {
+        }
+        Result<void> Init(size_t size, Allocator allocator, size_t alignment, uint64_t flags)
+        {
+            if (alignment > 256 || 256 % alignment != 0)
+            {
+                return Status(eNotSupported);
+            }
+            allocator_ = std::move(allocator);
+            CudaDeviceGuard guard(device_id_);
+            block_ = Access::get<AllocatorImpl>(allocator_).Allocate(size);
+            if (size && !block_.handle)
+            {
+                return Status(eOutOfMemory);
+            }
+            size_        = size;
+            owned_block_ = true;
+            return success();
+        }
+        Result<void> Init(size_t size, std::shared_ptr<void> data, uint64_t flags)
+        {
+            size_         = size;
+            external_     = std::move(data);
+            block_.handle = external_.get();
+            block_.size   = size;
+            owned_block_  = false;
+            return success();
+        }
+        ~CudaDeviceMemory()
+        {
+            if (block_.handle)
+            {
+                if (owned_block_)
+                {
+                    CudaDeviceGuard guard(device_id_);
+                    Access::get<AllocatorImpl>(allocator_).Deallocate(block_);
+                    owned_block_ = false;
+                }
+                block_.handle = nullptr;
+            }
+            external_.reset();
+            size_ = 0;
+        }
+        size_t size() const
+        {
+            return size_;
+        }
+        void* data() const
+        {
+            return block_.handle;
+        }
+        const Allocator& allocator() const
+        {
+            return allocator_;
+        }
+
+      private:
+        int                   device_id_;
+        size_t                size_;
+        AllocatorImpl::Block  block_;
+        bool                  owned_block_;
+        Allocator             allocator_;
+        std::shared_ptr<void> external_;
+    };
+
+    shared_ptr<BufferImpl> CudaPlatformImpl::CreateBuffer(Device device)
+    {
+        return std::make_shared<CudaBufferImpl>(device);
+    }
+
+    shared_ptr<StreamImpl> CudaPlatformImpl::CreateStream(Device device)
+    {
+        return std::make_shared<CudaStreamImpl>(device);
+    }
+
+    shared_ptr<EventImpl> CudaPlatformImpl::CreateEvent(Device device)
+    {
+        return std::make_shared<CudaEventImpl>(device);
+    }
+
+    Result<void> CudaPlatformImpl::BindDevice(Device device, Device* prev)
+    {
+        if (device.platform_id() != platform_id_)
+        {
+            return Status(eInvalidArgument);
+        }
+        // skip null device
+        if (device.device_id() == -1)
+        {
+            return success();
+        }
+        int prev_device_id = -1;
+        if (prev)
+        {
+            CUcontext ctx{};
+            cuCtxGetCurrent(&ctx);
+            if (ctx)
+            {
+                cudaGetDevice(&prev_device_id);
+                *prev = Device(platform_id_, prev_device_id);
+            }
+            else
+            {
+                // cuda is not initialized return a null device as previous
+                *prev = Device(platform_id_, -1);
+            }
+        }
+        if (device.device_id() != prev_device_id)
+        {
+            cudaSetDevice(device.device_id());
+        }
+        return success();
+    }
+
+    bool CudaPlatformImpl::CheckCopyDevice(const Device& src, const Device& dst, const Device& st)
+    {
+        return st.is_device() && (src.is_host() || src == st) && (dst.is_host() || dst == st);
+    }
+
+    Result<void> CudaPlatformImpl::Copy(const void* host_ptr, Buffer dst, size_t size, size_t dst_offset, Stream stream)
+    {
+        if (!CheckCopyDevice(Device{0, 0}, dst.GetDevice(), stream.GetDevice()))
+        {
+            return Status(eInvalidArgument);
+        }
+        if (size == 0)
+        {
+            return success();
+        }
+        auto dst_ptr = dst.GetNative();
+        if (!dst_ptr)
+        {
+            return Status(eInvalidArgument);
+        }
+        //  auto device = dst.GetDevice();
+        return CopyImpl(stream.GetDevice(), host_ptr, dst_ptr, size, dst.GetSize(), 0, dst_offset, size, stream);
+    }
+
+    Result<void> CudaPlatformImpl::Copy(Buffer src, void* host_ptr, size_t size, size_t src_offset, Stream stream)
+    {
+        if (!CheckCopyDevice(src.GetDevice(), Device{0, 0}, stream.GetDevice()))
+        {
+            return Status(eInvalidArgument);
+        }
+        if (size == 0)
+        {
+            return success();
+        }
+        auto src_ptr = src.GetNative();
+        if (!src_ptr)
+        {
+            return Status(eInvalidArgument);
+        }
+        //  auto device = src.GetDevice();
+        return CopyImpl(stream.GetDevice(), src_ptr, host_ptr, src.GetSize(), size, src_offset, 0, size, stream);
+    }
+
+    Result<void> CudaPlatformImpl::Copy(Buffer src, Buffer dst, size_t size, size_t src_offset, size_t dst_offset, Stream stream)
+    {
+        if (!CheckCopyDevice(src.GetDevice(), dst.GetDevice(), stream.GetDevice()))
+        {
+            return Status(eInvalidArgument);
+        }
+        if (size == 0)
+        {
+            return success();
+        }
+        auto src_ptr = src.GetNative();
+        auto dst_ptr = dst.GetNative();
+        if (!src_ptr || !dst_ptr)
+        {
+            return Status(eInvalidArgument);
+        }
+        return CopyImpl(stream.GetDevice(), src_ptr, dst_ptr, src.GetSize(), dst.GetSize(), src_offset, dst_offset, size, stream);
+    }
+
+    bool CudaPlatformImpl::CheckCopyParam(size_t src_size, size_t dst_size, size_t src_offset, size_t dst_offset, size_t copy_size)
+    {
+        if (src_offset + copy_size > src_size)
+        {
+            return false;
+        }
+        if (dst_offset + copy_size > dst_size)
+        {
+            return false;
+        }
+        return true;
+    }
+
+    Result<void> CudaPlatformImpl::CopyImpl(Device device, const void* src, void* dst, size_t src_size, size_t dst_size, size_t src_offset, size_t dst_offset, size_t size, Stream st)
+    {
+        if (!CheckCopyParam(src_size, dst_size, src_offset, dst_offset, size))
+        {
+            return Status(eInvalidArgument);
+        }
+
+        auto            p_dst = OffsetPtr(dst, dst_offset);
+        auto            p_src = OffsetPtr(src, src_offset);
+
+        CudaDeviceGuard guard(device);
+
+        if (st)
+        {
+            auto cuda_stream = ::mmdeploy::framework::GetNative<cudaStream_t>(st);
+            // TODO: how about default stream cudaStream_t(0)?
+            if (!cuda_stream)
+            {
+                return Status(eInvalidArgument);
+            }
+            auto err = cudaMemcpyAsync(p_dst, p_src, size, cudaMemcpyDefault, cuda_stream);
+            if (err != cudaSuccess)
+            {
+                return Status(eFail);
+            }
+        }
+        else
+        {
+            auto err = cudaMemcpy(p_dst, p_src, size, cudaMemcpyDefault);
+            if (err != cudaSuccess)
+            {
+                return Status(eFail);
+            }
+        }
+        return success();
+    }
+
+    Result<Stream> CudaPlatformImpl::GetDefaultStream(int32_t device_id)
+    {
+        if (device_id >= per_device_data_.size())
+        {
+            return Status(eInvalidArgument);
+        }
+        return per_device_data_[device_id]->default_stream();
+    }
+
+    void CudaPlatformImpl::PerDeviceData::init()
+    {
+        std::call_once(init_flag_, [&]
+                       {
     CudaDeviceGuard guard(device_id_);
     default_stream_ = Stream(gCudaPlatform().GetDevice(device_id_));
-    default_allocator_ = cuda::CreateDefaultAllocator();
-  });
-}
-
-CudaPlatformImpl::CudaPlatformImpl() {
-  int count{};
-  if (auto err = cudaGetDeviceCount(&count); err != cudaSuccess) {
-    MMDEPLOY_ERROR("error getting device count: {}", cudaGetErrorString(err));
-    throw_exception(eFail);
-  }
-  per_device_data_storage_.reserve(count);
-  per_device_data_.reserve(count);
-  for (int device_id = 0; device_id < count; ++device_id) {
-    per_device_data_storage_.push_back(std::make_unique<PerDeviceData>(device_id));
-    per_device_data_.push_back(per_device_data_storage_.back().get());
-  }
-}
-Allocator CudaPlatformImpl::GetDefaultAllocator(int32_t device_id) {
-  return per_device_data_[device_id]->default_allocator();
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// CudaStreamImpl
-
-CudaStreamImpl::CudaStreamImpl(Device device) : StreamImpl(device), stream_(), owned_stream_() {}
-
-CudaStreamImpl::~CudaStreamImpl() {
-  CudaDeviceGuard guard(device_.device_id());
-  if (owned_stream_) {
-    if (auto status = cudaStreamDestroy(stream_); status != cudaSuccess) {
-      // TODO: signal error
-    }
-    owned_stream_ = false;
-  }
-  external_.reset();
-}
-
-Result<void> CudaStreamImpl::Init(uint64_t flags) {
-  CudaDeviceGuard guard(device_);
-  if (auto status = cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking);
-      status != cudaSuccess) {
-    return Status(eFail);
-  }
-  owned_stream_ = true;
-  return success();
-}
-
-Result<void> CudaStreamImpl::Init(std::shared_ptr<void> native, uint64_t flags) {
-  // ! nullptr is valid for cudaStream_t
-  external_ = std::move(native);
-  stream_ = static_cast<cudaStream_t>(external_.get());
-  owned_stream_ = false;
-  return success();
-}
-
-Result<void> CudaStreamImpl::DependsOn(Event& event) {
-  if (event.GetDevice() == device_) {
-    CudaDeviceGuard guard(device_);
-    auto native_event = ::mmdeploy::framework::GetNative<cudaEvent_t>(event);
-    cudaStreamWaitEvent(stream_, native_event, 0);
-    return success();
-  }
-  return Status(eInvalidArgument);
-}
-
-Result<void> CudaStreamImpl::Query() {
-  CudaDeviceGuard guard(device_);
-  if (cudaStreamQuery(stream_) == cudaSuccess) {
-    return success();
-  } else {
-    return Status(eFail);
-  }
-}
-
-Result<void> CudaStreamImpl::Wait() {
-  CudaDeviceGuard guard(device_);
-  if (cudaStreamSynchronize(stream_) == cudaSuccess) {
-    return success();
-  } else {
-    return Status(eFail);
-  }
-}
-
-Result<void> CudaStreamImpl::Submit(Kernel& kernel) {
-  auto task = ::mmdeploy::framework::GetNative<CudaTask*>(kernel);
-  if (task) {
-    CudaDeviceGuard guard(device_);
-    (*task)(stream_);
-    return success();
-  }
-  return Status(eInvalidArgument);
-}
-
-void* CudaStreamImpl::GetNative(ErrorCode* ec) {
-  if (ec) *ec = ErrorCode::eSuccess;
-  return stream_;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// CudaEventImpl
-
-CudaEventImpl::CudaEventImpl(Device device) : EventImpl(device), event_(), owned_event_() {}
-
-CudaEventImpl::~CudaEventImpl() {
-  CudaDeviceGuard guard(device_.device_id());
-  if (owned_event_) {
-    if (auto status = cudaEventDestroy(event_); status != cudaSuccess) {
-      // TODO: signal error
-    }
-    owned_event_ = false;
-  }
-  external_.reset();
-}
-
-Result<void> CudaEventImpl::Init(uint64_t flags) {
-  CudaDeviceGuard guard(device_);
-  if (auto status = cudaEventCreateWithFlags(&event_, 0); status != cudaSuccess) {
-    return Status(eFail);
-  }
-  owned_event_ = true;
-  return success();
-}
-
-Result<void> CudaEventImpl::Init(std::shared_ptr<void> native, uint64_t flags) {
-  if (!native) {
-    return Status(eInvalidArgument);
-  }
-  external_ = std::move(native);
-  event_ = static_cast<cudaEvent_t>(external_.get());
-  owned_event_ = false;
-  return success();
-}
-
-Result<void> CudaEventImpl::Query() {
-  if (cudaEventQuery(event_) == cudaSuccess) {
-    return success();
-  } else {
-    return Status(eFail);
-  }
-}
-
-Result<void> CudaEventImpl::Record(Stream& stream) {
-  if (stream.GetDevice() != device_) {
-    return Status(eInvalidArgument);
-  }
-  CudaDeviceGuard guard(device_);
-  auto native_stream = ::mmdeploy::framework::GetNative<cudaStream_t>(stream);
-  cudaEventRecord(event_, native_stream);
-  return success();
-}
-
-Result<void> CudaEventImpl::Wait() {
-  CudaDeviceGuard guard(device_);
-  if (cudaEventSynchronize(event_) == cudaSuccess) {
-    return success();
-  } else {
-    return Status(eFail);
-  }
-}
-
-void* CudaEventImpl::GetNative(ErrorCode* ec) {
-  if (ec) *ec = ErrorCode::eSuccess;
-  return event_;
-}
-////////////////////////////////////////////////////////////////////////////////
-/// CudaBufferImpl
-
-CudaBufferImpl::CudaBufferImpl(Device device) : BufferImpl(device) {}
-
-Result<void> CudaBufferImpl::Init(size_t size, Allocator allocator, size_t alignment,
-                                  uint64_t flags) {
-  memory_ = std::make_shared<CudaDeviceMemory>(device_.device_id());
-  if (!allocator) {
-    allocator = gCudaPlatform().GetDefaultAllocator(device_.device_id());
-  }
-  OUTCOME_TRY(memory_->Init(size, std::move(allocator), alignment, flags));
-  size_ = size;
-  return success();
-}
-
-Result<void> CudaBufferImpl::Init(size_t size, std::shared_ptr<void> native, uint64_t flags) {
-  memory_ = std::make_shared<CudaDeviceMemory>(device_.device_id());
-  OUTCOME_TRY(memory_->Init(size, std::move(native), flags));
-  size_ = size;
-  return success();
-}
-
-Result<BufferImplPtr> CudaBufferImpl::SubBuffer(size_t offset, size_t size, uint64_t flags) {
-  if (offset_ + offset + size > memory_->size()) {
-    return Status(eInvalidArgument);
-  }
-  auto impl = std::make_shared<CudaBufferImpl>(device_);
-  impl->memory_ = memory_;
-  impl->offset_ = offset_ + offset;
-  impl->size_ = size;
-  return impl;
-}
-
-size_t CudaBufferImpl::GetSize(ErrorCode* ec) { return size_; }
-
-void* CudaBufferImpl::GetNative(ErrorCode* ec) {
-  if (!memory_) {
-    if (ec) *ec = eInvalidArgument;
-    return nullptr;
-  }
-  if (ec) *ec = ErrorCode::eSuccess;
-  return OffsetPtr(memory_->data(), offset_);
-}
-
-Allocator CudaBufferImpl::GetAllocator() const { return memory_->allocator(); }
-
-////////////////////////////////////////////////////////////////////////////////
-/// CudaKernelImpl
-void* CudaKernelImpl::GetNative(ErrorCode* ec) {
-  if (ec) *ec = ErrorCode::eSuccess;
-  return &task_;
-}
-
-CudaKernelImpl::CudaKernelImpl(Device device, CudaTask task)
-    : KernelImpl(device), task_(std::move(task)) {}
-
-////////////////////////////////////////////////////////////////////////////////
-/// CudaPlatformRegisterer
-class CudaPlatformRegisterer {
- public:
-  CudaPlatformRegisterer() {
-    gPlatformRegistry().Register([] { return std::make_shared<CudaPlatformImpl>(); });
-  }
-};
-
-CudaPlatformRegisterer g_cuda_platform_registerer;
-
-CudaPlatformImpl& gCudaPlatform() {
-  static Platform platform("cuda");
-  return Access::get<CudaPlatformImpl>(platform);
-}
+    default_allocator_ = cuda::CreateDefaultAllocator(); });
+    }
+
+    CudaPlatformImpl::CudaPlatformImpl()
+    {
+        int count{};
+        if (auto err = cudaGetDeviceCount(&count); err != cudaSuccess)
+        {
+            MMDEPLOY_ERROR("error getting device count: {}", cudaGetErrorString(err));
+            throw_exception(eFail);
+        }
+        per_device_data_storage_.reserve(count);
+        per_device_data_.reserve(count);
+        for (int device_id = 0; device_id < count; ++device_id)
+        {
+            per_device_data_storage_.push_back(std::make_unique<PerDeviceData>(device_id));
+            per_device_data_.push_back(per_device_data_storage_.back().get());
+        }
+    }
+    Allocator CudaPlatformImpl::GetDefaultAllocator(int32_t device_id)
+    {
+        return per_device_data_[device_id]->default_allocator();
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// CudaStreamImpl
+
+    CudaStreamImpl::CudaStreamImpl(Device device)
+        : StreamImpl(device)
+        , stream_()
+        , owned_stream_()
+    {
+    }
+
+    CudaStreamImpl::~CudaStreamImpl()
+    {
+        CudaDeviceGuard guard(device_.device_id());
+        if (owned_stream_)
+        {
+            if (auto status = cudaStreamDestroy(stream_); status != cudaSuccess)
+            {
+                // TODO: signal error
+            }
+            owned_stream_ = false;
+        }
+        external_.reset();
+    }
+
+    Result<void> CudaStreamImpl::Init(uint64_t flags)
+    {
+        CudaDeviceGuard guard(device_);
+        if (auto status = cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking);
+            status != cudaSuccess)
+        {
+            return Status(eFail);
+        }
+        owned_stream_ = true;
+        return success();
+    }
+
+    Result<void> CudaStreamImpl::Init(std::shared_ptr<void> native, uint64_t flags)
+    {
+        // ! nullptr is valid for cudaStream_t
+        external_     = std::move(native);
+        stream_       = static_cast<cudaStream_t>(external_.get());
+        owned_stream_ = false;
+        return success();
+    }
+
+    Result<void> CudaStreamImpl::DependsOn(Event& event)
+    {
+        if (event.GetDevice() == device_)
+        {
+            CudaDeviceGuard guard(device_);
+            auto            native_event = ::mmdeploy::framework::GetNative<cudaEvent_t>(event);
+            cudaStreamWaitEvent(stream_, native_event, 0);
+            return success();
+        }
+        return Status(eInvalidArgument);
+    }
+
+    Result<void> CudaStreamImpl::Query()
+    {
+        CudaDeviceGuard guard(device_);
+        if (cudaStreamQuery(stream_) == cudaSuccess)
+        {
+            return success();
+        }
+        else
+        {
+            return Status(eFail);
+        }
+    }
+
+    Result<void> CudaStreamImpl::Wait()
+    {
+        CudaDeviceGuard guard(device_);
+        if (cudaStreamSynchronize(stream_) == cudaSuccess)
+        {
+            return success();
+        }
+        else
+        {
+            return Status(eFail);
+        }
+    }
+
+    Result<void> CudaStreamImpl::Submit(Kernel& kernel)
+    {
+        auto task = ::mmdeploy::framework::GetNative<CudaTask*>(kernel);
+        if (task)
+        {
+            CudaDeviceGuard guard(device_);
+            (*task)(stream_);
+            return success();
+        }
+        return Status(eInvalidArgument);
+    }
+
+    void* CudaStreamImpl::GetNative(ErrorCode* ec)
+    {
+        if (ec) *ec = ErrorCode::eSuccess;
+        return stream_;
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// CudaEventImpl
+
+    CudaEventImpl::CudaEventImpl(Device device)
+        : EventImpl(device)
+        , event_()
+        , owned_event_()
+    {
+    }
+
+    CudaEventImpl::~CudaEventImpl()
+    {
+        CudaDeviceGuard guard(device_.device_id());
+        if (owned_event_)
+        {
+            if (auto status = cudaEventDestroy(event_); status != cudaSuccess)
+            {
+                // TODO: signal error
+            }
+            owned_event_ = false;
+        }
+        external_.reset();
+    }
+
+    Result<void> CudaEventImpl::Init(uint64_t flags)
+    {
+        CudaDeviceGuard guard(device_);
+        if (auto status = cudaEventCreateWithFlags(&event_, 0); status != cudaSuccess)
+        {
+            return Status(eFail);
+        }
+        owned_event_ = true;
+        return success();
+    }
+
+    Result<void> CudaEventImpl::Init(std::shared_ptr<void> native, uint64_t flags)
+    {
+        if (!native)
+        {
+            return Status(eInvalidArgument);
+        }
+        external_    = std::move(native);
+        event_       = static_cast<cudaEvent_t>(external_.get());
+        owned_event_ = false;
+        return success();
+    }
+
+    Result<void> CudaEventImpl::Query()
+    {
+        if (cudaEventQuery(event_) == cudaSuccess)
+        {
+            return success();
+        }
+        else
+        {
+            return Status(eFail);
+        }
+    }
+
+    Result<void> CudaEventImpl::Record(Stream& stream)
+    {
+        if (stream.GetDevice() != device_)
+        {
+            return Status(eInvalidArgument);
+        }
+        CudaDeviceGuard guard(device_);
+        auto            native_stream = ::mmdeploy::framework::GetNative<cudaStream_t>(stream);
+        cudaEventRecord(event_, native_stream);
+        return success();
+    }
+
+    Result<void> CudaEventImpl::Wait()
+    {
+        CudaDeviceGuard guard(device_);
+        if (cudaEventSynchronize(event_) == cudaSuccess)
+        {
+            return success();
+        }
+        else
+        {
+            return Status(eFail);
+        }
+    }
+
+    void* CudaEventImpl::GetNative(ErrorCode* ec)
+    {
+        if (ec) *ec = ErrorCode::eSuccess;
+        return event_;
+    }
+    ////////////////////////////////////////////////////////////////////////////////
+    /// CudaBufferImpl
+
+    CudaBufferImpl::CudaBufferImpl(Device device)
+        : BufferImpl(device)
+    {
+    }
+
+    Result<void> CudaBufferImpl::Init(size_t size, Allocator allocator, size_t alignment, uint64_t flags)
+    {
+        memory_ = std::make_shared<CudaDeviceMemory>(device_.device_id());
+        if (!allocator)
+        {
+            allocator = gCudaPlatform().GetDefaultAllocator(device_.device_id());
+        }
+        OUTCOME_TRY(memory_->Init(size, std::move(allocator), alignment, flags));
+        size_ = size;
+        return success();
+    }
+
+    Result<void> CudaBufferImpl::Init(size_t size, std::shared_ptr<void> native, uint64_t flags)
+    {
+        memory_ = std::make_shared<CudaDeviceMemory>(device_.device_id());
+        OUTCOME_TRY(memory_->Init(size, std::move(native), flags));
+        size_ = size;
+        return success();
+    }
+
+    Result<BufferImplPtr> CudaBufferImpl::SubBuffer(size_t offset, size_t size, uint64_t flags)
+    {
+        if (offset_ + offset + size > memory_->size())
+        {
+            return Status(eInvalidArgument);
+        }
+        auto impl     = std::make_shared<CudaBufferImpl>(device_);
+        impl->memory_ = memory_;
+        impl->offset_ = offset_ + offset;
+        impl->size_   = size;
+        return impl;
+    }
+
+    size_t CudaBufferImpl::GetSize(ErrorCode* ec)
+    {
+        return size_;
+    }
+
+    void* CudaBufferImpl::GetNative(ErrorCode* ec)
+    {
+        if (!memory_)
+        {
+            if (ec) *ec = eInvalidArgument;
+            return nullptr;
+        }
+        if (ec) *ec = ErrorCode::eSuccess;
+        return OffsetPtr(memory_->data(), offset_);
+    }
+
+    Allocator CudaBufferImpl::GetAllocator() const
+    {
+        return memory_->allocator();
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// CudaKernelImpl
+    void* CudaKernelImpl::GetNative(ErrorCode* ec)
+    {
+        if (ec) *ec = ErrorCode::eSuccess;
+        return &task_;
+    }
+
+    CudaKernelImpl::CudaKernelImpl(Device device, CudaTask task)
+        : KernelImpl(device)
+        , task_(std::move(task))
+    {
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////
+    /// CudaPlatformRegisterer
+    class CudaPlatformRegisterer
+    {
+      public:
+        CudaPlatformRegisterer()
+        {
+            gPlatformRegistry().Register([]
+                                         { return std::make_shared<CudaPlatformImpl>(); });
+        }
+    };
+
+    CudaPlatformRegisterer g_cuda_platform_registerer;
+
+    CudaPlatformImpl&      gCudaPlatform()
+    {
+        static Platform platform("cuda");
+        return Access::get<CudaPlatformImpl>(platform);
+    }
 
 }  // namespace mmdeploy::framework
diff --git a/csrc/mmdeploy/device/cuda/cuda_device.h b/csrc/mmdeploy/device/cuda/cuda_device.h
index 20b894652d..2e9547788c 100644
--- a/csrc/mmdeploy/device/cuda/cuda_device.h
+++ b/csrc/mmdeploy/device/cuda/cuda_device.h
@@ -8,191 +8,216 @@
 #include "mmdeploy/core/device_impl.h"
 #include "mmdeploy/core/types.h"
 
-namespace mmdeploy::framework {
+namespace mmdeploy::framework
+{
 
-using CudaTask = std::function<void(cudaStream_t)>;
+    using CudaTask = std::function<void(cudaStream_t)>;
 
-class CudaPlatformImpl : public PlatformImpl {
- public:
-  CudaPlatformImpl();
+    class CudaPlatformImpl : public PlatformImpl
+    {
+      public:
+        CudaPlatformImpl();
 
-  ~CudaPlatformImpl() override {
-    // The CUDA driver may have already shutdown before the platform dtor is called.
-    // As a workaround, simply leak per device resources and let the driver handle it
-    // FIXME: maybe a pair of global mmdeploy_init/deinit function would be a
-    //  better solution
-    for (auto& data : per_device_data_storage_) {
-      data.release();
-    }
-  }
+        ~CudaPlatformImpl() override
+        {
+            // The CUDA driver may have already shutdown before the platform dtor is called.
+            // As a workaround, simply leak per device resources and let the driver handle it
+            // FIXME: maybe a pair of global mmdeploy_init/deinit function would be a
+            //  better solution
+            for (auto& data : per_device_data_storage_)
+            {
+                data.release();
+            }
+        }
 
-  const char* GetPlatformName() const noexcept override { return "cuda"; }
+        const char* GetPlatformName() const noexcept override
+        {
+            return "cuda";
+        }
 
-  Result<void> BindDevice(Device device, Device* prev) override;
+        Result<void>           BindDevice(Device device, Device* prev) override;
 
-  shared_ptr<BufferImpl> CreateBuffer(Device device) override;
+        shared_ptr<BufferImpl> CreateBuffer(Device device) override;
 
-  shared_ptr<StreamImpl> CreateStream(Device device) override;
+        shared_ptr<StreamImpl> CreateStream(Device device) override;
 
-  shared_ptr<EventImpl> CreateEvent(Device device) override;
+        shared_ptr<EventImpl>  CreateEvent(Device device) override;
 
-  Result<void> Copy(const void* host_ptr, Buffer dst, size_t size, size_t dst_offset,
-                    Stream stream) override;
+        Result<void>           Copy(const void* host_ptr, Buffer dst, size_t size, size_t dst_offset, Stream stream) override;
 
-  Result<void> Copy(Buffer src, void* host_ptr, size_t size, size_t src_offset,
-                    Stream stream) override;
+        Result<void>           Copy(Buffer src, void* host_ptr, size_t size, size_t src_offset, Stream stream) override;
 
-  Result<void> Copy(Buffer src, Buffer dst, size_t size, size_t src_offset, size_t dst_offset,
-                    Stream stream) override;
+        Result<void>           Copy(Buffer src, Buffer dst, size_t size, size_t src_offset, size_t dst_offset, Stream stream) override;
 
-  Result<Stream> GetDefaultStream(int32_t device_id) override;
+        Result<Stream>         GetDefaultStream(int32_t device_id) override;
 
-  Allocator GetDefaultAllocator(int32_t device_id);
+        Allocator              GetDefaultAllocator(int32_t device_id);
 
-  Device GetDevice(int device_id) { return Device(platform_id_, device_id); }
+        Device                 GetDevice(int device_id)
+        {
+            return Device(platform_id_, device_id);
+        }
 
- private:
-  static bool CheckCopyParam(size_t src_size, size_t dst_size, size_t src_offset, size_t dst_offset,
-                             size_t copy_size);
+      private:
+        static bool         CheckCopyParam(size_t src_size, size_t dst_size, size_t src_offset, size_t dst_offset, size_t copy_size);
 
-  static bool CheckCopyDevice(const Device& src, const Device& dst, const Device& st);
+        static bool         CheckCopyDevice(const Device& src, const Device& dst, const Device& st);
 
-  static Result<void> CopyImpl(Device device, const void* src, void* dst, size_t src_size,
-                               size_t dst_size, size_t src_offset, size_t dst_offset, size_t size,
-                               Stream st);
+        static Result<void> CopyImpl(Device device, const void* src, void* dst, size_t src_size, size_t dst_size, size_t src_offset, size_t dst_offset, size_t size, Stream st);
 
-  class PerDeviceData {
-   public:
-    explicit PerDeviceData(int device_id) : device_id_(device_id) {}
-    void init();
-    Stream& default_stream() {
-      init();
-      return default_stream_;
-    }
-    Allocator& default_allocator() {
-      init();
-      return default_allocator_;
-    }
+        class PerDeviceData
+        {
+          public:
+            explicit PerDeviceData(int device_id)
+                : device_id_(device_id)
+            {
+            }
+            void    init();
+            Stream& default_stream()
+            {
+                init();
+                return default_stream_;
+            }
+            Allocator& default_allocator()
+            {
+                init();
+                return default_allocator_;
+            }
 
-   private:
-    int device_id_;
-    std::once_flag init_flag_;
-    Stream default_stream_;
-    Allocator default_allocator_;
-  };
+          private:
+            int            device_id_;
+            std::once_flag init_flag_;
+            Stream         default_stream_;
+            Allocator      default_allocator_;
+        };
 
-  std::vector<std::unique_ptr<PerDeviceData>> per_device_data_storage_;
-  std::vector<PerDeviceData*> per_device_data_;
-};
+        std::vector<std::unique_ptr<PerDeviceData>> per_device_data_storage_;
+        std::vector<PerDeviceData*>                 per_device_data_;
+    };
 
-CudaPlatformImpl& gCudaPlatform();
+    CudaPlatformImpl& gCudaPlatform();
 
-class CudaDeviceMemory;
+    class CudaDeviceMemory;
 
-class CudaBufferImpl : public BufferImpl {
- public:
-  explicit CudaBufferImpl(Device device);
+    class CudaBufferImpl : public BufferImpl
+    {
+      public:
+        explicit CudaBufferImpl(Device device);
 
-  Result<void> Init(size_t size, Allocator allocator, size_t alignment, uint64_t flags) override;
+        Result<void>          Init(size_t size, Allocator allocator, size_t alignment, uint64_t flags) override;
 
-  Result<void> Init(size_t size, std::shared_ptr<void> native, uint64_t flags) override;
+        Result<void>          Init(size_t size, std::shared_ptr<void> native, uint64_t flags) override;
 
-  Result<BufferImplPtr> SubBuffer(size_t offset, size_t size, uint64_t flags) override;
+        Result<BufferImplPtr> SubBuffer(size_t offset, size_t size, uint64_t flags) override;
 
-  void* GetNative(ErrorCode* ec) override;
+        void*                 GetNative(ErrorCode* ec) override;
 
-  Allocator GetAllocator() const override;
+        Allocator             GetAllocator() const override;
 
-  size_t GetSize(ErrorCode* ec) override;
+        size_t                GetSize(ErrorCode* ec) override;
 
- private:
-  std::shared_ptr<CudaDeviceMemory> memory_;
-  size_t offset_{0};
-  size_t size_{0};
-};
+      private:
+        std::shared_ptr<CudaDeviceMemory> memory_;
+        size_t                            offset_{0};
+        size_t                            size_{0};
+    };
 
-class CudaStreamImpl : public StreamImpl {
- public:
-  explicit CudaStreamImpl(Device device);
+    class CudaStreamImpl : public StreamImpl
+    {
+      public:
+        explicit CudaStreamImpl(Device device);
 
-  ~CudaStreamImpl() override;
+        ~CudaStreamImpl() override;
 
-  Result<void> Init(uint64_t flags) override;
+        Result<void> Init(uint64_t flags) override;
 
-  Result<void> Init(std::shared_ptr<void> native, uint64_t flags) override;
+        Result<void> Init(std::shared_ptr<void> native, uint64_t flags) override;
 
-  Result<void> DependsOn(Event& event) override;
+        Result<void> DependsOn(Event& event) override;
 
-  Result<void> Query() override;
+        Result<void> Query() override;
 
-  Result<void> Wait() override;
+        Result<void> Wait() override;
 
-  Result<void> Submit(Kernel& kernel) override;
+        Result<void> Submit(Kernel& kernel) override;
 
-  void* GetNative(ErrorCode* ec) override;
+        void*        GetNative(ErrorCode* ec) override;
 
- private:
-  cudaStream_t stream_;
-  bool owned_stream_;
-  std::shared_ptr<void> external_;
-};
+      private:
+        cudaStream_t          stream_;
+        bool                  owned_stream_;
+        std::shared_ptr<void> external_;
+    };
 
-class CudaEventImpl : public EventImpl {
- public:
-  explicit CudaEventImpl(Device device);
+    class CudaEventImpl : public EventImpl
+    {
+      public:
+        explicit CudaEventImpl(Device device);
+
+        ~CudaEventImpl() override;
+
+        Result<void> Init(uint64_t flags) override;
 
-  ~CudaEventImpl() override;
+        Result<void> Init(std::shared_ptr<void> native, uint64_t flags) override;
+
+        Result<void> Query() override;
+
+        Result<void> Record(Stream& stream) override;
+
+        Result<void> Wait() override;
+
+        void*        GetNative(ErrorCode* ec) override;
+
+      private:
+        cudaEvent_t           event_;
+        bool                  owned_event_;
+        std::shared_ptr<void> external_;
+    };
 
-  Result<void> Init(uint64_t flags) override;
+    class CudaKernelImpl : public KernelImpl
+    {
+      public:
+        explicit CudaKernelImpl(Device device, CudaTask task);
 
-  Result<void> Init(std::shared_ptr<void> native, uint64_t flags) override;
+        void* GetNative(ErrorCode* ec) override;
 
-  Result<void> Query() override;
+      private:
+        CudaTask task_;
+    };
 
-  Result<void> Record(Stream& stream) override;
-
-  Result<void> Wait() override;
-
-  void* GetNative(ErrorCode* ec) override;
-
- private:
-  cudaEvent_t event_;
-  bool owned_event_;
-  std::shared_ptr<void> external_;
-};
-
-class CudaKernelImpl : public KernelImpl {
- public:
-  explicit CudaKernelImpl(Device device, CudaTask task);
-
-  void* GetNative(ErrorCode* ec) override;
-
- private:
-  CudaTask task_;
-};
-
-class CudaDeviceGuard {
- public:
-  explicit CudaDeviceGuard(Device device) : CudaDeviceGuard(device.device_id()) {}
-  explicit CudaDeviceGuard(int device_id) : device_id_(device_id), prev_device_id_(-1) {
-    CUcontext ctx{};
-    cuCtxGetCurrent(&ctx);
-    if (ctx) {
-      cudaGetDevice(&prev_device_id_);
-    }
-    if (prev_device_id_ != device_id_) {
-      cudaSetDevice(device_id_);
-    }
-  }
-  ~CudaDeviceGuard() {
-    if (prev_device_id_ >= 0 && prev_device_id_ != device_id_) {
-      cudaSetDevice(prev_device_id_);
-    }
-  }
-
- private:
-  int device_id_;
-  int prev_device_id_;
-};
+    class CudaDeviceGuard
+    {
+      public:
+        explicit CudaDeviceGuard(Device device)
+            : CudaDeviceGuard(device.device_id())
+        {
+        }
+        explicit CudaDeviceGuard(int device_id)
+            : device_id_(device_id)
+            , prev_device_id_(-1)
+        {
+            CUcontext ctx{};
+            cuCtxGetCurrent(&ctx);
+            if (ctx)
+            {
+                cudaGetDevice(&prev_device_id_);
+            }
+            if (prev_device_id_ != device_id_)
+            {
+                cudaSetDevice(device_id_);
+            }
+        }
+        ~CudaDeviceGuard()
+        {
+            if (prev_device_id_ >= 0 && prev_device_id_ != device_id_)
+            {
+                cudaSetDevice(prev_device_id_);
+            }
+        }
+
+      private:
+        int device_id_;
+        int prev_device_id_;
+    };
 
 }  // namespace mmdeploy::framework
diff --git a/csrc/mmdeploy/device/cuda/default_allocator.h b/csrc/mmdeploy/device/cuda/default_allocator.h
index 50fe68b2c5..3eeb75ecbe 100644
--- a/csrc/mmdeploy/device/cuda/default_allocator.h
+++ b/csrc/mmdeploy/device/cuda/default_allocator.h
@@ -10,59 +10,65 @@
 
 #include "mmdeploy/core/logger.h"
 
-namespace mmdeploy::cuda {
+namespace mmdeploy::cuda
+{
 
-class DefaultAllocator {
- public:
-  DefaultAllocator() = default;
-  ~DefaultAllocator() {
-    MMDEPLOY_ERROR("=== CUDA Default Allocator ===");
-    MMDEPLOY_ERROR("  Allocation: count={}, size={}MB, time={}ms", alloc_count_,
-                   alloc_size_ / (1024 * 1024.f), alloc_time_ / 1000000.f);
-    MMDEPLOY_ERROR("Deallocation: count={}, size={}MB, time={}ms", dealloc_count_,
-                   dealloc_size_ / (1024 * 1024.f), dealloc_time_ / 1000000.f);
-  }
-  [[nodiscard]] void* Allocate(std::size_t n) {
-    void* p{};
-    auto t0 = std::chrono::high_resolution_clock::now();
-    auto ret = cudaMalloc(&p, n);
-    auto t1 = std::chrono::high_resolution_clock::now();
-    alloc_time_ += (int64_t)std::chrono::duration<double, std::nano>(t1 - t0).count();
-    if (ret != cudaSuccess) {
-      MMDEPLOY_ERROR("error allocating cuda memory: {}", cudaGetErrorString(ret));
-      return nullptr;
-    }
-    alloc_count_ += 1;
-    alloc_size_ += n;
-    return p;
-  }
-  void Deallocate(void* p, std::size_t n) {
-    (void)n;
-    auto t0 = std::chrono::high_resolution_clock::now();
-    auto ret = cudaFree(p);
-    auto t1 = std::chrono::high_resolution_clock::now();
-    dealloc_time_ += (int64_t)std::chrono::duration<double, std::nano>(t1 - t0).count();
-    if (ret != cudaSuccess) {
-      MMDEPLOY_ERROR("error deallocating cuda memory: {}", cudaGetErrorString(ret));
-      return;
-    }
-    dealloc_count_ += 1;
-    dealloc_size_ += n;
-  }
+    class DefaultAllocator
+    {
+      public:
+        DefaultAllocator() = default;
+        ~DefaultAllocator()
+        {
+            MMDEPLOY_ERROR("=== CUDA Default Allocator ===");
+            MMDEPLOY_ERROR("  Allocation: count={}, size={}MB, time={}ms", alloc_count_, alloc_size_ / (1024 * 1024.f), alloc_time_ / 1000000.f);
+            MMDEPLOY_ERROR("Deallocation: count={}, size={}MB, time={}ms", dealloc_count_, dealloc_size_ / (1024 * 1024.f), dealloc_time_ / 1000000.f);
+        }
+        [[nodiscard]] void* Allocate(std::size_t n)
+        {
+            void* p{};
+            auto  t0  = std::chrono::high_resolution_clock::now();
+            auto  ret = cudaMalloc(&p, n);
+            auto  t1  = std::chrono::high_resolution_clock::now();
+            alloc_time_ += (int64_t)std::chrono::duration<double, std::nano>(t1 - t0).count();
+            if (ret != cudaSuccess)
+            {
+                MMDEPLOY_ERROR("error allocating cuda memory: {}", cudaGetErrorString(ret));
+                return nullptr;
+            }
+            alloc_count_ += 1;
+            alloc_size_ += n;
+            return p;
+        }
+        void Deallocate(void* p, std::size_t n)
+        {
+            (void)n;
+            auto t0  = std::chrono::high_resolution_clock::now();
+            auto ret = cudaFree(p);
+            auto t1  = std::chrono::high_resolution_clock::now();
+            dealloc_time_ += (int64_t)std::chrono::duration<double, std::nano>(t1 - t0).count();
+            if (ret != cudaSuccess)
+            {
+                MMDEPLOY_ERROR("error deallocating cuda memory: {}", cudaGetErrorString(ret));
+                return;
+            }
+            dealloc_count_ += 1;
+            dealloc_size_ += n;
+        }
 
- private:
-  std::atomic<std::size_t> alloc_count_;
-  std::atomic<std::size_t> alloc_size_;
-  std::atomic<std::size_t> alloc_time_;
-  std::atomic<std::size_t> dealloc_count_;
-  std::atomic<std::size_t> dealloc_size_;
-  std::atomic<std::size_t> dealloc_time_;
-};
+      private:
+        std::atomic<std::size_t> alloc_count_;
+        std::atomic<std::size_t> alloc_size_;
+        std::atomic<std::size_t> alloc_time_;
+        std::atomic<std::size_t> dealloc_count_;
+        std::atomic<std::size_t> dealloc_size_;
+        std::atomic<std::size_t> dealloc_time_;
+    };
 
-inline DefaultAllocator& gDefaultAllocator() {
-  static DefaultAllocator v;
-  return v;
-}
+    inline DefaultAllocator& gDefaultAllocator()
+    {
+        static DefaultAllocator v;
+        return v;
+    }
 
 }  // namespace mmdeploy::cuda
 
diff --git a/csrc/mmdeploy/device/cuda/linear_allocator.h b/csrc/mmdeploy/device/cuda/linear_allocator.h
index 59133e9332..bb197883dd 100644
--- a/csrc/mmdeploy/device/cuda/linear_allocator.h
+++ b/csrc/mmdeploy/device/cuda/linear_allocator.h
@@ -5,64 +5,79 @@
 
 #include "default_allocator.h"
 
-namespace mmdeploy::cuda {
+namespace mmdeploy::cuda
+{
 
-class LinearAllocator {
- public:
-  explicit LinearAllocator(std::size_t size) : size_(size) {
-    base_ = static_cast<uint8_t*>(gDefaultAllocator().Allocate(size));
-    ptr_ = base_;
-  }
-  ~LinearAllocator() { gDefaultAllocator().Deallocate(base_, size_); }
-  [[nodiscard]] void* Allocate(std::size_t n) {
-    std::optional<std::lock_guard<std::mutex> > lock;
-    if (mutex_) {
-      lock.emplace(*mutex_);
-    }
-    ++count_;
-    total_ += n;
-    auto ptr = static_cast<void*>(ptr_);
-    std::size_t space = base_ + size_ - ptr_;
+    class LinearAllocator
+    {
+      public:
+        explicit LinearAllocator(std::size_t size)
+            : size_(size)
+        {
+            base_ = static_cast<uint8_t*>(gDefaultAllocator().Allocate(size));
+            ptr_  = base_;
+        }
+        ~LinearAllocator()
+        {
+            gDefaultAllocator().Deallocate(base_, size_);
+        }
+        [[nodiscard]] void* Allocate(std::size_t n)
+        {
+            std::optional<std::lock_guard<std::mutex>> lock;
+            if (mutex_)
+            {
+                lock.emplace(*mutex_);
+            }
+            ++count_;
+            total_ += n;
+            auto        ptr   = static_cast<void*>(ptr_);
+            std::size_t space = base_ + size_ - ptr_;
 
-    if (std::align(16, n, ptr, space)) {
-      MMDEPLOY_ERROR("success n={}, total={}, count={}", n, total_, count_);
-      ptr_ = static_cast<uint8_t*>(ptr) + n;
-      return ptr;
-    }
-    MMDEPLOY_ERROR("fallback {}, total={}, count={}", n, total_, count_);
-    return gDefaultAllocator().Allocate(n);
-  }
-  void Deallocate(void* _p, std::size_t n) {
-    std::optional<std::lock_guard<std::mutex> > lock;
-    if (mutex_) {
-      lock.emplace(*mutex_);
-    }
-    auto p = static_cast<uint8_t*>(_p);
-    if (!(base_ <= p && p < ptr_)) {
-      gDefaultAllocator().Deallocate(_p, n);
-    }
-    total_ -= n;
-    --count_;
-    MMDEPLOY_ERROR("deallocate total={}, count={}", total_, count_);
-    if (total_ == 0) {
-      assert(count_ == 0);
-      ptr_ = base_;
-    }
-  }
+            if (std::align(16, n, ptr, space))
+            {
+                MMDEPLOY_ERROR("success n={}, total={}, count={}", n, total_, count_);
+                ptr_ = static_cast<uint8_t*>(ptr) + n;
+                return ptr;
+            }
+            MMDEPLOY_ERROR("fallback {}, total={}, count={}", n, total_, count_);
+            return gDefaultAllocator().Allocate(n);
+        }
+        void Deallocate(void* _p, std::size_t n)
+        {
+            std::optional<std::lock_guard<std::mutex>> lock;
+            if (mutex_)
+            {
+                lock.emplace(*mutex_);
+            }
+            auto p = static_cast<uint8_t*>(_p);
+            if (!(base_ <= p && p < ptr_))
+            {
+                gDefaultAllocator().Deallocate(_p, n);
+            }
+            total_ -= n;
+            --count_;
+            MMDEPLOY_ERROR("deallocate total={}, count={}", total_, count_);
+            if (total_ == 0)
+            {
+                assert(count_ == 0);
+                ptr_ = base_;
+            }
+        }
 
- private:
-  std::size_t size_;
-  uint8_t* base_;
-  uint8_t* ptr_;
-  std::size_t total_{};
-  std::size_t count_{};
-  std::optional<std::mutex> mutex_;
-};
+      private:
+        std::size_t               size_;
+        uint8_t*                  base_;
+        uint8_t*                  ptr_;
+        std::size_t               total_{};
+        std::size_t               count_{};
+        std::optional<std::mutex> mutex_;
+    };
 
-inline LinearAllocator& gLinearAllocator() {
-  static LinearAllocator v(1U << 30);
-  return v;
-}
+    inline LinearAllocator& gLinearAllocator()
+    {
+        static LinearAllocator v(1U << 30);
+        return v;
+    }
 
 }  // namespace mmdeploy::cuda
 
diff --git a/csrc/mmdeploy/device/device_allocator.h b/csrc/mmdeploy/device/device_allocator.h
index 8adf0cf72f..657a3368d9 100644
--- a/csrc/mmdeploy/device/device_allocator.h
+++ b/csrc/mmdeploy/device/device_allocator.h
@@ -13,339 +13,430 @@
 #include "mmdeploy/core/device_impl.h"
 #include "mmdeploy/core/logger.h"
 
-namespace mmdeploy::framework::device_allocator {
-
-class Fallback : public AllocatorImpl {
- public:
-  Fallback(AllocatorImplPtr primary, AllocatorImplPtr fallback)
-      : primary_(std::move(primary)), fallback_(std::move(fallback)) {}
-
-  Block Allocate(size_t size) noexcept override {
-    if (auto block = primary_->Allocate(size); block.handle) {
-      return block;
-    }
-    return fallback_->Allocate(size);
-  }
-
-  void Deallocate(Block& block) noexcept override {
-    if (primary_->Owns(block)) {
-      primary_->Deallocate(block);
-      return;
-    }
-    fallback_->Deallocate(block);
-  }
-
-  bool Owns(const Block& block) const noexcept override {
-    return primary_->Owns(block) || fallback_->Owns(block);
-  }
-
- private:
-  AllocatorImplPtr primary_;
-  AllocatorImplPtr fallback_;
-};
-
-// TODO: batch allocation
-class Pool : public AllocatorImpl {
- public:
-  explicit Pool(AllocatorImplPtr allocator, size_t min_size, size_t max_size, unsigned pool_size)
-      : allocator_(std::move(allocator)),
-        min_size_(min_size),
-        max_size_(max_size),
-        pool_size_(pool_size) {
-    free_.reserve(pool_size);
-  }
-
-  ~Pool() override {
-    while (!free_.empty()) {
-      Block block(free_.back(), max_size_);
-      allocator_->Deallocate(block);
-      free_.pop_back();
-    }
-  }
-
-  Block Allocate(size_t size) noexcept override {
-    if (min_size_ <= size && size <= max_size_) {
-      if (!free_.empty()) {
-        auto handle = free_.back();
-        free_.pop_back();
-        return Block{handle, max_size_};
-      } else {
-        return allocator_->Allocate(max_size_);
-      }
+namespace mmdeploy::framework::device_allocator
+{
+
+    class Fallback : public AllocatorImpl
+    {
+      public:
+        Fallback(AllocatorImplPtr primary, AllocatorImplPtr fallback)
+            : primary_(std::move(primary))
+            , fallback_(std::move(fallback))
+        {
+        }
+
+        Block Allocate(size_t size) noexcept override
+        {
+            if (auto block = primary_->Allocate(size); block.handle)
+            {
+                return block;
+            }
+            return fallback_->Allocate(size);
+        }
+
+        void Deallocate(Block& block) noexcept override
+        {
+            if (primary_->Owns(block))
+            {
+                primary_->Deallocate(block);
+                return;
+            }
+            fallback_->Deallocate(block);
+        }
+
+        bool Owns(const Block& block) const noexcept override
+        {
+            return primary_->Owns(block) || fallback_->Owns(block);
+        }
+
+      private:
+        AllocatorImplPtr primary_;
+        AllocatorImplPtr fallback_;
+    };
+
+    // TODO: batch allocation
+    class Pool : public AllocatorImpl
+    {
+      public:
+        explicit Pool(AllocatorImplPtr allocator, size_t min_size, size_t max_size, unsigned pool_size)
+            : allocator_(std::move(allocator))
+            , min_size_(min_size)
+            , max_size_(max_size)
+            , pool_size_(pool_size)
+        {
+            free_.reserve(pool_size);
+        }
+
+        ~Pool() override
+        {
+            while (!free_.empty())
+            {
+                Block block(free_.back(), max_size_);
+                allocator_->Deallocate(block);
+                free_.pop_back();
+            }
+        }
+
+        Block Allocate(size_t size) noexcept override
+        {
+            if (min_size_ <= size && size <= max_size_)
+            {
+                if (!free_.empty())
+                {
+                    auto handle = free_.back();
+                    free_.pop_back();
+                    return Block{handle, max_size_};
+                }
+                else
+                {
+                    return allocator_->Allocate(max_size_);
+                }
+            }
+            return Block{};
+        }
+
+        void Deallocate(Block& block) noexcept override
+        {
+            if (Owns(block))
+            {
+                if (free_.size() < pool_size_)
+                {
+                    free_.push_back(block.handle);
+                    block.handle = nullptr;
+                    block.size   = 0;
+                }
+                else
+                {
+                    allocator_->Deallocate(block);
+                }
+            }
+        }
+
+        bool Owns(const Block& block) const noexcept override
+        {
+            return block.handle && min_size_ <= block.size && block.size <= max_size_;
+        }
+
+      private:
+        AllocatorImplPtr   allocator_;
+        size_t             min_size_;
+        size_t             max_size_;
+        unsigned           pool_size_;
+        std::vector<void*> free_;
+    };
+
+    class Tree : public AllocatorImpl
+    {
+        static constexpr auto kQuantizer = 100;
+
+      public:
+        Tree(AllocatorImplPtr allocator, size_t max_bytes, float threshold)
+            : allocator_(std::move(allocator))
+            , max_tree_bytes_(max_bytes)
+        {
+            if (threshold)
+            {
+                thresh_numerator_   = static_cast<int>(threshold * kQuantizer);
+                thresh_denominator_ = kQuantizer;
+                auto divisor        = std::gcd(thresh_numerator_, thresh_denominator_);
+                thresh_numerator_ /= divisor;
+                thresh_denominator_ /= divisor;
+            }
+        }
+
+        ~Tree() override
+        {
+            for (const auto& [size, handle] : tree_)
+            {
+                Block block(handle, size);
+                allocator_->Deallocate(block);
+            }
+        }
+
+        Block Allocate(size_t size) noexcept override
+        {
+            if (auto it = tree_.lower_bound(size); it != tree_.end())
+            {
+                if (size * thresh_denominator_ >= it->first * thresh_numerator_)
+                {
+                    Block block(it->second, it->first);
+                    tree_bytes_ -= it->first;
+                    tree_.erase(it);
+                    return block;
+                }
+            }
+            return allocator_->Allocate(size);
+        }
+        void Deallocate(Block& block) noexcept override
+        {
+            auto bytes = tree_bytes_ + block.size;
+            if (bytes < max_tree_bytes_)
+            {
+                tree_.insert({block.size, block.handle});
+                tree_bytes_  = bytes;
+                block.size   = 0;
+                block.handle = nullptr;
+            }
+            else
+            {
+                allocator_->Deallocate(block);
+            }
+        }
+        bool Owns(const Block& block) const noexcept override
+        {
+            return true;
+        }
+
+      private:
+        AllocatorImplPtr             allocator_;
+        // threshold ~ thresh_numerator_ / thresh_denominator_
+        int                          thresh_numerator_{};
+        int                          thresh_denominator_{};
+        std::multimap<size_t, void*> tree_;
+        size_t                       max_tree_bytes_;
+        size_t                       tree_bytes_{};
+    };
+
+    class Stats : public AllocatorImpl
+    {
+      public:
+        explicit Stats(AllocatorImplPtr allocator, std::string name)
+            : allocator_(std::move(allocator))
+            , name_(std::move(name))
+        {
+        }
+
+        ~Stats() override
+        {
+            MMDEPLOY_INFO("=== {} ===", name_);
+            MMDEPLOY_INFO("  Allocation: count={}, size={}MB, time={}ms", data_.allocation_count, data_.allocated_bytes / (1024 * 1024.f), static_cast<float>(data_.allocation_time));
+            MMDEPLOY_INFO("Deallocation: count={}, size={}MB, time={}ms", data_.deallocation_count, data_.deallocated_bytes / (1024 * 1024.f), static_cast<float>(data_.deallocation_time));
+            MMDEPLOY_INFO("Peak memory usage: size={}MB", data_.peak / (1024 * 1024.f));
+        }
+
+        Block Allocate(size_t size) noexcept override
+        {
+            auto t0    = std::chrono::high_resolution_clock::now();
+            auto block = allocator_->Allocate(size);
+            auto t1    = std::chrono::high_resolution_clock::now();
+            data_.allocation_time += std::chrono::duration<double, std::milli>(t1 - t0).count();
+            data_.allocated_bytes += block.size;
+            data_.peak = std::max(data_.peak, data_.allocated_bytes - data_.deallocated_bytes);
+            ++data_.allocation_count;
+            return block;
+        }
+
+        void Deallocate(Block& block) noexcept override
+        {
+            ++data_.deallocation_count;
+            data_.deallocated_bytes += block.size;
+            auto t0 = std::chrono::high_resolution_clock::now();
+            allocator_->Deallocate(block);
+            auto t1 = std::chrono::high_resolution_clock::now();
+            data_.deallocation_time += std::chrono::duration<double, std::milli>(t1 - t0).count();
+        }
+
+        bool Owns(const Block& block) const noexcept override
+        {
+            return allocator_->Owns(block);
+        }
+
+        const char* Name() const noexcept override
+        {
+            return name_.c_str();
+        }
+
+      private:
+        struct Data
+        {
+            size_t allocation_count{};
+            size_t deallocation_count{};
+            size_t allocated_bytes{};
+            size_t deallocated_bytes{};
+            size_t peak{};
+            double allocation_time{};
+            double deallocation_time{};
+        };
+        Data             data_;
+        AllocatorImplPtr allocator_;
+        std::string      name_;
+    };
+
+    class Locked : public AllocatorImpl
+    {
+      public:
+        explicit Locked(AllocatorImplPtr allocator)
+            : allocator_(std::move(allocator))
+        {
+        }
+        Block Allocate(size_t size) noexcept override
+        {
+            std::lock_guard lock(mutex_);
+            return allocator_->Allocate(size);
+        }
+
+        void Deallocate(Block& block) noexcept override
+        {
+            std::lock_guard lock(mutex_);
+            allocator_->Deallocate(block);
+        }
+
+        bool Owns(const Block& block) const noexcept override
+        {
+            std::lock_guard lock(mutex_);
+            return allocator_->Owns(block);
+        }
+
+      private:
+        AllocatorImplPtr   allocator_;
+        mutable std::mutex mutex_;
+    };
+
+    class Segregator : public AllocatorImpl
+    {
+      public:
+        Segregator(size_t threshold, AllocatorImplPtr small, AllocatorImplPtr large)
+            : threshold_(threshold)
+            , small_(std::move(small))
+            , large_(std::move(large))
+        {
+        }
+
+        Block Allocate(size_t size) noexcept override
+        {
+            if (size <= threshold_)
+            {
+                return small_->Allocate(size);
+            }
+            return large_->Allocate(size);
+        }
+
+        void Deallocate(Block& block) noexcept override
+        {
+            if (block.size <= threshold_)
+            {
+                return small_->Deallocate(block);
+            }
+            return large_->Deallocate(block);
+        }
+
+        bool Owns(const Block& block) const noexcept override
+        {
+            if (block.size <= threshold_)
+            {
+                return small_->Owns(block);
+            }
+            return large_->Owns(block);
+        }
+
+      private:
+        size_t           threshold_;
+        AllocatorImplPtr small_;
+        AllocatorImplPtr large_;
+    };
+
+    template<typename Allocator>
+    class AllocatorAdapter : public AllocatorImpl
+    {
+      public:
+        Block Allocate(size_t size) noexcept override
+        {
+            return allocator_.Allocate(size);
+        }
+        void Deallocate(Block& block) noexcept override
+        {
+            return allocator_.Deallocate(block);
+        }
+        bool Owns(const Block& block) const noexcept override
+        {
+            return allocator_.Owns(block);
+        }
+
+      private:
+        Allocator allocator_;
+    };
+
+    class Bucketizer : public AllocatorImpl
+    {
+      public:
+        using AllocatorCreator = std::function<AllocatorImplPtr(size_t, size_t)>;
+        Bucketizer(const AllocatorCreator& creator, size_t min_size, size_t max_size, size_t step_size)
+            : min_size_(min_size)
+            , max_size_(max_size)
+            , step_size_(step_size)
+        {
+            for (auto base = min_size_; base < max_size_; base += step_size_)
+            {
+                //      MMDEPLOY_ERROR("{}, {}", base, base + step_size - 1);
+                allocator_.push_back(creator(base, base + step_size - 1));
+            }
+            //    MMDEPLOY_ERROR("{}", allocator_.size());
+        }
+
+        Block Allocate(size_t size) noexcept override
+        {
+            auto index = (size - min_size_) / step_size_;
+            if (0 <= index && index < allocator_.size())
+            {
+                return allocator_[index]->Allocate(size);
+            }
+            return Block{};
+        }
+
+        void Deallocate(Block& block) noexcept override
+        {
+            auto index = (block.size - min_size_) / step_size_;
+            if (0 <= index && index < allocator_.size())
+            {
+                return allocator_[index]->Deallocate(block);
+            }
+        }
+
+        bool Owns(const Block& block) const noexcept override
+        {
+            return min_size_ <= block.size && block.size < max_size_;
+        }
+
+      private:
+        std::vector<AllocatorImplPtr> allocator_;
+        size_t                        min_size_;
+        size_t                        max_size_;
+        size_t                        step_size_;
+    };
+
+    inline AllocatorImplPtr CreateFallback(AllocatorImplPtr primary, AllocatorImplPtr fallback)
+    {
+        return std::make_shared<Fallback>(std::move(primary), std::move(fallback));
     }
-    return Block{};
-  }
-
-  void Deallocate(Block& block) noexcept override {
-    if (Owns(block)) {
-      if (free_.size() < pool_size_) {
-        free_.push_back(block.handle);
-        block.handle = nullptr;
-        block.size = 0;
-      } else {
-        allocator_->Deallocate(block);
-      }
-    }
-  }
-
-  bool Owns(const Block& block) const noexcept override {
-    return block.handle && min_size_ <= block.size && block.size <= max_size_;
-  }
-
- private:
-  AllocatorImplPtr allocator_;
-  size_t min_size_;
-  size_t max_size_;
-  unsigned pool_size_;
-  std::vector<void*> free_;
-};
-
-class Tree : public AllocatorImpl {
-  static constexpr auto kQuantizer = 100;
-
- public:
-  Tree(AllocatorImplPtr allocator, size_t max_bytes, float threshold)
-      : allocator_(std::move(allocator)), max_tree_bytes_(max_bytes) {
-    if (threshold) {
-      thresh_numerator_ = static_cast<int>(threshold * kQuantizer);
-      thresh_denominator_ = kQuantizer;
-      auto divisor = std::gcd(thresh_numerator_, thresh_denominator_);
-      thresh_numerator_ /= divisor;
-      thresh_denominator_ /= divisor;
-    }
-  }
 
-  ~Tree() override {
-    for (const auto& [size, handle] : tree_) {
-      Block block(handle, size);
-      allocator_->Deallocate(block);
-    }
-  }
-
-  Block Allocate(size_t size) noexcept override {
-    if (auto it = tree_.lower_bound(size); it != tree_.end()) {
-      if (size * thresh_denominator_ >= it->first * thresh_numerator_) {
-        Block block(it->second, it->first);
-        tree_bytes_ -= it->first;
-        tree_.erase(it);
-        return block;
-      }
+    inline AllocatorImplPtr CreateStats(const std::string& name, AllocatorImplPtr allocator)
+    {
+        return std::make_shared<Stats>(std::move(allocator), name);
     }
-    return allocator_->Allocate(size);
-  }
-  void Deallocate(Block& block) noexcept override {
-    auto bytes = tree_bytes_ + block.size;
-    if (bytes < max_tree_bytes_) {
-      tree_.insert({block.size, block.handle});
-      tree_bytes_ = bytes;
-      block.size = 0;
-      block.handle = nullptr;
-    } else {
-      allocator_->Deallocate(block);
-    }
-  }
-  bool Owns(const Block& block) const noexcept override { return true; }
-
- private:
-  AllocatorImplPtr allocator_;
-  // threshold ~ thresh_numerator_ / thresh_denominator_
-  int thresh_numerator_{};
-  int thresh_denominator_{};
-  std::multimap<size_t, void*> tree_;
-  size_t max_tree_bytes_;
-  size_t tree_bytes_{};
-};
-
-class Stats : public AllocatorImpl {
- public:
-  explicit Stats(AllocatorImplPtr allocator, std::string name)
-      : allocator_(std::move(allocator)), name_(std::move(name)) {}
-
-  ~Stats() override {
-    MMDEPLOY_INFO("=== {} ===", name_);
-    MMDEPLOY_INFO("  Allocation: count={}, size={}MB, time={}ms", data_.allocation_count,
-                  data_.allocated_bytes / (1024 * 1024.f),
-                  static_cast<float>(data_.allocation_time));
-    MMDEPLOY_INFO("Deallocation: count={}, size={}MB, time={}ms", data_.deallocation_count,
-                  data_.deallocated_bytes / (1024 * 1024.f),
-                  static_cast<float>(data_.deallocation_time));
-    MMDEPLOY_INFO("Peak memory usage: size={}MB", data_.peak / (1024 * 1024.f));
-  }
-
-  Block Allocate(size_t size) noexcept override {
-    auto t0 = std::chrono::high_resolution_clock::now();
-    auto block = allocator_->Allocate(size);
-    auto t1 = std::chrono::high_resolution_clock::now();
-    data_.allocation_time += std::chrono::duration<double, std::milli>(t1 - t0).count();
-    data_.allocated_bytes += block.size;
-    data_.peak = std::max(data_.peak, data_.allocated_bytes - data_.deallocated_bytes);
-    ++data_.allocation_count;
-    return block;
-  }
-
-  void Deallocate(Block& block) noexcept override {
-    ++data_.deallocation_count;
-    data_.deallocated_bytes += block.size;
-    auto t0 = std::chrono::high_resolution_clock::now();
-    allocator_->Deallocate(block);
-    auto t1 = std::chrono::high_resolution_clock::now();
-    data_.deallocation_time += std::chrono::duration<double, std::milli>(t1 - t0).count();
-  }
-
-  bool Owns(const Block& block) const noexcept override { return allocator_->Owns(block); }
-
-  const char* Name() const noexcept override { return name_.c_str(); }
-
- private:
-  struct Data {
-    size_t allocation_count{};
-    size_t deallocation_count{};
-    size_t allocated_bytes{};
-    size_t deallocated_bytes{};
-    size_t peak{};
-    double allocation_time{};
-    double deallocation_time{};
-  };
-  Data data_;
-  AllocatorImplPtr allocator_;
-  std::string name_;
-};
-
-class Locked : public AllocatorImpl {
- public:
-  explicit Locked(AllocatorImplPtr allocator) : allocator_(std::move(allocator)) {}
-  Block Allocate(size_t size) noexcept override {
-    std::lock_guard lock(mutex_);
-    return allocator_->Allocate(size);
-  }
-
-  void Deallocate(Block& block) noexcept override {
-    std::lock_guard lock(mutex_);
-    allocator_->Deallocate(block);
-  }
-
-  bool Owns(const Block& block) const noexcept override {
-    std::lock_guard lock(mutex_);
-    return allocator_->Owns(block);
-  }
-
- private:
-  AllocatorImplPtr allocator_;
-  mutable std::mutex mutex_;
-};
-
-class Segregator : public AllocatorImpl {
- public:
-  Segregator(size_t threshold, AllocatorImplPtr small, AllocatorImplPtr large)
-      : threshold_(threshold), small_(std::move(small)), large_(std::move(large)) {}
-
-  Block Allocate(size_t size) noexcept override {
-    if (size <= threshold_) {
-      return small_->Allocate(size);
-    }
-    return large_->Allocate(size);
-  }
 
-  void Deallocate(Block& block) noexcept override {
-    if (block.size <= threshold_) {
-      return small_->Deallocate(block);
+    inline AllocatorImplPtr CreatePool(size_t min_size, size_t max_size, unsigned int pool_size, AllocatorImplPtr allocator)
+    {
+        return std::make_shared<Pool>(std::move(allocator), min_size, max_size, pool_size);
     }
-    return large_->Deallocate(block);
-  }
 
-  bool Owns(const Block& block) const noexcept override {
-    if (block.size <= threshold_) {
-      return small_->Owns(block);
-    }
-    return large_->Owns(block);
-  }
-
- private:
-  size_t threshold_;
-  AllocatorImplPtr small_;
-  AllocatorImplPtr large_;
-};
-
-template <typename Allocator>
-class AllocatorAdapter : public AllocatorImpl {
- public:
-  Block Allocate(size_t size) noexcept override { return allocator_.Allocate(size); }
-  void Deallocate(Block& block) noexcept override { return allocator_.Deallocate(block); }
-  bool Owns(const Block& block) const noexcept override { return allocator_.Owns(block); }
-
- private:
-  Allocator allocator_;
-};
-
-class Bucketizer : public AllocatorImpl {
- public:
-  using AllocatorCreator = std::function<AllocatorImplPtr(size_t, size_t)>;
-  Bucketizer(const AllocatorCreator& creator, size_t min_size, size_t max_size, size_t step_size)
-      : min_size_(min_size), max_size_(max_size), step_size_(step_size) {
-    for (auto base = min_size_; base < max_size_; base += step_size_) {
-      //      MMDEPLOY_ERROR("{}, {}", base, base + step_size - 1);
-      allocator_.push_back(creator(base, base + step_size - 1));
+    inline AllocatorImplPtr CreateSegregator(size_t threshold, AllocatorImplPtr small, AllocatorImplPtr large)
+    {
+        return std::make_shared<Segregator>(threshold, std::move(small), std::move(large));
     }
-    //    MMDEPLOY_ERROR("{}", allocator_.size());
-  }
 
-  Block Allocate(size_t size) noexcept override {
-    auto index = (size - min_size_) / step_size_;
-    if (0 <= index && index < allocator_.size()) {
-      return allocator_[index]->Allocate(size);
+    inline AllocatorImplPtr CreateBucketizer(size_t min_size, size_t max_size, size_t step_size, const Bucketizer::AllocatorCreator& creator)
+    {
+        return std::make_shared<Bucketizer>(creator, min_size, max_size, step_size);
     }
-    return Block{};
-  }
 
-  void Deallocate(Block& block) noexcept override {
-    auto index = (block.size - min_size_) / step_size_;
-    if (0 <= index && index < allocator_.size()) {
-      return allocator_[index]->Deallocate(block);
+    inline AllocatorImplPtr CreatePoolBucketizer(size_t min_size, size_t max_size, size_t step_size, unsigned pool_size, const AllocatorImplPtr& allocator)
+    {
+        auto creator = [&](size_t lo, size_t hi)
+        {
+            return std::make_shared<Locked>(CreatePool(lo, hi, pool_size, allocator));
+        };
+        return CreateBucketizer(min_size, max_size, step_size, creator);
     }
-  }
-
-  bool Owns(const Block& block) const noexcept override {
-    return min_size_ <= block.size && block.size < max_size_;
-  }
-
- private:
-  std::vector<AllocatorImplPtr> allocator_;
-  size_t min_size_;
-  size_t max_size_;
-  size_t step_size_;
-};
-
-inline AllocatorImplPtr CreateFallback(AllocatorImplPtr primary, AllocatorImplPtr fallback) {
-  return std::make_shared<Fallback>(std::move(primary), std::move(fallback));
-}
-
-inline AllocatorImplPtr CreateStats(const std::string& name, AllocatorImplPtr allocator) {
-  return std::make_shared<Stats>(std::move(allocator), name);
-}
-
-inline AllocatorImplPtr CreatePool(size_t min_size, size_t max_size, unsigned int pool_size,
-                                   AllocatorImplPtr allocator) {
-  return std::make_shared<Pool>(std::move(allocator), min_size, max_size, pool_size);
-}
-
-inline AllocatorImplPtr CreateSegregator(size_t threshold, AllocatorImplPtr small,
-                                         AllocatorImplPtr large) {
-  return std::make_shared<Segregator>(threshold, std::move(small), std::move(large));
-}
-
-inline AllocatorImplPtr CreateBucketizer(size_t min_size, size_t max_size, size_t step_size,
-                                         const Bucketizer::AllocatorCreator& creator) {
-  return std::make_shared<Bucketizer>(creator, min_size, max_size, step_size);
-}
-
-inline AllocatorImplPtr CreatePoolBucketizer(size_t min_size, size_t max_size, size_t step_size,
-                                             unsigned pool_size,
-                                             const AllocatorImplPtr& allocator) {
-  auto creator = [&](size_t lo, size_t hi) {
-    return std::make_shared<Locked>(CreatePool(lo, hi, pool_size, allocator));
-  };
-  return CreateBucketizer(min_size, max_size, step_size, creator);
-}
 
 }  // namespace mmdeploy::framework::device_allocator
 
diff --git a/csrc/mmdeploy/execution/bulk.h b/csrc/mmdeploy/execution/bulk.h
index aeeeb40c99..4af12874c9 100644
--- a/csrc/mmdeploy/execution/bulk.h
+++ b/csrc/mmdeploy/execution/bulk.h
@@ -10,115 +10,133 @@
 #include "mmdeploy/core/logger.h"
 #include "utility.h"
 
-namespace mmdeploy {
-
-namespace __bulk {
-
-template <typename CvrefSender, typename Shape, typename Func, typename Receiver>
-struct _Operation {
-  struct type;
-};
-template <typename CvrefSender, typename Shape, typename Func, typename Receiver>
-using Operation = typename _Operation<CvrefSender, Shape, Func, remove_cvref_t<Receiver>>::type;
-
-template <typename Receiver, typename Shape, typename Func>
-struct _Receiver {
-  struct type;
-};
-template <typename Receiver, typename Shape, typename Func>
-using receiver_t = typename _Receiver<Receiver, Shape, Func>::type;
-
-template <typename Receiver, typename Shape, typename Func>
-struct _Receiver<Receiver, Shape, Func>::type {
-  Receiver receiver_;
-  Shape shape_;
-  Func func_;
-
-  template <class... As>
-  friend void tag_invoke(set_value_t, type&& self, As&&... as) noexcept {
-    MMDEPLOY_DEBUG("fallback Bulk implementation");
-    for (Shape i = 0; i < self.shape_; ++i) {
-      self.func_(i, as...);
-    }
-    SetValue(std::move(self.receiver_), (As &&) as...);
-  }
-};
-
-template <typename CvrefSender, typename Shape, typename Func, typename Receiver>
-struct _Operation<CvrefSender, Shape, Func, Receiver>::type {
-  connect_result_t<CvrefSender, receiver_t<Receiver, Shape, Func>> op_state2_;
-
-  friend void tag_invoke(start_t, type& self) { Start(self.op_state2_); }
-};
-
-template <typename Sender, typename Shape, typename Func>
-struct _Sender {
-  struct type;
-};
-template <typename Sender, typename Shape, typename Func>
-using sender_t = typename _Sender<remove_cvref_t<Sender>, remove_cvref_t<Shape>, Func>::type;
-
-template <typename Sender, typename Shape, typename Func>
-struct _Sender<Sender, Shape, Func>::type {
-  using value_types = completion_signatures_of_t<Sender>;
-
-  template <typename Receiver>
-  using _receiver_t = receiver_t<Receiver, Shape, Func>;
-
-  Sender sender_;
-  Shape shape_;
-  Func func_;
-
-  template <typename Self, typename Receiver, _decays_to<Self, type, int> = 0>
-  friend auto tag_invoke(connect_t, Self&& self, Receiver&& receiver)
-      -> Operation<_copy_cvref_t<Self, Sender>, Shape, Func, Receiver> {
-    return {Connect(((Self &&) self).sender_,
-                    _receiver_t<Receiver>{(Receiver &&) receiver, ((Self &&) self).shape_,
-                                          ((Self &&) self).func_})};
-  }
-};
-
-using std::enable_if_t;
-
-struct bulk_t {
-  template <typename Sender, typename Shape, typename Func,
-            enable_if_t<_is_sender<Sender> &&
-                            _tag_invocable_with_completion_scheduler<bulk_t, Sender, Shape, Func>,
-                        int> = 0>
-  auto operator()(Sender&& sender, Shape&& shape, Func func) const {
-    auto scheduler = GetCompletionScheduler(sender);
-    return tag_invoke(bulk_t{}, std::move(scheduler), (Sender &&) sender, (Shape &&) shape,
-                      (Func &&) func);
-  }
-  template <
-      typename Sender, typename Shape, typename Func,
-      enable_if_t<_is_sender<Sender> &&
-                      !_tag_invocable_with_completion_scheduler<bulk_t, Sender, Shape, Func> &&
-                      tag_invocable<bulk_t, Sender, Shape, Func>,
-                  int> = 0>
-  auto operator()(Sender&& sender, Shape&& shape, Func func) const {
-    return tag_invoke(bulk_t{}, (Sender &&) sender, (Shape &&) shape, (Func &&) func);
-  }
-  template <
-      typename Sender, typename Shape, typename Func,
-      enable_if_t<_is_sender<Sender> &&
-                      !_tag_invocable_with_completion_scheduler<bulk_t, Sender, Shape, Func> &&
-                      !tag_invocable<bulk_t, Sender, Shape, Func>,
-                  int> = 0>
-  auto operator()(Sender&& sender, Shape&& shape, Func func) const
-      -> sender_t<Sender, Shape, Func> {
-    return {(Sender &&) sender, (Shape &&) shape, std::move(func)};
-  }
-  template <typename Shape, typename Func>
-  _BinderBack<bulk_t, Shape, Func> operator()(Shape shape, Func fun) const {
-    return {{}, {}, {shape, std::move(fun)}};
-  }
-};
-
-}  // namespace __bulk
-
-using __bulk::bulk_t;
-inline constexpr bulk_t Bulk{};
+namespace mmdeploy
+{
+
+    namespace __bulk
+    {
+
+        template<typename CvrefSender, typename Shape, typename Func, typename Receiver>
+        struct _Operation
+        {
+            struct type;
+        };
+        template<typename CvrefSender, typename Shape, typename Func, typename Receiver>
+        using Operation = typename _Operation<CvrefSender, Shape, Func, remove_cvref_t<Receiver>>::type;
+
+        template<typename Receiver, typename Shape, typename Func>
+        struct _Receiver
+        {
+            struct type;
+        };
+        template<typename Receiver, typename Shape, typename Func>
+        using receiver_t = typename _Receiver<Receiver, Shape, Func>::type;
+
+        template<typename Receiver, typename Shape, typename Func>
+        struct _Receiver<Receiver, Shape, Func>::type
+        {
+            Receiver receiver_;
+            Shape    shape_;
+            Func     func_;
+
+            template<class... As>
+            friend void tag_invoke(set_value_t, type&& self, As&&... as) noexcept
+            {
+                MMDEPLOY_DEBUG("fallback Bulk implementation");
+                for (Shape i = 0; i < self.shape_; ++i)
+                {
+                    self.func_(i, as...);
+                }
+                SetValue(std::move(self.receiver_), (As&&)as...);
+            }
+        };
+
+        template<typename CvrefSender, typename Shape, typename Func, typename Receiver>
+        struct _Operation<CvrefSender, Shape, Func, Receiver>::type
+        {
+            connect_result_t<CvrefSender, receiver_t<Receiver, Shape, Func>> op_state2_;
+
+            friend void                                                      tag_invoke(start_t, type& self)
+            {
+                Start(self.op_state2_);
+            }
+        };
+
+        template<typename Sender, typename Shape, typename Func>
+        struct _Sender
+        {
+            struct type;
+        };
+        template<typename Sender, typename Shape, typename Func>
+        using sender_t = typename _Sender<remove_cvref_t<Sender>, remove_cvref_t<Shape>, Func>::type;
+
+        template<typename Sender, typename Shape, typename Func>
+        struct _Sender<Sender, Shape, Func>::type
+        {
+            using value_types = completion_signatures_of_t<Sender>;
+
+            template<typename Receiver>
+            using _receiver_t = receiver_t<Receiver, Shape, Func>;
+
+            Sender sender_;
+            Shape  shape_;
+            Func   func_;
+
+            template<typename Self, typename Receiver, _decays_to<Self, type, int> = 0>
+            friend auto tag_invoke(connect_t, Self&& self, Receiver&& receiver)
+                -> Operation<_copy_cvref_t<Self, Sender>, Shape, Func, Receiver>
+            {
+                return {Connect(((Self&&)self).sender_,
+                                _receiver_t<Receiver>{(Receiver&&)receiver, ((Self&&)self).shape_, ((Self&&)self).func_})};
+            }
+        };
+
+        using std::enable_if_t;
+
+        struct bulk_t
+        {
+            template<typename Sender, typename Shape, typename Func, enable_if_t<_is_sender<Sender> && _tag_invocable_with_completion_scheduler<bulk_t, Sender, Shape, Func>, int> = 0>
+            auto operator()(Sender&& sender, Shape&& shape, Func func) const
+            {
+                auto scheduler = GetCompletionScheduler(sender);
+                return tag_invoke(bulk_t{}, std::move(scheduler), (Sender&&)sender, (Shape&&)shape, (Func&&)func);
+            }
+            template<
+                typename Sender,
+                typename Shape,
+                typename Func,
+                enable_if_t<_is_sender<Sender> &&
+                                !_tag_invocable_with_completion_scheduler<bulk_t, Sender, Shape, Func> &&
+                                tag_invocable<bulk_t, Sender, Shape, Func>,
+                            int> = 0>
+            auto operator()(Sender&& sender, Shape&& shape, Func func) const
+            {
+                return tag_invoke(bulk_t{}, (Sender&&)sender, (Shape&&)shape, (Func&&)func);
+            }
+            template<
+                typename Sender,
+                typename Shape,
+                typename Func,
+                enable_if_t<_is_sender<Sender> &&
+                                !_tag_invocable_with_completion_scheduler<bulk_t, Sender, Shape, Func> &&
+                                !tag_invocable<bulk_t, Sender, Shape, Func>,
+                            int> = 0>
+            auto operator()(Sender&& sender, Shape&& shape, Func func) const
+                -> sender_t<Sender, Shape, Func>
+            {
+                return {(Sender&&)sender, (Shape&&)shape, std::move(func)};
+            }
+            template<typename Shape, typename Func>
+            _BinderBack<bulk_t, Shape, Func> operator()(Shape shape, Func fun) const
+            {
+                return {{}, {}, {shape, std::move(fun)}};
+            }
+        };
+
+    }  // namespace __bulk
+
+    using __bulk::bulk_t;
+    inline constexpr bulk_t Bulk{};
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/closure.h b/csrc/mmdeploy/execution/closure.h
index 7a901b5105..f73369e9c3 100644
--- a/csrc/mmdeploy/execution/closure.h
+++ b/csrc/mmdeploy/execution/closure.h
@@ -8,78 +8,87 @@
 #include "utility.h"
 
 #ifndef MMDEPLOY_CSRC_EXPERIMENTAL_EXECUTION_CLOSURE_H_
-#define MMDEPLOY_CSRC_EXPERIMENTAL_EXECUTION_CLOSURE_H_
-
-namespace mmdeploy {
-
-namespace __closure {
-
-template <class D>
-struct SenderAdaptorClosure;
-
-}  // namespace __closure
-
-using __closure::SenderAdaptorClosure;
-
-namespace __closure {
-
-template <typename T0, typename T1>
-struct _Compose : SenderAdaptorClosure<_Compose<T0, T1>> {
-  T0 t0_;
-  T1 t1_;
-
-  template <typename Sender, std::enable_if_t<_is_sender<Sender>, int> = 0>
-  std::invoke_result_t<T1, std::invoke_result_t<T0, Sender>> operator()(Sender&& sender) && {
-    return ((T1 &&) t1_)(((T0 &&) t0_)((Sender &&) sender));
-  }
-
-  template <typename Sender, std::enable_if_t<_is_sender<Sender>, int> = 0>
-  std::invoke_result_t<T1, std::invoke_result_t<T0, Sender>> operator()(Sender&& sender) const& {
-    return t1_(t0_((Sender &&) sender));
-  }
-};
-
-template <typename D>
-struct SenderAdaptorClosure {};
-
-template <typename T0, typename T1,
-          typename = std::enable_if_t<
-              std::is_base_of_v<SenderAdaptorClosure<remove_cvref_t<T0>>, remove_cvref_t<T0>> &&
-              std::is_base_of_v<SenderAdaptorClosure<remove_cvref_t<T1>>, remove_cvref_t<T1>>>>
-_Compose<remove_cvref_t<T0>, remove_cvref_t<T1>> operator|(T0&& t0, T1&& t1) {
-  return {(T0 &&) t0, (T1 &&) t1};
-}
-
-template <typename Sender, typename Closure,
-          typename = std::enable_if_t<
-              _is_sender<Sender> && std::is_base_of_v<SenderAdaptorClosure<remove_cvref_t<Closure>>,
-                                                      remove_cvref_t<Closure>>>>
-std::invoke_result_t<Closure, Sender> operator|(Sender&& sender, Closure&& closure) {
-  return ((Closure &&) closure)((Sender &&) sender);
-}
-
-template <typename Func, typename... As>
-struct _BinderBack : SenderAdaptorClosure<_BinderBack<Func, As...>> {
-  Func func_;
-  std::tuple<As...> as_;
-
-  template <typename Sender, std::enable_if_t<_is_sender<Sender>, int> = 0>
-  std::invoke_result_t<Func, Sender, As...> operator()(Sender&& sender) && {
-    return std::apply(
-        [&sender, this](As&... as) { return ((Func &&) func_)((Sender &&) sender, (As &&) as...); },
-        as_);
-  }
-
-  template <typename Sender, std::enable_if_t<_is_sender<Sender>, int> = 0>
-  std::invoke_result_t<Func, Sender, As...> operator()(Sender&& sender) const& {
-    return std::apply([&sender, this](const As&... as) { return func_((Sender &&) sender, as...); },
-                      as_);
-  }
-};
-
-}  // namespace __closure
-
-using __closure::_BinderBack;
+    #define MMDEPLOY_CSRC_EXPERIMENTAL_EXECUTION_CLOSURE_H_
+
+namespace mmdeploy
+{
+
+    namespace __closure
+    {
+
+        template<class D>
+        struct SenderAdaptorClosure;
+
+    }  // namespace __closure
+
+    using __closure::SenderAdaptorClosure;
+
+    namespace __closure
+    {
+
+        template<typename T0, typename T1>
+        struct _Compose : SenderAdaptorClosure<_Compose<T0, T1>>
+        {
+            T0 t0_;
+            T1 t1_;
+
+            template<typename Sender, std::enable_if_t<_is_sender<Sender>, int> = 0>
+            std::invoke_result_t<T1, std::invoke_result_t<T0, Sender>> operator()(Sender&& sender) &&
+            {
+                return ((T1&&)t1_)(((T0&&)t0_)((Sender&&)sender));
+            }
+
+            template<typename Sender, std::enable_if_t<_is_sender<Sender>, int> = 0>
+            std::invoke_result_t<T1, std::invoke_result_t<T0, Sender>> operator()(Sender&& sender) const&
+            {
+                return t1_(t0_((Sender&&)sender));
+            }
+        };
+
+        template<typename D>
+        struct SenderAdaptorClosure
+        {
+        };
+
+        template<typename T0, typename T1, typename = std::enable_if_t<std::is_base_of_v<SenderAdaptorClosure<remove_cvref_t<T0>>, remove_cvref_t<T0>> && std::is_base_of_v<SenderAdaptorClosure<remove_cvref_t<T1>>, remove_cvref_t<T1>>>>
+        _Compose<remove_cvref_t<T0>, remove_cvref_t<T1>> operator|(T0&& t0, T1&& t1)
+        {
+            return {(T0&&)t0, (T1&&)t1};
+        }
+
+        template<typename Sender, typename Closure, typename = std::enable_if_t<_is_sender<Sender> && std::is_base_of_v<SenderAdaptorClosure<remove_cvref_t<Closure>>, remove_cvref_t<Closure>>>>
+        std::invoke_result_t<Closure, Sender> operator|(Sender&& sender, Closure&& closure)
+        {
+            return ((Closure&&)closure)((Sender&&)sender);
+        }
+
+        template<typename Func, typename... As>
+        struct _BinderBack : SenderAdaptorClosure<_BinderBack<Func, As...>>
+        {
+            Func              func_;
+            std::tuple<As...> as_;
+
+            template<typename Sender, std::enable_if_t<_is_sender<Sender>, int> = 0>
+            std::invoke_result_t<Func, Sender, As...> operator()(Sender&& sender) &&
+            {
+                return std::apply(
+                    [&sender, this](As&... as)
+                    { return ((Func&&)func_)((Sender&&)sender, (As&&)as...); },
+                    as_);
+            }
+
+            template<typename Sender, std::enable_if_t<_is_sender<Sender>, int> = 0>
+            std::invoke_result_t<Func, Sender, As...> operator()(Sender&& sender) const&
+            {
+                return std::apply([&sender, this](const As&... as)
+                                  { return func_((Sender&&)sender, as...); },
+                                  as_);
+            }
+        };
+
+    }  // namespace __closure
+
+    using __closure::_BinderBack;
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/concepts.h b/csrc/mmdeploy/execution/concepts.h
index a0cb0f6d3e..d45fdf9e34 100644
--- a/csrc/mmdeploy/execution/concepts.h
+++ b/csrc/mmdeploy/execution/concepts.h
@@ -5,130 +5,158 @@
 
 #include "tag_invoke.h"
 
-namespace mmdeploy {
-
-namespace _get_completion_signatures {
-
-struct get_completion_signatures_t {
-  template <typename Sender, typename ValueTypes = typename remove_cvref_t<Sender>::value_types>
-  constexpr identity<ValueTypes> operator()(Sender&& sender) const noexcept {
-    return {};
-  }
-};
-
-}  // namespace _get_completion_signatures
-
-using _get_completion_signatures::get_completion_signatures_t;
-inline constexpr get_completion_signatures_t GetCompletionSignatures{};
-
-template <typename Sender>
-inline constexpr bool _is_sender = std::is_invocable_v<get_completion_signatures_t, Sender>&&
-    std::is_move_constructible_v<remove_cvref_t<Sender>>;
-
-// GetCompletionSignatures is expected to return identity<std::tuple<Types...>>;
-template <typename Sender>
-using completion_signatures_of_t =
-    typename std::invoke_result_t<get_completion_signatures_t, Sender>::type;
-
-namespace _set_value {
-struct set_value_t {
-  template <typename Receiver, typename... Args,
-            std::enable_if_t<is_tag_invocable_v<set_value_t, Receiver, Args...>, int> = 0>
-  void operator()(Receiver&& receiver, Args&&... args) const noexcept {
-    static_assert(is_nothrow_tag_invocable_v<set_value_t, Receiver, Args...>);
-    (void)tag_invoke(set_value_t{}, (Receiver &&) receiver, (Args &&) args...);
-  }
-};
-
-}  // namespace _set_value
-
-using _set_value::set_value_t;
-inline constexpr set_value_t SetValue{};
-
-namespace _start {
-
-struct start_t {
-  template <typename Operation, std::enable_if_t<tag_invocable<start_t, Operation&>, int> = 0>
-  void operator()(Operation& op_state) const
-      noexcept(is_nothrow_tag_invocable_v<start_t, Operation&>) {
-    (void)tag_invoke(start_t{}, op_state);
-  }
-};
-
-}  // namespace _start
-
-using _start::start_t;
-inline constexpr start_t Start{};
-
-namespace _connect {
-
-struct connect_t {
-  template <typename Sender, typename Receiver,
-            std::enable_if_t<is_tag_invocable_v<connect_t, Sender, Receiver>, int> = 0>
-  auto operator()(Sender&& sender, Receiver&& receiver) const
-      -> tag_invoke_result_t<connect_t, Sender, Receiver> {
-    return tag_invoke(connect_t{}, (Sender &&) sender, (Receiver &&) receiver);
-  }
-};
-
-}  // namespace _connect
-
-using _connect::connect_t;
-inline constexpr connect_t Connect{};
-
-namespace _get_completion_scheduler {
-
-struct get_completion_scheduler_t {
-  template <
-      typename Sender,
-      std::enable_if_t<is_tag_invocable_v<get_completion_scheduler_t, const Sender&>, int> = 0>
-  auto operator()(const Sender& sender) const noexcept
-      -> tag_invoke_result_t<get_completion_scheduler_t, const Sender&> {
-    return tag_invoke(get_completion_scheduler_t{}, sender);
-  }
-};
-
-}  // namespace _get_completion_scheduler
-
-using _get_completion_scheduler::get_completion_scheduler_t;
-inline constexpr get_completion_scheduler_t GetCompletionScheduler{};
-
-template <typename Sender>
-inline constexpr bool _has_completion_scheduler_v =
-    std::is_invocable_v<get_completion_scheduler_t, Sender>;
-
-template <typename Sender>
-struct _has_completion_scheduler : std::bool_constant<_has_completion_scheduler_v<Sender>> {};
-
-template <typename Sender>
-using _completion_scheduler_for = std::invoke_result_t<get_completion_scheduler_t, Sender>;
-
-namespace impl {
-
-template <typename Func, typename Sender, typename TArgs, typename SFINAE = void>
-struct _tag_invocable_with_completion_scheduler : std::false_type {};
-
-template <typename Func, typename Sender, typename... Args>
-struct _tag_invocable_with_completion_scheduler<
-    Func, Sender, std::tuple<Args...>, std::enable_if_t<_has_completion_scheduler_v<Sender>>>
-    : is_tag_invocable<Func, _completion_scheduler_for<Sender>, Sender, Args...> {};
-
-}  // namespace impl
-
-template <typename Func, typename Sender, typename... Args>
-inline constexpr bool _tag_invocable_with_completion_scheduler =
-    impl::_tag_invocable_with_completion_scheduler<Func, Sender, std::tuple<Args...>>::value;
-
-template <typename T, typename SFINAE = void>
-struct _is_range : std::false_type {};
-
-template <typename T>
-struct _is_range<T,
-                 std::void_t<decltype(std::begin(std::declval<T>()), std::end(std::declval<T>()))>>
-    : std::true_type {};
-
-template <typename T>
-inline constexpr bool _is_range_v = _is_range<T>::value;
+namespace mmdeploy
+{
+
+    namespace _get_completion_signatures
+    {
+
+        struct get_completion_signatures_t
+        {
+            template<typename Sender, typename ValueTypes = typename remove_cvref_t<Sender>::value_types>
+            constexpr identity<ValueTypes> operator()(Sender&& sender) const noexcept
+            {
+                return {};
+            }
+        };
+
+    }  // namespace _get_completion_signatures
+
+    using _get_completion_signatures::get_completion_signatures_t;
+    inline constexpr get_completion_signatures_t GetCompletionSignatures{};
+
+    template<typename Sender>
+    inline constexpr bool _is_sender = std::is_invocable_v<get_completion_signatures_t, Sender> &&
+                                       std::is_move_constructible_v<remove_cvref_t<Sender>>;
+
+    // GetCompletionSignatures is expected to return identity<std::tuple<Types...>>;
+    template<typename Sender>
+    using completion_signatures_of_t =
+        typename std::invoke_result_t<get_completion_signatures_t, Sender>::type;
+
+    namespace _set_value
+    {
+        struct set_value_t
+        {
+            template<typename Receiver, typename... Args, std::enable_if_t<is_tag_invocable_v<set_value_t, Receiver, Args...>, int> = 0>
+            void operator()(Receiver&& receiver, Args&&... args) const noexcept
+            {
+                static_assert(is_nothrow_tag_invocable_v<set_value_t, Receiver, Args...>);
+                (void)tag_invoke(set_value_t{}, (Receiver&&)receiver, (Args&&)args...);
+            }
+        };
+
+    }  // namespace _set_value
+
+    using _set_value::set_value_t;
+    inline constexpr set_value_t SetValue{};
+
+    namespace _start
+    {
+
+        struct start_t
+        {
+            template<typename Operation, std::enable_if_t<tag_invocable<start_t, Operation&>, int> = 0>
+            void operator()(Operation& op_state) const
+                noexcept(is_nothrow_tag_invocable_v<start_t, Operation&>)
+            {
+                (void)tag_invoke(start_t{}, op_state);
+            }
+        };
+
+    }  // namespace _start
+
+    using _start::start_t;
+    inline constexpr start_t Start{};
+
+    namespace _connect
+    {
+
+        struct connect_t
+        {
+            template<typename Sender, typename Receiver, std::enable_if_t<is_tag_invocable_v<connect_t, Sender, Receiver>, int> = 0>
+            auto operator()(Sender&& sender, Receiver&& receiver) const
+                -> tag_invoke_result_t<connect_t, Sender, Receiver>
+            {
+                return tag_invoke(connect_t{}, (Sender&&)sender, (Receiver&&)receiver);
+            }
+        };
+
+    }  // namespace _connect
+
+    using _connect::connect_t;
+    inline constexpr connect_t Connect{};
+
+    namespace _get_completion_scheduler
+    {
+
+        struct get_completion_scheduler_t
+        {
+            template<
+                typename Sender,
+                std::enable_if_t<is_tag_invocable_v<get_completion_scheduler_t, const Sender&>, int> = 0>
+            auto operator()(const Sender& sender) const noexcept
+                -> tag_invoke_result_t<get_completion_scheduler_t, const Sender&>
+            {
+                return tag_invoke(get_completion_scheduler_t{}, sender);
+            }
+        };
+
+    }  // namespace _get_completion_scheduler
+
+    using _get_completion_scheduler::get_completion_scheduler_t;
+    inline constexpr get_completion_scheduler_t GetCompletionScheduler{};
+
+    template<typename Sender>
+    inline constexpr bool _has_completion_scheduler_v =
+        std::is_invocable_v<get_completion_scheduler_t, Sender>;
+
+    template<typename Sender>
+    struct _has_completion_scheduler : std::bool_constant<_has_completion_scheduler_v<Sender>>
+    {
+    };
+
+    template<typename Sender>
+    using _completion_scheduler_for = std::invoke_result_t<get_completion_scheduler_t, Sender>;
+
+    namespace impl
+    {
+
+        template<typename Func, typename Sender, typename TArgs, typename SFINAE = void>
+        struct _tag_invocable_with_completion_scheduler : std::false_type
+        {
+        };
+
+        template<typename Func, typename Sender, typename... Args>
+        struct _tag_invocable_with_completion_scheduler<
+            Func,
+            Sender,
+            std::tuple<Args...>,
+            std::enable_if_t<_has_completion_scheduler_v<Sender>>>
+            : is_tag_invocable<Func, _completion_scheduler_for<Sender>, Sender, Args...>
+        {
+        };
+
+    }  // namespace impl
+
+    template<typename Func, typename Sender, typename... Args>
+    inline constexpr bool _tag_invocable_with_completion_scheduler =
+        impl::_tag_invocable_with_completion_scheduler<Func, Sender, std::tuple<Args...>>::value;
+
+    template<typename T, typename SFINAE = void>
+    struct _is_range : std::false_type
+    {
+    };
+
+    template<typename T>
+    struct _is_range<T,
+                     std::void_t<decltype(std::begin(std::declval<T>()), std::end(std::declval<T>()))>>
+        : std::true_type
+    {
+    };
+
+    template<typename T>
+    inline constexpr bool _is_range_v = _is_range<T>::value;
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/dynamic_batch.h b/csrc/mmdeploy/execution/dynamic_batch.h
index 6f43fd9a46..cbb920179b 100644
--- a/csrc/mmdeploy/execution/dynamic_batch.h
+++ b/csrc/mmdeploy/execution/dynamic_batch.h
@@ -8,55 +8,54 @@
 #include "mmdeploy/execution/then.h"
 #include "mmdeploy/execution/utility.h"
 
-namespace mmdeploy {
-
-namespace _dynamic_batch {
-
-struct dynamic_batch_t {
-  struct context_base_t {
-    void (*destroy_)(context_base_t*);
-  };
-  struct context_t {
-    std::atomic<context_base_t*> base{};
-    ~context_t() {
-      if (auto p = base.load()) {
-        p->destroy_(p);
-      }
-    }
-  };
-
-  template <typename Sender, typename Func,
-            std::enable_if_t<
-                _tag_invocable_with_completion_scheduler<dynamic_batch_t, Sender, context_t&, Func>,
-                int> = 0>
-  auto operator()(Sender&& sender, context_t& context, Func func) const {
-    auto scheduler = GetCompletionScheduler(sender);
-    return tag_invoke(*this, std::move(scheduler), (Sender &&) sender, context, std::move(func));
-  }
-
-  template <typename Sender, typename Func,
-            std::enable_if_t<!_tag_invocable_with_completion_scheduler<dynamic_batch_t, Sender,
-                                                                       context_t&, Func> &&
-                                 tag_invocable<dynamic_batch_t, Sender, context_t&, Func>,
-                             int> = 0>
-  auto operator()(Sender&& sender, context_t& context, Func func) const {
-    return tag_invoke(*this, (Sender &&) sender, context, std::move(func));
-  }
-
-  template <typename Sender, typename Context, typename Func,
-            std::enable_if_t<
-                !_tag_invocable_with_completion_scheduler<dynamic_batch_t, Sender, Context, Func> &&
-                    !tag_invocable<dynamic_batch_t, Sender, Context, Func>,
-                int> = 0>
-  auto operator()(Sender&& sender, Context&&, Func func) const {
-    return Then((Sender &&) sender, std::move(func));
-  }
-};
-
-}  // namespace _dynamic_batch
-
-using _dynamic_batch::dynamic_batch_t;
-inline constexpr dynamic_batch_t DynamicBatch{};
+namespace mmdeploy
+{
+
+    namespace _dynamic_batch
+    {
+
+        struct dynamic_batch_t
+        {
+            struct context_base_t
+            {
+                void (*destroy_)(context_base_t*);
+            };
+            struct context_t
+            {
+                std::atomic<context_base_t*> base{};
+                ~context_t()
+                {
+                    if (auto p = base.load())
+                    {
+                        p->destroy_(p);
+                    }
+                }
+            };
+
+            template<typename Sender, typename Func, std::enable_if_t<_tag_invocable_with_completion_scheduler<dynamic_batch_t, Sender, context_t&, Func>, int> = 0>
+            auto operator()(Sender&& sender, context_t& context, Func func) const
+            {
+                auto scheduler = GetCompletionScheduler(sender);
+                return tag_invoke(*this, std::move(scheduler), (Sender&&)sender, context, std::move(func));
+            }
+
+            template<typename Sender, typename Func, std::enable_if_t<!_tag_invocable_with_completion_scheduler<dynamic_batch_t, Sender, context_t&, Func> && tag_invocable<dynamic_batch_t, Sender, context_t&, Func>, int> = 0>
+            auto operator()(Sender&& sender, context_t& context, Func func) const
+            {
+                return tag_invoke(*this, (Sender&&)sender, context, std::move(func));
+            }
+
+            template<typename Sender, typename Context, typename Func, std::enable_if_t<!_tag_invocable_with_completion_scheduler<dynamic_batch_t, Sender, Context, Func> && !tag_invocable<dynamic_batch_t, Sender, Context, Func>, int> = 0>
+            auto operator()(Sender&& sender, Context&&, Func func) const
+            {
+                return Then((Sender&&)sender, std::move(func));
+            }
+        };
+
+    }  // namespace _dynamic_batch
+
+    using _dynamic_batch::dynamic_batch_t;
+    inline constexpr dynamic_batch_t DynamicBatch{};
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/ensure_started.h b/csrc/mmdeploy/execution/ensure_started.h
index 19f4458dc3..dbfea9d2f3 100644
--- a/csrc/mmdeploy/execution/ensure_started.h
+++ b/csrc/mmdeploy/execution/ensure_started.h
@@ -8,164 +8,190 @@
 #include "concepts.h"
 #include "utility.h"
 
-namespace mmdeploy {
-
-namespace __ensure_started {
-
-struct _OperationBase {
-  void (*notify_)(_OperationBase*);
-};
-
-template <typename SharedState>
-struct _Receiver {
-  struct type;
-};
-template <typename SharedState>
-using receiver_t = typename _Receiver<SharedState>::type;
-
-template <typename SharedState>
-struct _Receiver<SharedState>::type {
-  std::shared_ptr<SharedState> shared_state_;
-
-  template <typename... As>
-  friend void tag_invoke(set_value_t, type&& self, As&&... as) noexcept {
-    assert(self.shared_state_);
-    self.shared_state_->data_.emplace((As &&) as...);
-    self.shared_state_->_Notify();
-    self.shared_state_.reset();
-  }
-};
-
-template <typename Sender>
-struct _SharedState {
-  std::optional<completion_signatures_of_t<Sender>> data_;
-  //  std::optional<connect_result_t<Sender, receiver_t<_SharedState>>> op_state2_;
-  std::optional<__conv_proxy<connect_result_t<Sender, receiver_t<_SharedState>>>> op_state2_proxy_;
-
-  std::atomic<void*> awaiting_{nullptr};
-
-  void _Notify() noexcept {
-    void* const completion_state = static_cast<void*>(this);
-    void* old = awaiting_.exchange(completion_state, std::memory_order_acq_rel);
-    auto* op_state = static_cast<_OperationBase*>(old);
-
-    if (op_state != nullptr) {
-      op_state->notify_(op_state);
-    }
-  }
-};
-
-template <typename Sender, typename Receiver>
-struct _Operation {
-  struct type;
-};
-template <typename Sender, typename Receiver>
-using Operation = typename _Operation<Sender, remove_cvref_t<Receiver>>::type;
-
-template <typename Sender, typename Receiver>
-struct _Operation<Sender, Receiver>::type : public _OperationBase {
-  Receiver receiver_;
-  std::shared_ptr<_SharedState<Sender>> shared_state_;
-
-  type(Receiver&& receiver, std::shared_ptr<_SharedState<Sender>> shared_state)
-      : _OperationBase{_Notify},
-        receiver_(std::move(receiver)),
-        shared_state_(std::move(shared_state)) {}
-
-  static void _Notify(_OperationBase* self) noexcept {
-    auto op_state = static_cast<type*>(self);
-
-    std::apply(
-        [&](auto&&... vals) -> void {
-          SetValue(std::move(op_state->receiver_), (decltype(vals)&&)vals...);
-        },
-        *op_state->shared_state_->data_);
-  }
-
-  friend void tag_invoke(start_t, type& self) {
-    auto shared_state = self.shared_state_.get();
-    std::atomic<void*>& awaiting = shared_state->awaiting_;
-    void* const completion_state = static_cast<void*>(shared_state);
-    void* old = awaiting.load(std::memory_order_acquire);
-
-    // TODO: cancel the loop by replacing `compare_exchange_weak` with `compare_exchange_strong`
-    do {
-      if (old == completion_state) {
-        _Notify(&self);
-        return;
-      }
-    } while (awaiting.compare_exchange_weak(old, static_cast<void*>(&self),
-                                            std::memory_order_release, std::memory_order_acquire));
-  }
-};
-
-template <typename Sender>
-struct _Sender {
-  struct type;
-};
-template <typename Sender>
-using sender_t = typename _Sender<remove_cvref_t<Sender>>::type;
-
-template <typename Sender>
-struct _Sender<Sender>::type {
-  using value_types = completion_signatures_of_t<Sender>;
-
-  using SharedState = _SharedState<Sender>;
-
-  std::shared_ptr<SharedState> shared_state_;
-
-  template <typename Sndr, std::enable_if_t<!std::is_same_v<remove_cvref_t<Sndr>, type>, int> = 0>
-  explicit type(Sndr&& sender) : shared_state_(std::make_shared<SharedState>()) {
-    shared_state_->op_state2_proxy_.emplace(
-        [&] { return Connect((Sndr &&) sender, receiver_t<SharedState>{shared_state_}); });
-    Start(**shared_state_->op_state2_proxy_);
-    //    Start(shared_state_->op_state2_.emplace(
-    //        __conv{[&] { return Connect((Sndr &&) sender, receiver_t<SharedState>{shared_state_});
-    //        }}));
-  }
-
-  template <typename Self, typename Receiver, _decays_to<Self, type, int> = 0>
-  friend auto tag_invoke(connect_t, Self&& self, Receiver&& receiver)
-      -> Operation<Sender, Receiver> {
-    return {(Receiver &&) receiver, std::move(self.shared_state_)};
-  }
-};
-
-struct ensure_started_t {
-  template <typename Sender,
-            std::enable_if_t<_is_sender<Sender> &&
-                                 _tag_invocable_with_completion_scheduler<ensure_started_t, Sender>,
-                             int> = 0>
-  auto operator()(Sender&& sender) const {
-    auto scheduler = GetCompletionScheduler(sender);
-    return tag_invoke(ensure_started_t{}, std::move(scheduler), (Sender &&) sender);
-  }
-
-  template <
-      typename Sender,
-      std::enable_if_t<_is_sender<Sender> &&
-                           !_tag_invocable_with_completion_scheduler<ensure_started_t, Sender> &&
-                           tag_invocable<ensure_started_t, Sender>,
-                       int> = 0>
-  auto operator()(Sender&& sender) const {
-    return tag_invoke(ensure_started_t{}, (Sender &&) sender);
-  }
-
-  template <
-      typename Sender,
-      std::enable_if_t<_is_sender<Sender> &&
-                           !_tag_invocable_with_completion_scheduler<ensure_started_t, Sender> &&
-                           !tag_invocable<ensure_started_t, Sender>,
-                       int> = 0>
-  sender_t<Sender> operator()(Sender&& sender) const {
-    return sender_t<Sender>{(Sender &&) sender};
-  }
-};
-
-}  // namespace __ensure_started
-
-using __ensure_started::ensure_started_t;
-inline constexpr ensure_started_t EnsureStarted{};
+namespace mmdeploy
+{
+
+    namespace __ensure_started
+    {
+
+        struct _OperationBase
+        {
+            void (*notify_)(_OperationBase*);
+        };
+
+        template<typename SharedState>
+        struct _Receiver
+        {
+            struct type;
+        };
+        template<typename SharedState>
+        using receiver_t = typename _Receiver<SharedState>::type;
+
+        template<typename SharedState>
+        struct _Receiver<SharedState>::type
+        {
+            std::shared_ptr<SharedState> shared_state_;
+
+            template<typename... As>
+            friend void tag_invoke(set_value_t, type&& self, As&&... as) noexcept
+            {
+                assert(self.shared_state_);
+                self.shared_state_->data_.emplace((As&&)as...);
+                self.shared_state_->_Notify();
+                self.shared_state_.reset();
+            }
+        };
+
+        template<typename Sender>
+        struct _SharedState
+        {
+            std::optional<completion_signatures_of_t<Sender>>                               data_;
+            //  std::optional<connect_result_t<Sender, receiver_t<_SharedState>>> op_state2_;
+            std::optional<__conv_proxy<connect_result_t<Sender, receiver_t<_SharedState>>>> op_state2_proxy_;
+
+            std::atomic<void*>                                                              awaiting_{nullptr};
+
+            void                                                                            _Notify() noexcept
+            {
+                void* const completion_state = static_cast<void*>(this);
+                void*       old              = awaiting_.exchange(completion_state, std::memory_order_acq_rel);
+                auto*       op_state         = static_cast<_OperationBase*>(old);
+
+                if (op_state != nullptr)
+                {
+                    op_state->notify_(op_state);
+                }
+            }
+        };
+
+        template<typename Sender, typename Receiver>
+        struct _Operation
+        {
+            struct type;
+        };
+        template<typename Sender, typename Receiver>
+        using Operation = typename _Operation<Sender, remove_cvref_t<Receiver>>::type;
+
+        template<typename Sender, typename Receiver>
+        struct _Operation<Sender, Receiver>::type : public _OperationBase
+        {
+            Receiver                              receiver_;
+            std::shared_ptr<_SharedState<Sender>> shared_state_;
+
+            type(Receiver&& receiver, std::shared_ptr<_SharedState<Sender>> shared_state)
+                : _OperationBase{_Notify}
+                , receiver_(std::move(receiver))
+                , shared_state_(std::move(shared_state))
+            {
+            }
+
+            static void _Notify(_OperationBase* self) noexcept
+            {
+                auto op_state = static_cast<type*>(self);
+
+                std::apply(
+                    [&](auto&&... vals) -> void
+                    {
+                        SetValue(std::move(op_state->receiver_), (decltype(vals)&&)vals...);
+                    },
+                    *op_state->shared_state_->data_);
+            }
+
+            friend void tag_invoke(start_t, type& self)
+            {
+                auto                shared_state     = self.shared_state_.get();
+                std::atomic<void*>& awaiting         = shared_state->awaiting_;
+                void* const         completion_state = static_cast<void*>(shared_state);
+                void*               old              = awaiting.load(std::memory_order_acquire);
+
+                // TODO: cancel the loop by replacing `compare_exchange_weak` with `compare_exchange_strong`
+                do {
+                    if (old == completion_state)
+                    {
+                        _Notify(&self);
+                        return;
+                    }
+                } while (awaiting.compare_exchange_weak(old, static_cast<void*>(&self), std::memory_order_release, std::memory_order_acquire));
+            }
+        };
+
+        template<typename Sender>
+        struct _Sender
+        {
+            struct type;
+        };
+        template<typename Sender>
+        using sender_t = typename _Sender<remove_cvref_t<Sender>>::type;
+
+        template<typename Sender>
+        struct _Sender<Sender>::type
+        {
+            using value_types = completion_signatures_of_t<Sender>;
+
+            using SharedState = _SharedState<Sender>;
+
+            std::shared_ptr<SharedState> shared_state_;
+
+            template<typename Sndr, std::enable_if_t<!std::is_same_v<remove_cvref_t<Sndr>, type>, int> = 0>
+            explicit type(Sndr&& sender)
+                : shared_state_(std::make_shared<SharedState>())
+            {
+                shared_state_->op_state2_proxy_.emplace(
+                    [&]
+                    { return Connect((Sndr&&)sender, receiver_t<SharedState>{shared_state_}); });
+                Start(**shared_state_->op_state2_proxy_);
+                //    Start(shared_state_->op_state2_.emplace(
+                //        __conv{[&] { return Connect((Sndr &&) sender, receiver_t<SharedState>{shared_state_});
+                //        }}));
+            }
+
+            template<typename Self, typename Receiver, _decays_to<Self, type, int> = 0>
+            friend auto tag_invoke(connect_t, Self&& self, Receiver&& receiver)
+                -> Operation<Sender, Receiver>
+            {
+                return {(Receiver&&)receiver, std::move(self.shared_state_)};
+            }
+        };
+
+        struct ensure_started_t
+        {
+            template<typename Sender,
+                     std::enable_if_t<_is_sender<Sender> &&
+                                          _tag_invocable_with_completion_scheduler<ensure_started_t, Sender>,
+                                      int> = 0>
+            auto operator()(Sender&& sender) const
+            {
+                auto scheduler = GetCompletionScheduler(sender);
+                return tag_invoke(ensure_started_t{}, std::move(scheduler), (Sender&&)sender);
+            }
+
+            template<
+                typename Sender,
+                std::enable_if_t<_is_sender<Sender> &&
+                                     !_tag_invocable_with_completion_scheduler<ensure_started_t, Sender> &&
+                                     tag_invocable<ensure_started_t, Sender>,
+                                 int> = 0>
+            auto operator()(Sender&& sender) const
+            {
+                return tag_invoke(ensure_started_t{}, (Sender&&)sender);
+            }
+
+            template<
+                typename Sender,
+                std::enable_if_t<_is_sender<Sender> &&
+                                     !_tag_invocable_with_completion_scheduler<ensure_started_t, Sender> &&
+                                     !tag_invocable<ensure_started_t, Sender>,
+                                 int> = 0>
+            sender_t<Sender> operator()(Sender&& sender) const
+            {
+                return sender_t<Sender>{(Sender&&)sender};
+            }
+        };
+
+    }  // namespace __ensure_started
+
+    using __ensure_started::ensure_started_t;
+    inline constexpr ensure_started_t EnsureStarted{};
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/execute.h b/csrc/mmdeploy/execution/execute.h
index 1b26528912..fa216bbf42 100644
--- a/csrc/mmdeploy/execution/execute.h
+++ b/csrc/mmdeploy/execution/execute.h
@@ -9,27 +9,30 @@
 #include "mmdeploy/execution/then.h"
 #include "mmdeploy/execution/utility.h"
 
-namespace mmdeploy {
-
-namespace _execute {
-
-struct execute_t {
-  template <typename Scheduler, typename Func,
-            std::enable_if_t<tag_invocable<execute_t, Scheduler, Func>, int> = 0>
-  void operator()(Scheduler&& scheduler, Func func) const {
-    return tag_invoke(*this, (Scheduler &&) scheduler, std::move(func));
-  }
-  template <typename Scheduler, typename Func,
-            std::enable_if_t<!tag_invocable<execute_t, Scheduler, Func>, int> = 0>
-  void operator()(Scheduler&& scheduler, Func func) const {
-    return StartDetached(Then(Schedule((Scheduler &&) scheduler), std::move(func)));
-  }
-};
-
-}  // namespace _execute
-
-using _execute::execute_t;
-inline constexpr execute_t Execute{};
+namespace mmdeploy
+{
+
+    namespace _execute
+    {
+
+        struct execute_t
+        {
+            template<typename Scheduler, typename Func, std::enable_if_t<tag_invocable<execute_t, Scheduler, Func>, int> = 0>
+            void operator()(Scheduler&& scheduler, Func func) const
+            {
+                return tag_invoke(*this, (Scheduler&&)scheduler, std::move(func));
+            }
+            template<typename Scheduler, typename Func, std::enable_if_t<!tag_invocable<execute_t, Scheduler, Func>, int> = 0>
+            void operator()(Scheduler&& scheduler, Func func) const
+            {
+                return StartDetached(Then(Schedule((Scheduler&&)scheduler), std::move(func)));
+            }
+        };
+
+    }  // namespace _execute
+
+    using _execute::execute_t;
+    inline constexpr execute_t Execute{};
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/expand.h b/csrc/mmdeploy/execution/expand.h
index 734f9946aa..632796a5cb 100644
--- a/csrc/mmdeploy/execution/expand.h
+++ b/csrc/mmdeploy/execution/expand.h
@@ -7,55 +7,69 @@
 #include "concepts.h"
 #include "utility.h"
 
-namespace mmdeploy {
-
-namespace _expand {
-
-template <typename Sender, typename Receiver>
-struct _Receiver {
-  struct type {
-    Receiver receiver_;
-    template <class Tuple>
-    friend void tag_invoke(set_value_t, type&& self, Tuple&& tup) noexcept {
-      std::apply(
-          [&](auto&&... args) {
-            SetValue((Receiver &&) self.receiver_, (decltype(args)&&)args...);
-          },
-          (Tuple &&) tup);
-    }
-  };
-};
-template <typename Sender, typename Receiver>
-using receiver_t = typename _Receiver<Sender, remove_cvref_t<Receiver>>::type;
-
-template <typename Sender>
-struct _Sender {
-  struct type {
-    using value_types = std::tuple_element_t<0, completion_signatures_of_t<Sender>>;
-    Sender sender_;
-
-    template <typename Self, typename Receiver, _decays_to<Self, type, bool> = true>
-    friend auto tag_invoke(connect_t, Self&& self, Receiver&& receiver) {
-      return Connect(((Self &&) self).sender_,
-                     receiver_t<Sender, Receiver>{(Receiver &&) receiver});
-    }
-  };
-};
-template <typename Sender>
-using sender_t = typename _Sender<remove_cvref_t<Sender>>::type;
-
-struct expand_t {
-  template <typename Sender, std::enable_if_t<_is_sender<Sender>, int> = 0>
-  auto operator()(Sender&& sender) const {
-    return sender_t<Sender>{(Sender &&) sender};
-  }
-  _BinderBack<expand_t> operator()() const { return {{}, {}, {}}; }
-};
-
-}  // namespace _expand
-
-using _expand::expand_t;
-inline constexpr expand_t Expand{};
+namespace mmdeploy
+{
+
+    namespace _expand
+    {
+
+        template<typename Sender, typename Receiver>
+        struct _Receiver
+        {
+            struct type
+            {
+                Receiver receiver_;
+                template<class Tuple>
+                friend void tag_invoke(set_value_t, type&& self, Tuple&& tup) noexcept
+                {
+                    std::apply(
+                        [&](auto&&... args)
+                        {
+                            SetValue((Receiver&&)self.receiver_, (decltype(args)&&)args...);
+                        },
+                        (Tuple&&)tup);
+                }
+            };
+        };
+        template<typename Sender, typename Receiver>
+        using receiver_t = typename _Receiver<Sender, remove_cvref_t<Receiver>>::type;
+
+        template<typename Sender>
+        struct _Sender
+        {
+            struct type
+            {
+                using value_types = std::tuple_element_t<0, completion_signatures_of_t<Sender>>;
+                Sender sender_;
+
+                template<typename Self, typename Receiver, _decays_to<Self, type, bool> = true>
+                friend auto tag_invoke(connect_t, Self&& self, Receiver&& receiver)
+                {
+                    return Connect(((Self&&)self).sender_,
+                                   receiver_t<Sender, Receiver>{(Receiver&&)receiver});
+                }
+            };
+        };
+        template<typename Sender>
+        using sender_t = typename _Sender<remove_cvref_t<Sender>>::type;
+
+        struct expand_t
+        {
+            template<typename Sender, std::enable_if_t<_is_sender<Sender>, int> = 0>
+            auto operator()(Sender&& sender) const
+            {
+                return sender_t<Sender>{(Sender&&)sender};
+            }
+            _BinderBack<expand_t> operator()() const
+            {
+                return {{}, {}, {}};
+            }
+        };
+
+    }  // namespace _expand
+
+    using _expand::expand_t;
+    inline constexpr expand_t Expand{};
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/just.h b/csrc/mmdeploy/execution/just.h
index cf769c57dc..f26b924ae2 100644
--- a/csrc/mmdeploy/execution/just.h
+++ b/csrc/mmdeploy/execution/just.h
@@ -10,62 +10,74 @@
 #include "concepts.h"
 #include "utility.h"
 
-namespace mmdeploy {
-
-namespace __just {
-
-template <typename Receiver, typename... Ts>
-struct _Operation {
-  struct type;
-};
-template <typename Receiver, typename... Ts>
-using operation_t = typename _Operation<remove_cvref_t<Receiver>, Ts...>::type;
-
-template <typename Receiver, typename... Ts>
-struct _Operation<Receiver, Ts...>::type {
-  std::tuple<Ts...> values_;
-  Receiver receiver_;
-  friend void tag_invoke(start_t, type& op_state) noexcept {
-    std::apply(
-        [&](Ts&... ts) -> void { SetValue(std::move(op_state.receiver_), std::move(ts)...); },
-        op_state.values_);
-  }
-};
-
-template <typename... Ts>
-struct _Sender {
-  struct type;
-};
-template <typename... Ts>
-using sender_t = typename _Sender<std::decay_t<Ts>...>::type;
-
-template <typename... Ts>
-struct _Sender<Ts...>::type {
-  using value_types = std::tuple<Ts...>;
-  value_types values_;
-
-  template <typename Receiver>
-  friend operation_t<Receiver, Ts...> tag_invoke(connect_t, const type& self, Receiver&& receiver) {
-    return {self.values_, (Receiver &&) receiver};
-  }
-
-  template <typename Receiver>
-  friend operation_t<Receiver, Ts...> tag_invoke(connect_t, type&& self, Receiver&& receiver) {
-    return {std::move(self).values_, (Receiver &&) receiver};
-  }
-};
-
-struct just_t {
-  template <typename... Ts>
-  sender_t<Ts...> operator()(Ts&&... ts) const {
-    return {{(Ts &&) ts...}};
-  }
-};
-
-}  // namespace __just
-
-using __just::just_t;
-inline constexpr just_t Just{};
+namespace mmdeploy
+{
+
+    namespace __just
+    {
+
+        template<typename Receiver, typename... Ts>
+        struct _Operation
+        {
+            struct type;
+        };
+
+        template<typename Receiver, typename... Ts>
+        using operation_t = typename _Operation<remove_cvref_t<Receiver>, Ts...>::type;
+
+        template<typename Receiver, typename... Ts>
+        struct _Operation<Receiver, Ts...>::type
+        {
+            std::tuple<Ts...> values_;
+            Receiver          receiver_;
+            friend void       tag_invoke(start_t, type& op_state) noexcept
+            {
+                std::apply([&](Ts&... ts) -> void
+                           { SetValue(std::move(op_state.receiver_), std::move(ts)...); },
+                           op_state.values_);
+            }
+        };
+
+        template<typename... Ts>
+        struct _Sender
+        {
+            struct type;
+        };
+        template<typename... Ts>
+        using sender_t = typename _Sender<std::decay_t<Ts>...>::type;
+
+        template<typename... Ts>
+        struct _Sender<Ts...>::type
+        {
+            using value_types = std::tuple<Ts...>;
+            value_types values_;
+
+            template<typename Receiver>
+            friend operation_t<Receiver, Ts...> tag_invoke(connect_t, const type& self, Receiver&& receiver)
+            {
+                return {self.values_, (Receiver&&)receiver};
+            }
+
+            template<typename Receiver>
+            friend operation_t<Receiver, Ts...> tag_invoke(connect_t, type&& self, Receiver&& receiver)
+            {
+                return {std::move(self).values_, (Receiver&&)receiver};
+            }
+        };
+
+        struct just_t
+        {
+            template<typename... Ts>
+            sender_t<Ts...> operator()(Ts&&... ts) const
+            {
+                return {{(Ts&&)ts...}};
+            }
+        };
+
+    }  // namespace __just
+
+    using __just::just_t;
+    inline constexpr just_t Just{};
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/let_value.h b/csrc/mmdeploy/execution/let_value.h
index 21947b95ea..e2b2f30af5 100644
--- a/csrc/mmdeploy/execution/let_value.h
+++ b/csrc/mmdeploy/execution/let_value.h
@@ -7,148 +7,159 @@
 
 #include "utility.h"
 
-namespace mmdeploy {
-
-namespace __let_value {
-
-template <typename T>
-using __decay_ref = std::decay_t<T>&;
-
-template <typename Func, typename... As>
-using __result_sender_t = __call_result_t<Func, __decay_ref<As>...>;
-
-template <typename Func, typename Tuple>
-struct __value_type {};
-
-template <typename Func, typename... As>
-struct __value_type<Func, std::tuple<As...>> {
-  using type = __result_sender_t<Func, As...>;
-};
-
-template <typename Func, typename Tuple>
-using __value_type_t = typename __value_type<Func, Tuple>::type;
-
-template <typename CvrefSender, typename Receiver, typename Fun>
-struct _Storage {
-  using Sender = remove_cvref_t<CvrefSender>;
-  using operation_t =
-      connect_result_t<__value_type_t<Fun, completion_signatures_of_t<Sender>>, Receiver>;
-  std::optional<completion_signatures_of_t<Sender>> args_;
-  // workaround for MSVC v142 toolset, copy elision does not work here
-  std::optional<__conv_proxy<operation_t>> proxy_;
-};
-
-template <typename CvrefSender, typename Receiver, typename Func>
-struct _Operation {
-  struct type;
-};
-template <typename CvrefSender, typename Receiver, typename Func>
-using operation_t = typename _Operation<CvrefSender, remove_cvref_t<Receiver>, Func>::type;
-
-template <typename CvrefSender, typename Receiver, typename Func>
-struct _Receiver {
-  struct type;
-};
-template <typename CvrefSender, typename Receiver, typename Func>
-using receiver_t = typename _Receiver<CvrefSender, Receiver, Func>::type;
-
-template <typename CvrefSender, typename Receiver, typename Func>
-struct _Receiver<CvrefSender, Receiver, Func>::type {
-  operation_t<CvrefSender, Receiver, Func>* op_state_;
-
-  template <typename... As>
-  friend void tag_invoke(set_value_t, type&& self, As&&... as) noexcept {
-    auto* op_state = self.op_state_;
-    auto& args = op_state->storage_.args_.emplace((As &&) as...);
-    op_state->storage_.proxy_.emplace([&] {
-      return Connect(std::apply(std::move(op_state->func_), args), std::move(op_state->receiver_));
-    });
-    Start(**op_state->storage_.proxy_);
-  }
-};
-
-template <typename CvrefSender, typename Receiver, typename Func>
-struct _Operation<CvrefSender, Receiver, Func>::type {
-  using _receiver_t = receiver_t<CvrefSender, Receiver, Func>;
-
-  friend void tag_invoke(start_t, type& self) noexcept { Start(self.op_state2_); }
-
-  template <typename Receiver2>
-  type(CvrefSender&& sender, Receiver2&& receiver, Func func)
-      : op_state2_(Connect((CvrefSender &&) sender, _receiver_t{this})),
-        receiver_((Receiver2 &&) receiver),
-        func_(std::move(func)) {}
-
-  connect_result_t<CvrefSender, _receiver_t> op_state2_;
-  Receiver receiver_;
-  Func func_;
-  _Storage<CvrefSender, Receiver, Func> storage_;
-};
-
-template <typename Sender, typename Func>
-struct _Sender {
-  struct type;
-};
-template <typename Sender, typename Func>
-using sender_t = typename _Sender<remove_cvref_t<Sender>, Func>::type;
-
-template <typename Sender, typename Func>
-struct _Sender<Sender, Func>::type {
-  template <typename Self, typename Receiver>
-  using _operation_t = operation_t<_copy_cvref_t<Self, Sender>, Receiver, Func>;
-
-  using value_types =
-      completion_signatures_of_t<__value_type_t<Func, completion_signatures_of_t<Sender>>>;
-
-  template <typename Self, typename Receiver, _decays_to<Self, type, int> = 0>
-  friend auto tag_invoke(connect_t, Self&& self, Receiver&& receiver)
-      -> _operation_t<Self, Receiver> {
-    return _operation_t<Self, Receiver>{((Self &&) self).sender_, (Receiver &&) receiver,
-                                        ((Self &&) self).func_};
-  }
-  Sender sender_;
-  Func func_;
-};
-
-using std::enable_if_t;
-
-struct let_value_t {
-  template <typename Sender, typename Func,
-            enable_if_t<_is_sender<Sender> &&
-                            _tag_invocable_with_completion_scheduler<let_value_t, Sender, Func>,
-                        int> = 0>
-  auto operator()(Sender&& sender, Func func) const {
-    auto scheduler = GetCompletionScheduler(sender);
-    return tag_invoke(let_value_t{}, std::move(scheduler), (Sender &&) sender, std::move(func));
-  }
-
-  template <typename Sender, typename Func,
-            enable_if_t<_is_sender<Sender> &&
-                            _tag_invocable_with_completion_scheduler<let_value_t, Sender, Func> &&
-                            tag_invocable<let_value_t, Sender, Func>,
-                        int> = 0>
-  auto operator()(Sender&& sender, Func func) const {
-    return tag_invoke(let_value_t{}, (Sender &&) sender, std::move(func));
-  }
-
-  template <typename Sender, typename Func,
-            enable_if_t<_is_sender<Sender> &&
-                            !_tag_invocable_with_completion_scheduler<let_value_t, Sender, Func> &&
-                            !tag_invocable<let_value_t, Sender>,
-                        int> = 0>
-  sender_t<Sender, Func> operator()(Sender&& sender, Func func) const {
-    return {(Sender &&) sender, std::move(func)};
-  }
-  template <typename Func>
-  _BinderBack<let_value_t, Func> operator()(Func func) const {
-    return {{}, {}, {std::move(func)}};
-  }
-};
-
-}  // namespace __let_value
-
-using __let_value::let_value_t;
-inline constexpr let_value_t LetValue{};
+namespace mmdeploy
+{
+
+    namespace __let_value
+    {
+
+        template<typename T>
+        using __decay_ref = std::decay_t<T>&;
+
+        template<typename Func, typename... As>
+        using __result_sender_t = __call_result_t<Func, __decay_ref<As>...>;
+
+        template<typename Func, typename Tuple>
+        struct __value_type
+        {
+        };
+
+        template<typename Func, typename... As>
+        struct __value_type<Func, std::tuple<As...>>
+        {
+            using type = __result_sender_t<Func, As...>;
+        };
+
+        template<typename Func, typename Tuple>
+        using __value_type_t = typename __value_type<Func, Tuple>::type;
+
+        template<typename CvrefSender, typename Receiver, typename Fun>
+        struct _Storage
+        {
+            using Sender = remove_cvref_t<CvrefSender>;
+            using operation_t =
+                connect_result_t<__value_type_t<Fun, completion_signatures_of_t<Sender>>, Receiver>;
+            std::optional<completion_signatures_of_t<Sender>> args_;
+            // workaround for MSVC v142 toolset, copy elision does not work here
+            std::optional<__conv_proxy<operation_t>>          proxy_;
+        };
+
+        template<typename CvrefSender, typename Receiver, typename Func>
+        struct _Operation
+        {
+            struct type;
+        };
+        template<typename CvrefSender, typename Receiver, typename Func>
+        using operation_t = typename _Operation<CvrefSender, remove_cvref_t<Receiver>, Func>::type;
+
+        template<typename CvrefSender, typename Receiver, typename Func>
+        struct _Receiver
+        {
+            struct type;
+        };
+        template<typename CvrefSender, typename Receiver, typename Func>
+        using receiver_t = typename _Receiver<CvrefSender, Receiver, Func>::type;
+
+        template<typename CvrefSender, typename Receiver, typename Func>
+        struct _Receiver<CvrefSender, Receiver, Func>::type
+        {
+            operation_t<CvrefSender, Receiver, Func>* op_state_;
+
+            template<typename... As>
+            friend void tag_invoke(set_value_t, type&& self, As&&... as) noexcept
+            {
+                auto* op_state = self.op_state_;
+                auto& args     = op_state->storage_.args_.emplace((As&&)as...);
+                op_state->storage_.proxy_.emplace([&]
+                                                  { return Connect(std::apply(std::move(op_state->func_), args), std::move(op_state->receiver_)); });
+                Start(**op_state->storage_.proxy_);
+            }
+        };
+
+        template<typename CvrefSender, typename Receiver, typename Func>
+        struct _Operation<CvrefSender, Receiver, Func>::type
+        {
+            using _receiver_t = receiver_t<CvrefSender, Receiver, Func>;
+
+            friend void tag_invoke(start_t, type& self) noexcept
+            {
+                Start(self.op_state2_);
+            }
+
+            template<typename Receiver2>
+            type(CvrefSender&& sender, Receiver2&& receiver, Func func)
+                : op_state2_(Connect((CvrefSender&&)sender, _receiver_t{this}))
+                , receiver_((Receiver2&&)receiver)
+                , func_(std::move(func))
+            {
+            }
+
+            connect_result_t<CvrefSender, _receiver_t> op_state2_;
+            Receiver                                   receiver_;
+            Func                                       func_;
+            _Storage<CvrefSender, Receiver, Func>      storage_;
+        };
+
+        template<typename Sender, typename Func>
+        struct _Sender
+        {
+            struct type;
+        };
+        template<typename Sender, typename Func>
+        using sender_t = typename _Sender<remove_cvref_t<Sender>, Func>::type;
+
+        template<typename Sender, typename Func>
+        struct _Sender<Sender, Func>::type
+        {
+            template<typename Self, typename Receiver>
+            using _operation_t = operation_t<_copy_cvref_t<Self, Sender>, Receiver, Func>;
+
+            using value_types =
+                completion_signatures_of_t<__value_type_t<Func, completion_signatures_of_t<Sender>>>;
+
+            template<typename Self, typename Receiver, _decays_to<Self, type, int> = 0>
+            friend auto tag_invoke(connect_t, Self&& self, Receiver&& receiver)
+                -> _operation_t<Self, Receiver>
+            {
+                return _operation_t<Self, Receiver>{((Self&&)self).sender_, (Receiver&&)receiver, ((Self&&)self).func_};
+            }
+            Sender sender_;
+            Func   func_;
+        };
+
+        using std::enable_if_t;
+
+        struct let_value_t
+        {
+            template<typename Sender, typename Func, enable_if_t<_is_sender<Sender> && _tag_invocable_with_completion_scheduler<let_value_t, Sender, Func>, int> = 0>
+            auto operator()(Sender&& sender, Func func) const
+            {
+                auto scheduler = GetCompletionScheduler(sender);
+                return tag_invoke(let_value_t{}, std::move(scheduler), (Sender&&)sender, std::move(func));
+            }
+
+            template<typename Sender, typename Func, enable_if_t<_is_sender<Sender> && _tag_invocable_with_completion_scheduler<let_value_t, Sender, Func> && tag_invocable<let_value_t, Sender, Func>, int> = 0>
+            auto operator()(Sender&& sender, Func func) const
+            {
+                return tag_invoke(let_value_t{}, (Sender&&)sender, std::move(func));
+            }
+
+            template<typename Sender, typename Func, enable_if_t<_is_sender<Sender> && !_tag_invocable_with_completion_scheduler<let_value_t, Sender, Func> && !tag_invocable<let_value_t, Sender>, int> = 0>
+            sender_t<Sender, Func> operator()(Sender&& sender, Func func) const
+            {
+                return {(Sender&&)sender, std::move(func)};
+            }
+            template<typename Func>
+            _BinderBack<let_value_t, Func> operator()(Func func) const
+            {
+                return {{}, {}, {std::move(func)}};
+            }
+        };
+
+    }  // namespace __let_value
+
+    using __let_value::let_value_t;
+    inline constexpr let_value_t LetValue{};
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/on.h b/csrc/mmdeploy/execution/on.h
index 992a6b31fb..4c170d56a4 100644
--- a/csrc/mmdeploy/execution/on.h
+++ b/csrc/mmdeploy/execution/on.h
@@ -9,115 +9,136 @@
 
 #include "utility.h"
 
-namespace mmdeploy {
-
-namespace __on {
-
-template <typename Scheduler, typename Sender, typename Receiver>
-struct _Operation {
-  struct type;
-};
-template <typename Scheduler, typename Sender, typename Receiver>
-using operation_t = typename _Operation<Scheduler, Sender, Receiver>::type;
-
-template <typename Scheduler, typename Sender, typename Receiver>
-struct _ReceiverRef {
-  struct type;
-};
-template <typename Scheduler, typename Sender, typename Receiver>
-using receiver_ref_t = typename _ReceiverRef<Scheduler, Sender, Receiver>::type;
-
-template <typename Scheduler, typename Sender, typename Receiver>
-struct _ReceiverRef<Scheduler, Sender, Receiver>::type {
-  operation_t<Scheduler, Sender, Receiver>* op_state_;
-  template <typename... Args>
-  friend void tag_invoke(set_value_t, type&& self, Args&&... args) noexcept {
-    SetValue((Receiver &&) self.op_state_->receiver_, ((Args &&) args)...);
-  }
-};
-
-template <typename Scheduler, typename Sender, typename Receiver>
-struct _Receiver {
-  struct type;
-};
-template <typename Scheduler, typename Sender, typename Receiver>
-using receiver_t = typename _Receiver<Scheduler, Sender, Receiver>::type;
-
-template <typename Scheduler, typename Sender, typename Receiver>
-struct _Receiver<Scheduler, Sender, Receiver>::type {
-  operation_t<Scheduler, Sender, Receiver>* op_state_;
-  using _receiver_ref_t = receiver_ref_t<Scheduler, Sender, Receiver>;
-
-  friend void tag_invoke(set_value_t, type&& self) noexcept {
-    auto op_state = self.op_state_;
-    Start(op_state->data_.template emplace<1>(
-        Connect((Sender &&) op_state->sender_, _receiver_ref_t{op_state})));
-  }
-};
-
-template <typename Scheduler, typename Sender, typename Receiver>
-struct _Operation<Scheduler, Sender, Receiver>::type {
-  using _receiver_t = receiver_t<Scheduler, Sender, Receiver>;
-  using _receiver_ref_t = receiver_ref_t<Scheduler, Sender, Receiver>;
-
-  template <class Sender2, class Receiver2>
-  type(Scheduler scheduler, Sender2&& sender, Receiver2&& receiver)
-      : data_(std::in_place_index<0>, Connect(Schedule(scheduler), _receiver_t{this})),
-        scheduler_(scheduler),
-        sender_((Sender2 &&) sender),
-        receiver_((Receiver2 &&) receiver) {}
-
-  friend void tag_invoke(start_t, type& self) { Start(std::get<0>(self.data_)); }
-
-  std::variant<connect_result_t<schedule_result_t<Scheduler>, _receiver_t>,
-               connect_result_t<Sender, _receiver_ref_t>>
-      data_;
-  Scheduler scheduler_;
-  Sender sender_;
-  Receiver receiver_;
-};
-
-template <typename Scheduler, typename Sender>
-struct _Sender {
-  struct type;
-};
-template <typename Scheduler, typename Sender>
-using sender_t = typename _Sender<remove_cvref_t<Scheduler>, remove_cvref_t<Sender>>::type;
-
-template <typename Scheduler, typename Sender>
-struct _Sender<Scheduler, Sender>::type {
-  using value_types = completion_signatures_of_t<Sender>;
-  Scheduler scheduler_;
-  Sender sender_;
-
-  template <typename Receiver>
-  using _operation_t = operation_t<Scheduler, Sender, remove_cvref_t<Receiver>>;
-
-  template <typename Self, typename Receiver, _decays_to<Self, type, int> = 0>
-  friend auto tag_invoke(connect_t, Self&& self, Receiver&& receiver) -> _operation_t<Receiver> {
-    return {((Self &&) self).scheduler_, ((Self &&) self).sender_, (Receiver &&) receiver};
-  }
-};
-
-struct on_t {
-  template <typename Scheduler, typename Sender,
-            std::enable_if_t<_is_sender<Sender> && tag_invocable<on_t, Scheduler, Sender>, int> = 0>
-  auto operator()(Scheduler&& scheduler, Sender&& sender) const
-      -> tag_invoke_result_t<on_t, Scheduler, Sender> {
-    return tag_invoke(on_t{}, (Scheduler &&) scheduler, (Sender &&) sender);
-  }
-  template <
-      typename Scheduler, typename Sender,
-      std::enable_if_t<_is_sender<Sender> && !tag_invocable<on_t, Scheduler, Sender>, int> = 0>
-  sender_t<Scheduler, Sender> operator()(Scheduler&& scheduler, Sender&& sender) const {
-    return {(Scheduler &&) scheduler, (Sender &&) sender};
-  }
-};
-
-}  // namespace __on
-
-using __on::on_t;
-inline constexpr on_t On{};
+namespace mmdeploy
+{
+
+    namespace __on
+    {
+
+        template<typename Scheduler, typename Sender, typename Receiver>
+        struct _Operation
+        {
+            struct type;
+        };
+        template<typename Scheduler, typename Sender, typename Receiver>
+        using operation_t = typename _Operation<Scheduler, Sender, Receiver>::type;
+
+        template<typename Scheduler, typename Sender, typename Receiver>
+        struct _ReceiverRef
+        {
+            struct type;
+        };
+        template<typename Scheduler, typename Sender, typename Receiver>
+        using receiver_ref_t = typename _ReceiverRef<Scheduler, Sender, Receiver>::type;
+
+        template<typename Scheduler, typename Sender, typename Receiver>
+        struct _ReceiverRef<Scheduler, Sender, Receiver>::type
+        {
+            operation_t<Scheduler, Sender, Receiver>* op_state_;
+            template<typename... Args>
+            friend void tag_invoke(set_value_t, type&& self, Args&&... args) noexcept
+            {
+                SetValue((Receiver&&)self.op_state_->receiver_, ((Args&&)args)...);
+            }
+        };
+
+        template<typename Scheduler, typename Sender, typename Receiver>
+        struct _Receiver
+        {
+            struct type;
+        };
+        template<typename Scheduler, typename Sender, typename Receiver>
+        using receiver_t = typename _Receiver<Scheduler, Sender, Receiver>::type;
+
+        template<typename Scheduler, typename Sender, typename Receiver>
+        struct _Receiver<Scheduler, Sender, Receiver>::type
+        {
+            operation_t<Scheduler, Sender, Receiver>* op_state_;
+            using _receiver_ref_t = receiver_ref_t<Scheduler, Sender, Receiver>;
+
+            friend void tag_invoke(set_value_t, type&& self) noexcept
+            {
+                auto op_state = self.op_state_;
+                Start(op_state->data_.template emplace<1>(
+                    Connect((Sender&&)op_state->sender_, _receiver_ref_t{op_state})));
+            }
+        };
+
+        template<typename Scheduler, typename Sender, typename Receiver>
+        struct _Operation<Scheduler, Sender, Receiver>::type
+        {
+            using _receiver_t     = receiver_t<Scheduler, Sender, Receiver>;
+            using _receiver_ref_t = receiver_ref_t<Scheduler, Sender, Receiver>;
+
+            template<class Sender2, class Receiver2>
+            type(Scheduler scheduler, Sender2&& sender, Receiver2&& receiver)
+                : data_(std::in_place_index<0>, Connect(Schedule(scheduler), _receiver_t{this}))
+                , scheduler_(scheduler)
+                , sender_((Sender2&&)sender)
+                , receiver_((Receiver2&&)receiver)
+            {
+            }
+
+            friend void tag_invoke(start_t, type& self)
+            {
+                Start(std::get<0>(self.data_));
+            }
+
+            std::variant<connect_result_t<schedule_result_t<Scheduler>, _receiver_t>,
+                         connect_result_t<Sender, _receiver_ref_t>>
+                      data_;
+            Scheduler scheduler_;
+            Sender    sender_;
+            Receiver  receiver_;
+        };
+
+        template<typename Scheduler, typename Sender>
+        struct _Sender
+        {
+            struct type;
+        };
+        template<typename Scheduler, typename Sender>
+        using sender_t = typename _Sender<remove_cvref_t<Scheduler>, remove_cvref_t<Sender>>::type;
+
+        template<typename Scheduler, typename Sender>
+        struct _Sender<Scheduler, Sender>::type
+        {
+            using value_types = completion_signatures_of_t<Sender>;
+            Scheduler scheduler_;
+            Sender    sender_;
+
+            template<typename Receiver>
+            using _operation_t = operation_t<Scheduler, Sender, remove_cvref_t<Receiver>>;
+
+            template<typename Self, typename Receiver, _decays_to<Self, type, int> = 0>
+            friend auto tag_invoke(connect_t, Self&& self, Receiver&& receiver) -> _operation_t<Receiver>
+            {
+                return {((Self&&)self).scheduler_, ((Self&&)self).sender_, (Receiver&&)receiver};
+            }
+        };
+
+        struct on_t
+        {
+            template<typename Scheduler, typename Sender, std::enable_if_t<_is_sender<Sender> && tag_invocable<on_t, Scheduler, Sender>, int> = 0>
+            auto operator()(Scheduler&& scheduler, Sender&& sender) const
+                -> tag_invoke_result_t<on_t, Scheduler, Sender>
+            {
+                return tag_invoke(on_t{}, (Scheduler&&)scheduler, (Sender&&)sender);
+            }
+            template<
+                typename Scheduler,
+                typename Sender,
+                std::enable_if_t<_is_sender<Sender> && !tag_invocable<on_t, Scheduler, Sender>, int> = 0>
+            sender_t<Scheduler, Sender> operator()(Scheduler&& scheduler, Sender&& sender) const
+            {
+                return {(Scheduler&&)scheduler, (Sender&&)sender};
+            }
+        };
+
+    }  // namespace __on
+
+    using __on::on_t;
+    inline constexpr on_t On{};
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/run_loop.h b/csrc/mmdeploy/execution/run_loop.h
index d52432a8d0..c670d45b50 100644
--- a/csrc/mmdeploy/execution/run_loop.h
+++ b/csrc/mmdeploy/execution/run_loop.h
@@ -11,153 +11,198 @@
 #include "concepts.h"
 #include "utility.h"
 
-namespace mmdeploy {
-
-namespace __loop {
-class RunLoop;
-
-namespace __impl {
-
-struct _Task {
-  virtual void _Execute() noexcept = 0;
-  _Task* next_ = nullptr;
-};
-
-template <typename Receiver>
-struct _Operation {
-  struct type;
-};
-template <typename Receiver>
-using operation_t = typename _Operation<remove_cvref_t<Receiver>>::type;
-
-template <typename Receiver>
-struct _Operation<Receiver>::type final : _Task {
-  friend void tag_invoke(start_t, type& op_state) noexcept { op_state._Start(); }
-
-  void _Execute() noexcept override { SetValue(std::move(receiver_)); }
-  void _Start() noexcept;
-
-  Receiver receiver_;
-  RunLoop* const loop_;
-
- public:
-  template <class _Receiver2>
-  explicit type(_Receiver2&& receiver, RunLoop* loop)
-      : receiver_((_Receiver2 &&) receiver), loop_(loop) {}
-};
-
-}  // namespace __impl
-
-class RunLoop {
-  template <typename>
-  friend struct __impl::_Operation;
-
- public:
-  class _Scheduler {
-    struct _ScheduleTask {
-      using value_types = std::tuple<>;
-
-     private:
-      friend _Scheduler;
-
-      template <typename Receiver>
-      friend __impl::operation_t<Receiver> tag_invoke(connect_t, const _ScheduleTask& self,
-                                                      Receiver&& receiver) {
-        return __impl::operation_t<Receiver>{(Receiver &&) receiver, self.loop_};
-      }
-      RunLoop* const loop_;
-
-     public:
-      explicit _ScheduleTask(RunLoop* loop) noexcept : loop_(loop) {}
-
-      friend _Scheduler tag_invoke(get_completion_scheduler_t, const _ScheduleTask& self);
-    };
-    friend RunLoop;
-
-    friend _Scheduler tag_invoke(get_completion_scheduler_t, const _ScheduleTask& self) {
-      return RunLoop::_Scheduler{self.loop_};
-    }
-
-    explicit _Scheduler(RunLoop* loop) noexcept : loop_(loop) {}
-
-   public:
-    bool operator==(const _Scheduler& other) const noexcept { return loop_ == other.loop_; }
-
-    _Scheduler(const _Scheduler& other) = default;
-
-   private:
-    friend _ScheduleTask tag_invoke(schedule_t, const _Scheduler& self) {
-      return _ScheduleTask{self.loop_};
-    }
-    RunLoop* loop_;
-  };
-  _Scheduler GetScheduler() { return _Scheduler{this}; }
-  void _Run();
-  void _Finish();
-
- private:
-  void _push_back(__impl::_Task* task);
-  __impl::_Task* _pop_front();
-
-  std::mutex mutex_;
-  std::condition_variable cv_;
-  __impl::_Task* head_ = nullptr;
-  __impl::_Task* tail_ = nullptr;
-  bool stop_ = false;
-};
-
-namespace __impl {
-
-template <typename Receiver>
-inline void _Operation<Receiver>::type::_Start() noexcept {
-  loop_->_push_back(this);
-}
-
-}  // namespace __impl
-
-inline void RunLoop::_Run() {
-  while (auto* task = _pop_front()) {
-    task->_Execute();
-  }
-}
-
-inline void RunLoop::_Finish() {
-  std::lock_guard lock{mutex_};
-  stop_ = true;
-  cv_.notify_all();
-}
-
-inline void RunLoop::_push_back(__impl::_Task* task) {
-  std::lock_guard lock{mutex_};
-  if (head_ == nullptr) {
-    head_ = task;
-  } else {
-    tail_->next_ = task;
-  }
-  tail_ = task;
-  task->next_ = nullptr;
-  cv_.notify_one();
-}
-
-inline __impl::_Task* RunLoop::_pop_front() {
-  std::unique_lock lock{mutex_};
-  while (head_ == nullptr) {
-    if (stop_) {
-      return nullptr;
-    }
-    cv_.wait(lock);
-  }
-  auto* task = head_;
-  head_ = task->next_;
-  if (head_ == nullptr) {
-    tail_ = nullptr;
-  }
-  return task;
-}
-
-}  // namespace __loop
-
-using RunLoop = __loop::RunLoop;
+namespace mmdeploy
+{
+
+    namespace __loop
+    {
+        class RunLoop;
+
+        namespace __impl
+        {
+
+            struct _Task
+            {
+                virtual void _Execute() noexcept = 0;
+                _Task*       next_               = nullptr;
+            };
+
+            template<typename Receiver>
+            struct _Operation
+            {
+                struct type;
+            };
+            template<typename Receiver>
+            using operation_t = typename _Operation<remove_cvref_t<Receiver>>::type;
+
+            template<typename Receiver>
+            struct _Operation<Receiver>::type final : _Task
+            {
+                friend void tag_invoke(start_t, type& op_state) noexcept
+                {
+                    op_state._Start();
+                }
+
+                void _Execute() noexcept override
+                {
+                    SetValue(std::move(receiver_));
+                }
+                void           _Start() noexcept;
+
+                Receiver       receiver_;
+                RunLoop* const loop_;
+
+              public:
+                template<class _Receiver2>
+                explicit type(_Receiver2&& receiver, RunLoop* loop)
+                    : receiver_((_Receiver2&&)receiver)
+                    , loop_(loop)
+                {
+                }
+            };
+
+        }  // namespace __impl
+
+        class RunLoop
+        {
+            template<typename>
+            friend struct __impl::_Operation;
+
+          public:
+            class _Scheduler
+            {
+                struct _ScheduleTask
+                {
+                    using value_types = std::tuple<>;
+
+                  private:
+                    friend _Scheduler;
+
+                    template<typename Receiver>
+                    friend __impl::operation_t<Receiver> tag_invoke(connect_t, const _ScheduleTask& self, Receiver&& receiver)
+                    {
+                        return __impl::operation_t<Receiver>{(Receiver&&)receiver, self.loop_};
+                    }
+                    RunLoop* const loop_;
+
+                  public:
+                    explicit _ScheduleTask(RunLoop* loop) noexcept
+                        : loop_(loop)
+                    {
+                    }
+
+                    friend _Scheduler tag_invoke(get_completion_scheduler_t, const _ScheduleTask& self);
+                };
+                friend RunLoop;
+
+                friend _Scheduler tag_invoke(get_completion_scheduler_t, const _ScheduleTask& self)
+                {
+                    return RunLoop::_Scheduler{self.loop_};
+                }
+
+                explicit _Scheduler(RunLoop* loop) noexcept
+                    : loop_(loop)
+                {
+                }
+
+              public:
+                bool operator==(const _Scheduler& other) const noexcept
+                {
+                    return loop_ == other.loop_;
+                }
+
+                _Scheduler(const _Scheduler& other) = default;
+
+              private:
+                friend _ScheduleTask tag_invoke(schedule_t, const _Scheduler& self)
+                {
+                    return _ScheduleTask{self.loop_};
+                }
+                RunLoop* loop_;
+            };
+            _Scheduler GetScheduler()
+            {
+                return _Scheduler{this};
+            }
+            void _Run();
+            void _Finish();
+
+          private:
+            void                    _push_back(__impl::_Task* task);
+            __impl::_Task*          _pop_front();
+
+            std::mutex              mutex_;
+            std::condition_variable cv_;
+            __impl::_Task*          head_ = nullptr;
+            __impl::_Task*          tail_ = nullptr;
+            bool                    stop_ = false;
+        };
+
+        namespace __impl
+        {
+
+            template<typename Receiver>
+            inline void _Operation<Receiver>::type::_Start() noexcept
+            {
+                loop_->_push_back(this);
+            }
+
+        }  // namespace __impl
+
+        inline void RunLoop::_Run()
+        {
+            while (auto* task = _pop_front())
+            {
+                task->_Execute();
+            }
+        }
+
+        inline void RunLoop::_Finish()
+        {
+            std::lock_guard lock{mutex_};
+            stop_ = true;
+            cv_.notify_all();
+        }
+
+        inline void RunLoop::_push_back(__impl::_Task* task)
+        {
+            std::lock_guard lock{mutex_};
+            if (head_ == nullptr)
+            {
+                head_ = task;
+            }
+            else
+            {
+                tail_->next_ = task;
+            }
+            tail_       = task;
+            task->next_ = nullptr;
+            cv_.notify_one();
+        }
+
+        inline __impl::_Task* RunLoop::_pop_front()
+        {
+            std::unique_lock lock{mutex_};
+            while (head_ == nullptr)
+            {
+                if (stop_)
+                {
+                    return nullptr;
+                }
+                cv_.wait(lock);
+            }
+            auto* task = head_;
+            head_      = task->next_;
+            if (head_ == nullptr)
+            {
+                tail_ = nullptr;
+            }
+            return task;
+        }
+
+    }  // namespace __loop
+
+    using RunLoop = __loop::RunLoop;
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/schedule_from.h b/csrc/mmdeploy/execution/schedule_from.h
index c51e90b94b..ca31a3deb4 100644
--- a/csrc/mmdeploy/execution/schedule_from.h
+++ b/csrc/mmdeploy/execution/schedule_from.h
@@ -9,131 +9,151 @@
 
 #include "utility.h"
 
-namespace mmdeploy {
-
-namespace __schedule_from {
-
-template <typename Scheduler, typename CvrefSender, typename Receiver>
-struct _Operation1 {
-  struct type;
-};
-template <typename Scheduler, typename CvrefSender, typename Receiver>
-using operation1_t = typename _Operation1<Scheduler, CvrefSender, Receiver>::type;
-
-template <typename Scheduler, typename CvrefSender, typename Receiver>
-struct _Receiver1 {
-  struct type;
-};
-template <typename Scheduler, typename CvrefSender, typename Receiver>
-using receiver1_t = typename _Receiver1<Scheduler, CvrefSender, Receiver>::type;
-
-template <typename Scheduler, typename CvrefSender, typename Receiver>
-struct _Receiver2 {
-  struct type;
-};
-template <typename Scheduler, typename CvrefSender, typename Receiver>
-using receiver2_t = typename _Receiver2<Scheduler, CvrefSender, Receiver>::type;
-
-template <typename Scheduler, typename CvrefSender, typename Receiver>
-struct _Receiver2<Scheduler, CvrefSender, Receiver>::type {
-  operation1_t<Scheduler, CvrefSender, Receiver>* op_state_;
-
-  friend void tag_invoke(set_value_t, type&& self) noexcept {
-    std::apply(
-        [&](auto&&... vals) {
-          SetValue(std::move(self.op_state_->receiver_), std::move(vals)...);  //
-        },
-        std::move(*self.op_state_->data_));
-  }
-};
-
-template <typename Scheduler, typename CvrefSender, typename Receiver>
-struct _Receiver1<Scheduler, CvrefSender, Receiver>::type {
-  using _receiver2_t = receiver2_t<Scheduler, CvrefSender, Receiver>;
-
-  operation1_t<Scheduler, CvrefSender, Receiver>* op_state_;
-
-  template <typename... As>
-  friend void tag_invoke(set_value_t, type&& self, As&&... as) noexcept {
-    self.op_state_->data_.emplace((As &&) as...);
-    auto sender = Schedule(self.op_state_->scheduler_);
-    auto& op_state2 = self.op_state_->op_state2_.emplace(
-        __conv{[&] { return Connect(std::move(sender), _receiver2_t{self.op_state_}); }});
-    Start(op_state2);
-  }
-};
-
-template <typename Scheduler, typename CvrefSender, typename Receiver>
-struct _Operation1<Scheduler, CvrefSender, Receiver>::type {
-  using _receiver1_t = receiver1_t<Scheduler, CvrefSender, Receiver>;
-  using _receiver2_t = receiver2_t<Scheduler, CvrefSender, Receiver>;
-
-  Scheduler scheduler_;
-  Receiver receiver_;
-  std::optional<completion_signatures_of_t<remove_cvref_t<CvrefSender>>> data_;
-  connect_result_t<CvrefSender, _receiver1_t> op_state1_;
-  std::optional<connect_result_t<schedule_result_t<Scheduler>, _receiver2_t>> op_state2_;
-
-  template <class Receiver2>
-  type(Scheduler sched, CvrefSender&& sender, Receiver2&& receiver)
-      : scheduler_(sched),
-        receiver_((Receiver2 &&) receiver),
-        op_state1_(Connect((CvrefSender &&) sender, _receiver1_t{this})) {}
-
-  type(const type&) = delete;
-  type(_Operation1&&) noexcept = delete;
-  type& operator=(const type&) = delete;
-  type& operator=(type&&) noexcept = delete;
-
-  friend void tag_invoke(start_t, type& op_state) noexcept { Start(op_state.op_state1_); }
-};
-
-template <typename Scheduler, typename Sender>
-struct _Sender {
-  struct type;
-};
-template <typename Scheduler, typename Sender>
-using sender_t = typename _Sender<remove_cvref_t<Scheduler>, remove_cvref_t<Sender>>::type;
-
-template <typename Scheduler, typename Sender>
-struct _Sender<Scheduler, Sender>::type {
-  using value_types = completion_signatures_of_t<Sender>;
-
-  Scheduler scheduler_;
-  Sender sender_;
-
-  template <typename Self, typename Receiver, _decays_to<Self, type, int> = 0>
-  friend auto tag_invoke(connect_t, Self&& self, Receiver&& receiver)
-      -> operation1_t<Scheduler, _copy_cvref_t<Self, Sender>, remove_cvref_t<Receiver>> {
-    return {self.scheduler_, ((Self &&) self).sender_, (Receiver &&) receiver};
-  }
-
-  friend Scheduler tag_invoke(get_completion_scheduler_t, const type& self) noexcept {
-    return self.scheduler_;
-  }
-};
-
-struct schedule_from_t {
-  template <typename Scheduler, typename Sender,
-            std::enable_if_t<
-                _is_sender<Sender> && tag_invocable<schedule_from_t, Scheduler, Sender>, int> = 0>
-  auto operator()(Scheduler&& scheduler, Sender&& sender) const
-      -> tag_invoke_result_t<schedule_from_t, Scheduler, Sender> {
-    return tag_invoke(schedule_from_t{}, (Scheduler &&) scheduler, (Sender &&) sender);
-  }
-
-  template <typename Scheduler, typename Sender,
-            std::enable_if_t<
-                _is_sender<Sender> && !tag_invocable<schedule_from_t, Scheduler, Sender>, int> = 0>
-  sender_t<Scheduler, Sender> operator()(Scheduler&& scheduler, Sender&& sender) const {
-    return {(Scheduler &&) scheduler, (Sender &&) sender};
-  }
-};
-
-}  // namespace __schedule_from
-
-using __schedule_from::schedule_from_t;
-inline constexpr schedule_from_t ScheduleFrom{};
+namespace mmdeploy
+{
+
+    namespace __schedule_from
+    {
+
+        template<typename Scheduler, typename CvrefSender, typename Receiver>
+        struct _Operation1
+        {
+            struct type;
+        };
+        template<typename Scheduler, typename CvrefSender, typename Receiver>
+        using operation1_t = typename _Operation1<Scheduler, CvrefSender, Receiver>::type;
+
+        template<typename Scheduler, typename CvrefSender, typename Receiver>
+        struct _Receiver1
+        {
+            struct type;
+        };
+        template<typename Scheduler, typename CvrefSender, typename Receiver>
+        using receiver1_t = typename _Receiver1<Scheduler, CvrefSender, Receiver>::type;
+
+        template<typename Scheduler, typename CvrefSender, typename Receiver>
+        struct _Receiver2
+        {
+            struct type;
+        };
+        template<typename Scheduler, typename CvrefSender, typename Receiver>
+        using receiver2_t = typename _Receiver2<Scheduler, CvrefSender, Receiver>::type;
+
+        template<typename Scheduler, typename CvrefSender, typename Receiver>
+        struct _Receiver2<Scheduler, CvrefSender, Receiver>::type
+        {
+            operation1_t<Scheduler, CvrefSender, Receiver>* op_state_;
+
+            friend void                                     tag_invoke(set_value_t, type&& self) noexcept
+            {
+                std::apply(
+                    [&](auto&&... vals)
+                    {
+                        SetValue(std::move(self.op_state_->receiver_), std::move(vals)...);  //
+                    },
+                    std::move(*self.op_state_->data_));
+            }
+        };
+
+        template<typename Scheduler, typename CvrefSender, typename Receiver>
+        struct _Receiver1<Scheduler, CvrefSender, Receiver>::type
+        {
+            using _receiver2_t = receiver2_t<Scheduler, CvrefSender, Receiver>;
+
+            operation1_t<Scheduler, CvrefSender, Receiver>* op_state_;
+
+            template<typename... As>
+            friend void tag_invoke(set_value_t, type&& self, As&&... as) noexcept
+            {
+                self.op_state_->data_.emplace((As&&)as...);
+                auto  sender    = Schedule(self.op_state_->scheduler_);
+                auto& op_state2 = self.op_state_->op_state2_.emplace(
+                    __conv{[&]
+                           { return Connect(std::move(sender), _receiver2_t{self.op_state_}); }});
+                Start(op_state2);
+            }
+        };
+
+        template<typename Scheduler, typename CvrefSender, typename Receiver>
+        struct _Operation1<Scheduler, CvrefSender, Receiver>::type
+        {
+            using _receiver1_t = receiver1_t<Scheduler, CvrefSender, Receiver>;
+            using _receiver2_t = receiver2_t<Scheduler, CvrefSender, Receiver>;
+
+            Scheduler                                                                   scheduler_;
+            Receiver                                                                    receiver_;
+            std::optional<completion_signatures_of_t<remove_cvref_t<CvrefSender>>>      data_;
+            connect_result_t<CvrefSender, _receiver1_t>                                 op_state1_;
+            std::optional<connect_result_t<schedule_result_t<Scheduler>, _receiver2_t>> op_state2_;
+
+            template<class Receiver2>
+            type(Scheduler sched, CvrefSender&& sender, Receiver2&& receiver)
+                : scheduler_(sched)
+                , receiver_((Receiver2&&)receiver)
+                , op_state1_(Connect((CvrefSender&&)sender, _receiver1_t{this}))
+            {
+            }
+
+            type(const type&)                      = delete;
+            type(_Operation1&&) noexcept           = delete;
+            type&       operator=(const type&)     = delete;
+            type&       operator=(type&&) noexcept = delete;
+
+            friend void tag_invoke(start_t, type& op_state) noexcept
+            {
+                Start(op_state.op_state1_);
+            }
+        };
+
+        template<typename Scheduler, typename Sender>
+        struct _Sender
+        {
+            struct type;
+        };
+        template<typename Scheduler, typename Sender>
+        using sender_t = typename _Sender<remove_cvref_t<Scheduler>, remove_cvref_t<Sender>>::type;
+
+        template<typename Scheduler, typename Sender>
+        struct _Sender<Scheduler, Sender>::type
+        {
+            using value_types = completion_signatures_of_t<Sender>;
+
+            Scheduler scheduler_;
+            Sender    sender_;
+
+            template<typename Self, typename Receiver, _decays_to<Self, type, int> = 0>
+            friend auto tag_invoke(connect_t, Self&& self, Receiver&& receiver)
+                -> operation1_t<Scheduler, _copy_cvref_t<Self, Sender>, remove_cvref_t<Receiver>>
+            {
+                return {self.scheduler_, ((Self&&)self).sender_, (Receiver&&)receiver};
+            }
+
+            friend Scheduler tag_invoke(get_completion_scheduler_t, const type& self) noexcept
+            {
+                return self.scheduler_;
+            }
+        };
+
+        struct schedule_from_t
+        {
+            template<typename Scheduler, typename Sender, std::enable_if_t<_is_sender<Sender> && tag_invocable<schedule_from_t, Scheduler, Sender>, int> = 0>
+            auto operator()(Scheduler&& scheduler, Sender&& sender) const
+                -> tag_invoke_result_t<schedule_from_t, Scheduler, Sender>
+            {
+                return tag_invoke(schedule_from_t{}, (Scheduler&&)scheduler, (Sender&&)sender);
+            }
+
+            template<typename Scheduler, typename Sender, std::enable_if_t<_is_sender<Sender> && !tag_invocable<schedule_from_t, Scheduler, Sender>, int> = 0>
+            sender_t<Scheduler, Sender> operator()(Scheduler&& scheduler, Sender&& sender) const
+            {
+                return {(Scheduler&&)scheduler, (Sender&&)sender};
+            }
+        };
+
+    }  // namespace __schedule_from
+
+    using __schedule_from::schedule_from_t;
+    inline constexpr schedule_from_t ScheduleFrom{};
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/schedulers/dynamic_batch_scheduler.h b/csrc/mmdeploy/execution/schedulers/dynamic_batch_scheduler.h
index 437eda30af..981a07f27f 100644
--- a/csrc/mmdeploy/execution/schedulers/dynamic_batch_scheduler.h
+++ b/csrc/mmdeploy/execution/schedulers/dynamic_batch_scheduler.h
@@ -8,141 +8,170 @@
 #include "mmdeploy/execution/schedulers/timed_single_thread_context.h"
 #include "mmdeploy/execution/utility.h"
 
-namespace mmdeploy {
-
-namespace _dynamic_batch_scheduler {
-
-template <typename SubmitSch, typename ExecuteSch, typename AssemblerType>
-struct DynamicBatchScheduler {
-  using Assembler = AssemblerType;
-
-  SubmitSch submit_sch_;
-  ExecuteSch execute_sch_;
-  TimedSingleThreadContext* timer_;
-  size_t max_batch_size_;
-  std::chrono::duration<int64_t, std::micro> timeout_;
-
-  friend auto tag_invoke(schedule_t, const DynamicBatchScheduler& self) {
-    return Schedule(self.submit_sch_);
-  }
-};
-
-template <typename... Args>
-using scheduler_t = DynamicBatchScheduler<Args...>;
-
-template <typename Sender, typename Scheduler, typename Receiver, typename Func>
-struct _Operation {
-  struct type;
-};
-template <typename Sender, typename Scheduler, typename Receiver, typename Func>
-using operation_t = typename _Operation<Sender, Scheduler, remove_cvref_t<Receiver>, Func>::type;
-
-template <typename Sender, typename Scheduler, typename Receiver, typename Func>
-struct _Receiver {
-  struct type {
-    operation_t<Sender, Scheduler, Receiver, Func>* op_state_;
-    template <typename... Args>
-    friend void tag_invoke(set_value_t, type&& self, Args&&... args) noexcept {
-      self.op_state_->context_->Notify(self.op_state_, (Args &&) args...);
-    }
-  };
-};
-template <typename Sender, typename Scheduler, typename Receiver, typename Func>
-using receiver_t = typename _Receiver<Sender, Scheduler, Receiver, Func>::type;
-
-using context_base_t = dynamic_batch_t::context_base_t;
-
-//                         start   count
-using range_t = std::pair<size_t, size_t>;
-
-template <typename Sender, typename Scheduler, typename Receiver, typename Func>
-struct Context : context_base_t {
-  using _duration_t = std::chrono::duration<int64_t, std::micro>;
-
-  Scheduler scheduler_;
-  using Assembler = typename Scheduler::Assembler;
-  Func func_;
-  size_t max_batch_size_;
-  _duration_t delay_;
-  TimedSingleThreadContext* timer_;
-
-  std::mutex mutex_;
-  size_t counter_{0};
-
-  Context(Scheduler scheduler, Func func)
-      : context_base_t{[](context_base_t* p) { delete static_cast<Context*>(p); }},
-        scheduler_(std::move(scheduler)),
-        func_(std::move(func)),
-        max_batch_size_(scheduler_.max_batch_size_),
-        delay_(scheduler_.timeout_),
-        timer_(scheduler_.timer_) {}
-
-  ~Context() { MMDEPLOY_DEBUG("~Context()"); }
-
-  using _operation_t = operation_t<Sender, Scheduler, Receiver, Func>;
-
-  struct Batch {
-    Context* context_;
-    size_t index_{0};
-    std::vector<_operation_t*> states_;
-    std::vector<range_t> ranges_;
-    completion_signatures_of_t<Sender> values_;
-    size_t size_{0};
-    Batch(Context* context, size_t index, size_t max_batch_size)
-        : context_(context), index_(index), values_{} {
-      states_.reserve(max_batch_size);
-      ranges_.reserve(max_batch_size);
-    }
-
-    friend std::ostream& operator<<(std::ostream& os, const Batch& batch) {
-      os << fmt::format("(index={}, size={})", batch.index_, batch.size_);
-      return os;
-    }
-  };
-
-  template <typename... Args>
-  void Notify(_operation_t* op_state, Args&&... args) {
-    std::lock_guard lock{mutex_};
-
-    std::unique_ptr<Batch> batch = std::move(batch_);
-    const size_t size = Assembler::get_size((Args &&) args...);
-    op_state->count_ = size;
-    op_state->batch_size_ = size;
-
-    size_t index = 0;
-    while (index != size) {
-      bool new_batch{};
-      if (!batch) {
-        batch = std::make_unique<Batch>(this, counter_++, max_batch_size_);
-        new_batch = true;
-      }
-      auto count = std::min(max_batch_size_ - batch->size_, size - index);
-      auto start = index;
-
-      batch->states_.push_back(op_state);
-      batch->ranges_.emplace_back(start, count);
-      Assembler::input(std::forward_as_tuple((Args &&) args...), {start, count}, batch->values_,
-                       {batch->size_, count}, max_batch_size_);
-      batch->size_ += count;
-
-      index += count;
-      if (batch->size_ == max_batch_size_) {
-        MMDEPLOY_DEBUG("direct submit of batch {}", *batch);
-        // batch is full, submit immediately
-        Execute(scheduler_.execute_sch_, [this, batch = std::move(batch)] { Run(*batch); });
-      } else if (new_batch && timer_) {
-        MMDEPLOY_DEBUG("set off deferred submission for batch {}", *batch);
-        // set off a deferred task to submit the batch if it still exists at the moment.
-        StartDetached(Then(ScheduleAfter(timer_->GetScheduler(), delay_),
-                           [this, batch_index = batch->index_] { Submit(batch_index); }));
-      }
-    }
-
-    batch_ = std::move(batch);
-  }
-
-  void Submit(size_t batch_index) {
-    Execute(scheduler_.execute_sch_, [this, batch_index] {
+namespace mmdeploy
+{
+
+    namespace _dynamic_batch_scheduler
+    {
+
+        template<typename SubmitSch, typename ExecuteSch, typename AssemblerType>
+        struct DynamicBatchScheduler
+        {
+            using Assembler = AssemblerType;
+
+            SubmitSch                                  submit_sch_;
+            ExecuteSch                                 execute_sch_;
+            TimedSingleThreadContext*                  timer_;
+            size_t                                     max_batch_size_;
+            std::chrono::duration<int64_t, std::micro> timeout_;
+
+            friend auto                                tag_invoke(schedule_t, const DynamicBatchScheduler& self)
+            {
+                return Schedule(self.submit_sch_);
+            }
+        };
+
+        template<typename... Args>
+        using scheduler_t = DynamicBatchScheduler<Args...>;
+
+        template<typename Sender, typename Scheduler, typename Receiver, typename Func>
+        struct _Operation
+        {
+            struct type;
+        };
+        template<typename Sender, typename Scheduler, typename Receiver, typename Func>
+        using operation_t = typename _Operation<Sender, Scheduler, remove_cvref_t<Receiver>, Func>::type;
+
+        template<typename Sender, typename Scheduler, typename Receiver, typename Func>
+        struct _Receiver
+        {
+            struct type
+            {
+                operation_t<Sender, Scheduler, Receiver, Func>* op_state_;
+                template<typename... Args>
+                friend void tag_invoke(set_value_t, type&& self, Args&&... args) noexcept
+                {
+                    self.op_state_->context_->Notify(self.op_state_, (Args&&)args...);
+                }
+            };
+        };
+        template<typename Sender, typename Scheduler, typename Receiver, typename Func>
+        using receiver_t = typename _Receiver<Sender, Scheduler, Receiver, Func>::type;
+
+        using context_base_t = dynamic_batch_t::context_base_t;
+
+        //                         start   count
+        using range_t = std::pair<size_t, size_t>;
+
+        template<typename Sender, typename Scheduler, typename Receiver, typename Func>
+        struct Context : context_base_t
+        {
+            using _duration_t = std::chrono::duration<int64_t, std::micro>;
+
+            Scheduler scheduler_;
+            using Assembler = typename Scheduler::Assembler;
+            Func                      func_;
+            size_t                    max_batch_size_;
+            _duration_t               delay_;
+            TimedSingleThreadContext* timer_;
+
+            std::mutex                mutex_;
+            size_t                    counter_{0};
+
+            Context(Scheduler scheduler, Func func)
+                : context_base_t{[](context_base_t* p)
+                                 { delete static_cast<Context*>(p); }}
+                , scheduler_(std::move(scheduler))
+                , func_(std::move(func))
+                , max_batch_size_(scheduler_.max_batch_size_)
+                , delay_(scheduler_.timeout_)
+                , timer_(scheduler_.timer_)
+            {
+            }
+
+            ~Context()
+            {
+                MMDEPLOY_DEBUG("~Context()");
+            }
+
+            using _operation_t = operation_t<Sender, Scheduler, Receiver, Func>;
+
+            struct Batch
+            {
+                Context*                           context_;
+                size_t                             index_{0};
+                std::vector<_operation_t*>         states_;
+                std::vector<range_t>               ranges_;
+                completion_signatures_of_t<Sender> values_;
+                size_t                             size_{0};
+                Batch(Context* context, size_t index, size_t max_batch_size)
+                    : context_(context)
+                    , index_(index)
+                    , values_{}
+                {
+                    states_.reserve(max_batch_size);
+                    ranges_.reserve(max_batch_size);
+                }
+
+                friend std::ostream& operator<<(std::ostream& os, const Batch& batch)
+                {
+                    os << fmt::format("(index={}, size={})", batch.index_, batch.size_);
+                    return os;
+                }
+            };
+
+            template<typename... Args>
+            void Notify(_operation_t* op_state, Args&&... args)
+            {
+                std::lock_guard        lock{mutex_};
+
+                std::unique_ptr<Batch> batch = std::move(batch_);
+                const size_t           size  = Assembler::get_size((Args&&)args...);
+                op_state->count_             = size;
+                op_state->batch_size_        = size;
+
+                size_t index = 0;
+                while (index != size)
+                {
+                    bool new_batch{};
+                    if (!batch)
+                    {
+                        batch     = std::make_unique<Batch>(this, counter_++, max_batch_size_);
+                        new_batch = true;
+                    }
+                    auto count = std::min(max_batch_size_ - batch->size_, size - index);
+                    auto start = index;
+
+                    batch->states_.push_back(op_state);
+                    batch->ranges_.emplace_back(start, count);
+                    Assembler::input(std::forward_as_tuple((Args&&)args...), {start, count}, batch->values_, {batch->size_, count}, max_batch_size_);
+                    batch->size_ += count;
+
+                    index += count;
+                    if (batch->size_ == max_batch_size_)
+                    {
+                        MMDEPLOY_DEBUG("direct submit of batch {}", *batch);
+                        // batch is full, submit immediately
+                        Execute(scheduler_.execute_sch_, [this, batch = std::move(batch)]
+                                { Run(*batch); });
+                    }
+                    else if (new_batch && timer_)
+                    {
+                        MMDEPLOY_DEBUG("set off deferred submission for batch {}", *batch);
+                        // set off a deferred task to submit the batch if it still exists at the moment.
+                        StartDetached(Then(ScheduleAfter(timer_->GetScheduler(), delay_),
+                                           [this, batch_index = batch->index_]
+                                           { Submit(batch_index); }));
+                    }
+                }
+
+                batch_ = std::move(batch);
+            }
+
+            void Submit(size_t batch_index)
+            {
+                Execute(scheduler_.execute_sch_, [this, batch_index]
+                        {
       std::unique_ptr<Batch> batch;
       {
         std::lock_guard lock{mutex_};
@@ -156,126 +185,143 @@ struct Context : context_base_t {
       if (batch) {
         MMDEPLOY_DEBUG("deferred submit of batch {}", *batch);
         Run(*batch);
-      }
-    });
-  }
-
-  void Run(Batch& batch) {
-    auto rets = std::apply([&](auto&&... args) { return func_((decltype(args)&&)args...); },
-                           std::move(batch.values_));
-    auto& states = batch.states_;
-    auto& ranges = batch.ranges_;
-    size_t start = 0;
-    for (size_t i = 0; i < states.size(); ++i) {
-      auto count = ranges[i].second;
-      range_t rets_range{start, count};
-      states[i]->Notify(rets, rets_range, ranges[i]);
-      start += count;
-    }
-  }
-
-  std::unique_ptr<Batch> batch_;
-};
-
-template <typename Sender, typename Scheduler, typename Receiver, typename Func>
-struct _Operation<Sender, Scheduler, Receiver, Func>::type {
-  using Assembler = typename Scheduler::Assembler;
-  using _context_t = Context<Sender, Scheduler, Receiver, Func>;
-  using _receiver_t = receiver_t<Sender, Scheduler, Receiver, Func>;
-  using _result_t = decltype(
-      std::apply(std::declval<Func>(), std::declval<completion_signatures_of_t<Sender>>()));
-
-  _context_t* context_;
-  connect_result_t<Sender, _receiver_t> op_state_;
-  Receiver receiver_;
-  _result_t vals_;
-
-  std::atomic<size_t> count_{0};
-  size_t batch_size_{0};
-
-  template <typename Receiver2>
-  type(Sender&& sender, Scheduler scheduler, std::atomic<context_base_t*>* context, Func func,
-       Receiver2&& receiver)
-      : context_(CreateContext(*context, std::move(scheduler), std::move(func))),
-        op_state_{Connect((Sender &&) sender, _receiver_t{this})},
-        receiver_((Receiver2 &&) receiver),
-        vals_{} {}
-
-  type(const type&) = delete;
-  type& operator=(const type&) = delete;
-  type(type&&) noexcept = delete;
-  type& operator=(type&&) noexcept = delete;
-
-  _context_t* CreateContext(std::atomic<context_base_t*>& context, Scheduler scheduler, Func func) {
-    auto* old = context.load(std::memory_order_acquire);
-    if (old) {
-      return static_cast<_context_t*>(old);
-    } else {
-      auto p = std::make_unique<_context_t>(scheduler, std::move(func));
-      if (context.compare_exchange_strong(old, p.get(), std::memory_order_release,
-                                          std::memory_order_acquire)) {
-        // context is filled with p, and now it has the ownership of its value
-        return p.release();
-      } else {
-        // old contains context created by some other thread, p will be destroyed
-        return static_cast<_context_t*>(old);
-      }
-    }
-  }
-
-  friend void tag_invoke(start_t, type& self) { Start(self.op_state_); }
-
-  void Notify(_result_t& rets, range_t rets_range, range_t vals_range) {
-    Assembler::output(rets, rets_range, vals_, vals_range, batch_size_);
-    auto count = rets_range.second;
-    if (count_.fetch_sub(count, std::memory_order_acq_rel) == count) {  // (count_ -= count) == 0
-      SetValue(std::move(receiver_), std::move(vals_));
-    }
-  }
-};
-
-template <typename Sender, typename Scheduler, typename Func>
-struct _Sender {
-  struct type {
-    using _result_t = decltype(
-        std::apply(std::declval<Func>(), std::declval<completion_signatures_of_t<Sender>>()));
-
-    using value_types = std::tuple<_result_t>;
-
-    Sender sender_;
-    Scheduler scheduler_;
-    std::atomic<context_base_t*>* context_;
-    Func func_;
-
-    template <typename Sender2>
-    type(Sender2&& sender, Scheduler scheduler, std::atomic<context_base_t*>* context, Func func)
-        : sender_((Sender2 &&) sender),
-          scheduler_(std::move(scheduler)),
-          context_(context),
-          func_(std::move(func)) {}
-
-    template <typename Receiver>
-    friend auto tag_invoke(connect_t, type&& self, Receiver&& receiver)
-        -> operation_t<Sender, Scheduler, Receiver, Func> {
-      return {std::move(self).sender_, std::move(self).scheduler_, self.context_,
-              std::move(self).func_, (Receiver &&) receiver};
-    }
-  };
-};
-
-template <typename Sender, typename Scheduler, typename Func>
-using sender_t = typename _Sender<remove_cvref_t<Sender>, Scheduler, Func>::type;
-
-template <typename Sender, typename Func, typename... Args>
-auto tag_invoke(dynamic_batch_t, const scheduler_t<Args...>& scheduler, Sender&& sender,
-                dynamic_batch_t::context_t& context, Func func)
-    -> sender_t<Sender, scheduler_t<Args...>, Func> {
-  return {(Sender &&) sender, scheduler, &context.base, std::move(func)};
-}
-
-}  // namespace _dynamic_batch_scheduler
-
-using _dynamic_batch_scheduler::DynamicBatchScheduler;
+      } });
+            }
+
+            void Run(Batch& batch)
+            {
+                auto   rets   = std::apply([&](auto&&... args)
+                                       { return func_((decltype(args)&&)args...); },
+                                       std::move(batch.values_));
+                auto&  states = batch.states_;
+                auto&  ranges = batch.ranges_;
+                size_t start  = 0;
+                for (size_t i = 0; i < states.size(); ++i)
+                {
+                    auto    count = ranges[i].second;
+                    range_t rets_range{start, count};
+                    states[i]->Notify(rets, rets_range, ranges[i]);
+                    start += count;
+                }
+            }
+
+            std::unique_ptr<Batch> batch_;
+        };
+
+        template<typename Sender, typename Scheduler, typename Receiver, typename Func>
+        struct _Operation<Sender, Scheduler, Receiver, Func>::type
+        {
+            using Assembler   = typename Scheduler::Assembler;
+            using _context_t  = Context<Sender, Scheduler, Receiver, Func>;
+            using _receiver_t = receiver_t<Sender, Scheduler, Receiver, Func>;
+            using _result_t   = decltype(std::apply(std::declval<Func>(), std::declval<completion_signatures_of_t<Sender>>()));
+
+            _context_t*                           context_;
+            connect_result_t<Sender, _receiver_t> op_state_;
+            Receiver                              receiver_;
+            _result_t                             vals_;
+
+            std::atomic<size_t>                   count_{0};
+            size_t                                batch_size_{0};
+
+            template<typename Receiver2>
+            type(Sender&& sender, Scheduler scheduler, std::atomic<context_base_t*>* context, Func func, Receiver2&& receiver)
+                : context_(CreateContext(*context, std::move(scheduler), std::move(func)))
+                , op_state_{Connect((Sender&&)sender, _receiver_t{this})}
+                , receiver_((Receiver2&&)receiver)
+                , vals_{}
+            {
+            }
+
+            type(const type&)                      = delete;
+            type& operator=(const type&)           = delete;
+            type(type&&) noexcept                  = delete;
+            type&       operator=(type&&) noexcept = delete;
+
+            _context_t* CreateContext(std::atomic<context_base_t*>& context, Scheduler scheduler, Func func)
+            {
+                auto* old = context.load(std::memory_order_acquire);
+                if (old)
+                {
+                    return static_cast<_context_t*>(old);
+                }
+                else
+                {
+                    auto p = std::make_unique<_context_t>(scheduler, std::move(func));
+                    if (context.compare_exchange_strong(old, p.get(), std::memory_order_release, std::memory_order_acquire))
+                    {
+                        // context is filled with p, and now it has the ownership of its value
+                        return p.release();
+                    }
+                    else
+                    {
+                        // old contains context created by some other thread, p will be destroyed
+                        return static_cast<_context_t*>(old);
+                    }
+                }
+            }
+
+            friend void tag_invoke(start_t, type& self)
+            {
+                Start(self.op_state_);
+            }
+
+            void Notify(_result_t& rets, range_t rets_range, range_t vals_range)
+            {
+                Assembler::output(rets, rets_range, vals_, vals_range, batch_size_);
+                auto count = rets_range.second;
+                if (count_.fetch_sub(count, std::memory_order_acq_rel) == count)
+                {  // (count_ -= count) == 0
+                    SetValue(std::move(receiver_), std::move(vals_));
+                }
+            }
+        };
+
+        template<typename Sender, typename Scheduler, typename Func>
+        struct _Sender
+        {
+            struct type
+            {
+                using _result_t = decltype(std::apply(std::declval<Func>(), std::declval<completion_signatures_of_t<Sender>>()));
+
+                using value_types = std::tuple<_result_t>;
+
+                Sender                        sender_;
+                Scheduler                     scheduler_;
+                std::atomic<context_base_t*>* context_;
+                Func                          func_;
+
+                template<typename Sender2>
+                type(Sender2&& sender, Scheduler scheduler, std::atomic<context_base_t*>* context, Func func)
+                    : sender_((Sender2&&)sender)
+                    , scheduler_(std::move(scheduler))
+                    , context_(context)
+                    , func_(std::move(func))
+                {
+                }
+
+                template<typename Receiver>
+                friend auto tag_invoke(connect_t, type&& self, Receiver&& receiver)
+                    -> operation_t<Sender, Scheduler, Receiver, Func>
+                {
+                    return {std::move(self).sender_, std::move(self).scheduler_, self.context_, std::move(self).func_, (Receiver&&)receiver};
+                }
+            };
+        };
+
+        template<typename Sender, typename Scheduler, typename Func>
+        using sender_t = typename _Sender<remove_cvref_t<Sender>, Scheduler, Func>::type;
+
+        template<typename Sender, typename Func, typename... Args>
+        auto tag_invoke(dynamic_batch_t, const scheduler_t<Args...>& scheduler, Sender&& sender, dynamic_batch_t::context_t& context, Func func)
+            -> sender_t<Sender, scheduler_t<Args...>, Func>
+        {
+            return {(Sender&&)sender, scheduler, &context.base, std::move(func)};
+        }
+
+    }  // namespace _dynamic_batch_scheduler
+
+    using _dynamic_batch_scheduler::DynamicBatchScheduler;
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/schedulers/inlined_scheduler.h b/csrc/mmdeploy/execution/schedulers/inlined_scheduler.h
index 3a4b06dd73..e80fd792c2 100644
--- a/csrc/mmdeploy/execution/schedulers/inlined_scheduler.h
+++ b/csrc/mmdeploy/execution/schedulers/inlined_scheduler.h
@@ -5,68 +5,86 @@
 
 #include "mmdeploy/execution/execution.h"
 
-namespace mmdeploy {
-
-namespace _inline_sched {
-
-template <typename Receiver>
-struct _Operation {
-  struct type;
-};
-template <typename Receiver>
-using operation_t = typename _Operation<remove_cvref_t<Receiver>>::type;
-
-template <typename Receiver>
-struct _Operation<Receiver>::type {
-  Receiver receiver_;
-  friend void tag_invoke(start_t, type& op) noexcept { SetValue(std::move(op.receiver_)); }
-};
-
-struct _Sender {
-  using value_types = std::tuple<>;
-
-  template <typename Receiver>
-  friend auto tag_invoke(connect_t, _Sender, Receiver&& receiver) -> operation_t<Receiver> {
-    return {(Receiver &&) receiver};
-  }
-};
-
-struct InlineScheduler {
-  friend _inline_sched::_Sender tag_invoke(schedule_t, const InlineScheduler&) noexcept {
-    return {};
-  }
-};
-
-inline InlineScheduler tag_invoke(get_completion_scheduler_t, const _Sender&) { return {}; }
-
-template <typename Sender>
-struct _Receiver {
-  struct type;
-};
-template <typename Sender>
-using receiver_t = typename _Receiver<remove_cvref_t<Sender>>::type;
-
-template <typename Sender>
-struct _Receiver<Sender>::type {
-  std::optional<completion_signatures_of_t<Sender>>* data_;
-  template <typename... As>
-  friend void tag_invoke(set_value_t, type&& r, As&&... as) noexcept {
-    r.data_->emplace((As &&) as...);
-  }
-};
-
-template <typename Sender>
-completion_signatures_of_t<Sender> tag_invoke(sync_wait_t, InlineScheduler, Sender&& sender) {
-  std::optional<completion_signatures_of_t<Sender>> data;
-  auto op_state = Connect(((Sender &&) sender), _inline_sched::receiver_t<Sender>{&data});
-  Start(op_state);
-  return std::move(data).value();
-}
-
-}  // namespace _inline_sched
-
-using _inline_sched::InlineScheduler;
-inline constexpr InlineScheduler inline_scheduler{};
+namespace mmdeploy
+{
+
+    namespace _inline_sched
+    {
+
+        template<typename Receiver>
+        struct _Operation
+        {
+            struct type;
+        };
+        template<typename Receiver>
+        using operation_t = typename _Operation<remove_cvref_t<Receiver>>::type;
+
+        template<typename Receiver>
+        struct _Operation<Receiver>::type
+        {
+            Receiver    receiver_;
+            friend void tag_invoke(start_t, type& op) noexcept
+            {
+                SetValue(std::move(op.receiver_));
+            }
+        };
+
+        struct _Sender
+        {
+            using value_types = std::tuple<>;
+
+            template<typename Receiver>
+            friend auto tag_invoke(connect_t, _Sender, Receiver&& receiver) -> operation_t<Receiver>
+            {
+                return {(Receiver&&)receiver};
+            }
+        };
+
+        struct InlineScheduler
+        {
+            friend _inline_sched::_Sender tag_invoke(schedule_t, const InlineScheduler&) noexcept
+            {
+                return {};
+            }
+        };
+
+        inline InlineScheduler tag_invoke(get_completion_scheduler_t, const _Sender&)
+        {
+            return {};
+        }
+
+        template<typename Sender>
+        struct _Receiver
+        {
+            struct type;
+        };
+        template<typename Sender>
+        using receiver_t = typename _Receiver<remove_cvref_t<Sender>>::type;
+
+        template<typename Sender>
+        struct _Receiver<Sender>::type
+        {
+            std::optional<completion_signatures_of_t<Sender>>* data_;
+            template<typename... As>
+            friend void tag_invoke(set_value_t, type&& r, As&&... as) noexcept
+            {
+                r.data_->emplace((As&&)as...);
+            }
+        };
+
+        template<typename Sender>
+        completion_signatures_of_t<Sender> tag_invoke(sync_wait_t, InlineScheduler, Sender&& sender)
+        {
+            std::optional<completion_signatures_of_t<Sender>> data;
+            auto                                              op_state = Connect(((Sender&&)sender), _inline_sched::receiver_t<Sender>{&data});
+            Start(op_state);
+            return std::move(data).value();
+        }
+
+    }  // namespace _inline_sched
+
+    using _inline_sched::InlineScheduler;
+    inline constexpr InlineScheduler inline_scheduler{};
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/schedulers/intrusive_queue.h b/csrc/mmdeploy/execution/schedulers/intrusive_queue.h
index 32514c01c6..ac9550bc7c 100644
--- a/csrc/mmdeploy/execution/schedulers/intrusive_queue.h
+++ b/csrc/mmdeploy/execution/schedulers/intrusive_queue.h
@@ -20,96 +20,124 @@
 #include <tuple>
 #include <utility>
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-template <auto Next>
-class intrusive_queue;
+    template<auto Next>
+    class intrusive_queue;
 
-template <class Item, Item* Item::*Next>
-class intrusive_queue<Next> {
- public:
-  intrusive_queue() noexcept = default;
+    template<class Item, Item* Item::*Next>
+    class intrusive_queue<Next>
+    {
+      public:
+        intrusive_queue() noexcept = default;
 
-  intrusive_queue(intrusive_queue&& other) noexcept
-      : head_(std::exchange((other.head_, nullptr))),
-        tail_(std::exchange((other.head_, nullptr))) {}
+        intrusive_queue(intrusive_queue&& other) noexcept
+            : head_(std::exchange((other.head_, nullptr)))
+            , tail_(std::exchange((other.head_, nullptr)))
+        {
+        }
 
-  ~intrusive_queue() { assert(empty()); }
+        ~intrusive_queue()
+        {
+            assert(empty());
+        }
 
-  static intrusive_queue MakeReversed(Item* list) noexcept {
-    Item* new_head = nullptr;
-    Item* new_tail = list;
-    while (list != nullptr) {
-      Item* next = list->*Next;
-      list->*Next = new_head;
-      new_head = list;
-      list = next;
-    }
-    intrusive_queue result;
-    result.head_ = new_head;
-    result.tail_ = new_tail;
-  }
+        static intrusive_queue MakeReversed(Item* list) noexcept
+        {
+            Item* new_head = nullptr;
+            Item* new_tail = list;
+            while (list != nullptr)
+            {
+                Item* next  = list->*Next;
+                list->*Next = new_head;
+                new_head    = list;
+                list        = next;
+            }
+            intrusive_queue result;
+            result.head_ = new_head;
+            result.tail_ = new_tail;
+        }
 
-  bool empty() const noexcept { return head_ == nullptr; }
+        bool empty() const noexcept
+        {
+            return head_ == nullptr;
+        }
 
-  Item* pop_front() noexcept {
-    assert(!empty());
-    Item* item = std::exchange(head_, head_->*Next);
-    if (head_ == nullptr) {
-      tail_ = nullptr;
-    }
-    return item;
-  }
+        Item* pop_front() noexcept
+        {
+            assert(!empty());
+            Item* item = std::exchange(head_, head_->*Next);
+            if (head_ == nullptr)
+            {
+                tail_ = nullptr;
+            }
+            return item;
+        }
 
-  void push_front(Item* item) noexcept {
-    assert(item != nullptr);
-    item->*Next = head_;
-    head_ = item;
-    if (tail_ == nullptr) {
-      tail_ = item;
-    }
-  }
+        void push_front(Item* item) noexcept
+        {
+            assert(item != nullptr);
+            item->*Next = head_;
+            head_       = item;
+            if (tail_ == nullptr)
+            {
+                tail_ = item;
+            }
+        }
 
-  void push_back(Item* item) noexcept {
-    assert(item != nullptr);
-    item->*Next = nullptr;
-    if (tail_ == nullptr) {
-      head_ = item;
-    } else {
-      tail_->*Next = item;
-    }
-    tail_ = item;
-  }
+        void push_back(Item* item) noexcept
+        {
+            assert(item != nullptr);
+            item->*Next = nullptr;
+            if (tail_ == nullptr)
+            {
+                head_ = item;
+            }
+            else
+            {
+                tail_->*Next = item;
+            }
+            tail_ = item;
+        }
 
-  void append(intrusive_queue other) noexcept {
-    if (other.empty()) {
-      return;
-    }
-    auto* other_head = std::exchange(other.head_, nullptr);
-    if (empty()) {
-      head_ = other_head;
-    } else {
-      tail_->*Next = other_head;
-    }
-    tail_ = std::exchange(other.tail_, nullptr);
-  }
+        void append(intrusive_queue other) noexcept
+        {
+            if (other.empty())
+            {
+                return;
+            }
+            auto* other_head = std::exchange(other.head_, nullptr);
+            if (empty())
+            {
+                head_ = other_head;
+            }
+            else
+            {
+                tail_->*Next = other_head;
+            }
+            tail_ = std::exchange(other.tail_, nullptr);
+        }
 
-  void prepend(intrusive_queue other) noexcept {
-    if (other.empty()) {
-      return;
-    }
-    other.tail_->*Next = head_;
-    head_ = other.head_;
-    if (tail_ == nullptr) {
-      tail_ = other.tail_;
-    }
-    other.tail_ = nullptr;
-    other.head_ = nullptr;
-  }
+        void prepend(intrusive_queue other) noexcept
+        {
+            if (other.empty())
+            {
+                return;
+            }
+            other.tail_->*Next = head_;
+            head_              = other.head_;
+            if (tail_ == nullptr)
+            {
+                tail_ = other.tail_;
+            }
+            other.tail_ = nullptr;
+            other.head_ = nullptr;
+        }
 
- private:
-  Item* head_ = nullptr;
-  Item* tail_ = nullptr;
-};
+      private:
+        Item* head_ = nullptr;
+        Item* tail_ = nullptr;
+    };
 
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/execution/schedulers/registry.h b/csrc/mmdeploy/execution/schedulers/registry.h
index 3db4e9feee..2a9ebd2c24 100644
--- a/csrc/mmdeploy/execution/schedulers/registry.h
+++ b/csrc/mmdeploy/execution/schedulers/registry.h
@@ -7,12 +7,13 @@
 #include "mmdeploy/core/value.h"
 #include "mmdeploy/execution/type_erased.h"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-MMDEPLOY_REGISTER_TYPE_ID(TypeErasedScheduler<Value>, 8);
+    MMDEPLOY_REGISTER_TYPE_ID(TypeErasedScheduler<Value>, 8);
 
-MMDEPLOY_DECLARE_REGISTRY(TypeErasedScheduler<Value>,
-                          TypeErasedScheduler<Value>(const Value& config));
+    MMDEPLOY_DECLARE_REGISTRY(TypeErasedScheduler<Value>,
+                              TypeErasedScheduler<Value>(const Value& config));
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/schedulers/schedulers.cpp b/csrc/mmdeploy/execution/schedulers/schedulers.cpp
index 2703e136e1..e2592fc2ed 100644
--- a/csrc/mmdeploy/execution/schedulers/schedulers.cpp
+++ b/csrc/mmdeploy/execution/schedulers/schedulers.cpp
@@ -7,33 +7,38 @@
 #include "mmdeploy/execution/schedulers/static_thread_pool.h"
 #include "mmdeploy/execution/schedulers/timed_single_thread_context.h"
 
-namespace mmdeploy {
-
-using Scheduler = TypeErasedScheduler<Value>;
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(Scheduler, (Inline, 0),
-                               [](const Value&) { return Scheduler{InlineScheduler{}}; });
-
-namespace {
-
-// Create type-erased scheduler by calling Context::GetScheduler and then move the context into the
-// deleter of the impl ptr of the type-erased scheduler
-template <class Context>
-Scheduler CreateFromContext(std::unique_ptr<Context> context) {
-  using SchedType = decltype(context->GetScheduler());
-  using EraseType = _type_erased::TypeErasedSchedulerImpl<SchedType, Value>;
-  auto sched = new EraseType(context->GetScheduler());
-  return Scheduler{std::shared_ptr<Scheduler::Impl>(
-      sched, [context = std::shared_ptr<Context>(std::move(context))](EraseType* p) { delete p; })};
-}
-
-}  // namespace
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(Scheduler, (SingleThread, 0), [](const Value&) {
-  return CreateFromContext(std::make_unique<_single_thread_context::SingleThreadContext>());
-});
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(Scheduler, (ThreadPool, 0), [](const Value& cfg) {
+namespace mmdeploy
+{
+
+    using Scheduler = TypeErasedScheduler<Value>;
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Scheduler, (Inline, 0), [](const Value&)
+                                   { return Scheduler{InlineScheduler{}}; });
+
+    namespace
+    {
+
+        // Create type-erased scheduler by calling Context::GetScheduler and then move the context into the
+        // deleter of the impl ptr of the type-erased scheduler
+        template<class Context>
+        Scheduler CreateFromContext(std::unique_ptr<Context> context)
+        {
+            using SchedType = decltype(context->GetScheduler());
+            using EraseType = _type_erased::TypeErasedSchedulerImpl<SchedType, Value>;
+            auto sched      = new EraseType(context->GetScheduler());
+            return Scheduler{std::shared_ptr<Scheduler::Impl>(
+                sched,
+                [context = std::shared_ptr<Context>(std::move(context))](EraseType* p)
+                { delete p; })};
+        }
+
+    }  // namespace
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Scheduler, (SingleThread, 0), [](const Value&)
+                                   { return CreateFromContext(std::make_unique<_single_thread_context::SingleThreadContext>()); });
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Scheduler, (ThreadPool, 0), [](const Value& cfg)
+                                   {
   auto num_threads = -1;
   if (cfg.is_object() && cfg.contains("num_threads")) {
     num_threads = cfg["num_threads"].get<int>();
@@ -42,77 +47,85 @@ MMDEPLOY_REGISTER_FACTORY_FUNC(Scheduler, (ThreadPool, 0), [](const Value& cfg)
     return CreateFromContext(std::make_unique<__static_thread_pool::StaticThreadPool>(num_threads));
   } else {
     return CreateFromContext(std::make_unique<__static_thread_pool::StaticThreadPool>());
-  }
-});
-
-struct ValueAssembler {
-  using range_t = std::pair<size_t, size_t>;
-
-  static size_t get_size(const Value& x) { return x.empty() ? 0 : x.front().size(); }
-
-  template <typename ValueType>
-  static void input(std::tuple<ValueType> _src, range_t src_range, std::tuple<Value>& _dst,
-                    range_t dst_range, size_t batch_size) {
-    auto& [src] = _src;
-    auto& [dst] = _dst;
-    if (dst.empty()) {
-      dst = std::move(src);
-      for (auto& x : dst) {
-        x.array().reserve(batch_size);
-      }
-      return;
-    }
-    auto& u = src.array();
-    auto& v = dst.array();
-    assert(u.size() == v.size());
-    assert(dst_range.first = v.front().size());
-    for (size_t k = 0; k < src.size(); ++k) {
-      auto& x = u[k].array();
-      auto& y = v[k].array();
-      std::copy(std::begin(x) + src_range.first, std::begin(x) + src_range.first + src_range.second,
-                std::back_inserter(y));
+  } });
+
+    struct ValueAssembler
+    {
+        using range_t = std::pair<size_t, size_t>;
+
+        static size_t get_size(const Value& x)
+        {
+            return x.empty() ? 0 : x.front().size();
+        }
+
+        template<typename ValueType>
+        static void input(std::tuple<ValueType> _src, range_t src_range, std::tuple<Value>& _dst, range_t dst_range, size_t batch_size)
+        {
+            auto& [src] = _src;
+            auto& [dst] = _dst;
+            if (dst.empty())
+            {
+                dst = std::move(src);
+                for (auto& x : dst)
+                {
+                    x.array().reserve(batch_size);
+                }
+                return;
+            }
+            auto& u = src.array();
+            auto& v = dst.array();
+            assert(u.size() == v.size());
+            assert(dst_range.first = v.front().size());
+            for (size_t k = 0; k < src.size(); ++k)
+            {
+                auto& x = u[k].array();
+                auto& y = v[k].array();
+                std::copy(std::begin(x) + src_range.first, std::begin(x) + src_range.first + src_range.second, std::back_inserter(y));
+            }
+        }
+
+        static void output(Value& src, range_t src_range, Value& dst, range_t dst_range, size_t batch_size)
+        {
+            if (dst.empty())
+            {
+                dst = Value::Array(src.size(), Value::Array(batch_size));
+            }
+            auto& u = src.array();
+            auto& v = dst.array();
+            assert(u.size() == v.size());
+            for (size_t k = 0; k < src.size(); ++k)
+            {
+                auto& x = u[k].array();
+                auto& y = v[k].array();
+                std::move(std::begin(x) + src_range.first, std::begin(x) + src_range.first + src_range.second, std::begin(y) + dst_range.first);
+            }
+        }
+    };
+
+    TimedSingleThreadContext& gTimedSingleThreadContext()
+    {
+        static TimedSingleThreadContext context{};
+        return context;
     }
-  }
 
-  static void output(Value& src, range_t src_range, Value& dst, range_t dst_range,
-                     size_t batch_size) {
-    if (dst.empty()) {
-      dst = Value::Array(src.size(), Value::Array(batch_size));
+    static Scheduler CreateDynamicBatchScheduler(const Value& cfg)
+    {
+        using SchedulerType =
+            DynamicBatchScheduler<InlineScheduler, TypeErasedScheduler<Value>, ValueAssembler>;
+        auto                      scheduler      = cfg["scheduler"].get<TypeErasedScheduler<Value>>();
+        auto                      max_batch_size = cfg["max_batch_size"].get<int>();
+
+        TimedSingleThreadContext* timer{};
+        auto                      timeout = cfg["timeout"].get<int>();
+        if (timeout >= 0)
+        {
+            timer = &gTimedSingleThreadContext();
+        }
+        return Scheduler{SchedulerType{inline_scheduler, std::move(scheduler), timer, (size_t)max_batch_size, std::chrono::microseconds(timeout)}};
     }
-    auto& u = src.array();
-    auto& v = dst.array();
-    assert(u.size() == v.size());
-    for (size_t k = 0; k < src.size(); ++k) {
-      auto& x = u[k].array();
-      auto& y = v[k].array();
-      std::move(std::begin(x) + src_range.first, std::begin(x) + src_range.first + src_range.second,
-                std::begin(y) + dst_range.first);
-    }
-  }
-};
-
-TimedSingleThreadContext& gTimedSingleThreadContext() {
-  static TimedSingleThreadContext context{};
-  return context;
-}
-
-static Scheduler CreateDynamicBatchScheduler(const Value& cfg) {
-  using SchedulerType =
-      DynamicBatchScheduler<InlineScheduler, TypeErasedScheduler<Value>, ValueAssembler>;
-  auto scheduler = cfg["scheduler"].get<TypeErasedScheduler<Value>>();
-  auto max_batch_size = cfg["max_batch_size"].get<int>();
-
-  TimedSingleThreadContext* timer{};
-  auto timeout = cfg["timeout"].get<int>();
-  if (timeout >= 0) {
-    timer = &gTimedSingleThreadContext();
-  }
-  return Scheduler{SchedulerType{inline_scheduler, std::move(scheduler), timer,
-                                 (size_t)max_batch_size, std::chrono::microseconds(timeout)}};
-}
 
-MMDEPLOY_REGISTER_FACTORY_FUNC(Scheduler, (DynamicBatch, 0), CreateDynamicBatchScheduler);
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Scheduler, (DynamicBatch, 0), CreateDynamicBatchScheduler);
 
-MMDEPLOY_DEFINE_REGISTRY(TypeErasedScheduler<Value>);
+    MMDEPLOY_DEFINE_REGISTRY(TypeErasedScheduler<Value>);
 
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/execution/schedulers/single_thread_context.h b/csrc/mmdeploy/execution/schedulers/single_thread_context.h
index ebcba774ab..4fee5142a9 100644
--- a/csrc/mmdeploy/execution/schedulers/single_thread_context.h
+++ b/csrc/mmdeploy/execution/schedulers/single_thread_context.h
@@ -9,48 +9,68 @@
 
 #include "mmdeploy/execution/run_loop.h"
 
-namespace mmdeploy {
-
-namespace _single_thread_context {
-
-class SingleThreadContext {
- public:
-  SingleThreadContext() : loop_(), thread_([this] { loop_._Run(); }) {}
-
-  ~SingleThreadContext() {
-    loop_._Finish();
-    thread_.join();
-  }
-
-  class Scheduler {
-   public:
-    explicit Scheduler(SingleThreadContext* context)
-        : context_(context), scheduler_(context_->loop_.GetScheduler()) {}
-
-    friend auto tag_invoke(schedule_t, const Scheduler& self)
-        -> tag_invoke_result_t<schedule_t, RunLoop::_Scheduler> {
-      return Schedule(self.scheduler_);
-    }
-
-   private:
-    SingleThreadContext* context_;
-    RunLoop::_Scheduler scheduler_;
-  };
-
-  Scheduler GetScheduler() noexcept { return Scheduler{this}; }
-
-  std::thread::id GetThreadId() const noexcept { return thread_.get_id(); }
-
- private:
-  RunLoop loop_;
-  std::thread thread_;
-};
-
-using Scheduler = SingleThreadContext::Scheduler;
-
-}  // namespace _single_thread_context
-
-using _single_thread_context::SingleThreadContext;
+namespace mmdeploy
+{
+
+    namespace _single_thread_context
+    {
+
+        class SingleThreadContext
+        {
+          public:
+            SingleThreadContext()
+                : loop_()
+                , thread_([this]
+                          { loop_._Run(); })
+            {
+            }
+
+            ~SingleThreadContext()
+            {
+                loop_._Finish();
+                thread_.join();
+            }
+
+            class Scheduler
+            {
+              public:
+                explicit Scheduler(SingleThreadContext* context)
+                    : context_(context)
+                    , scheduler_(context_->loop_.GetScheduler())
+                {
+                }
+
+                friend auto tag_invoke(schedule_t, const Scheduler& self)
+                    -> tag_invoke_result_t<schedule_t, RunLoop::_Scheduler>
+                {
+                    return Schedule(self.scheduler_);
+                }
+
+              private:
+                SingleThreadContext* context_;
+                RunLoop::_Scheduler  scheduler_;
+            };
+
+            Scheduler GetScheduler() noexcept
+            {
+                return Scheduler{this};
+            }
+
+            std::thread::id GetThreadId() const noexcept
+            {
+                return thread_.get_id();
+            }
+
+          private:
+            RunLoop     loop_;
+            std::thread thread_;
+        };
+
+        using Scheduler = SingleThreadContext::Scheduler;
+
+    }  // namespace _single_thread_context
+
+    using _single_thread_context::SingleThreadContext;
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/schedulers/static_thread_pool.h b/csrc/mmdeploy/execution/schedulers/static_thread_pool.h
index 3aebc5ac99..fc29d4e69f 100644
--- a/csrc/mmdeploy/execution/schedulers/static_thread_pool.h
+++ b/csrc/mmdeploy/execution/schedulers/static_thread_pool.h
@@ -15,346 +15,433 @@
 #include "intrusive_queue.h"
 #include "mmdeploy/execution/execution.h"
 
-namespace mmdeploy {
-
-namespace __static_thread_pool {
-
-struct TaskBase {
-  TaskBase* next_;
-  void (*execute_)(TaskBase*) noexcept;
-};
-
-template <typename Receiver>
-struct _Operation {
-  struct type;
-};
-template <typename Receiver>
-using operation_t = typename _Operation<remove_cvref_t<Receiver>>::type;
-
-class StaticThreadPool;
-
-struct Scheduler {
-  template <typename Receiver>
-  friend struct _Operation;
-
-  struct Sender {
-    using value_types = std::tuple<>;
-
-    template <typename Receiver>
-    operation_t<Receiver> MakeOperation(Receiver&& r) const {
-      return {pool_, (Receiver &&) r};
-    }
+namespace mmdeploy
+{
+
+    namespace __static_thread_pool
+    {
+
+        struct TaskBase
+        {
+            TaskBase* next_;
+            void (*execute_)(TaskBase*) noexcept;
+        };
+
+        template<typename Receiver>
+        struct _Operation
+        {
+            struct type;
+        };
+        template<typename Receiver>
+        using operation_t = typename _Operation<remove_cvref_t<Receiver>>::type;
+
+        class StaticThreadPool;
+
+        struct Scheduler
+        {
+            template<typename Receiver>
+            friend struct _Operation;
+
+            struct Sender
+            {
+                using value_types = std::tuple<>;
+
+                template<typename Receiver>
+                operation_t<Receiver> MakeOperation(Receiver&& r) const
+                {
+                    return {pool_, (Receiver&&)r};
+                }
+
+                template<typename Receiver>
+                friend operation_t<Receiver> tag_invoke(connect_t, Sender s, Receiver&& r)
+                {
+                    return s.MakeOperation((Receiver&&)r);
+                }
+
+                friend auto tag_invoke(get_completion_scheduler_t, const Sender& sender) noexcept -> Scheduler
+                {
+                    return Scheduler{sender.pool_};
+                }
+
+                friend struct Scheduler;
+
+                explicit Sender(StaticThreadPool& pool) noexcept
+                    : pool_(pool)
+                {
+                }
+
+                StaticThreadPool& pool_;
+            };
+
+            Sender MakeSender_() const
+            {
+                return Sender{*pool_};
+            }
+
+            friend class StaticThreadPool;
+
+          public:
+            explicit Scheduler(StaticThreadPool& pool) noexcept
+                : pool_(&pool)
+            {
+            }
+
+            friend bool operator==(Scheduler a, Scheduler b) noexcept
+            {
+                return a.pool_ == b.pool_;
+            }
+
+            friend bool operator!=(Scheduler a, Scheduler b) noexcept
+            {
+                return a.pool_ != b.pool_;
+            }
+
+            friend Sender tag_invoke(schedule_t, const Scheduler& self) noexcept
+            {
+                return self.MakeSender_();
+            }
+
+          private:
+            StaticThreadPool* pool_{nullptr};
+        };
+
+        class StaticThreadPool
+        {
+            template<typename Receiver>
+            friend struct _Operation;
+
+          public:
+            StaticThreadPool();
+            explicit StaticThreadPool(std::uint32_t thread_count);
+            ~StaticThreadPool();
+
+            Scheduler GetScheduler() noexcept
+            {
+                return Scheduler{*this};
+            }
+
+            void RequestStop() noexcept;
+
+          private:
+            class ThreadState
+            {
+              public:
+                TaskBase* try_pop();
+                TaskBase* pop();
+                bool      try_push(TaskBase* task);
+                void      push(TaskBase* task);
+                void      request_stop();
+
+              private:
+                std::mutex                        mutex_;
+                std::condition_variable           cv_;
+                intrusive_queue<&TaskBase::next_> queue_;
+                bool                              stop_requested_{false};
+            };
+
+            void                       Run(std::uint32_t index) noexcept;
+            void                       Join() noexcept;
+
+            void                       Enqueue(TaskBase* task) noexcept;
+
+            std::uint32_t              thread_count_;
+            std::vector<std::thread>   threads_;
+            std::vector<ThreadState>   thread_states_;
+            std::atomic<std::uint32_t> next_thread_;
+        };
+
+        template<typename Receiver>
+        struct _Operation<Receiver>::type : TaskBase
+        {
+            friend Scheduler::Sender;
+
+            StaticThreadPool& pool_;
+            Receiver          receiver_;
+
+            type(StaticThreadPool& pool, Receiver&& r)
+                : TaskBase{}
+                , pool_(pool)
+                , receiver_((Receiver&&)r)
+            {
+                this->execute_ = [](TaskBase* t) noexcept
+                {
+                    auto& op = *static_cast<type*>(t);
+                    SetValue((Receiver&&)op.receiver_);
+                };
+            }
+
+            void enqueue_(TaskBase* op) const
+            {
+                return pool_.Enqueue(op);
+            }
+
+            friend void tag_invoke(start_t, type& op) noexcept
+            {
+                op.enqueue_(&op);
+            }
+        };
+
+        inline StaticThreadPool::StaticThreadPool()
+            : StaticThreadPool(std::thread::hardware_concurrency())
+        {
+        }
 
-    template <typename Receiver>
-    friend operation_t<Receiver> tag_invoke(connect_t, Sender s, Receiver&& r) {
-      return s.MakeOperation((Receiver &&) r);
-    }
+        inline StaticThreadPool::StaticThreadPool(std::uint32_t thread_count)
+            : thread_count_(thread_count)
+            , thread_states_(thread_count)
+            , next_thread_(0)
+        {
+            assert(thread_count_ > 0);
+
+            threads_.reserve(thread_count_);
+
+            try
+            {
+                for (std::uint32_t i = 0; i < thread_count_; ++i)
+                {
+                    threads_.emplace_back([this, i]
+                                          { Run(i); });
+                }
+            }
+            catch (...)
+            {
+                RequestStop();
+                Join();
+                throw;
+            }
+        }
 
-    friend auto tag_invoke(get_completion_scheduler_t, const Sender& sender) noexcept -> Scheduler {
-      return Scheduler{sender.pool_};
-    }
+        inline StaticThreadPool::~StaticThreadPool()
+        {
+            RequestStop();
+            Join();
+        }
 
-    friend struct Scheduler;
+        inline void StaticThreadPool::RequestStop() noexcept
+        {
+            for (auto& state : thread_states_)
+            {
+                state.request_stop();
+            }
+        }
 
-    explicit Sender(StaticThreadPool& pool) noexcept : pool_(pool) {}
+        inline void StaticThreadPool::Run(std::uint32_t index) noexcept
+        {
+            while (true)
+            {
+                TaskBase* task = nullptr;
+                for (std::uint32_t i = 0; i < thread_count_; ++i)
+                {
+                    auto  queue_index = (index + i) < thread_count_ ? (index + i) : (index + i - thread_count_);
+                    auto& state       = thread_states_[queue_index];
+                    task              = state.try_pop();
+                    if (task != nullptr)
+                    {
+                        break;
+                    }
+                }
+                if (task == nullptr)
+                {
+                    task = thread_states_[index].pop();
+                    if (task == nullptr)
+                    {
+                        return;
+                    }
+                }
+                task->execute_(task);
+            }
+        }
 
-    StaticThreadPool& pool_;
-  };
+        inline void StaticThreadPool::Join() noexcept
+        {
+            for (auto& t : threads_)
+            {
+                t.join();
+            }
+            threads_.clear();
+        }
 
-  Sender MakeSender_() const { return Sender{*pool_}; }
+        inline void StaticThreadPool::Enqueue(TaskBase* task) noexcept
+        {
+            const auto          thread_count = static_cast<std::uint32_t>(threads_.size());
+            const std::uint32_t start_index =
+                next_thread_.fetch_add(1, std::memory_order_relaxed) % thread_count;
+            for (std::uint32_t i = 0; i < thread_count; ++i)
+            {
+                const auto index =
+                    (start_index + i) < thread_count ? (start_index + i) : (start_index + i - thread_count);
+                if (thread_states_[index].try_push(task))
+                {
+                    return;
+                }
+            }
+            thread_states_[start_index].push(task);
+        }
 
-  friend class StaticThreadPool;
+        inline TaskBase* StaticThreadPool::ThreadState::try_pop()
+        {
+            std::unique_lock lock{mutex_, std::try_to_lock};
+            if (!lock || queue_.empty())
+            {
+                return nullptr;
+            }
+            return queue_.pop_front();
+        }
 
- public:
-  explicit Scheduler(StaticThreadPool& pool) noexcept : pool_(&pool) {}
+        inline TaskBase* StaticThreadPool::ThreadState::pop()
+        {
+            std::unique_lock lock{mutex_};
+            while (queue_.empty())
+            {
+                if (stop_requested_)
+                {
+                    return nullptr;
+                }
+                cv_.wait(lock);
+            }
+            return queue_.pop_front();
+        }
 
-  friend bool operator==(Scheduler a, Scheduler b) noexcept { return a.pool_ == b.pool_; }
+        inline bool StaticThreadPool::ThreadState::try_push(TaskBase* task)
+        {
+            bool was_empty{};
+            {
+                std::unique_lock lock{mutex_, std::try_to_lock};
+                if (!lock)
+                {
+                    return false;
+                }
+                was_empty = queue_.empty();
+                queue_.push_back(task);
+            }
+            if (was_empty)
+            {
+                cv_.notify_one();
+            }
+            return true;
+        }
 
-  friend bool operator!=(Scheduler a, Scheduler b) noexcept { return a.pool_ != b.pool_; }
+        inline void StaticThreadPool::ThreadState::push(TaskBase* task)
+        {
+            bool was_empty{};
+            {
+                std::lock_guard lock{mutex_};
+                was_empty = queue_.empty();
+                queue_.push_back(task);
+            }
+            if (was_empty)
+            {
+                cv_.notify_one();
+            }
+        }
 
-  friend Sender tag_invoke(schedule_t, const Scheduler& self) noexcept {
-    return self.MakeSender_();
-  }
+        inline void StaticThreadPool::ThreadState::request_stop()
+        {
+            {
+                std::lock_guard lock{mutex_};
+                stop_requested_ = true;
+            }
+            cv_.notify_one();
+        }
 
- private:
-  StaticThreadPool* pool_{nullptr};
-};
-
-class StaticThreadPool {
-  template <typename Receiver>
-  friend struct _Operation;
-
- public:
-  StaticThreadPool();
-  explicit StaticThreadPool(std::uint32_t thread_count);
-  ~StaticThreadPool();
-
-  Scheduler GetScheduler() noexcept { return Scheduler{*this}; }
-
-  void RequestStop() noexcept;
-
- private:
-  class ThreadState {
-   public:
-    TaskBase* try_pop();
-    TaskBase* pop();
-    bool try_push(TaskBase* task);
-    void push(TaskBase* task);
-    void request_stop();
-
-   private:
-    std::mutex mutex_;
-    std::condition_variable cv_;
-    intrusive_queue<&TaskBase::next_> queue_;
-    bool stop_requested_{false};
-  };
-
-  void Run(std::uint32_t index) noexcept;
-  void Join() noexcept;
-
-  void Enqueue(TaskBase* task) noexcept;
-
-  std::uint32_t thread_count_;
-  std::vector<std::thread> threads_;
-  std::vector<ThreadState> thread_states_;
-  std::atomic<std::uint32_t> next_thread_;
-};
-
-template <typename Receiver>
-struct _Operation<Receiver>::type : TaskBase {
-  friend Scheduler::Sender;
-
-  StaticThreadPool& pool_;
-  Receiver receiver_;
-
-  type(StaticThreadPool& pool, Receiver&& r) : TaskBase{}, pool_(pool), receiver_((Receiver &&) r) {
-    this->execute_ = [](TaskBase* t) noexcept {
-      auto& op = *static_cast<type*>(t);
-      SetValue((Receiver &&) op.receiver_);
-    };
-  }
-
-  void enqueue_(TaskBase* op) const { return pool_.Enqueue(op); }
-
-  friend void tag_invoke(start_t, type& op) noexcept { op.enqueue_(&op); }
-};
-
-inline StaticThreadPool::StaticThreadPool()
-    : StaticThreadPool(std::thread::hardware_concurrency()) {}
-
-inline StaticThreadPool::StaticThreadPool(std::uint32_t thread_count)
-    : thread_count_(thread_count), thread_states_(thread_count), next_thread_(0) {
-  assert(thread_count_ > 0);
-
-  threads_.reserve(thread_count_);
-
-  try {
-    for (std::uint32_t i = 0; i < thread_count_; ++i) {
-      threads_.emplace_back([this, i] { Run(i); });
-    }
-  } catch (...) {
-    RequestStop();
-    Join();
-    throw;
-  }
-}
-
-inline StaticThreadPool::~StaticThreadPool() {
-  RequestStop();
-  Join();
-}
-
-inline void StaticThreadPool::RequestStop() noexcept {
-  for (auto& state : thread_states_) {
-    state.request_stop();
-  }
-}
-
-inline void StaticThreadPool::Run(std::uint32_t index) noexcept {
-  while (true) {
-    TaskBase* task = nullptr;
-    for (std::uint32_t i = 0; i < thread_count_; ++i) {
-      auto queue_index = (index + i) < thread_count_ ? (index + i) : (index + i - thread_count_);
-      auto& state = thread_states_[queue_index];
-      task = state.try_pop();
-      if (task != nullptr) {
-        break;
-      }
-    }
-    if (task == nullptr) {
-      task = thread_states_[index].pop();
-      if (task == nullptr) {
-        return;
-      }
-    }
-    task->execute_(task);
-  }
-}
-
-inline void StaticThreadPool::Join() noexcept {
-  for (auto& t : threads_) {
-    t.join();
-  }
-  threads_.clear();
-}
-
-inline void StaticThreadPool::Enqueue(TaskBase* task) noexcept {
-  const auto thread_count = static_cast<std::uint32_t>(threads_.size());
-  const std::uint32_t start_index =
-      next_thread_.fetch_add(1, std::memory_order_relaxed) % thread_count;
-  for (std::uint32_t i = 0; i < thread_count; ++i) {
-    const auto index =
-        (start_index + i) < thread_count ? (start_index + i) : (start_index + i - thread_count);
-    if (thread_states_[index].try_push(task)) {
-      return;
-    }
-  }
-  thread_states_[start_index].push(task);
-}
-
-inline TaskBase* StaticThreadPool::ThreadState::try_pop() {
-  std::unique_lock lock{mutex_, std::try_to_lock};
-  if (!lock || queue_.empty()) {
-    return nullptr;
-  }
-  return queue_.pop_front();
-}
-
-inline TaskBase* StaticThreadPool::ThreadState::pop() {
-  std::unique_lock lock{mutex_};
-  while (queue_.empty()) {
-    if (stop_requested_) {
-      return nullptr;
-    }
-    cv_.wait(lock);
-  }
-  return queue_.pop_front();
-}
-
-inline bool StaticThreadPool::ThreadState::try_push(TaskBase* task) {
-  bool was_empty{};
-  {
-    std::unique_lock lock{mutex_, std::try_to_lock};
-    if (!lock) {
-      return false;
-    }
-    was_empty = queue_.empty();
-    queue_.push_back(task);
-  }
-  if (was_empty) {
-    cv_.notify_one();
-  }
-  return true;
-}
-
-inline void StaticThreadPool::ThreadState::push(TaskBase* task) {
-  bool was_empty{};
-  {
-    std::lock_guard lock{mutex_};
-    was_empty = queue_.empty();
-    queue_.push_back(task);
-  }
-  if (was_empty) {
-    cv_.notify_one();
-  }
-}
-
-inline void StaticThreadPool::ThreadState::request_stop() {
-  {
-    std::lock_guard lock{mutex_};
-    stop_requested_ = true;
-  }
-  cv_.notify_one();
-}
-
-namespace __bulk {
-
-template <typename CvrefSender, typename Shape, typename Func, typename Receiver>
-struct _Operation {
-  struct type;
-};
-template <typename CvrefSender, typename Shape, typename Func, typename Receiver>
-using operation_t = typename _Operation<CvrefSender, Shape, Func, Receiver>::type;
-
-template <typename Receiver, typename Shape, typename Func, typename Tuple>
-struct _Receiver {
-  struct type;
-};
-template <typename Receiver, typename Shape, typename Func, typename Tuple>
-using receiver_t = typename _Receiver<remove_cvref_t<Receiver>, Shape, Func, Tuple>::type;
-
-template <typename Receiver, typename Shape, typename Func, typename Tuple>
-struct _Receiver<Receiver, Shape, Func, Tuple>::type {
-  struct State {
-    Receiver receiver_;
-    Shape shape_;
-    Func func_;
-    std::optional<Tuple> values_;
-    Scheduler scheduler_;
-    std::atomic<Shape> count_;
-  };
-
-  std::shared_ptr<State> state_;
-
-  type(Receiver&& receiver, Shape shape, Func func, Scheduler scheduler)
-      : state_(new State{(Receiver &&) receiver, shape, (Func &&) func, std::nullopt, scheduler,
-                         shape}) {}
-
-  template <typename... As>
-  friend void tag_invoke(set_value_t, type&& self, As&&... as) noexcept {
-    auto& state = self.state_;
-    state->values_.emplace((As &&) as...);
-    for (Shape index = {}; index < state->shape_; ++index) {
-      StartDetached(Then(Schedule(state->scheduler_), [state, index] {
+        namespace __bulk
+        {
+
+            template<typename CvrefSender, typename Shape, typename Func, typename Receiver>
+            struct _Operation
+            {
+                struct type;
+            };
+            template<typename CvrefSender, typename Shape, typename Func, typename Receiver>
+            using operation_t = typename _Operation<CvrefSender, Shape, Func, Receiver>::type;
+
+            template<typename Receiver, typename Shape, typename Func, typename Tuple>
+            struct _Receiver
+            {
+                struct type;
+            };
+            template<typename Receiver, typename Shape, typename Func, typename Tuple>
+            using receiver_t = typename _Receiver<remove_cvref_t<Receiver>, Shape, Func, Tuple>::type;
+
+            template<typename Receiver, typename Shape, typename Func, typename Tuple>
+            struct _Receiver<Receiver, Shape, Func, Tuple>::type
+            {
+                struct State
+                {
+                    Receiver             receiver_;
+                    Shape                shape_;
+                    Func                 func_;
+                    std::optional<Tuple> values_;
+                    Scheduler            scheduler_;
+                    std::atomic<Shape>   count_;
+                };
+
+                std::shared_ptr<State> state_;
+
+                type(Receiver&& receiver, Shape shape, Func func, Scheduler scheduler)
+                    : state_(new State{(Receiver&&)receiver, shape, (Func&&)func, std::nullopt, scheduler, shape})
+                {
+                }
+
+                template<typename... As>
+                friend void tag_invoke(set_value_t, type&& self, As&&... as) noexcept
+                {
+                    auto& state = self.state_;
+                    state->values_.emplace((As&&)as...);
+                    for (Shape index = {}; index < state->shape_; ++index)
+                    {
+                        StartDetached(Then(Schedule(state->scheduler_), [state, index]
+                                           {
         std::apply([&](auto&... vals) { state->func_(index, vals...); }, state->values_.value());
         if (0 == --state->count_) {
           std::apply(
               [&](auto&... vals) { SetValue(std::move(state->receiver_), std::move(vals)...); },
               state->values_.value());
         }
-        return 0;
-      }));
-    }
-  }
-};
-
-template <typename Sender, typename Shape, typename Func>
-struct _Sender {
-  struct type;
-};
-template <typename Sender, typename Shape, typename Func>
-using sender_t = typename _Sender<remove_cvref_t<Sender>, remove_cvref_t<Shape>, Func>::type;
-
-template <typename Sender, typename Shape, typename Func>
-struct _Sender<Sender, Shape, Func>::type {
-  using value_types = completion_signatures_of_t<Sender>;
-  template <typename Receiver>
-  using _receiver_t = receiver_t<Receiver, Shape, Func, value_types>;
-
-  Sender sender_;
-  Scheduler scheduler_;
-  Shape shape_;
-  Func func_;
-
-  template <typename Self, typename Receiver, _decays_to<Self, type, int> = 0>
-  friend auto tag_invoke(connect_t, Self&& self, Receiver&& receiver) {
-    return Connect(((Self &&) self).sender_,
-                   _receiver_t<Receiver>{(Receiver &&) receiver, ((Self &&) self).shape_,
-                                         ((Self &&) self).func_, ((Self &&) self).scheduler_});
-  }
-};
-
-}  // namespace __bulk
-
-template <typename Sender, typename Shape, typename Func>
-__bulk::sender_t<Sender, Shape, Func> tag_invoke(bulk_t, Scheduler scheduler, Sender&& sender,
-                                                 Shape&& shape, Func&& func) {
-  return {(Sender &&) sender, scheduler, (Shape &&) shape, (Func &&) func};
-}
-
-}  // namespace __static_thread_pool
-
-using __static_thread_pool::StaticThreadPool;
+        return 0; }));
+                    }
+                }
+            };
+
+            template<typename Sender, typename Shape, typename Func>
+            struct _Sender
+            {
+                struct type;
+            };
+            template<typename Sender, typename Shape, typename Func>
+            using sender_t = typename _Sender<remove_cvref_t<Sender>, remove_cvref_t<Shape>, Func>::type;
+
+            template<typename Sender, typename Shape, typename Func>
+            struct _Sender<Sender, Shape, Func>::type
+            {
+                using value_types = completion_signatures_of_t<Sender>;
+                template<typename Receiver>
+                using _receiver_t = receiver_t<Receiver, Shape, Func, value_types>;
+
+                Sender    sender_;
+                Scheduler scheduler_;
+                Shape     shape_;
+                Func      func_;
+
+                template<typename Self, typename Receiver, _decays_to<Self, type, int> = 0>
+                friend auto tag_invoke(connect_t, Self&& self, Receiver&& receiver)
+                {
+                    return Connect(((Self&&)self).sender_,
+                                   _receiver_t<Receiver>{(Receiver&&)receiver, ((Self&&)self).shape_, ((Self&&)self).func_, ((Self&&)self).scheduler_});
+                }
+            };
+
+        }  // namespace __bulk
+
+        template<typename Sender, typename Shape, typename Func>
+        __bulk::sender_t<Sender, Shape, Func> tag_invoke(bulk_t, Scheduler scheduler, Sender&& sender, Shape&& shape, Func&& func)
+        {
+            return {(Sender&&)sender, scheduler, (Shape&&)shape, (Func&&)func};
+        }
+
+    }  // namespace __static_thread_pool
+
+    using __static_thread_pool::StaticThreadPool;
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/schedulers/timed_single_thread_context.h b/csrc/mmdeploy/execution/schedulers/timed_single_thread_context.h
index 706d6d9808..5a409d61f7 100644
--- a/csrc/mmdeploy/execution/schedulers/timed_single_thread_context.h
+++ b/csrc/mmdeploy/execution/schedulers/timed_single_thread_context.h
@@ -6,197 +6,258 @@
 
 #include "mmdeploy/execution/execution.h"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-class TimedSingleThreadContext;
+    class TimedSingleThreadContext;
 
-namespace _timed_single_thread_context {
-
-using Clock = std::chrono::steady_clock;
-using TimePoint = typename Clock::time_point;
-
-struct TaskBase {
-  using ExecuteFn = void(TaskBase*) noexcept;
-
-  explicit TaskBase(TimedSingleThreadContext& context, ExecuteFn* execute) noexcept
-      : context_(&context), execute_(execute) {}
-
-  TimedSingleThreadContext* const context_;
-  TaskBase* next_{nullptr};
-  TaskBase** prev_next_ptr_{nullptr};
-  ExecuteFn* execute_;
-  TimePoint due_time_;
-
-  void Execute() noexcept { execute_(this); }
-};
-
-class Scheduler;
-
-namespace __schedule_after {
-
-template <typename Duration, typename Receiver>
-struct _Operation {
-  struct type;
-};
-template <typename Duration, typename Receiver>
-using operation_t = typename _Operation<Duration, remove_cvref_t<Receiver>>::type;
-
-template <typename Duration>
-struct _Sender {
-  struct type;
-};
-template <typename Duration>
-using sender_t = typename _Sender<Duration>::type;
-
-}  // namespace __schedule_after
-
-class Scheduler {
-  friend TimedSingleThreadContext;
-
-  explicit Scheduler(TimedSingleThreadContext& context) noexcept : context_(&context) {}
-
-  friend bool operator==(Scheduler a, Scheduler b) noexcept { return a.context_ == b.context_; }
-
-  friend bool operator!=(Scheduler a, Scheduler b) noexcept { return a.context_ != b.context_; }
+    namespace _timed_single_thread_context
+    {
 
-  TimedSingleThreadContext* context_;
+        using Clock     = std::chrono::steady_clock;
+        using TimePoint = typename Clock::time_point;
+
+        struct TaskBase
+        {
+            using ExecuteFn = void(TaskBase*) noexcept;
+
+            explicit TaskBase(TimedSingleThreadContext& context, ExecuteFn* execute) noexcept
+                : context_(&context)
+                , execute_(execute)
+            {
+            }
+
+            TimedSingleThreadContext* const context_;
+            TaskBase*                       next_{nullptr};
+            TaskBase**                      prev_next_ptr_{nullptr};
+            ExecuteFn*                      execute_;
+            TimePoint                       due_time_;
+
+            void                            Execute() noexcept
+            {
+                execute_(this);
+            }
+        };
+
+        class Scheduler;
+
+        namespace __schedule_after
+        {
+
+            template<typename Duration, typename Receiver>
+            struct _Operation
+            {
+                struct type;
+            };
+            template<typename Duration, typename Receiver>
+            using operation_t = typename _Operation<Duration, remove_cvref_t<Receiver>>::type;
+
+            template<typename Duration>
+            struct _Sender
+            {
+                struct type;
+            };
+            template<typename Duration>
+            using sender_t = typename _Sender<Duration>::type;
+
+        }  // namespace __schedule_after
+
+        class Scheduler
+        {
+            friend TimedSingleThreadContext;
+
+            explicit Scheduler(TimedSingleThreadContext& context) noexcept
+                : context_(&context)
+            {
+            }
+
+            friend bool operator==(Scheduler a, Scheduler b) noexcept
+            {
+                return a.context_ == b.context_;
+            }
+
+            friend bool operator!=(Scheduler a, Scheduler b) noexcept
+            {
+                return a.context_ != b.context_;
+            }
+
+            TimedSingleThreadContext* context_;
+
+            template<typename Rep, typename Ratio>
+            friend auto ScheduleAfter(const Scheduler& self, std::chrono::duration<Rep, Ratio> delay) noexcept
+                -> __schedule_after::sender_t<std::chrono::duration<Rep, Ratio>>
+            {
+                return {self.context_, delay};
+            }
+
+            template<typename Duration = std::chrono::microseconds>
+            friend __schedule_after::sender_t<Duration> tag_invoke(schedule_t,
+                                                                   const Scheduler& self) noexcept
+            {
+                return {self.context_, Duration::zero()};
+            }
+        };
+
+    }  // namespace _timed_single_thread_context
+
+    class MMDEPLOY_API TimedSingleThreadContext
+    {
+        using Clock     = _timed_single_thread_context::Clock;
+        using Scheduler = _timed_single_thread_context::Scheduler;
+        using TaskBase  = _timed_single_thread_context::TaskBase;
+        template<typename Duration, typename Receiver>
+        friend struct _timed_single_thread_context::__schedule_after::_Operation;
+        friend Scheduler;
+
+        void Enqueue(TaskBase* task) noexcept
+        {
+            bool need_notify = false;
+            {
+                std::lock_guard lock{mutex_};
+
+                if (head_ == nullptr || task->due_time_ < head_->due_time_)
+                {
+                    task->next_ = head_;
+                    head_       = task;
+                    need_notify = true;
+                }
+                else
+                {
+                    auto* queued_task = head_;
+                    // find insert pos
+                    while (queued_task->next_ != nullptr && queued_task->next_->due_time_ <= task->due_time_)
+                    {
+                        queued_task = queued_task->next_;
+                    }
+
+                    task->next_        = queued_task->next_;
+                    queued_task->next_ = task;
+                }
+            }
+            if (need_notify)
+            {
+                cv_.notify_one();
+            }
+        }
 
-  template <typename Rep, typename Ratio>
-  friend auto ScheduleAfter(const Scheduler& self, std::chrono::duration<Rep, Ratio> delay) noexcept
-      -> __schedule_after::sender_t<std::chrono::duration<Rep, Ratio>> {
-    return {self.context_, delay};
-  }
+        void Run()
+        {
+            std::unique_lock lock{mutex_};
+
+            while (!stop_)
+            {
+                if (head_ != nullptr)
+                {
+                    auto now           = Clock::now();
+                    auto next_due_time = head_->due_time_;
+                    if (next_due_time <= now)
+                    {
+                        // dequeue
+                        auto* task = head_;
+                        head_      = task->next_;
+                        // execute
+                        lock.unlock();
+                        task->Execute();
+                        lock.lock();
+                    }
+                    else
+                    {
+                        cv_.wait_until(lock, next_due_time);
+                    }
+                }
+                else
+                {
+                    cv_.wait(lock);
+                }
+            }
+        }
 
-  template <typename Duration = std::chrono::microseconds>
-  friend __schedule_after::sender_t<Duration> tag_invoke(schedule_t,
-                                                         const Scheduler& self) noexcept {
-    return {self.context_, Duration::zero()};
-  }
-};
+        std::mutex              mutex_;
+        std::condition_variable cv_;
 
-}  // namespace _timed_single_thread_context
+        TaskBase*               head_{nullptr};
+        bool                    stop_{false};
 
-class MMDEPLOY_API TimedSingleThreadContext {
-  using Clock = _timed_single_thread_context::Clock;
-  using Scheduler = _timed_single_thread_context::Scheduler;
-  using TaskBase = _timed_single_thread_context::TaskBase;
-  template <typename Duration, typename Receiver>
-  friend struct _timed_single_thread_context::__schedule_after::_Operation;
-  friend Scheduler;
+        std::thread             thread_;
 
-  void Enqueue(TaskBase* task) noexcept {
-    bool need_notify = false;
-    {
-      std::lock_guard lock{mutex_};
-
-      if (head_ == nullptr || task->due_time_ < head_->due_time_) {
-        task->next_ = head_;
-        head_ = task;
-        need_notify = true;
-      } else {
-        auto* queued_task = head_;
-        // find insert pos
-        while (queued_task->next_ != nullptr && queued_task->next_->due_time_ <= task->due_time_) {
-          queued_task = queued_task->next_;
+      public:
+        TimedSingleThreadContext()
+            : thread_([this]
+                      { this->Run(); })
+        {
         }
-
-        task->next_ = queued_task->next_;
-        queued_task->next_ = task;
-      }
-    }
-    if (need_notify) {
-      cv_.notify_one();
-    }
-  }
-
-  void Run() {
-    std::unique_lock lock{mutex_};
-
-    while (!stop_) {
-      if (head_ != nullptr) {
-        auto now = Clock::now();
-        auto next_due_time = head_->due_time_;
-        if (next_due_time <= now) {
-          // dequeue
-          auto* task = head_;
-          head_ = task->next_;
-          // execute
-          lock.unlock();
-          task->Execute();
-          lock.lock();
-        } else {
-          cv_.wait_until(lock, next_due_time);
+        ~TimedSingleThreadContext()
+        {
+            {
+                std::lock_guard lock{mutex_};
+                stop_ = true;
+                cv_.notify_one();
+            }
+            thread_.join();
+            assert(head_ == nullptr);
         }
-      } else {
-        cv_.wait(lock);
-      }
-    }
-  }
-
-  std::mutex mutex_;
-  std::condition_variable cv_;
 
-  TaskBase* head_{nullptr};
-  bool stop_{false};
+        Scheduler GetScheduler() noexcept
+        {
+            return Scheduler{*this};
+        }
 
-  std::thread thread_;
+        std::thread::id GetThreadId() const noexcept
+        {
+            return thread_.get_id();
+        }
+    };
 
- public:
-  TimedSingleThreadContext() : thread_([this] { this->Run(); }) {}
-  ~TimedSingleThreadContext() {
+    namespace _timed_single_thread_context::__schedule_after
     {
-      std::lock_guard lock{mutex_};
-      stop_ = true;
-      cv_.notify_one();
-    }
-    thread_.join();
-    assert(head_ == nullptr);
-  }
-
-  Scheduler GetScheduler() noexcept { return Scheduler{*this}; }
-
-  std::thread::id GetThreadId() const noexcept { return thread_.get_id(); }
-};
-
-namespace _timed_single_thread_context::__schedule_after {
-
-template <typename Duration, typename Receiver>
-struct _Operation<Duration, Receiver>::type : TaskBase {
-  Duration duration_;
-  Receiver receiver_;
-
-  template <typename Receiver2>
-  type(TimedSingleThreadContext& context, Duration duration, Receiver2&& receiver)
-      : TaskBase(context, &type::ExecuteImpl),
-        duration_(duration),
-        receiver_((Receiver2 &&) receiver) {}
-
-  static void ExecuteImpl(TaskBase* p) noexcept {
-    auto& self = *static_cast<type*>(p);
-    SetValue((Receiver &&) self.receiver_);
-  }
-
-  void Enqueue() { context_->Enqueue(this); }
-
-  friend void tag_invoke(start_t, type& op_state) noexcept {
-    op_state.due_time_ = Clock::now() + op_state.duration_;
-    op_state.Enqueue();
-  }
-};
-
-template <typename Duration>
-struct _Sender<Duration>::type {
-  using value_types = std::tuple<>;
-
-  TimedSingleThreadContext* context_;
-  Duration duration_;
-
-  template <typename Self, typename Receiver, _decays_to<Self, type, int> = 0>
-  friend operation_t<Duration, Receiver> tag_invoke(connect_t, Self&& self, Receiver&& receiver) {
-    return {*self.context_, self.duration_, (Receiver &&) receiver};
-  }
-};
-
-}  // namespace _timed_single_thread_context::__schedule_after
+
+        template<typename Duration, typename Receiver>
+        struct _Operation<Duration, Receiver>::type : TaskBase
+        {
+            Duration duration_;
+            Receiver receiver_;
+
+            template<typename Receiver2>
+            type(TimedSingleThreadContext& context, Duration duration, Receiver2&& receiver)
+                : TaskBase(context, &type::ExecuteImpl)
+                , duration_(duration)
+                , receiver_((Receiver2&&)receiver)
+            {
+            }
+
+            static void ExecuteImpl(TaskBase* p) noexcept
+            {
+                auto& self = *static_cast<type*>(p);
+                SetValue((Receiver&&)self.receiver_);
+            }
+
+            void Enqueue()
+            {
+                context_->Enqueue(this);
+            }
+
+            friend void tag_invoke(start_t, type& op_state) noexcept
+            {
+                op_state.due_time_ = Clock::now() + op_state.duration_;
+                op_state.Enqueue();
+            }
+        };
+
+        template<typename Duration>
+        struct _Sender<Duration>::type
+        {
+            using value_types = std::tuple<>;
+
+            TimedSingleThreadContext* context_;
+            Duration                  duration_;
+
+            template<typename Self, typename Receiver, _decays_to<Self, type, int> = 0>
+            friend operation_t<Duration, Receiver> tag_invoke(connect_t, Self&& self, Receiver&& receiver)
+            {
+                return {*self.context_, self.duration_, (Receiver&&)receiver};
+            }
+        };
+
+    }  // namespace _timed_single_thread_context::__schedule_after
 
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/execution/split.h b/csrc/mmdeploy/execution/split.h
index f4df689f42..93e5d6db70 100644
--- a/csrc/mmdeploy/execution/split.h
+++ b/csrc/mmdeploy/execution/split.h
@@ -9,165 +9,198 @@
 #include "concepts.h"
 #include "utility.h"
 
-namespace mmdeploy {
-
-namespace __split {
-
-template <typename SharedState>
-struct _Receiver {
-  struct type;
-};
-template <typename SharedState>
-using receiver_t = typename _Receiver<SharedState>::type;
-
-struct _OperationBase {
-  _OperationBase* next_;
-  void (*notify_)(_OperationBase*) noexcept;
-};
-
-template <typename SharedState>
-struct _Receiver<SharedState>::type {
-  SharedState& shared_state_;
-
-  template <typename... As>
-  friend void tag_invoke(set_value_t, type&& self, As&&... as) noexcept {
-    auto& state = self.shared_state_;
-    state.data_.emplace((As &&) as...);
-    state._Notify();
-  }
-};
-
-template <typename Sender>
-struct _SharedState {
-  std::optional<completion_signatures_of_t<Sender>> data_;
-
-  using Receiver = receiver_t<_SharedState>;
-
-  connect_result_t<Sender, Receiver> op_state2_;
-
-  std::atomic<void*> head_{nullptr};
-
-  explicit _SharedState(Sender& sender)
-      : op_state2_(Connect((Sender &&) sender, Receiver{*this})) {}
-
-  void _Notify() noexcept {
-    void* const completion_state = static_cast<void*>(this);
-    void* old = head_.exchange(completion_state, std::memory_order_acq_rel);
-    auto* op_state = static_cast<_OperationBase*>(old);
-
-    while (op_state != nullptr) {
-      _OperationBase* next = op_state->next_;
-      op_state->notify_(op_state);
-      op_state = next;
-    }
-  }
-};
-
-template <typename Sender, typename Receiver>
-struct _Operation {
-  struct type;
-};
-template <typename Sender, typename Receiver>
-using operation_t = typename _Operation<Sender, remove_cvref_t<Receiver>>::type;
-
-template <typename Sender, typename Receiver>
-struct _Operation<Sender, Receiver>::type : _OperationBase {
-  Receiver receiver_;
-  std::shared_ptr<_SharedState<Sender>> shared_state_;
-
-  type(Receiver&& receiver, std::shared_ptr<_SharedState<Sender>> shared_state)
-      : _OperationBase{nullptr, _Notify},
-        receiver_(std::move(receiver)),
-        shared_state_(std::move(shared_state)) {}
-
-  static void _Notify(_OperationBase* self) noexcept {
-    auto op = static_cast<type*>(self);
-    std::apply([&](const auto&... args) { SetValue(std::move(op->receiver_), args...); },
-               op->shared_state_->data_.value());
-  }
-
-  friend void tag_invoke(start_t, type& self) {
-    auto shared_state = self.shared_state_.get();
-    std::atomic<void*>& head = shared_state->head_;
-    void* const completion_state = static_cast<void*>(shared_state);
-    void* old = head.load(std::memory_order_acquire);
-
-    do {
-      if (old == completion_state) {
-        self._Notify(&self);
-        return;
-      }
-      self.next_ = static_cast<_OperationBase*>(old);
-    } while (!head.compare_exchange_weak(old, static_cast<void*>(&self), std::memory_order_release,
-                                         std::memory_order_acquire));
-
-    if (old == nullptr) {
-      Start(shared_state->op_state2_);
-    }
-  }
-};
-
-template <typename Sender>
-struct _Sender {
-  struct type;
-};
-template <typename Sender>
-using sender_t = typename _Sender<remove_cvref_t<Sender>>::type;
-
-template <typename Sender>
-struct _Sender<Sender>::type {
-  using SharedState = _SharedState<Sender>;
-  template <typename Receiver>
-  using _operation_t = operation_t<Sender, Receiver>;
-
-  using value_types = completion_signatures_of_t<Sender>;
-
-  Sender sender_;
-  std::shared_ptr<SharedState> shared_state_;
-
-  explicit type(Sender sender)
-      : sender_(std::move(sender)), shared_state_{std::make_shared<SharedState>(sender_)} {}
-
-  template <typename Self, typename Receiver, _decays_to<Self, type, int> = 0>
-  friend auto tag_invoke(connect_t, Self&& self, Receiver&& receiver) -> _operation_t<Receiver> {
-    return _operation_t<Receiver>((Receiver &&) receiver, self.shared_state_);
-  }
-};
-
-struct split_t {
-  template <
-      typename Sender,
-      std::enable_if_t<
-          _is_sender<Sender> && _tag_invocable_with_completion_scheduler<split_t, Sender>, int> = 0>
-  auto operator()(Sender&& sender) const {
-    auto scheduler = GetCompletionScheduler(sender);
-    return tag_invoke(split_t{}, std::move(scheduler), (Sender &&) sender);
-  }
-
-  template <typename Sender,
-            std::enable_if_t<_is_sender<Sender> &&
-                                 !_tag_invocable_with_completion_scheduler<split_t, Sender> &&
-                                 tag_invocable<split_t, Sender>,
-                             int> = 0>
-  auto operator()(Sender&& sender) const {
-    return tag_invoke(split_t{}, (Sender &&) sender);
-  }
-
-  template <typename Sender,
-            std::enable_if_t<_is_sender<Sender> &&
-                                 !_tag_invocable_with_completion_scheduler<split_t, Sender> &&
-                                 !tag_invocable<split_t, Sender>,
-                             int> = 0>
-  sender_t<Sender> operator()(Sender&& sender) const {
-    return sender_t<Sender>{(Sender &&) sender};
-  }
-  _BinderBack<split_t> operator()() const { return {{}, {}, {}}; }
-};
-
-}  // namespace __split
-
-using __split::split_t;
-inline constexpr split_t Split{};
+namespace mmdeploy
+{
+
+    namespace __split
+    {
+
+        template<typename SharedState>
+        struct _Receiver
+        {
+            struct type;
+        };
+        template<typename SharedState>
+        using receiver_t = typename _Receiver<SharedState>::type;
+
+        struct _OperationBase
+        {
+            _OperationBase* next_;
+            void (*notify_)(_OperationBase*) noexcept;
+        };
+
+        template<typename SharedState>
+        struct _Receiver<SharedState>::type
+        {
+            SharedState& shared_state_;
+
+            template<typename... As>
+            friend void tag_invoke(set_value_t, type&& self, As&&... as) noexcept
+            {
+                auto& state = self.shared_state_;
+                state.data_.emplace((As&&)as...);
+                state._Notify();
+            }
+        };
+
+        template<typename Sender>
+        struct _SharedState
+        {
+            std::optional<completion_signatures_of_t<Sender>> data_;
+
+            using Receiver = receiver_t<_SharedState>;
+
+            connect_result_t<Sender, Receiver> op_state2_;
+
+            std::atomic<void*>                 head_{nullptr};
+
+            explicit _SharedState(Sender& sender)
+                : op_state2_(Connect((Sender&&)sender, Receiver{*this}))
+            {
+            }
+
+            void _Notify() noexcept
+            {
+                void* const completion_state = static_cast<void*>(this);
+                void*       old              = head_.exchange(completion_state, std::memory_order_acq_rel);
+                auto*       op_state         = static_cast<_OperationBase*>(old);
+
+                while (op_state != nullptr)
+                {
+                    _OperationBase* next = op_state->next_;
+                    op_state->notify_(op_state);
+                    op_state = next;
+                }
+            }
+        };
+
+        template<typename Sender, typename Receiver>
+        struct _Operation
+        {
+            struct type;
+        };
+        template<typename Sender, typename Receiver>
+        using operation_t = typename _Operation<Sender, remove_cvref_t<Receiver>>::type;
+
+        template<typename Sender, typename Receiver>
+        struct _Operation<Sender, Receiver>::type : _OperationBase
+        {
+            Receiver                              receiver_;
+            std::shared_ptr<_SharedState<Sender>> shared_state_;
+
+            type(Receiver&& receiver, std::shared_ptr<_SharedState<Sender>> shared_state)
+                : _OperationBase{nullptr, _Notify}
+                , receiver_(std::move(receiver))
+                , shared_state_(std::move(shared_state))
+            {
+            }
+
+            static void _Notify(_OperationBase* self) noexcept
+            {
+                auto op = static_cast<type*>(self);
+                std::apply([&](const auto&... args)
+                           { SetValue(std::move(op->receiver_), args...); },
+                           op->shared_state_->data_.value());
+            }
+
+            friend void tag_invoke(start_t, type& self)
+            {
+                auto                shared_state     = self.shared_state_.get();
+                std::atomic<void*>& head             = shared_state->head_;
+                void* const         completion_state = static_cast<void*>(shared_state);
+                void*               old              = head.load(std::memory_order_acquire);
+
+                do {
+                    if (old == completion_state)
+                    {
+                        self._Notify(&self);
+                        return;
+                    }
+                    self.next_ = static_cast<_OperationBase*>(old);
+                } while (!head.compare_exchange_weak(old, static_cast<void*>(&self), std::memory_order_release, std::memory_order_acquire));
+
+                if (old == nullptr)
+                {
+                    Start(shared_state->op_state2_);
+                }
+            }
+        };
+
+        template<typename Sender>
+        struct _Sender
+        {
+            struct type;
+        };
+        template<typename Sender>
+        using sender_t = typename _Sender<remove_cvref_t<Sender>>::type;
+
+        template<typename Sender>
+        struct _Sender<Sender>::type
+        {
+            using SharedState = _SharedState<Sender>;
+            template<typename Receiver>
+            using _operation_t = operation_t<Sender, Receiver>;
+
+            using value_types = completion_signatures_of_t<Sender>;
+
+            Sender                       sender_;
+            std::shared_ptr<SharedState> shared_state_;
+
+            explicit type(Sender sender)
+                : sender_(std::move(sender))
+                , shared_state_{std::make_shared<SharedState>(sender_)}
+            {
+            }
+
+            template<typename Self, typename Receiver, _decays_to<Self, type, int> = 0>
+            friend auto tag_invoke(connect_t, Self&& self, Receiver&& receiver) -> _operation_t<Receiver>
+            {
+                return _operation_t<Receiver>((Receiver&&)receiver, self.shared_state_);
+            }
+        };
+
+        struct split_t
+        {
+            template<
+                typename Sender,
+                std::enable_if_t<
+                    _is_sender<Sender> && _tag_invocable_with_completion_scheduler<split_t, Sender>,
+                    int> = 0>
+            auto operator()(Sender&& sender) const
+            {
+                auto scheduler = GetCompletionScheduler(sender);
+                return tag_invoke(split_t{}, std::move(scheduler), (Sender&&)sender);
+            }
+
+            template<typename Sender,
+                     std::enable_if_t<_is_sender<Sender> &&
+                                          !_tag_invocable_with_completion_scheduler<split_t, Sender> &&
+                                          tag_invocable<split_t, Sender>,
+                                      int> = 0>
+            auto operator()(Sender&& sender) const
+            {
+                return tag_invoke(split_t{}, (Sender&&)sender);
+            }
+
+            template<typename Sender,
+                     std::enable_if_t<_is_sender<Sender> &&
+                                          !_tag_invocable_with_completion_scheduler<split_t, Sender> &&
+                                          !tag_invocable<split_t, Sender>,
+                                      int> = 0>
+            sender_t<Sender> operator()(Sender&& sender) const
+            {
+                return sender_t<Sender>{(Sender&&)sender};
+            }
+            _BinderBack<split_t> operator()() const
+            {
+                return {{}, {}, {}};
+            }
+        };
+
+    }  // namespace __split
+
+    using __split::split_t;
+    inline constexpr split_t Split{};
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/start_detached.h b/csrc/mmdeploy/execution/start_detached.h
index c4e846c19e..2eb191d728 100644
--- a/csrc/mmdeploy/execution/start_detached.h
+++ b/csrc/mmdeploy/execution/start_detached.h
@@ -8,35 +8,43 @@
 #include "submit.h"
 #include "utility.h"
 
-namespace mmdeploy {
-
-namespace __start_detached {
-
-struct _Receiver {
-  template <typename... As>
-  friend void tag_invoke(set_value_t, _Receiver&&, As&&...) noexcept {}
-};
-
-struct start_detached_t {
-  template <
-      typename Sender,
-      std::enable_if_t<_is_sender<Sender> && tag_invocable<start_detached_t, Sender>, int> = 0>
-  void operator()(Sender&& sender) const {
-    (void)tag_invoke(start_detached_t{}, (Sender &&) sender);
-  }
-
-  template <
-      typename Sender,
-      std::enable_if_t<_is_sender<Sender> && !tag_invocable<start_detached_t, Sender>, int> = 0>
-  void operator()(Sender&& sender) const {
-    __Submit((Sender &&) sender, _Receiver{});
-  }
-};
-
-}  // namespace __start_detached
-
-using __start_detached::start_detached_t;
-inline constexpr start_detached_t StartDetached{};
+namespace mmdeploy
+{
+
+    namespace __start_detached
+    {
+
+        struct _Receiver
+        {
+            template<typename... As>
+            friend void tag_invoke(set_value_t, _Receiver&&, As&&...) noexcept
+            {
+            }
+        };
+
+        struct start_detached_t
+        {
+            template<
+                typename Sender,
+                std::enable_if_t<_is_sender<Sender> && tag_invocable<start_detached_t, Sender>, int> = 0>
+            void operator()(Sender&& sender) const
+            {
+                (void)tag_invoke(start_detached_t{}, (Sender&&)sender);
+            }
+
+            template<
+                typename Sender,
+                std::enable_if_t<_is_sender<Sender> && !tag_invocable<start_detached_t, Sender>, int> = 0>
+            void operator()(Sender&& sender) const
+            {
+                __Submit((Sender&&)sender, _Receiver{});
+            }
+        };
+
+    }  // namespace __start_detached
+
+    using __start_detached::start_detached_t;
+    inline constexpr start_detached_t StartDetached{};
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/submit.h b/csrc/mmdeploy/execution/submit.h
index fa1f3d53fe..b376a0eb10 100644
--- a/csrc/mmdeploy/execution/submit.h
+++ b/csrc/mmdeploy/execution/submit.h
@@ -7,52 +7,63 @@
 
 #include "utility.h"
 
-namespace mmdeploy {
-
-namespace __submit {
-
-namespace __impl {
-
-template <typename Sender, typename Receiver>
-struct _Operation {
-  struct type;
-};
-template <typename Sender, typename Receiver>
-using operation_t = typename _Operation<Sender, remove_cvref_t<Receiver>>::type;
-
-template <typename Sender, typename Receiver>
-struct _Operation<Sender, Receiver>::type {
-  struct _Receiver {
-    type* op_state_;
-    template <typename... As>
-    friend void tag_invoke(set_value_t, _Receiver&& self, As&&... as) noexcept {
-      std::unique_ptr<type> _{self.op_state_};
-      return SetValue(std::move(self.op_state_->receiver_), (As &&) as...);
-    }
-  };
-  Receiver receiver_;
-  connect_result_t<Sender, _Receiver> op_state_;
-
-  template <typename Receiver2, _decays_to<Receiver2, Receiver, int> = 0>
-  type(Sender&& sender, Receiver2&& receiver)
-      : receiver_((Receiver2 &&) receiver),
-        op_state_(Connect((Sender &&) sender, _Receiver{this})) {}
-};
-
-}  // namespace __impl
-
-struct __submit_t {
-  template <typename Receiver, typename Sender>
-  void operator()(Sender&& sender, Receiver&& receiver) const noexcept(false) {
-    Start((new __impl::operation_t<Sender, Receiver>((Sender &&) sender, (Receiver &&) receiver))
-              ->op_state_);
-  }
-};
-
-}  // namespace __submit
-
-using __submit::__submit_t;
-inline constexpr __submit_t __Submit{};
+namespace mmdeploy
+{
+
+    namespace __submit
+    {
+
+        namespace __impl
+        {
+
+            template<typename Sender, typename Receiver>
+            struct _Operation
+            {
+                struct type;
+            };
+            template<typename Sender, typename Receiver>
+            using operation_t = typename _Operation<Sender, remove_cvref_t<Receiver>>::type;
+
+            template<typename Sender, typename Receiver>
+            struct _Operation<Sender, Receiver>::type
+            {
+                struct _Receiver
+                {
+                    type* op_state_;
+                    template<typename... As>
+                    friend void tag_invoke(set_value_t, _Receiver&& self, As&&... as) noexcept
+                    {
+                        std::unique_ptr<type> _{self.op_state_};
+                        return SetValue(std::move(self.op_state_->receiver_), (As&&)as...);
+                    }
+                };
+                Receiver                            receiver_;
+                connect_result_t<Sender, _Receiver> op_state_;
+
+                template<typename Receiver2, _decays_to<Receiver2, Receiver, int> = 0>
+                type(Sender&& sender, Receiver2&& receiver)
+                    : receiver_((Receiver2&&)receiver)
+                    , op_state_(Connect((Sender&&)sender, _Receiver{this}))
+                {
+                }
+            };
+
+        }  // namespace __impl
+
+        struct __submit_t
+        {
+            template<typename Receiver, typename Sender>
+            void operator()(Sender&& sender, Receiver&& receiver) const noexcept(false)
+            {
+                Start((new __impl::operation_t<Sender, Receiver>((Sender&&)sender, (Receiver&&)receiver))
+                          ->op_state_);
+            }
+        };
+
+    }  // namespace __submit
+
+    using __submit::__submit_t;
+    inline constexpr __submit_t __Submit{};
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/sync_wait.h b/csrc/mmdeploy/execution/sync_wait.h
index 9a9a31a0e4..437545ddb2 100644
--- a/csrc/mmdeploy/execution/sync_wait.h
+++ b/csrc/mmdeploy/execution/sync_wait.h
@@ -10,75 +10,85 @@
 #include "run_loop.h"
 #include "utility.h"
 
-namespace mmdeploy {
-
-namespace __sync_wait {
-
-template <typename Sender>
-struct _State {
-  std::optional<completion_signatures_of_t<Sender>> data_;
-};
-
-template <typename Sender>
-struct _Receiver {
-  struct type;
-};
-template <typename Sender>
-using receiver_t = typename _Receiver<remove_cvref_t<Sender>>::type;
-
-template <typename Sender>
-struct _Receiver<Sender>::type {
-  _State<Sender>* state_;
-  RunLoop* loop_;
-
-  template <typename... As>
-  friend void tag_invoke(set_value_t, type&& receiver, As&&... as) noexcept {
-    receiver.state_->data_.emplace((As &&) as...);
-    receiver.loop_->_Finish();
-  }
-};
-
-struct sync_wait_t {
-  template <typename Sender,
-            std::enable_if_t<_is_sender<Sender> &&
-                                 _tag_invocable_with_completion_scheduler<sync_wait_t, Sender>,
-                             int> = 0>
-  auto operator()(Sender&& sender) const
-      -> tag_invoke_result_t<sync_wait_t, _completion_scheduler_for<Sender>, Sender> {
-    auto scheduler = GetCompletionScheduler(sender);
-    return tag_invoke(sync_wait_t{}, std::move(scheduler), (Sender &&) sender);
-  }
-  template <typename Sender,
-            std::enable_if_t<_is_sender<Sender> &&
-                                 !_tag_invocable_with_completion_scheduler<sync_wait_t, Sender> &&
-                                 tag_invocable<sync_wait_t, Sender>,
-                             int> = 0>
-  auto operator()(Sender&& sender) const -> tag_invoke_result_t<sync_wait_t, Sender> {
-    return tag_invoke(sync_wait_t{}, (Sender &&) sender);
-  }
-
-  template <typename Sender,
-            std::enable_if_t<_is_sender<Sender> &&
-                                 !_tag_invocable_with_completion_scheduler<sync_wait_t, Sender> &&
-                                 !tag_invocable<sync_wait_t, Sender>,
-                             int> = 0>
-  completion_signatures_of_t<Sender> operator()(Sender&& sender) const {
-    _State<remove_cvref_t<Sender>> state;
-    RunLoop loop;
-    // connect to internal receiver
-    auto op_state = Connect((Sender &&) sender, receiver_t<Sender>{&state, &loop});
-    Start(op_state);
-
-    loop._Run();
-    // extract the returned values
-    return std::move(*state.data_);
-  }
-};
-
-}  // namespace __sync_wait
-
-using __sync_wait::sync_wait_t;
-inline constexpr sync_wait_t SyncWait{};
+namespace mmdeploy
+{
+
+    namespace __sync_wait
+    {
+
+        template<typename Sender>
+        struct _State
+        {
+            std::optional<completion_signatures_of_t<Sender>> data_;
+        };
+
+        template<typename Sender>
+        struct _Receiver
+        {
+            struct type;
+        };
+        template<typename Sender>
+        using receiver_t = typename _Receiver<remove_cvref_t<Sender>>::type;
+
+        template<typename Sender>
+        struct _Receiver<Sender>::type
+        {
+            _State<Sender>* state_;
+            RunLoop*        loop_;
+
+            template<typename... As>
+            friend void tag_invoke(set_value_t, type&& receiver, As&&... as) noexcept
+            {
+                receiver.state_->data_.emplace((As&&)as...);
+                receiver.loop_->_Finish();
+            }
+        };
+
+        struct sync_wait_t
+        {
+            template<typename Sender,
+                     std::enable_if_t<_is_sender<Sender> &&
+                                          _tag_invocable_with_completion_scheduler<sync_wait_t, Sender>,
+                                      int> = 0>
+            auto operator()(Sender&& sender) const
+                -> tag_invoke_result_t<sync_wait_t, _completion_scheduler_for<Sender>, Sender>
+            {
+                auto scheduler = GetCompletionScheduler(sender);
+                return tag_invoke(sync_wait_t{}, std::move(scheduler), (Sender&&)sender);
+            }
+            template<typename Sender,
+                     std::enable_if_t<_is_sender<Sender> &&
+                                          !_tag_invocable_with_completion_scheduler<sync_wait_t, Sender> &&
+                                          tag_invocable<sync_wait_t, Sender>,
+                                      int> = 0>
+            auto operator()(Sender&& sender) const -> tag_invoke_result_t<sync_wait_t, Sender>
+            {
+                return tag_invoke(sync_wait_t{}, (Sender&&)sender);
+            }
+
+            template<typename Sender,
+                     std::enable_if_t<_is_sender<Sender> &&
+                                          !_tag_invocable_with_completion_scheduler<sync_wait_t, Sender> &&
+                                          !tag_invocable<sync_wait_t, Sender>,
+                                      int> = 0>
+            completion_signatures_of_t<Sender> operator()(Sender&& sender) const
+            {
+                _State<remove_cvref_t<Sender>> state;
+                RunLoop                        loop;
+                // connect to internal receiver
+                auto                           op_state = Connect((Sender&&)sender, receiver_t<Sender>{&state, &loop});
+                Start(op_state);
+
+                loop._Run();
+                // extract the returned values
+                return std::move(*state.data_);
+            }
+        };
+
+    }  // namespace __sync_wait
+
+    using __sync_wait::sync_wait_t;
+    inline constexpr sync_wait_t SyncWait{};
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/tag_invoke.h b/csrc/mmdeploy/execution/tag_invoke.h
index 53fa943499..cc93464063 100644
--- a/csrc/mmdeploy/execution/tag_invoke.h
+++ b/csrc/mmdeploy/execution/tag_invoke.h
@@ -6,77 +6,87 @@
 
 #include "type_traits.h"
 
-namespace mmdeploy {
-
-namespace _tag_invoke {
-
-void tag_invoke();
-
-struct _fn {
-  template <typename CPO, typename... Args>
-  constexpr auto operator()(CPO cpo, Args&&... args) const
-      noexcept(noexcept(tag_invoke((CPO &&) cpo, (Args &&) args...)))
-          -> decltype(tag_invoke((CPO &&) cpo, (Args &&) args...)) {
-    return tag_invoke((CPO &&) cpo, (Args &&) args...);
-  }
-};
-
-template <typename CPO, typename... Args>
-using tag_invoke_result_t = decltype(tag_invoke(std::declval<CPO>(), std::declval<Args>()...));
-
-using yes_type = char;
-using no_type = char (&)[2];
-
-template <typename CPO, typename... Args>
-auto try_tag_invoke(int) noexcept(noexcept(tag_invoke(std::declval<CPO>(),
-                                                      std::declval<Args>()...)))
-    -> decltype((void)tag_invoke(std::declval<CPO&&>(), std::declval<Args>()...), yes_type{});
-
-template <typename CPO, typename... Args>
-no_type try_tag_invoke(...) noexcept(false);
-
-template <template <typename...> class T, typename... Args>
-struct defer {
-  using type = T<Args...>;
-};
-
-struct empty {};
-
-}  // namespace _tag_invoke
-
-namespace _tag_invoke_cpo {
-inline constexpr _tag_invoke::_fn tag_invoke{};
-}
-using namespace _tag_invoke_cpo;
-
-template <auto& CPO>
-using tag_t = std::remove_const_t<std::remove_reference_t<decltype(CPO)>>;
-
-using _tag_invoke::tag_invoke_result_t;
-
-template <typename CPO, typename... Args>
-inline constexpr bool is_tag_invocable_v = (sizeof(_tag_invoke::try_tag_invoke<CPO, Args...>(0))) ==
-                                           (sizeof(_tag_invoke::yes_type));
-
-template <typename CPO, typename... Args>
-struct tag_invoke_result
-    : std::conditional_t<is_tag_invocable_v<CPO, Args...>,
-                         _tag_invoke::defer<tag_invoke_result, CPO, Args...>, _tag_invoke::empty> {
-};
-
-template <typename CPO, typename... Args>
-using is_tag_invocable = std::bool_constant<is_tag_invocable_v<CPO, Args...>>;
-
-template <typename CPO, typename... Args>
-inline constexpr bool is_nothrow_tag_invocable_v =
-    noexcept(_tag_invoke::try_tag_invoke<CPO, Args...>(0));
-
-template <typename CPO, typename... Args>
-using is_nothrow_tag_invocable = std::bool_constant<is_nothrow_tag_invocable_v<CPO, Args...>>;
-
-template <typename CPO, typename... Args>
-inline constexpr bool tag_invocable = (sizeof(_tag_invoke::try_tag_invoke<CPO, Args...>(0)) ==
-                                       sizeof(_tag_invoke::yes_type));
+namespace mmdeploy
+{
+
+    namespace _tag_invoke
+    {
+
+        void tag_invoke();
+
+        struct _fn
+        {
+            template<typename CPO, typename... Args>
+            constexpr auto operator()(CPO cpo, Args&&... args) const
+                noexcept(noexcept(tag_invoke((CPO&&)cpo, (Args&&)args...)))
+                    -> decltype(tag_invoke((CPO&&)cpo, (Args&&)args...))
+            {
+                return tag_invoke((CPO&&)cpo, (Args&&)args...);
+            }
+        };
+
+        template<typename CPO, typename... Args>
+        using tag_invoke_result_t = decltype(tag_invoke(std::declval<CPO>(), std::declval<Args>()...));
+
+        using yes_type = char;
+        using no_type  = char (&)[2];
+
+        template<typename CPO, typename... Args>
+        auto try_tag_invoke(int) noexcept(noexcept(tag_invoke(std::declval<CPO>(),
+                                                              std::declval<Args>()...)))
+            -> decltype((void)tag_invoke(std::declval<CPO&&>(), std::declval<Args>()...), yes_type{});
+
+        template<typename CPO, typename... Args>
+        no_type try_tag_invoke(...) noexcept(false);
+
+        template<template<typename...> class T, typename... Args>
+        struct defer
+        {
+            using type = T<Args...>;
+        };
+
+        struct empty
+        {
+        };
+
+    }  // namespace _tag_invoke
+
+    namespace _tag_invoke_cpo
+    {
+        inline constexpr _tag_invoke::_fn tag_invoke{};
+    }
+    using namespace _tag_invoke_cpo;
+
+    template<auto& CPO>
+    using tag_t = std::remove_const_t<std::remove_reference_t<decltype(CPO)>>;
+
+    using _tag_invoke::tag_invoke_result_t;
+
+    template<typename CPO, typename... Args>
+    inline constexpr bool is_tag_invocable_v = (sizeof(_tag_invoke::try_tag_invoke<CPO, Args...>(0))) ==
+                                               (sizeof(_tag_invoke::yes_type));
+
+    template<typename CPO, typename... Args>
+    struct tag_invoke_result
+        : std::conditional_t<is_tag_invocable_v<CPO, Args...>,
+                             _tag_invoke::defer<tag_invoke_result, CPO, Args...>,
+                             _tag_invoke::empty>
+    {
+    };
+
+    template<typename CPO, typename... Args>
+    using is_tag_invocable = std::bool_constant<is_tag_invocable_v<CPO, Args...>>;
+
+    template<typename CPO, typename... Args>
+    inline constexpr bool is_nothrow_tag_invocable_v =
+        noexcept(_tag_invoke::try_tag_invoke<CPO, Args...>(0));
+
+    template<typename CPO, typename... Args>
+    using is_nothrow_tag_invocable = std::bool_constant<is_nothrow_tag_invocable_v<CPO, Args...>>;
+
+    template<typename CPO, typename... Args>
+    inline constexpr bool tag_invocable = (sizeof(_tag_invoke::try_tag_invoke<CPO, Args...>(0)) ==
+                                           sizeof(_tag_invoke::yes_type));
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/then.h b/csrc/mmdeploy/execution/then.h
index 4d5119c255..5da38c9b78 100644
--- a/csrc/mmdeploy/execution/then.h
+++ b/csrc/mmdeploy/execution/then.h
@@ -9,102 +9,108 @@
 #include "concepts.h"
 #include "utility.h"
 
-namespace mmdeploy {
-
-namespace __then {
-
-template <typename Receiver, typename Func>
-struct _Receiver {
-  struct type;
-};
-template <typename Receiver, typename Func>
-using receiver_t = typename _Receiver<Receiver, Func>::type;
-
-template <typename Receiver, typename Func>
-struct _Receiver<Receiver, Func>::type {
-  Receiver receiver_;
-  Func func_;
-
-  template <typename... Args>
-  friend void tag_invoke(set_value_t, type&& self, Args&&... args) noexcept {
-    if constexpr (std::is_void_v<std::invoke_result_t<Func&&, Args...>>) {
-      std::invoke(std::move(self.func_), (Args &&) args...);
-      SetValue(std::move(self.receiver_));
-    } else {
-      SetValue(std::move(self.receiver_), std::invoke(std::move(self.func_), (Args &&) args...));
-    }
-  }
-};
-
-template <typename Sender, typename Func>
-struct _Sender {
-  struct type;
-};
-template <typename Sender, typename Func>
-using sender_t = typename _Sender<remove_cvref_t<Sender>, remove_cvref_t<Func>>::type;
-
-template <typename Sender, typename Func>
-struct _Sender<Sender, Func>::type {
-  using _ret_type = decltype(
-      std::apply(std::declval<Func>(), std::declval<completion_signatures_of_t<Sender>>()));
-
-  using value_types =
-      std::conditional_t<std::is_void_v<_ret_type>, std::tuple<>, std::tuple<_ret_type>>;
-
-  Sender sender_;
-  Func func_;
-
-  template <typename Self, typename Receiver, _decays_to<Self, type, int> = 0>
-  friend auto tag_invoke(connect_t, Self&& self, Receiver&& receiver) {
-    return Connect(((Self &&) self).sender_,
-                   receiver_t<Receiver, Func>{(Receiver &&) receiver, std::move(self.func_)});
-  }
-
-  template <typename SenderT = Sender>
-  friend auto tag_invoke(get_completion_scheduler_t, const type& self) noexcept
-      -> tag_invoke_result_t<get_completion_scheduler_t, SenderT> {
-    return GetCompletionScheduler(self.sender_);
-  }
-};
-
-struct then_t {
-  template <typename Sender, typename Func,
-            std::enable_if_t<_is_sender<Sender> &&
-                                 _tag_invocable_with_completion_scheduler<then_t, Sender, Func>,
-                             int> = 0>
-  auto operator()(Sender&& sender, Func func) const {
-    auto scheduler = GetCompletionScheduler(sender);
-    return tag_invoke(then_t{}, std::move(scheduler), (Sender &&) sender, std::move(func));
-  }
-
-  template <typename Sender, typename Func,
-            std::enable_if_t<_is_sender<Sender> &&
-                                 !_tag_invocable_with_completion_scheduler<then_t, Sender, Func> &&
-                                 tag_invocable<then_t, Sender, Func>,
-                             int> = 0>
-  auto operator()(Sender&& sender, Func func) const {
-    auto scheduler = GetCompletionScheduler(sender);
-    return tag_invoke(then_t{}, std::move(scheduler), (Sender &&) sender, std::move(func));
-  }
-
-  template <typename Sender, typename Func,
-            std::enable_if_t<_is_sender<Sender> &&
-                                 !_tag_invocable_with_completion_scheduler<then_t, Sender, Func> &&
-                                 !tag_invocable<then_t, Sender, Func>,
-                             int> = 0>
-  sender_t<Sender, Func> operator()(Sender&& sender, Func func) const {
-    return {(Sender &&) sender, std::move(func)};
-  }
-  template <typename Func>
-  _BinderBack<then_t, Func> operator()(Func func) const {
-    return {{}, {}, {std::move(func)}};
-  }
-};
-
-}  // namespace __then
-
-using __then::then_t;
-inline constexpr then_t Then;
+namespace mmdeploy
+{
+
+    namespace __then
+    {
+
+        template<typename Receiver, typename Func>
+        struct _Receiver
+        {
+            struct type;
+        };
+        template<typename Receiver, typename Func>
+        using receiver_t = typename _Receiver<Receiver, Func>::type;
+
+        template<typename Receiver, typename Func>
+        struct _Receiver<Receiver, Func>::type
+        {
+            Receiver receiver_;
+            Func     func_;
+
+            template<typename... Args>
+            friend void tag_invoke(set_value_t, type&& self, Args&&... args) noexcept
+            {
+                if constexpr (std::is_void_v<std::invoke_result_t<Func&&, Args...>>)
+                {
+                    std::invoke(std::move(self.func_), (Args&&)args...);
+                    SetValue(std::move(self.receiver_));
+                }
+                else
+                {
+                    SetValue(std::move(self.receiver_), std::invoke(std::move(self.func_), (Args&&)args...));
+                }
+            }
+        };
+
+        template<typename Sender, typename Func>
+        struct _Sender
+        {
+            struct type;
+        };
+        template<typename Sender, typename Func>
+        using sender_t = typename _Sender<remove_cvref_t<Sender>, remove_cvref_t<Func>>::type;
+
+        template<typename Sender, typename Func>
+        struct _Sender<Sender, Func>::type
+        {
+            using _ret_type = decltype(std::apply(std::declval<Func>(), std::declval<completion_signatures_of_t<Sender>>()));
+
+            using value_types =
+                std::conditional_t<std::is_void_v<_ret_type>, std::tuple<>, std::tuple<_ret_type>>;
+
+            Sender sender_;
+            Func   func_;
+
+            template<typename Self, typename Receiver, _decays_to<Self, type, int> = 0>
+            friend auto tag_invoke(connect_t, Self&& self, Receiver&& receiver)
+            {
+                return Connect(((Self&&)self).sender_,
+                               receiver_t<Receiver, Func>{(Receiver&&)receiver, std::move(self.func_)});
+            }
+
+            template<typename SenderT = Sender>
+            friend auto tag_invoke(get_completion_scheduler_t, const type& self) noexcept
+                -> tag_invoke_result_t<get_completion_scheduler_t, SenderT>
+            {
+                return GetCompletionScheduler(self.sender_);
+            }
+        };
+
+        struct then_t
+        {
+            template<typename Sender, typename Func, std::enable_if_t<_is_sender<Sender> && _tag_invocable_with_completion_scheduler<then_t, Sender, Func>, int> = 0>
+            auto operator()(Sender&& sender, Func func) const
+            {
+                auto scheduler = GetCompletionScheduler(sender);
+                return tag_invoke(then_t{}, std::move(scheduler), (Sender&&)sender, std::move(func));
+            }
+
+            template<typename Sender, typename Func, std::enable_if_t<_is_sender<Sender> && !_tag_invocable_with_completion_scheduler<then_t, Sender, Func> && tag_invocable<then_t, Sender, Func>, int> = 0>
+            auto operator()(Sender&& sender, Func func) const
+            {
+                auto scheduler = GetCompletionScheduler(sender);
+                return tag_invoke(then_t{}, std::move(scheduler), (Sender&&)sender, std::move(func));
+            }
+
+            template<typename Sender, typename Func, std::enable_if_t<_is_sender<Sender> && !_tag_invocable_with_completion_scheduler<then_t, Sender, Func> && !tag_invocable<then_t, Sender, Func>, int> = 0>
+            sender_t<Sender, Func> operator()(Sender&& sender, Func func) const
+            {
+                return {(Sender&&)sender, std::move(func)};
+            }
+
+            template<typename Func>
+            _BinderBack<then_t, Func> operator()(Func func) const
+            {
+                return {{}, {}, {std::move(func)}};
+            }
+        };
+
+    }  // namespace __then
+
+    using __then::then_t;
+    inline constexpr then_t Then;
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/transfer.h b/csrc/mmdeploy/execution/transfer.h
index 43752e6453..79fdb25a1b 100644
--- a/csrc/mmdeploy/execution/transfer.h
+++ b/csrc/mmdeploy/execution/transfer.h
@@ -8,47 +8,41 @@
 #include "schedule_from.h"
 #include "utility.h"
 
-namespace mmdeploy {
-
-namespace __transfer {
-
-struct transfer_t {
-  template <typename Sender, typename Scheduler,
-            std::enable_if_t<_is_sender<Sender> && _tag_invocable_with_completion_scheduler<
-                                                       transfer_t, Sender, Scheduler>,
-                             int> = 0>
-  auto operator()(Sender&& sender, Scheduler&& scheduler) const {
-    auto sched = GetCompletionScheduler(sender);
-    return tag_invoke(transfer_t{}, std::move(sched), (Sender &&) sender, (Scheduler &&) scheduler);
-  }
-  template <typename Sender, typename Scheduler,
-            std::enable_if_t<
-                _is_sender<Sender> &&
-                    !_tag_invocable_with_completion_scheduler<transfer_t, Sender, Scheduler> &&
-                    tag_invocable<transfer_t, Sender, Scheduler>,
-                int> = 0>
-  auto operator()(Sender&& sender, Scheduler&& scheduler) const {
-    return tag_invoke(transfer_t{}, (Sender &&) sender, (Scheduler &&) scheduler);
-  }
-  template <typename Sender, typename Scheduler,
-            std::enable_if_t<
-                _is_sender<Sender> &&
-                    !_tag_invocable_with_completion_scheduler<transfer_t, Sender, Scheduler> &&
-                    !tag_invocable<transfer_t, Sender, Scheduler>,
-                int> = 0>
-  auto operator()(Sender&& sender, Scheduler&& scheduler) const {
-    return ScheduleFrom((Scheduler &&) scheduler, (Sender &&) sender);
-  }
-  template <typename Scheduler>
-  _BinderBack<transfer_t, remove_cvref_t<Scheduler>> operator()(Scheduler&& scheduler) const {
-    return {{}, {}, {(Scheduler &&) scheduler}};
-  }
-};
-
-}  // namespace __transfer
-
-using __transfer::transfer_t;
-inline constexpr transfer_t Transfer{};
+namespace mmdeploy
+{
+
+    namespace __transfer
+    {
+
+        struct transfer_t
+        {
+            template<typename Sender, typename Scheduler, std::enable_if_t<_is_sender<Sender> && _tag_invocable_with_completion_scheduler<transfer_t, Sender, Scheduler>, int> = 0>
+            auto operator()(Sender&& sender, Scheduler&& scheduler) const
+            {
+                auto sched = GetCompletionScheduler(sender);
+                return tag_invoke(transfer_t{}, std::move(sched), (Sender&&)sender, (Scheduler&&)scheduler);
+            }
+            template<typename Sender, typename Scheduler, std::enable_if_t<_is_sender<Sender> && !_tag_invocable_with_completion_scheduler<transfer_t, Sender, Scheduler> && tag_invocable<transfer_t, Sender, Scheduler>, int> = 0>
+            auto operator()(Sender&& sender, Scheduler&& scheduler) const
+            {
+                return tag_invoke(transfer_t{}, (Sender&&)sender, (Scheduler&&)scheduler);
+            }
+            template<typename Sender, typename Scheduler, std::enable_if_t<_is_sender<Sender> && !_tag_invocable_with_completion_scheduler<transfer_t, Sender, Scheduler> && !tag_invocable<transfer_t, Sender, Scheduler>, int> = 0>
+            auto operator()(Sender&& sender, Scheduler&& scheduler) const
+            {
+                return ScheduleFrom((Scheduler&&)scheduler, (Sender&&)sender);
+            }
+            template<typename Scheduler>
+            _BinderBack<transfer_t, remove_cvref_t<Scheduler>> operator()(Scheduler&& scheduler) const
+            {
+                return {{}, {}, {(Scheduler&&)scheduler}};
+            }
+        };
+
+    }  // namespace __transfer
+
+    using __transfer::transfer_t;
+    inline constexpr transfer_t Transfer{};
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/transfer_just.h b/csrc/mmdeploy/execution/transfer_just.h
index 693455b742..5edd91f76a 100644
--- a/csrc/mmdeploy/execution/transfer_just.h
+++ b/csrc/mmdeploy/execution/transfer_just.h
@@ -9,27 +9,30 @@
 #include "mmdeploy/execution/transfer.h"
 #include "mmdeploy/execution/utility.h"
 
-namespace mmdeploy {
-
-namespace _transfer_just {
-
-struct transfer_just_t {
-  template <typename Scheduler, typename... As,
-            std::enable_if_t<tag_invocable<transfer_just_t, Scheduler, As...>, int> = 0>
-  auto operator()(Scheduler&& scheduler, As&&... as) const {
-    return tag_invoke(transfer_just_t{}, (Scheduler &&) scheduler, (As &&) as...);
-  }
-  template <typename Scheduler, typename... As,
-            std::enable_if_t<!tag_invocable<transfer_just_t, Scheduler, As...>, int> = 0>
-  auto operator()(Scheduler&& scheduler, As&&... as) const {
-    return Transfer(Just((As &&) as...), (Scheduler &&) scheduler);
-  }
-};
-
-}  // namespace _transfer_just
-
-using _transfer_just::transfer_just_t;
-inline constexpr transfer_just_t TransferJust{};
+namespace mmdeploy
+{
+
+    namespace _transfer_just
+    {
+
+        struct transfer_just_t
+        {
+            template<typename Scheduler, typename... As, std::enable_if_t<tag_invocable<transfer_just_t, Scheduler, As...>, int> = 0>
+            auto operator()(Scheduler&& scheduler, As&&... as) const
+            {
+                return tag_invoke(transfer_just_t{}, (Scheduler&&)scheduler, (As&&)as...);
+            }
+            template<typename Scheduler, typename... As, std::enable_if_t<!tag_invocable<transfer_just_t, Scheduler, As...>, int> = 0>
+            auto operator()(Scheduler&& scheduler, As&&... as) const
+            {
+                return Transfer(Just((As&&)as...), (Scheduler&&)scheduler);
+            }
+        };
+
+    }  // namespace _transfer_just
+
+    using _transfer_just::transfer_just_t;
+    inline constexpr transfer_just_t TransferJust{};
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/type_erased.h b/csrc/mmdeploy/execution/type_erased.h
index be7c202b22..16b4ad58d0 100644
--- a/csrc/mmdeploy/execution/type_erased.h
+++ b/csrc/mmdeploy/execution/type_erased.h
@@ -8,456 +8,563 @@
 // ! DO NOT INCLUDE THIS FILE DIRECTLY IF SPECIALIZATION OF `capture_completion_scheduler` IS
 // NEEDED, ALL TRANSLATION UNITS MUST SEE THE SAME SPECIALIZATION
 
-namespace mmdeploy {
-
-namespace _capture_completion_scheduler {
-
-template <typename ValueTypes>
-struct capture_completion_scheduler : std::false_type {};
-
-}  // namespace _capture_completion_scheduler
-
-using _capture_completion_scheduler::capture_completion_scheduler;
-
-template <typename ValueTypes>
-inline constexpr bool _capture_completion_scheduler_v =
-    capture_completion_scheduler<ValueTypes>::value;
-
-namespace _type_erased {
-
-template <typename ValueTypes>
-class _TypeErasedSender;
-
-class _TypeErasedOperation;
-
-template <typename ValueTypes>
-class _TypeErasedReceiver;
-
-template <typename ValueTypes>
-class _TypeErasedScheduler;
-
-struct _unit {};
-
-template <typename>
-struct _ThenFn {
-  using type = _unit;
-};
-template <typename T>
-struct _ThenFn<std::tuple<T>> {
-  using type = std::function<T(T)>;
-};
-template <>
-struct _ThenFn<std::tuple<>> {
-  using type = std::function<void()>;
-};
-template <typename ValueTypes>
-using _then_fn_t = typename _ThenFn<ValueTypes>::type;
-
-template <typename>
-struct _BulkFn {};
-template <typename... Ts>
-struct _BulkFn<std::tuple<Ts...>> {
-  using type = std::function<void(size_t, Ts&...)>;
-};
-template <typename ValueTypes>
-using _bulk_fn_t = typename _BulkFn<ValueTypes>::type;
-
-///////////////////////////////////////////////////////////////////////////////
-// Operation
-///////////////////////////////////////////////////////////////////////////////
-
-using TypeErasedOperation = _TypeErasedOperation;
-
-class _TypeErasedOperation {
- public:
-  struct Impl {
-    virtual ~Impl() = default;
-    virtual void _Start() = 0;
-  };
-
-  template <typename Fun, typename = std::enable_if_t<std::is_invocable_v<Fun>>>
-  explicit _TypeErasedOperation(Fun&& fun);
-
-  friend void tag_invoke(start_t, _TypeErasedOperation& op_state) { op_state.impl_->_Start(); }
-
- private:
-  std::unique_ptr<Impl> impl_;
-};
-
-template <typename Operation>
-struct _TypeErasedOperationImpl : _TypeErasedOperation::Impl {
-  virtual void _Start() { Start(operation_); }
-
-  template <typename Fun, typename = std::enable_if_t<std::is_invocable_v<Fun>>>
-  explicit _TypeErasedOperationImpl(Fun&& fun) : operation_{((Fun &&) fun)()} {}
-
-  Operation operation_;
-};
-
-template <typename Fun, typename>
-_TypeErasedOperation::_TypeErasedOperation(Fun&& fun) {
-  using _Operation = std::invoke_result_t<Fun>;
-  impl_.reset(new _TypeErasedOperationImpl<_Operation>{(Fun &&) fun});
-}
-
-///////////////////////////////////////////////////////////////////////////////
-// Sender
-///////////////////////////////////////////////////////////////////////////////
-
-template <typename SenderType>
-class _TypeErasedSenderAdapter {
- public:
-  using value_types = typename SenderType::value_types;
-
-  explicit _TypeErasedSenderAdapter(SenderType sender) : sender_(std::move(sender)) {}
-
-  template <typename Self, typename Receiver, _decays_to<Self, _TypeErasedSenderAdapter, int> = 0>
-  friend auto tag_invoke(connect_t, Self&& self, Receiver&& receiver) {
-    return Connect(((Self &&) self).sender_, (Receiver &&) receiver);
-  }
-
- private:
-  SenderType sender_;
-};
-
-template <typename SenderType>
-_TypeErasedSenderAdapter(SenderType &&)->_TypeErasedSenderAdapter<remove_cvref_t<SenderType>>;
-
-namespace _expose {
-
-template <typename ValueTypes>
-struct _Sender {
-  using value_types = ValueTypes;
-
-  _TypeErasedSender<ValueTypes> sender_;
-
-  template <typename Self, typename Receiver, _decays_to<Self, _Sender, int> = 0>
-  friend auto tag_invoke(connect_t, Self&& self, Receiver&& receiver) {
-    return Connect(((Self &&) self).sender_, (Receiver &&) receiver);
-  }
-
-  friend auto tag_invoke(get_completion_scheduler_t, const _Sender& self) noexcept {
-    return self.sender_._GetCompletionScheduler();
-  }
-};
-
-}  // namespace _expose
-
-template <typename ValueTypes>
-class _TypeErasedSender {
- public:
-  using _Operation = _TypeErasedOperation;
-  using _Receiver = _TypeErasedReceiver<ValueTypes>;
-  using _Scheduler = _TypeErasedScheduler<ValueTypes>;
-  using value_types = ValueTypes;
-
-  struct Impl {
-    virtual ~Impl() = default;
-    virtual _Operation _Connect(_Receiver) = 0;
-    virtual std::unique_ptr<Impl> _Clone() const = 0;
-    virtual _Scheduler _GetCompletionScheduler() const = 0;
-  };
-
-  _TypeErasedSender(_TypeErasedSender&& other) noexcept = default;
-  _TypeErasedSender& operator=(_TypeErasedSender&& other) noexcept = default;
-
-  _TypeErasedSender(const _TypeErasedSender& other) : impl_(other.impl_->_Clone()) {}
-  _TypeErasedSender& operator=(const _TypeErasedSender& other) {
-    impl_ = other.impl_->_Clone();
-    return *this;
-  }
-
-  _Scheduler _GetCompletionScheduler() const { return impl_->_GetCompletionScheduler(); }
-
-  template <typename Self, typename Receiver,
-            std::enable_if_t<std::is_same_v<_TypeErasedSender, remove_cvref_t<Self>>, int> = 0>
-  friend _Operation tag_invoke(connect_t, Self&& self, Receiver&& receiver) {
-    return self.impl_->_Connect(_TypeErasedReceiver<ValueTypes>((Receiver &&) receiver));
-  }
-
-  using SenderType = _TypeErasedSender;
-
-  friend _expose::_Sender<ValueTypes> tag_invoke(transfer_t, SenderType input,
-                                                 _Scheduler scheduler) {
-    auto sched = input.impl_->_GetCompletionScheduler();
-    return _expose::_Sender<ValueTypes>{
-        tag_invoke(transfer_t{}, sched, std::move(input), std::move(scheduler))};
-  }
-
-  template <
-      typename Sender,
-      typename = std::enable_if_t<
-          !std::is_same_v<remove_cvref_t<Sender>, _TypeErasedSender> &&
-          !std::is_same_v<remove_cvref_t<Sender>, _TypeErasedSenderAdapter<_TypeErasedSender>>>>
-  /* implicit */ _TypeErasedSender(Sender&& sender);
-
- private:
-  std::unique_ptr<Impl> impl_;
-};
-
-template <typename... Ts>
-using TypeErasedSender = _TypeErasedSender<std::tuple<Ts...>>;
-
-template <typename Sender>
-_TypeErasedSender(Sender &&)->_TypeErasedSender<completion_signatures_of_t<Sender>>;
-
-template <typename Sender, typename ValueTypes = completion_signatures_of_t<Sender>>
-struct _TypeErasedSenderImpl : _TypeErasedSender<ValueTypes>::Impl {
- public:
-  using Base = typename _TypeErasedSender<ValueTypes>::Impl;
-  using _Operation = _TypeErasedOperation;
-  using _Receiver = _TypeErasedReceiver<ValueTypes>;
-
-  template <typename _Sender, typename = std::enable_if_t<
-                                  !std::is_same_v<std::decay_t<_Sender>, _TypeErasedSenderImpl>>>
-  explicit _TypeErasedSenderImpl(_Sender&& sender) : sender_((_Sender &&) sender) {}
-
-  _TypeErasedOperation _Connect(_Receiver receiver) override;
-
-  _TypeErasedScheduler<ValueTypes> _GetCompletionScheduler() const override {
-    //    static_assert(
-    //        !std::is_same_v<ValueTypes, std::tuple<mmdeploy::Value>> ||
-    //        (_capture_completion_scheduler_v<ValueTypes> && _has_completion_scheduler_v<Sender>));
-    if constexpr (_capture_completion_scheduler_v<ValueTypes> &&
-                  _has_completion_scheduler_v<Sender>) {
-      return _TypeErasedScheduler<ValueTypes>{GetCompletionScheduler(sender_)};
-    } else {
-      return _TypeErasedScheduler<ValueTypes>{
-          std::make_shared<typename _TypeErasedScheduler<ValueTypes>::Impl>()};
-    }
-  }
-
-  std::unique_ptr<Base> _Clone() const override {
-    if constexpr (std::is_copy_constructible_v<Sender>) {
-      return std::make_unique<_TypeErasedSenderImpl>(sender_);
-    } else {
-      MMDEPLOY_ERROR("attempt to clone non-copyable sender");
-      std::abort();
-    }
-    return {};
-  }
-
- private:
-  Sender sender_;
-};
-template <typename Sender, typename ValueTypes>
-_TypeErasedOperation _TypeErasedSenderImpl<Sender, ValueTypes>::_Connect(
-    _TypeErasedSenderImpl::_Receiver receiver) {
-  return _Operation{[&] { return Connect(std::move(sender_), std::move(receiver)); }};
-}
-
-template <typename ValueTypes>
-template <typename Sender, typename>
-_TypeErasedSender<ValueTypes>::_TypeErasedSender(Sender&& sender) {
-  using _Sender = remove_cvref_t<Sender>;
-  impl_ = std::make_unique<_TypeErasedSenderImpl<_Sender>>((Sender &&) sender);
-}
-
-///////////////////////////////////////////////////////////////////////////////
-// Receiver
-///////////////////////////////////////////////////////////////////////////////
-
-template <typename ValueTypes>
-class _TypeErasedReceiver {
- public:
-  struct Impl {
-    virtual ~Impl() = default;
-    virtual void _SetValue(ValueTypes) = 0;
-  };
-
-  template <typename Receiver, typename = std::enable_if_t<
-                                   !std::is_same_v<std::decay_t<Receiver>, _TypeErasedReceiver>>>
-  explicit _TypeErasedReceiver(Receiver&&);
-
-  template <typename... As>
-  friend void tag_invoke(set_value_t, _TypeErasedReceiver&& self, As&&... as) noexcept {
-    self.impl_->_SetValue(std::make_tuple((As &&) as...));
-  }
-
- private:
-  std::unique_ptr<Impl> impl_;
-};
-
-template <typename Receiver, typename ValueTypes>
-struct _TypeErasedReceiverImpl : _TypeErasedReceiver<ValueTypes>::Impl {
-  void _SetValue(ValueTypes vals) override {
-    std::apply(
-        [&](auto&&... args) noexcept { SetValue(std::move(receiver_), (decltype(args)&&)args...); },
-        std::move(vals));
-  }
-  Receiver receiver_;
-
-  template <typename _Receiver>
-  explicit _TypeErasedReceiverImpl(_Receiver&& receiver) : receiver_((_Receiver &&) receiver) {}
-};
-
-template <typename ValueTypes>
-template <typename Receiver, typename>
-_TypeErasedReceiver<ValueTypes>::_TypeErasedReceiver(Receiver&& receiver) {
-  using _Receiver = std::decay_t<Receiver>;
-  impl_ = std::make_unique<_TypeErasedReceiverImpl<_Receiver, ValueTypes>>((Receiver &&) receiver);
-}
-
-///////////////////////////////////////////////////////////////////////////////
-// Scheduler
-///////////////////////////////////////////////////////////////////////////////
-
-template <typename... Ts>
-using TypeErasedScheduler = _TypeErasedScheduler<std::tuple<Ts...>>;
-
-template <typename ValueTypes>
-class _TypeErasedScheduler {
- public:
-  using SenderType = _TypeErasedSender<ValueTypes>;
-  using SenderAdapterType = _TypeErasedSenderAdapter<SenderType>;
-  using EmptySenderType = _TypeErasedSender<std::tuple<>>;
-
-  using ThenFun = typename _ThenFn<ValueTypes>::type;
-  using BulkFun = typename _BulkFn<ValueTypes>::type;
-
-  struct Impl {
-    virtual ~Impl() = default;
-    virtual EmptySenderType _Schedule() { return Just(); }
-    virtual SenderType _Transfer(SenderAdapterType input, _TypeErasedScheduler sched) {
-      return ::mmdeploy::Transfer(std::move(input), std::move(sched));
-    }
-    virtual SenderType _Bulk(SenderAdapterType input, size_t shape, BulkFun fun) {
-      return ::mmdeploy::Bulk(std::move(input), shape, std::move(fun));
-    }
-    virtual SenderType _DynamicBatch(SenderAdapterType input, dynamic_batch_t::context_t& context,
-                                     ThenFun fun) {
-      if constexpr (!std::is_same_v<ThenFun, _unit>) {
-        return ::mmdeploy::DynamicBatch(std::move(input), nullptr, std::move(fun));
-      } else {
-        std::abort();
-      }
-    }
-    // virtual SenderType _ScheduleFrom(SenderType) = 0;
-    // virtual SenderType _Then(SenderType input, ThenFun fun) = 0;
-    // virtual SenderType _LetValue() = 0;
-    // virtual SenderType _On(SenderType) = 0;
-    // virtual SenderType _Split(SenderType) = 0;
-    // virtual SenderType _WhenAll(std::vector<SenderType>) = 0;
-    // virtual SenderType _TransferWhenAll(std::vector<SenderType>) = 0;
-    // virtual SenderType _EnsureStarted(SenderType) = 0;
-    // virtual void _StartDetached(SenderType) = 0;
-    // virtual ValueTypes _SyncWait(SenderType) = 0;
-  };
-
-  template <typename Scheduler, typename = std::enable_if_t<
-                                    !std::is_same_v<std::decay_t<Scheduler>, _TypeErasedScheduler>>>
-  explicit _TypeErasedScheduler(Scheduler&& sched);
-
-  explicit _TypeErasedScheduler(std::shared_ptr<Impl> impl) : impl_(std::move(impl)) {
-    assert(impl_);
-  }
-
-  friend EmptySenderType tag_invoke(schedule_t, const _TypeErasedScheduler& self) {
-    return self.impl_->_Schedule();
-  }
-
-  friend SenderType tag_invoke(transfer_t, const _TypeErasedScheduler& self, SenderType input,
-                               _TypeErasedScheduler other) {
-    if (self.impl_ == other.impl_) {
-      return std::move(input);
-    } else {
-      return self.impl_->_Transfer(SenderAdapterType{std::move(input)}, std::move(other));
-    }
-  }
-
-  friend SenderType tag_invoke(bulk_t, const _TypeErasedScheduler& self, SenderType input,
-                               size_t shape, BulkFun fun) {
-    return self.impl_->_Bulk(SenderAdapterType{std::move(input)}, shape, std::move(fun));
-  }
-
-  friend SenderType tag_invoke(dynamic_batch_t, const _TypeErasedScheduler& self, SenderType input,
-                               dynamic_batch_t::context_t& context, ThenFun fun) {
-    return self.impl_->_DynamicBatch(SenderAdapterType{std::move(input)}, context, std::move(fun));
-  }
-
- private:
-  std::shared_ptr<Impl> impl_;
-};
-
-template <typename ValueTypes, typename Scheduler>
-struct _TypeErasedSchedulerImpl : _TypeErasedScheduler<ValueTypes>::Impl {
-  using _SenderType = _TypeErasedSender<std::tuple<>>;
-
-  using Base = typename _TypeErasedScheduler<ValueTypes>::Impl;
-  using BulkFun = typename _TypeErasedScheduler<ValueTypes>::BulkFun;
-  using ThenFun = typename _TypeErasedScheduler<ValueTypes>::ThenFun;
-  using VoidSenderType = typename _TypeErasedScheduler<ValueTypes>::EmptySenderType;
-  using SenderType = typename _TypeErasedScheduler<ValueTypes>::SenderType;
-  using SenderAdapterType = _TypeErasedSenderAdapter<SenderType>;
-
-  VoidSenderType _Schedule() override { return VoidSenderType{Schedule(scheduler_)}; }
-
-  SenderType _Transfer(SenderAdapterType input, _TypeErasedScheduler<ValueTypes> sched) override {
-    if constexpr (tag_invocable<transfer_t, Scheduler, SenderType,
-                                _TypeErasedScheduler<ValueTypes>>) {
-      return tag_invoke(transfer_t{}, scheduler_, std::move(input), std::move(sched));
-    } else {
-      return Base::_Transfer(std::move(input), std::move(sched));
-    }
-  }
-
-  SenderType _Bulk(SenderAdapterType input, size_t shape, BulkFun fun) override {
-    if constexpr (tag_invocable<bulk_t, Scheduler, SenderType, size_t, BulkFun>) {
-      return tag_invoke(bulk_t{}, scheduler_, std::move(input), shape, std::move(fun));
-    } else {
-      return Base::_Bulk(std::move(input), shape, std::move(fun));
-    }
-  }
-
-  SenderType _DynamicBatch(SenderAdapterType input, dynamic_batch_t::context_t& context,
-                           ThenFun fun) override {
-    if constexpr (tag_invocable<dynamic_batch_t, Scheduler, SenderAdapterType,
-                                dynamic_batch_t::context_t&, ThenFun>) {
-      return tag_invoke(dynamic_batch_t{}, scheduler_, std::move(input), context, std::move(fun));
-    } else {
-      return Base::_DynamicBatch(std::move(input), context, std::move(fun));
-    }
-  }
-
-  explicit _TypeErasedSchedulerImpl(Scheduler sched) : scheduler_(std::move(sched)) {}
-  Scheduler scheduler_;
-};
-
-template <typename Scheduler, typename... Ts>
-using TypeErasedSchedulerImpl = _TypeErasedSchedulerImpl<std::tuple<Ts...>, Scheduler>;
-
-template <typename ValueTypes>
-template <typename Scheduler, typename>
-_TypeErasedScheduler<ValueTypes>::_TypeErasedScheduler(Scheduler&& scheduler) {
-  using _Scheduler = std::decay_t<Scheduler>;
-  impl_ =
-      std::make_unique<_TypeErasedSchedulerImpl<ValueTypes, _Scheduler>>((Scheduler &&) scheduler);
-}
-
-struct type_erase_t {
-  template <typename Sender, std::enable_if_t<_is_sender<Sender>, int> = 0>
-  auto operator()(Sender&& sender) const {
-    return _TypeErasedSender((Sender &&) sender);
-  }
-  _BinderBack<type_erase_t> operator()() const { return {{}, {}, {}}; }
-};
-
-}  // namespace _type_erased
-
-using _type_erased::type_erase_t;
-inline constexpr type_erase_t TypeErase{};
-
-using _type_erased::TypeErasedOperation;
-using _type_erased::TypeErasedScheduler;
-using _type_erased::TypeErasedSender;
-
-// TODO move the specialization somewhere else in a consistent way
-class Value;
-
-namespace _capture_completion_scheduler {
-template <>
-struct capture_completion_scheduler<std::tuple<Value>> : std::true_type {};
-}  // namespace _capture_completion_scheduler
+namespace mmdeploy
+{
+
+    namespace _capture_completion_scheduler
+    {
+
+        template<typename ValueTypes>
+        struct capture_completion_scheduler : std::false_type
+        {
+        };
+
+    }  // namespace _capture_completion_scheduler
+
+    using _capture_completion_scheduler::capture_completion_scheduler;
+
+    template<typename ValueTypes>
+    inline constexpr bool _capture_completion_scheduler_v =
+        capture_completion_scheduler<ValueTypes>::value;
+
+    namespace _type_erased
+    {
+
+        template<typename ValueTypes>
+        class _TypeErasedSender;
+
+        class _TypeErasedOperation;
+
+        template<typename ValueTypes>
+        class _TypeErasedReceiver;
+
+        template<typename ValueTypes>
+        class _TypeErasedScheduler;
+
+        struct _unit
+        {
+        };
+
+        template<typename>
+        struct _ThenFn
+        {
+            using type = _unit;
+        };
+        template<typename T>
+        struct _ThenFn<std::tuple<T>>
+        {
+            using type = std::function<T(T)>;
+        };
+        template<>
+        struct _ThenFn<std::tuple<>>
+        {
+            using type = std::function<void()>;
+        };
+        template<typename ValueTypes>
+        using _then_fn_t = typename _ThenFn<ValueTypes>::type;
+
+        template<typename>
+        struct _BulkFn
+        {
+        };
+        template<typename... Ts>
+        struct _BulkFn<std::tuple<Ts...>>
+        {
+            using type = std::function<void(size_t, Ts&...)>;
+        };
+        template<typename ValueTypes>
+        using _bulk_fn_t = typename _BulkFn<ValueTypes>::type;
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // Operation
+        ///////////////////////////////////////////////////////////////////////////////
+
+        using TypeErasedOperation = _TypeErasedOperation;
+
+        class _TypeErasedOperation
+        {
+          public:
+            struct Impl
+            {
+                virtual ~Impl()       = default;
+                virtual void _Start() = 0;
+            };
+
+            template<typename Fun, typename = std::enable_if_t<std::is_invocable_v<Fun>>>
+            explicit _TypeErasedOperation(Fun&& fun);
+
+            friend void tag_invoke(start_t, _TypeErasedOperation& op_state)
+            {
+                op_state.impl_->_Start();
+            }
+
+          private:
+            std::unique_ptr<Impl> impl_;
+        };
+
+        template<typename Operation>
+        struct _TypeErasedOperationImpl : _TypeErasedOperation::Impl
+        {
+            virtual void _Start()
+            {
+                Start(operation_);
+            }
+
+            template<typename Fun, typename = std::enable_if_t<std::is_invocable_v<Fun>>>
+            explicit _TypeErasedOperationImpl(Fun&& fun)
+                : operation_{((Fun&&)fun)()}
+            {
+            }
+
+            Operation operation_;
+        };
+
+        template<typename Fun, typename>
+        _TypeErasedOperation::_TypeErasedOperation(Fun&& fun)
+        {
+            using _Operation = std::invoke_result_t<Fun>;
+            impl_.reset(new _TypeErasedOperationImpl<_Operation>{(Fun&&)fun});
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // Sender
+        ///////////////////////////////////////////////////////////////////////////////
+
+        template<typename SenderType>
+        class _TypeErasedSenderAdapter
+        {
+          public:
+            using value_types = typename SenderType::value_types;
+
+            explicit _TypeErasedSenderAdapter(SenderType sender)
+                : sender_(std::move(sender))
+            {
+            }
+
+            template<typename Self, typename Receiver, _decays_to<Self, _TypeErasedSenderAdapter, int> = 0>
+            friend auto tag_invoke(connect_t, Self&& self, Receiver&& receiver)
+            {
+                return Connect(((Self&&)self).sender_, (Receiver&&)receiver);
+            }
+
+          private:
+            SenderType sender_;
+        };
+
+        template<typename SenderType>
+        _TypeErasedSenderAdapter(SenderType&&) -> _TypeErasedSenderAdapter<remove_cvref_t<SenderType>>;
+
+        namespace _expose
+        {
+
+            template<typename ValueTypes>
+            struct _Sender
+            {
+                using value_types = ValueTypes;
+
+                _TypeErasedSender<ValueTypes> sender_;
+
+                template<typename Self, typename Receiver, _decays_to<Self, _Sender, int> = 0>
+                friend auto tag_invoke(connect_t, Self&& self, Receiver&& receiver)
+                {
+                    return Connect(((Self&&)self).sender_, (Receiver&&)receiver);
+                }
+
+                friend auto tag_invoke(get_completion_scheduler_t, const _Sender& self) noexcept
+                {
+                    return self.sender_._GetCompletionScheduler();
+                }
+            };
+
+        }  // namespace _expose
+
+        template<typename ValueTypes>
+        class _TypeErasedSender
+        {
+          public:
+            using _Operation  = _TypeErasedOperation;
+            using _Receiver   = _TypeErasedReceiver<ValueTypes>;
+            using _Scheduler  = _TypeErasedScheduler<ValueTypes>;
+            using value_types = ValueTypes;
+
+            struct Impl
+            {
+                virtual ~Impl()                                               = default;
+                virtual _Operation            _Connect(_Receiver)             = 0;
+                virtual std::unique_ptr<Impl> _Clone() const                  = 0;
+                virtual _Scheduler            _GetCompletionScheduler() const = 0;
+            };
+
+            _TypeErasedSender(_TypeErasedSender&& other) noexcept            = default;
+            _TypeErasedSender& operator=(_TypeErasedSender&& other) noexcept = default;
+
+            _TypeErasedSender(const _TypeErasedSender& other)
+                : impl_(other.impl_->_Clone())
+            {
+            }
+            _TypeErasedSender& operator=(const _TypeErasedSender& other)
+            {
+                impl_ = other.impl_->_Clone();
+                return *this;
+            }
+
+            _Scheduler _GetCompletionScheduler() const
+            {
+                return impl_->_GetCompletionScheduler();
+            }
+
+            template<typename Self, typename Receiver, std::enable_if_t<std::is_same_v<_TypeErasedSender, remove_cvref_t<Self>>, int> = 0>
+            friend _Operation tag_invoke(connect_t, Self&& self, Receiver&& receiver)
+            {
+                return self.impl_->_Connect(_TypeErasedReceiver<ValueTypes>((Receiver&&)receiver));
+            }
+
+            using SenderType = _TypeErasedSender;
+
+            friend _expose::_Sender<ValueTypes> tag_invoke(transfer_t, SenderType input, _Scheduler scheduler)
+            {
+                auto sched = input.impl_->_GetCompletionScheduler();
+                return _expose::_Sender<ValueTypes>{
+                    tag_invoke(transfer_t{}, sched, std::move(input), std::move(scheduler))};
+            }
+
+            template<
+                typename Sender,
+                typename = std::enable_if_t<
+                    !std::is_same_v<remove_cvref_t<Sender>, _TypeErasedSender> &&
+                    !std::is_same_v<remove_cvref_t<Sender>, _TypeErasedSenderAdapter<_TypeErasedSender>>>>
+            /* implicit */ _TypeErasedSender(Sender&& sender);
+
+          private:
+            std::unique_ptr<Impl> impl_;
+        };
+
+        template<typename... Ts>
+        using TypeErasedSender = _TypeErasedSender<std::tuple<Ts...>>;
+
+        template<typename Sender>
+        _TypeErasedSender(Sender&&) -> _TypeErasedSender<completion_signatures_of_t<Sender>>;
+
+        template<typename Sender, typename ValueTypes = completion_signatures_of_t<Sender>>
+        struct _TypeErasedSenderImpl : _TypeErasedSender<ValueTypes>::Impl
+        {
+          public:
+            using Base       = typename _TypeErasedSender<ValueTypes>::Impl;
+            using _Operation = _TypeErasedOperation;
+            using _Receiver  = _TypeErasedReceiver<ValueTypes>;
+
+            template<typename _Sender, typename = std::enable_if_t<!std::is_same_v<std::decay_t<_Sender>, _TypeErasedSenderImpl>>>
+            explicit _TypeErasedSenderImpl(_Sender&& sender)
+                : sender_((_Sender&&)sender)
+            {
+            }
+
+            _TypeErasedOperation             _Connect(_Receiver receiver) override;
+
+            _TypeErasedScheduler<ValueTypes> _GetCompletionScheduler() const override
+            {
+                //    static_assert(
+                //        !std::is_same_v<ValueTypes, std::tuple<mmdeploy::Value>> ||
+                //        (_capture_completion_scheduler_v<ValueTypes> && _has_completion_scheduler_v<Sender>));
+                if constexpr (_capture_completion_scheduler_v<ValueTypes> &&
+                              _has_completion_scheduler_v<Sender>)
+                {
+                    return _TypeErasedScheduler<ValueTypes>{GetCompletionScheduler(sender_)};
+                }
+                else
+                {
+                    return _TypeErasedScheduler<ValueTypes>{
+                        std::make_shared<typename _TypeErasedScheduler<ValueTypes>::Impl>()};
+                }
+            }
+
+            std::unique_ptr<Base> _Clone() const override
+            {
+                if constexpr (std::is_copy_constructible_v<Sender>)
+                {
+                    return std::make_unique<_TypeErasedSenderImpl>(sender_);
+                }
+                else
+                {
+                    MMDEPLOY_ERROR("attempt to clone non-copyable sender");
+                    std::abort();
+                }
+                return {};
+            }
+
+          private:
+            Sender sender_;
+        };
+        template<typename Sender, typename ValueTypes>
+        _TypeErasedOperation _TypeErasedSenderImpl<Sender, ValueTypes>::_Connect(
+            _TypeErasedSenderImpl::_Receiver receiver)
+        {
+            return _Operation{[&]
+                              { return Connect(std::move(sender_), std::move(receiver)); }};
+        }
+
+        template<typename ValueTypes>
+        template<typename Sender, typename>
+        _TypeErasedSender<ValueTypes>::_TypeErasedSender(Sender&& sender)
+        {
+            using _Sender = remove_cvref_t<Sender>;
+            impl_         = std::make_unique<_TypeErasedSenderImpl<_Sender>>((Sender&&)sender);
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // Receiver
+        ///////////////////////////////////////////////////////////////////////////////
+
+        template<typename ValueTypes>
+        class _TypeErasedReceiver
+        {
+          public:
+            struct Impl
+            {
+                virtual ~Impl()                    = default;
+                virtual void _SetValue(ValueTypes) = 0;
+            };
+
+            template<typename Receiver, typename = std::enable_if_t<!std::is_same_v<std::decay_t<Receiver>, _TypeErasedReceiver>>>
+            explicit _TypeErasedReceiver(Receiver&&);
+
+            template<typename... As>
+            friend void tag_invoke(set_value_t, _TypeErasedReceiver&& self, As&&... as) noexcept
+            {
+                self.impl_->_SetValue(std::make_tuple((As&&)as...));
+            }
+
+          private:
+            std::unique_ptr<Impl> impl_;
+        };
+
+        template<typename Receiver, typename ValueTypes>
+        struct _TypeErasedReceiverImpl : _TypeErasedReceiver<ValueTypes>::Impl
+        {
+            void _SetValue(ValueTypes vals) override
+            {
+                std::apply(
+                    [&](auto&&... args) noexcept
+                    { SetValue(std::move(receiver_), (decltype(args)&&)args...); },
+                    std::move(vals));
+            }
+            Receiver receiver_;
+
+            template<typename _Receiver>
+            explicit _TypeErasedReceiverImpl(_Receiver&& receiver)
+                : receiver_((_Receiver&&)receiver)
+            {
+            }
+        };
+
+        template<typename ValueTypes>
+        template<typename Receiver, typename>
+        _TypeErasedReceiver<ValueTypes>::_TypeErasedReceiver(Receiver&& receiver)
+        {
+            using _Receiver = std::decay_t<Receiver>;
+            impl_           = std::make_unique<_TypeErasedReceiverImpl<_Receiver, ValueTypes>>((Receiver&&)receiver);
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // Scheduler
+        ///////////////////////////////////////////////////////////////////////////////
+
+        template<typename... Ts>
+        using TypeErasedScheduler = _TypeErasedScheduler<std::tuple<Ts...>>;
+
+        template<typename ValueTypes>
+        class _TypeErasedScheduler
+        {
+          public:
+            using SenderType        = _TypeErasedSender<ValueTypes>;
+            using SenderAdapterType = _TypeErasedSenderAdapter<SenderType>;
+            using EmptySenderType   = _TypeErasedSender<std::tuple<>>;
+
+            using ThenFun = typename _ThenFn<ValueTypes>::type;
+            using BulkFun = typename _BulkFn<ValueTypes>::type;
+
+            struct Impl
+            {
+                virtual ~Impl() = default;
+                virtual EmptySenderType _Schedule()
+                {
+                    return Just();
+                }
+                virtual SenderType _Transfer(SenderAdapterType input, _TypeErasedScheduler sched)
+                {
+                    return ::mmdeploy::Transfer(std::move(input), std::move(sched));
+                }
+                virtual SenderType _Bulk(SenderAdapterType input, size_t shape, BulkFun fun)
+                {
+                    return ::mmdeploy::Bulk(std::move(input), shape, std::move(fun));
+                }
+                virtual SenderType _DynamicBatch(SenderAdapterType input, dynamic_batch_t::context_t& context, ThenFun fun)
+                {
+                    if constexpr (!std::is_same_v<ThenFun, _unit>)
+                    {
+                        return ::mmdeploy::DynamicBatch(std::move(input), nullptr, std::move(fun));
+                    }
+                    else
+                    {
+                        std::abort();
+                    }
+                }
+                // virtual SenderType _ScheduleFrom(SenderType) = 0;
+                // virtual SenderType _Then(SenderType input, ThenFun fun) = 0;
+                // virtual SenderType _LetValue() = 0;
+                // virtual SenderType _On(SenderType) = 0;
+                // virtual SenderType _Split(SenderType) = 0;
+                // virtual SenderType _WhenAll(std::vector<SenderType>) = 0;
+                // virtual SenderType _TransferWhenAll(std::vector<SenderType>) = 0;
+                // virtual SenderType _EnsureStarted(SenderType) = 0;
+                // virtual void _StartDetached(SenderType) = 0;
+                // virtual ValueTypes _SyncWait(SenderType) = 0;
+            };
+
+            template<typename Scheduler, typename = std::enable_if_t<!std::is_same_v<std::decay_t<Scheduler>, _TypeErasedScheduler>>>
+            explicit _TypeErasedScheduler(Scheduler&& sched);
+
+            explicit _TypeErasedScheduler(std::shared_ptr<Impl> impl)
+                : impl_(std::move(impl))
+            {
+                assert(impl_);
+            }
+
+            friend EmptySenderType tag_invoke(schedule_t, const _TypeErasedScheduler& self)
+            {
+                return self.impl_->_Schedule();
+            }
+
+            friend SenderType tag_invoke(transfer_t, const _TypeErasedScheduler& self, SenderType input, _TypeErasedScheduler other)
+            {
+                if (self.impl_ == other.impl_)
+                {
+                    return std::move(input);
+                }
+                else
+                {
+                    return self.impl_->_Transfer(SenderAdapterType{std::move(input)}, std::move(other));
+                }
+            }
+
+            friend SenderType tag_invoke(bulk_t, const _TypeErasedScheduler& self, SenderType input, size_t shape, BulkFun fun)
+            {
+                return self.impl_->_Bulk(SenderAdapterType{std::move(input)}, shape, std::move(fun));
+            }
+
+            friend SenderType tag_invoke(dynamic_batch_t, const _TypeErasedScheduler& self, SenderType input, dynamic_batch_t::context_t& context, ThenFun fun)
+            {
+                return self.impl_->_DynamicBatch(SenderAdapterType{std::move(input)}, context, std::move(fun));
+            }
+
+          private:
+            std::shared_ptr<Impl> impl_;
+        };
+
+        template<typename ValueTypes, typename Scheduler>
+        struct _TypeErasedSchedulerImpl : _TypeErasedScheduler<ValueTypes>::Impl
+        {
+            using _SenderType = _TypeErasedSender<std::tuple<>>;
+
+            using Base              = typename _TypeErasedScheduler<ValueTypes>::Impl;
+            using BulkFun           = typename _TypeErasedScheduler<ValueTypes>::BulkFun;
+            using ThenFun           = typename _TypeErasedScheduler<ValueTypes>::ThenFun;
+            using VoidSenderType    = typename _TypeErasedScheduler<ValueTypes>::EmptySenderType;
+            using SenderType        = typename _TypeErasedScheduler<ValueTypes>::SenderType;
+            using SenderAdapterType = _TypeErasedSenderAdapter<SenderType>;
+
+            VoidSenderType _Schedule() override
+            {
+                return VoidSenderType{Schedule(scheduler_)};
+            }
+
+            SenderType _Transfer(SenderAdapterType input, _TypeErasedScheduler<ValueTypes> sched) override
+            {
+                if constexpr (tag_invocable<transfer_t, Scheduler, SenderType, _TypeErasedScheduler<ValueTypes>>)
+                {
+                    return tag_invoke(transfer_t{}, scheduler_, std::move(input), std::move(sched));
+                }
+                else
+                {
+                    return Base::_Transfer(std::move(input), std::move(sched));
+                }
+            }
+
+            SenderType _Bulk(SenderAdapterType input, size_t shape, BulkFun fun) override
+            {
+                if constexpr (tag_invocable<bulk_t, Scheduler, SenderType, size_t, BulkFun>)
+                {
+                    return tag_invoke(bulk_t{}, scheduler_, std::move(input), shape, std::move(fun));
+                }
+                else
+                {
+                    return Base::_Bulk(std::move(input), shape, std::move(fun));
+                }
+            }
+
+            SenderType _DynamicBatch(SenderAdapterType input, dynamic_batch_t::context_t& context, ThenFun fun) override
+            {
+                if constexpr (tag_invocable<dynamic_batch_t, Scheduler, SenderAdapterType, dynamic_batch_t::context_t&, ThenFun>)
+                {
+                    return tag_invoke(dynamic_batch_t{}, scheduler_, std::move(input), context, std::move(fun));
+                }
+                else
+                {
+                    return Base::_DynamicBatch(std::move(input), context, std::move(fun));
+                }
+            }
+
+            explicit _TypeErasedSchedulerImpl(Scheduler sched)
+                : scheduler_(std::move(sched))
+            {
+            }
+            Scheduler scheduler_;
+        };
+
+        template<typename Scheduler, typename... Ts>
+        using TypeErasedSchedulerImpl = _TypeErasedSchedulerImpl<std::tuple<Ts...>, Scheduler>;
+
+        template<typename ValueTypes>
+        template<typename Scheduler, typename>
+        _TypeErasedScheduler<ValueTypes>::_TypeErasedScheduler(Scheduler&& scheduler)
+        {
+            using _Scheduler = std::decay_t<Scheduler>;
+            impl_ =
+                std::make_unique<_TypeErasedSchedulerImpl<ValueTypes, _Scheduler>>((Scheduler&&)scheduler);
+        }
+
+        struct type_erase_t
+        {
+            template<typename Sender, std::enable_if_t<_is_sender<Sender>, int> = 0>
+            auto operator()(Sender&& sender) const
+            {
+                return _TypeErasedSender((Sender&&)sender);
+            }
+            _BinderBack<type_erase_t> operator()() const
+            {
+                return {{}, {}, {}};
+            }
+        };
+
+    }  // namespace _type_erased
+
+    using _type_erased::type_erase_t;
+    inline constexpr type_erase_t TypeErase{};
+
+    using _type_erased::TypeErasedOperation;
+    using _type_erased::TypeErasedScheduler;
+    using _type_erased::TypeErasedSender;
+
+    // TODO move the specialization somewhere else in a consistent way
+    class Value;
+
+    namespace _capture_completion_scheduler
+    {
+        template<>
+        struct capture_completion_scheduler<std::tuple<Value>> : std::true_type
+        {
+        };
+    }  // namespace _capture_completion_scheduler
 
 }  // namespace mmdeploy
 //
diff --git a/csrc/mmdeploy/execution/type_traits.h b/csrc/mmdeploy/execution/type_traits.h
index 1ac6e69eb0..8d01dfb596 100644
--- a/csrc/mmdeploy/execution/type_traits.h
+++ b/csrc/mmdeploy/execution/type_traits.h
@@ -7,80 +7,93 @@
 
 #include <type_traits>
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-/////////////////////////////////////////////////////////
-// remove_cvref without handling volatile
-template <typename T>
-struct remove_cvref {
-  using type = T;
-};
-template <typename T>
-struct remove_cvref<const T> {
-  using type = T;
-};
-template <typename T>
-struct remove_cvref<T&> {
-  using type = T;
-};
-template <typename T>
-struct remove_cvref<const T&> {
-  using type = T;
-};
-template <typename T>
-struct remove_cvref<T&&> {
-  using type = T;
-};
-template <typename T>
-struct remove_cvref<const T&&> {
-  using type = T;
-};
+    /////////////////////////////////////////////////////////
+    // remove_cvref without handling volatile
+    template<typename T>
+    struct remove_cvref
+    {
+        using type = T;
+    };
+    template<typename T>
+    struct remove_cvref<const T>
+    {
+        using type = T;
+    };
+    template<typename T>
+    struct remove_cvref<T&>
+    {
+        using type = T;
+    };
+    template<typename T>
+    struct remove_cvref<const T&>
+    {
+        using type = T;
+    };
+    template<typename T>
+    struct remove_cvref<T&&>
+    {
+        using type = T;
+    };
+    template<typename T>
+    struct remove_cvref<const T&&>
+    {
+        using type = T;
+    };
 
-template <typename T>
-using remove_cvref_t = typename remove_cvref<T>::type;
+    template<typename T>
+    using remove_cvref_t = typename remove_cvref<T>::type;
 
-template <typename Fn, typename... Args>
-using callable_result_t = decltype(std::declval<Fn&&>()(std::declval<Args&&>()...));
+    template<typename Fn, typename... Args>
+    using callable_result_t = decltype(std::declval<Fn&&>()(std::declval<Args&&>()...));
 
-namespace _is_callable {
-struct yes_type {
-  char dummy;
-};
-struct no_type {
-  char dummy[2];
-};
-static_assert(sizeof(yes_type) != sizeof(no_type));
+    namespace _is_callable
+    {
+        struct yes_type
+        {
+            char dummy;
+        };
+        struct no_type
+        {
+            char dummy[2];
+        };
+        static_assert(sizeof(yes_type) != sizeof(no_type));
 
-template <typename Fn, typename... Args, typename = callable_result_t<Fn, Args...>>
-yes_type _try_call(Fn (*)(Args...)) noexcept(
-    noexcept(std::declval<Fn&&>()(std::declval<Args&&>()...)));
-no_type _try_call(...) noexcept(false);
+        template<typename Fn, typename... Args, typename = callable_result_t<Fn, Args...>>
+        yes_type _try_call(Fn (*)(Args...)) noexcept(
+            noexcept(std::declval<Fn&&>()(std::declval<Args&&>()...)));
+        no_type _try_call(...) noexcept(false);
 
-}  // namespace _is_callable
+    }  // namespace _is_callable
 
-template <typename Fn, typename... Args>
-inline constexpr bool is_callable_v =
-    sizeof(decltype(_is_callable::_try_call(static_cast<Fn (*)(Args...)>(nullptr)))) ==
-    sizeof(_is_callable::yes_type);
+    template<typename Fn, typename... Args>
+    inline constexpr bool is_callable_v =
+        sizeof(decltype(_is_callable::_try_call(static_cast<Fn (*)(Args...)>(nullptr)))) ==
+        sizeof(_is_callable::yes_type);
 
-template <typename Fn, typename... Args>
-inline constexpr bool is_nothrow_callable_v =
-    noexcept(_is_callable::_try_call(static_cast<Fn (*)(Args...)>(nullptr)));
+    template<typename Fn, typename... Args>
+    inline constexpr bool is_nothrow_callable_v =
+        noexcept(_is_callable::_try_call(static_cast<Fn (*)(Args...)>(nullptr)));
 }  // namespace mmdeploy
 
-template <template <typename...> class T, typename... Args>
-struct _defer {
-  using type = T<Args...>;
+template<template<typename...> class T, typename... Args>
+struct _defer
+{
+    using type = T<Args...>;
 };
 
-template <template <typename...> class T, typename... Args>
-struct _defer_args {
-  using type = T<typename Args::type...>;
+template<template<typename...> class T, typename... Args>
+struct _defer_args
+{
+    using type = T<typename Args::type...>;
 };
 
-template <typename T>
-struct identity {
-  using type = T;
+template<typename T>
+struct identity
+{
+    using type = T;
 };
 
 #endif  // MMDEPLOY_CSRC_EXPERIMENTAL_EXECUTION_TYPE_TRAITS_H_
diff --git a/csrc/mmdeploy/execution/utility.h b/csrc/mmdeploy/execution/utility.h
index 5a0bb795d8..bef6d14e81 100644
--- a/csrc/mmdeploy/execution/utility.h
+++ b/csrc/mmdeploy/execution/utility.h
@@ -13,71 +13,86 @@
 
 #define MMDEPLOY_REQUIRES(...) std::enable_if_t<__VA_ARGS__, int> = 0
 
-namespace mmdeploy {
-
-template <typename T, typename E, typename U = void>
-using _decays_to = std::enable_if_t<std::is_same<std::decay_t<T>, E>::value, U>;
-
-template <typename... Ts>
-using __decayed_tuple = std::tuple<std::decay_t<Ts>...>;
-
-template <typename Fun, typename... As>
-using __call_result_t = decltype(std::declval<Fun>()(std::declval<As>()...));
-
-template <typename F>
-struct __conv {
-  F f_;
-  using type = __call_result_t<F>;
-  operator type() && { return ((F &&) f_)(); }
-};
-
-template <typename F>
-__conv(F)->__conv<F>;
-
-template <typename T, typename = std::enable_if_t<std::is_destructible_v<T>>>
-struct __conv_proxy {
-  T v_;
-  template <typename F>
-  explicit __conv_proxy(F&& f) : v_(((F &&) f)()) {}
-  T& operator*() noexcept { return v_; }
-};
-
-template <typename _Member, typename _Self>
-_Member _Self::*__memptr(const _Self&);
-
-template <typename _Self, typename _Member>
-using __member_t = decltype((std::declval<_Self>().*__memptr<_Member>(std::declval<_Self>())));
-
-template <typename From, typename To>
-using _copy_cvref_t = __member_t<From, To>;
-
-template <typename S, typename R>
-using connect_result_t = decltype(Connect(std::declval<S>(), std::declval<R>()));
+namespace mmdeploy
+{
 
-template <typename...>
-struct _types
+    template<typename T, typename E, typename U = void>
+    using _decays_to = std::enable_if_t<std::is_same<std::decay_t<T>, E>::value, U>;
+
+    template<typename... Ts>
+    using __decayed_tuple = std::tuple<std::decay_t<Ts>...>;
+
+    template<typename Fun, typename... As>
+    using __call_result_t = decltype(std::declval<Fun>()(std::declval<As>()...));
+
+    template<typename F>
+    struct __conv
+    {
+        F f_;
+        using type = __call_result_t<F>;
+        operator type() &&
+        {
+            return ((F&&)f_)();
+        }
+    };
+
+    template<typename F>
+    __conv(F) -> __conv<F>;
+
+    template<typename T, typename = std::enable_if_t<std::is_destructible_v<T>>>
+    struct __conv_proxy
+    {
+        T v_;
+        template<typename F>
+        explicit __conv_proxy(F&& f)
+            : v_(((F&&)f)())
+        {
+        }
+        T& operator*() noexcept
+        {
+            return v_;
+        }
+    };
+
+    template<typename _Member, typename _Self>
+    _Member _Self::*__memptr(const _Self&);
+
+    template<typename _Self, typename _Member>
+    using __member_t = decltype((std::declval<_Self>().*__memptr<_Member>(std::declval<_Self>())));
+
+    template<typename From, typename To>
+    using _copy_cvref_t = __member_t<From, To>;
+
+    template<typename S, typename R>
+    using connect_result_t = decltype(Connect(std::declval<S>(), std::declval<R>()));
+
+    template<typename...>
+    struct _types
 #if defined(__GNUC__) && !defined(__clang__)
-{
-}
+    {
+    }
 #endif
-;
+    ;
 
-namespace __schedule {
+    namespace __schedule
+    {
 
-struct schedule_t {
-  template <typename Scheduler, std::enable_if_t<tag_invocable<schedule_t, Scheduler>, int> = 0>
-  auto operator()(Scheduler&& scheduler) const -> tag_invoke_result_t<schedule_t, Scheduler> {
-    return tag_invoke(schedule_t{}, (Scheduler &&) scheduler);
-  }
-};
+        struct schedule_t
+        {
+            template<typename Scheduler, std::enable_if_t<tag_invocable<schedule_t, Scheduler>, int> = 0>
+            auto operator()(Scheduler&& scheduler) const -> tag_invoke_result_t<schedule_t, Scheduler>
+            {
+                return tag_invoke(schedule_t{}, (Scheduler&&)scheduler);
+            }
+        };
 
-}  // namespace __schedule
+    }  // namespace __schedule
 
-using __schedule::schedule_t;
-inline constexpr schedule_t Schedule{};
+    using __schedule::schedule_t;
+    inline constexpr schedule_t Schedule{};
 
-template <typename Scheduler>
-using schedule_result_t = decltype(std::declval<schedule_t>()(std::declval<Scheduler>()));
+    template<typename Scheduler>
+    using schedule_result_t = decltype(std::declval<schedule_t>()(std::declval<Scheduler>()));
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/when_all.h b/csrc/mmdeploy/execution/when_all.h
index 46e0f85ee2..a7395d97ad 100644
--- a/csrc/mmdeploy/execution/when_all.h
+++ b/csrc/mmdeploy/execution/when_all.h
@@ -7,168 +7,202 @@
 
 #include "utility.h"
 
-namespace mmdeploy {
-
-namespace __when_all {
-
-template <typename... Senders>
-using __concat_t = decltype(std::tuple_cat(std::declval<completion_signatures_of_t<Senders>>()...));
-
-template <typename CvrefReceiver, typename... Senders>
-struct _Operation {
-  struct type;
-};
-template <typename CvrefReceiver, typename... Senders>
-using Operation = typename _Operation<CvrefReceiver, Senders...>::type;
-
-template <typename CvrefReceiver, size_t Index, typename... Senders>
-struct _Receiver {
-  struct type;
-};
-template <typename CvrefReceiver, size_t Index, typename... Senders>
-using receiver_t = typename _Receiver<CvrefReceiver, Index, Senders...>::type;
-
-template <typename CvrefReceiver, size_t Index, typename... Senders>
-struct _Receiver<CvrefReceiver, Index, Senders...>::type {
-  using Receiver = remove_cvref_t<CvrefReceiver>;
-  Operation<CvrefReceiver, Senders...>* op_state_;
-
-  template <typename... As>
-  friend void tag_invoke(set_value_t, type&& self, As&&... as) noexcept {
-    std::get<Index>(self.op_state_->vals_).emplace((As &&) as...);
-    self.op_state_->_Arrive();
-  }
-};
-
-template <typename CvrefReceiver, typename... Senders>
-struct _Operation<CvrefReceiver, Senders...>::type {
-  using Receiver = remove_cvref_t<CvrefReceiver>;
-
-  template <size_t Index>
-  using _receiver_t = receiver_t<CvrefReceiver, Index, Senders...>;
-
-  template <typename Sender, size_t Index>
-  using _ChildOpState = connect_result_t<_copy_cvref_t<CvrefReceiver, Sender>, _receiver_t<Index>>;
-
-  using _Indices = std::index_sequence_for<Senders...>;
-
-  // workaround for a bug in GCC7 that `Is` in a lambda is treated as unexpanded parameter pack
-  template <typename Sender, typename Receiver>
-  static auto _Connect1(Sender&& sender, Receiver&& receiver) {
-    return __conv{[&]() mutable { return Connect((Sender &&) sender, (Receiver &&) receiver); }};
-  }
-
-  template <size_t... Is, typename... _Senders>
-  static auto _ConnectChildren(type* self, std::index_sequence<Is...>, _Senders&&... senders)
-      -> std::tuple<_ChildOpState<Senders, Is>...> {
-    return {_Connect1((_Senders &&) senders, _receiver_t<Is>{self})...};
-  }
-
-  using _ChildOpStates = decltype(_ConnectChildren(
-      nullptr, _Indices{}, std::declval<_copy_cvref_t<CvrefReceiver, Senders>>()...));
-
-  using _ChildValueTuple = std::tuple<std::optional<completion_signatures_of_t<Senders>>...>;
-
-  void _Arrive() noexcept {
-    if (0 == --count_) {
-      _Complete();
-    }
-  }
-
-  void _Complete() noexcept {
-    std::apply(
-        [this](auto&... opt_vals) -> void {
-          std::apply(
-              [this](auto&... all_vals) -> void {
-                SetValue((Receiver &&) receiver_, std::move(all_vals)...);
-              },
-              std::tuple_cat(
-                  std::apply([](auto&... vals) { return std::tie(vals...); }, *opt_vals)...));
-        },
-        vals_);
-  }
-
-  template <typename... _Senders>
-  explicit type(Receiver&& receiver, _Senders&&... senders)
-      : child_states_{_ConnectChildren(this, _Indices{}, (_Senders &&) senders...)},
-        receiver_(std::move(receiver)) {}
-
-  friend void tag_invoke(start_t, type& self) noexcept {
-    std::apply([](auto&&... child_ops) noexcept -> void { (Start(child_ops), ...); },
-               self.child_states_);
-  }
-
-  type(const type&) = delete;
-  type(type&&) = delete;
-  type& operator=(const type&) = delete;
-  type& operator=(type&&) = delete;
-
-  _ChildOpStates child_states_;
-  Receiver receiver_;
-  std::atomic<size_t> count_{sizeof...(Senders)};
-  _ChildValueTuple vals_;
-};
-
-template <typename... Senders>
-struct _Sender {
-  struct type;
-};
-template <typename... Senders>
-using Sender = typename _Sender<remove_cvref_t<Senders>...>::type;
-
-template <typename... Senders>
-struct _Sender<Senders...>::type {
-  using value_types = __concat_t<Senders...>;
-
-  template <typename Receiver>
-  using operation_t = Operation<Receiver, Senders...>;
-
-  template <typename Self, typename Receiver, _decays_to<Self, type, int> = 0>
-  friend auto tag_invoke(connect_t, Self&& self, Receiver&& receiver)
-      -> operation_t<_copy_cvref_t<Self, remove_cvref_t<Receiver>>> {  // cvref encoded in receiver
-                                                                       // type
-    return std::apply(
-        [&](auto&&... senders) {
-          // MSVC v142 doesn't recognize operation_t here
-          return Operation<_copy_cvref_t<Self, remove_cvref_t<Receiver>>, Senders...>(
-              (Receiver &&) receiver, (decltype(senders)&&)senders...);
-        },
-        ((Self &&) self).senders_);
-  }
-
-  std::tuple<Senders...> senders_;
-};
-
-struct when_all_t {
-  template <typename... Senders,
-            std::enable_if_t<(_is_sender<Senders> && ...) && (sizeof...(Senders) > 0) &&
-                                 tag_invocable<when_all_t, Senders...>,
-                             int> = 0>
-  auto operator()(Senders&&... senders) const {
-    return tag_invoke(when_all_t{}, (Senders &&) senders...);
-  }
-
-  template <
-      typename Range, typename ValueType = typename remove_cvref_t<Range>::value_type,
-      std::enable_if_t<
-          _is_range_v<Range> && _is_sender<ValueType> && tag_invocable<when_all_t, Range>, int> = 0>
-  auto operator()(Range&& range) const {
-    return tag_invoke(when_all_t{}, (Range &&) range);
-  }
-
-  template <typename... Senders,
-            std::enable_if_t<(_is_sender<Senders> && ...) && (sizeof...(Senders) > 0) &&
-                                 !tag_invocable<when_all_t, Senders...>,
-                             int> = 0>
-  Sender<Senders...> operator()(Senders&&... senders) const {
-    return {{(Senders &&) senders...}};
-  }
-};
-
-}  // namespace __when_all
-
-using __when_all::when_all_t;
-inline constexpr when_all_t WhenAll{};
+namespace mmdeploy
+{
+
+    namespace __when_all
+    {
+
+        template<typename... Senders>
+        using __concat_t = decltype(std::tuple_cat(std::declval<completion_signatures_of_t<Senders>>()...));
+
+        template<typename CvrefReceiver, typename... Senders>
+        struct _Operation
+        {
+            struct type;
+        };
+        template<typename CvrefReceiver, typename... Senders>
+        using Operation = typename _Operation<CvrefReceiver, Senders...>::type;
+
+        template<typename CvrefReceiver, size_t Index, typename... Senders>
+        struct _Receiver
+        {
+            struct type;
+        };
+        template<typename CvrefReceiver, size_t Index, typename... Senders>
+        using receiver_t = typename _Receiver<CvrefReceiver, Index, Senders...>::type;
+
+        template<typename CvrefReceiver, size_t Index, typename... Senders>
+        struct _Receiver<CvrefReceiver, Index, Senders...>::type
+        {
+            using Receiver = remove_cvref_t<CvrefReceiver>;
+            Operation<CvrefReceiver, Senders...>* op_state_;
+
+            template<typename... As>
+            friend void tag_invoke(set_value_t, type&& self, As&&... as) noexcept
+            {
+                std::get<Index>(self.op_state_->vals_).emplace((As&&)as...);
+                self.op_state_->_Arrive();
+            }
+        };
+
+        template<typename CvrefReceiver, typename... Senders>
+        struct _Operation<CvrefReceiver, Senders...>::type
+        {
+            using Receiver = remove_cvref_t<CvrefReceiver>;
+
+            template<size_t Index>
+            using _receiver_t = receiver_t<CvrefReceiver, Index, Senders...>;
+
+            template<typename Sender, size_t Index>
+            using _ChildOpState = connect_result_t<_copy_cvref_t<CvrefReceiver, Sender>, _receiver_t<Index>>;
+
+            using _Indices = std::index_sequence_for<Senders...>;
+
+            // workaround for a bug in GCC7 that `Is` in a lambda is treated as unexpanded parameter pack
+            template<typename Sender, typename Receiver>
+            static auto _Connect1(Sender&& sender, Receiver&& receiver)
+            {
+                return __conv{[&]() mutable
+                              { return Connect((Sender&&)sender, (Receiver&&)receiver); }};
+            }
+
+            template<size_t... Is, typename... _Senders>
+            static auto _ConnectChildren(type* self, std::index_sequence<Is...>, _Senders&&... senders)
+                -> std::tuple<_ChildOpState<Senders, Is>...>
+            {
+                return {_Connect1((_Senders&&)senders, _receiver_t<Is>{self})...};
+            }
+
+            using _ChildOpStates = decltype(_ConnectChildren(
+                nullptr,
+                _Indices{},
+                std::declval<_copy_cvref_t<CvrefReceiver, Senders>>()...));
+
+            using _ChildValueTuple = std::tuple<std::optional<completion_signatures_of_t<Senders>>...>;
+
+            void _Arrive() noexcept
+            {
+                if (0 == --count_)
+                {
+                    _Complete();
+                }
+            }
+
+            void _Complete() noexcept
+            {
+                std::apply(
+                    [this](auto&... opt_vals) -> void
+                    {
+                        std::apply(
+                            [this](auto&... all_vals) -> void
+                            {
+                                SetValue((Receiver&&)receiver_, std::move(all_vals)...);
+                            },
+                            std::tuple_cat(
+                                std::apply([](auto&... vals)
+                                           { return std::tie(vals...); },
+                                           *opt_vals)...));
+                    },
+                    vals_);
+            }
+
+            template<typename... _Senders>
+            explicit type(Receiver&& receiver, _Senders&&... senders)
+                : child_states_{_ConnectChildren(this, _Indices{}, (_Senders&&)senders...)}
+                , receiver_(std::move(receiver))
+            {
+            }
+
+            friend void tag_invoke(start_t, type& self) noexcept
+            {
+                std::apply([](auto&&... child_ops) noexcept -> void
+                           { (Start(child_ops), ...); },
+                           self.child_states_);
+            }
+
+            type(const type&)                          = delete;
+            type(type&&)                               = delete;
+            type&               operator=(const type&) = delete;
+            type&               operator=(type&&)      = delete;
+
+            _ChildOpStates      child_states_;
+            Receiver            receiver_;
+            std::atomic<size_t> count_{sizeof...(Senders)};
+            _ChildValueTuple    vals_;
+        };
+
+        template<typename... Senders>
+        struct _Sender
+        {
+            struct type;
+        };
+        template<typename... Senders>
+        using Sender = typename _Sender<remove_cvref_t<Senders>...>::type;
+
+        template<typename... Senders>
+        struct _Sender<Senders...>::type
+        {
+            using value_types = __concat_t<Senders...>;
+
+            template<typename Receiver>
+            using operation_t = Operation<Receiver, Senders...>;
+
+            template<typename Self, typename Receiver, _decays_to<Self, type, int> = 0>
+            friend auto tag_invoke(connect_t, Self&& self, Receiver&& receiver)
+                -> operation_t<_copy_cvref_t<Self, remove_cvref_t<Receiver>>>
+            {  // cvref encoded in receiver
+               // type
+                return std::apply(
+                    [&](auto&&... senders)
+                    {
+                        // MSVC v142 doesn't recognize operation_t here
+                        return Operation<_copy_cvref_t<Self, remove_cvref_t<Receiver>>, Senders...>(
+                            (Receiver&&)receiver,
+                            (decltype(senders)&&)senders...);
+                    },
+                    ((Self&&)self).senders_);
+            }
+
+            std::tuple<Senders...> senders_;
+        };
+
+        struct when_all_t
+        {
+            template<typename... Senders,
+                     std::enable_if_t<(_is_sender<Senders> && ...) && (sizeof...(Senders) > 0) &&
+                                          tag_invocable<when_all_t, Senders...>,
+                                      int> = 0>
+            auto operator()(Senders&&... senders) const
+            {
+                return tag_invoke(when_all_t{}, (Senders&&)senders...);
+            }
+
+            template<
+                typename Range,
+                typename ValueType = typename remove_cvref_t<Range>::value_type,
+                std::enable_if_t<
+                    _is_range_v<Range> && _is_sender<ValueType> && tag_invocable<when_all_t, Range>,
+                    int> = 0>
+            auto operator()(Range&& range) const
+            {
+                return tag_invoke(when_all_t{}, (Range&&)range);
+            }
+
+            template<typename... Senders,
+                     std::enable_if_t<(_is_sender<Senders> && ...) && (sizeof...(Senders) > 0) &&
+                                          !tag_invocable<when_all_t, Senders...>,
+                                      int> = 0>
+            Sender<Senders...> operator()(Senders&&... senders) const
+            {
+                return {{(Senders&&)senders...}};
+            }
+        };
+
+    }  // namespace __when_all
+
+    using __when_all::when_all_t;
+    inline constexpr when_all_t WhenAll{};
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/execution/when_all_value.h b/csrc/mmdeploy/execution/when_all_value.h
index b059937995..706cc7ffa2 100644
--- a/csrc/mmdeploy/execution/when_all_value.h
+++ b/csrc/mmdeploy/execution/when_all_value.h
@@ -6,86 +6,102 @@
 #include "mmdeploy/core/value.h"
 #include "mmdeploy/execution/schedulers/registry.h"
 
-namespace mmdeploy {
-
-namespace __when_all_value {
-
-template <typename Receiver>
-struct _Operation {
-  struct type;
-};
-template <typename Receiver>
-using operation_t = typename _Operation<Receiver>::type;
-
-template <typename Receiver>
-struct _Receiver {
-  struct type;
-};
-template <typename Receiver>
-using receiver_t = typename _Receiver<Receiver>::type;
-
-template <typename Receiver>
-struct _Receiver<Receiver>::type {
-  size_t index_;
-  operation_t<Receiver>* op_state_;
-
-  friend void tag_invoke(set_value_t, type&& self, Value value) noexcept {
-    self.op_state_->values_[self.index_] = std::move(value);
-    if (0 == --self.op_state_->count_) {
-      SetValue(std::move(self.op_state_->rcvr_), std::move(self.op_state_->values_));
-    }
-  }
-};
-
-template <typename Receiver>
-struct _Operation<Receiver>::type {
-  std::vector<TypeErasedOperation> ConnectChildren(std::vector<TypeErasedSender<Value>> senders) {
-    std::vector<TypeErasedOperation> op_states;
-    op_states.reserve(senders.size());
-    for (size_t i = 0; i < senders.size(); ++i)
-      op_states.push_back(Connect(std::move(senders[i]), receiver_t<Receiver>{i, this}));
-    return op_states;
-  }
-  type(std::vector<TypeErasedSender<Value>> senders, Receiver receiver)
-      : child_op_states_{ConnectChildren(std::move(senders))},
-        rcvr_((Receiver &&) receiver),
-        count_(child_op_states_.size()),
-        values_(child_op_states_.size()) {}
-
-  std::vector<TypeErasedOperation> child_op_states_;
-  Receiver rcvr_;
-  std::atomic<size_t> count_;
-  std::vector<Value> values_;
-
-  friend void tag_invoke(start_t, type& op_state) {
-    for (auto& op : op_state.child_op_states_) {
-      Start(op);
-    }
-  }
-};
-
-struct sender_t {
-  using value_types = std::tuple<std::vector<Value>>;
-
-  std::vector<TypeErasedSender<Value>> senders_;
-
-  template <typename Self, typename Receiver, typename = _decays_to<Self, sender_t>>
-  friend operation_t<remove_cvref_t<Receiver>> tag_invoke(connect_t, Self&& self,
-                                                          Receiver&& receiver) {
-    return {((Self &&) self).senders_, (Receiver &&) receiver};
-  }
-};
-
-}  // namespace __when_all_value
-
-namespace _type_erased {
-
-inline __when_all_value::sender_t tag_invoke(when_all_t,
-                                             std::vector<TypeErasedSender<Value>> senders) {
-  return {std::move(senders)};
-}
-
-}  // namespace _type_erased
+namespace mmdeploy
+{
+
+    namespace __when_all_value
+    {
+
+        template<typename Receiver>
+        struct _Operation
+        {
+            struct type;
+        };
+        template<typename Receiver>
+        using operation_t = typename _Operation<Receiver>::type;
+
+        template<typename Receiver>
+        struct _Receiver
+        {
+            struct type;
+        };
+        template<typename Receiver>
+        using receiver_t = typename _Receiver<Receiver>::type;
+
+        template<typename Receiver>
+        struct _Receiver<Receiver>::type
+        {
+            size_t                 index_;
+            operation_t<Receiver>* op_state_;
+
+            friend void            tag_invoke(set_value_t, type&& self, Value value) noexcept
+            {
+                self.op_state_->values_[self.index_] = std::move(value);
+                if (0 == --self.op_state_->count_)
+                {
+                    SetValue(std::move(self.op_state_->rcvr_), std::move(self.op_state_->values_));
+                }
+            }
+        };
+
+        template<typename Receiver>
+        struct _Operation<Receiver>::type
+        {
+            std::vector<TypeErasedOperation> ConnectChildren(std::vector<TypeErasedSender<Value>> senders)
+            {
+                std::vector<TypeErasedOperation> op_states;
+                op_states.reserve(senders.size());
+                for (size_t i = 0; i < senders.size(); ++i)
+                    op_states.push_back(Connect(std::move(senders[i]), receiver_t<Receiver>{i, this}));
+                return op_states;
+            }
+            type(std::vector<TypeErasedSender<Value>> senders, Receiver receiver)
+                : child_op_states_{ConnectChildren(std::move(senders))}
+                , rcvr_((Receiver&&)receiver)
+                , count_(child_op_states_.size())
+                , values_(child_op_states_.size())
+            {
+            }
+
+            std::vector<TypeErasedOperation> child_op_states_;
+            Receiver                         rcvr_;
+            std::atomic<size_t>              count_;
+            std::vector<Value>               values_;
+
+            friend void                      tag_invoke(start_t, type& op_state)
+            {
+                for (auto& op : op_state.child_op_states_)
+                {
+                    Start(op);
+                }
+            }
+        };
+
+        struct sender_t
+        {
+            using value_types = std::tuple<std::vector<Value>>;
+
+            std::vector<TypeErasedSender<Value>> senders_;
+
+            template<typename Self, typename Receiver, typename = _decays_to<Self, sender_t>>
+            friend operation_t<remove_cvref_t<Receiver>> tag_invoke(connect_t, Self&& self, Receiver&& receiver)
+            {
+                return {((Self&&)self).senders_, (Receiver&&)receiver};
+            }
+        };
+
+    }  // namespace __when_all_value
+
+    namespace _type_erased
+    {
+
+        inline __when_all_value::sender_t tag_invoke(when_all_t,
+                                                     std::vector<TypeErasedSender<Value>> senders)
+        {
+            return {std::move(senders)};
+        }
+
+    }  // namespace _type_erased
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/experimental/module_adapter.h b/csrc/mmdeploy/experimental/module_adapter.h
index 763a71ba6b..712ff84615 100644
--- a/csrc/mmdeploy/experimental/module_adapter.h
+++ b/csrc/mmdeploy/experimental/module_adapter.h
@@ -7,105 +7,136 @@
 #include "mmdeploy/core/module.h"
 #include "mmdeploy/core/mpl/type_traits.h"
 
-namespace mmdeploy {
-
-namespace module_adapter {
-
-template <typename T>
-struct is_tuple : std::false_type {};
-
-template <typename... Ts>
-struct is_tuple<std::tuple<Ts...>> : std::true_type {};
-
-template <typename T>
-inline constexpr auto is_tuple_v = is_tuple<T>::value;
-
-template <typename... Args>
-struct InvokeImpl {
-  template <typename F>
-  static Result<Value> apply(F&& f, const Value& args) {
-    try {
-      using ArgsType = std::tuple<uncvref_t<Args>...>;
-      return make_ret_val(std::apply((F &&) f, from_value<ArgsType>(args)));
-    } catch (const std::exception& e) {
-      MMDEPLOY_ERROR("unhandled exception: {}", e.what());
-      return Status(eFail);
-    } catch (...) {
-      return Status(eFail);
-    }
-  }
-
-  template <typename T, typename T0 = uncvref_t<T>>
-  static Result<Value> make_ret_val(T&& ret) {
-    if constexpr (is_tuple_v<T0>) {
-      return to_value(std::forward<T>(ret));
-    } else if constexpr (is_result_v<T0>) {
-      return ret ? make_ret_val(std::forward<T>(ret).value()) : std::forward<T>(ret).as_failure();
-    } else {
-      return make_ret_val(std::forward_as_tuple(std::forward<T>(ret)));
-    }
-  }
-};
-
-// match function pointer
-template <typename Ret, typename... Args>
-Result<Value> Invoke(Ret (*f)(Args...), const Value& args) {
-  return InvokeImpl<Args...>::apply(f, args);
-}
-
-// match member function pointer `&C::operator()`
-template <typename Ret, typename C, typename F, typename... Args>
-Result<Value> Invoke(Ret (C::*)(Args...) const, const F& f, const Value& args) {
-  return InvokeImpl<Args...>::apply(f, args);
-}
-template <typename Ret, typename C, typename F, typename... Args>
-Result<Value> Invoke(Ret (C::*)(Args...), F& f, const Value& args) {
-  return InvokeImpl<Args...>::apply(f, args);
-}
-
-// match function object
-template <typename F, typename C = std::remove_reference_t<F>,
-          typename = std::void_t<decltype(&C::operator())>>
-Result<Value> Invoke(F&& f, const Value& args) {
-  return Invoke(&C::operator(), (F &&) f, args);
-}
-
-template <typename F>
-Result<Value> Invoke(const std::unique_ptr<F>& f, const Value& args) {
-  return Invoke(*f, args);
-}
-template <typename F>
-Result<Value> Invoke(const std::shared_ptr<F>& f, const Value& args) {
-  return Invoke(*f, args);
-}
-
-template <typename Func>
-class Task : public Module {
- public:
-  explicit Task(Func func) : func_(std::move(func)) {}
-
-  Result<Value> Process(const Value& arg) override {
-    return ::mmdeploy::module_adapter::Invoke(func_, arg);
-  }
-
- private:
-  Func func_;
-};
-
-template <typename T>
-std::unique_ptr<Module> CreateTask(T&& x) {
-  return std::unique_ptr<Module>(new Task{std::forward<T>(x)});
-}
-
-template <typename T>
-auto MakeTask(T&& x) {
-  return Task(std::forward<T>(x));
-}
-
-}  // namespace module_adapter
-
-using module_adapter::CreateTask;
-using module_adapter::MakeTask;
+namespace mmdeploy
+{
+
+    namespace module_adapter
+    {
+
+        template<typename T>
+        struct is_tuple : std::false_type
+        {
+        };
+
+        template<typename... Ts>
+        struct is_tuple<std::tuple<Ts...>> : std::true_type
+        {
+        };
+
+        template<typename T>
+        inline constexpr auto is_tuple_v = is_tuple<T>::value;
+
+        template<typename... Args>
+        struct InvokeImpl
+        {
+            template<typename F>
+            static Result<Value> apply(F&& f, const Value& args)
+            {
+                try
+                {
+                    using ArgsType = std::tuple<uncvref_t<Args>...>;
+                    return make_ret_val(std::apply((F&&)f, from_value<ArgsType>(args)));
+                }
+                catch (const std::exception& e)
+                {
+                    MMDEPLOY_ERROR("unhandled exception: {}", e.what());
+                    return Status(eFail);
+                }
+                catch (...)
+                {
+                    return Status(eFail);
+                }
+            }
+
+            template<typename T, typename T0 = uncvref_t<T>>
+            static Result<Value> make_ret_val(T&& ret)
+            {
+                if constexpr (is_tuple_v<T0>)
+                {
+                    return to_value(std::forward<T>(ret));
+                }
+                else if constexpr (is_result_v<T0>)
+                {
+                    return ret ? make_ret_val(std::forward<T>(ret).value()) : std::forward<T>(ret).as_failure();
+                }
+                else
+                {
+                    return make_ret_val(std::forward_as_tuple(std::forward<T>(ret)));
+                }
+            }
+        };
+
+        // match function pointer
+        template<typename Ret, typename... Args>
+        Result<Value> Invoke(Ret (*f)(Args...), const Value& args)
+        {
+            return InvokeImpl<Args...>::apply(f, args);
+        }
+
+        // match member function pointer `&C::operator()`
+        template<typename Ret, typename C, typename F, typename... Args>
+        Result<Value> Invoke(Ret (C::*)(Args...) const, const F& f, const Value& args)
+        {
+            return InvokeImpl<Args...>::apply(f, args);
+        }
+        template<typename Ret, typename C, typename F, typename... Args>
+        Result<Value> Invoke(Ret (C::*)(Args...), F& f, const Value& args)
+        {
+            return InvokeImpl<Args...>::apply(f, args);
+        }
+
+        // match function object
+        template<typename F, typename C = std::remove_reference_t<F>, typename = std::void_t<decltype(&C::operator())>>
+        Result<Value> Invoke(F&& f, const Value& args)
+        {
+            return Invoke(&C::operator(), (F&&)f, args);
+        }
+
+        template<typename F>
+        Result<Value> Invoke(const std::unique_ptr<F>& f, const Value& args)
+        {
+            return Invoke(*f, args);
+        }
+        template<typename F>
+        Result<Value> Invoke(const std::shared_ptr<F>& f, const Value& args)
+        {
+            return Invoke(*f, args);
+        }
+
+        template<typename Func>
+        class Task : public Module
+        {
+          public:
+            explicit Task(Func func)
+                : func_(std::move(func))
+            {
+            }
+
+            Result<Value> Process(const Value& arg) override
+            {
+                return ::mmdeploy::module_adapter::Invoke(func_, arg);
+            }
+
+          private:
+            Func func_;
+        };
+
+        template<typename T>
+        std::unique_ptr<Module> CreateTask(T&& x)
+        {
+            return std::unique_ptr<Module>(new Task{std::forward<T>(x)});
+        }
+
+        template<typename T>
+        auto MakeTask(T&& x)
+        {
+            return Task(std::forward<T>(x));
+        }
+
+    }  // namespace module_adapter
+
+    using module_adapter::CreateTask;
+    using module_adapter::MakeTask;
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/graph/CMakeLists.txt b/csrc/mmdeploy/graph/CMakeLists.txt
index b5b6d6422d..08945be366 100644
--- a/csrc/mmdeploy/graph/CMakeLists.txt
+++ b/csrc/mmdeploy/graph/CMakeLists.txt
@@ -2,11 +2,6 @@
 
 project(mmdeploy_graph)
 
-set(SRCS
-        task.cpp
-        static_router.cpp
-        inference.cpp
-        pipeline.cpp
-        cond.cpp)
+set(SRCS task.cpp static_router.cpp inference.cpp pipeline.cpp cond.cpp)
 mmdeploy_add_module(${PROJECT_NAME} LIBRARY "${SRCS}")
 add_library(mmdeploy::graph ALIAS ${PROJECT_NAME})
diff --git a/csrc/mmdeploy/graph/common.h b/csrc/mmdeploy/graph/common.h
index 6427340592..cd5e81a716 100644
--- a/csrc/mmdeploy/graph/common.h
+++ b/csrc/mmdeploy/graph/common.h
@@ -8,48 +8,67 @@
 #include "mmdeploy/core/registry.h"
 #include "mmdeploy/core/value.h"
 
-namespace mmdeploy::graph {
-
-namespace {
-
-template <typename T>
-inline auto Check(const T& v) -> decltype(!!v) {
-  return !!v;
-}
-
-template <typename T>
-inline std::true_type Check(T&&) {
-  return {};
-}
-
-}  // namespace
-
-namespace _maybe {
-
-struct Maybe {
-  std::optional<std::reference_wrapper<const Value>> val_;
-  explicit operator bool() const noexcept { return val_.has_value(); }
-  const Value& operator*() const noexcept { return val_->get(); }
-  const Value* operator->() const noexcept { return &val_->get(); }
-};
-
-inline Maybe operator/(const Maybe& maybe, const string& p) {
-  if (maybe && maybe->contains(p)) {
-    return {(*maybe)[p]};
-  }
-  return {std::nullopt};
-}
-
-template <typename T>
-inline std::optional<T> operator/(const Maybe& maybe, identity<T>) {
-  if (maybe) {
-    return maybe->get<T>();
-  }
-  return std::nullopt;
-}
-}  // namespace _maybe
-
-using _maybe::Maybe;
+namespace mmdeploy::graph
+{
+
+    namespace
+    {
+
+        template<typename T>
+        inline auto Check(const T& v) -> decltype(!!v)
+        {
+            return !!v;
+        }
+
+        template<typename T>
+        inline std::true_type Check(T&&)
+        {
+            return {};
+        }
+
+    }  // namespace
+
+    namespace _maybe
+    {
+
+        struct Maybe
+        {
+            std::optional<std::reference_wrapper<const Value>> val_;
+            explicit                                           operator bool() const noexcept
+            {
+                return val_.has_value();
+            }
+            const Value& operator*() const noexcept
+            {
+                return val_->get();
+            }
+            const Value* operator->() const noexcept
+            {
+                return &val_->get();
+            }
+        };
+
+        inline Maybe operator/(const Maybe& maybe, const string& p)
+        {
+            if (maybe && maybe->contains(p))
+            {
+                return {(*maybe)[p]};
+            }
+            return {std::nullopt};
+        }
+
+        template<typename T>
+        inline std::optional<T> operator/(const Maybe& maybe, identity<T>)
+        {
+            if (maybe)
+            {
+                return maybe->get<T>();
+            }
+            return std::nullopt;
+        }
+    }  // namespace _maybe
+
+    using _maybe::Maybe;
 
 }  // namespace mmdeploy::graph
 
diff --git a/csrc/mmdeploy/graph/cond.cpp b/csrc/mmdeploy/graph/cond.cpp
index f490e08e73..5cf6b80053 100644
--- a/csrc/mmdeploy/graph/cond.cpp
+++ b/csrc/mmdeploy/graph/cond.cpp
@@ -4,66 +4,81 @@
 
 #include <algorithm>
 
-namespace mmdeploy::graph {
-
-namespace {
-
-std::vector<int> get_predicates(const Value::Array& xs) {
-  std::vector<int> ps;
-  ps.reserve(xs.size());
-  std::transform(std::begin(xs), std::end(xs), std::back_inserter(ps),
-                 [](const Value& x) { return static_cast<int>(x.get<bool>()); });
-  return ps;
-}
-
-std::pair<bool, int> choice(const std::vector<int>& xs) {
-  auto count = std::count(std::begin(xs), std::end(xs), 1);
-  if (count == 0 || count == xs.size()) {
-    return std::make_pair(true, count == xs.size());
-  }
-  return std::make_pair(false, false);
-}
-
-Value get_divergent_input(Value::Array& as, const std::vector<int>& ps) {
-  Value::Array ts(as.size(), Value::kArray);
-  for (size_t i = 0; i < ts.size(); ++i) {
-    auto& t = ts[i].array();
-    auto& a = as[i].array();
-    for (size_t j = 0; j < ps.size(); ++j) {
-      if (ps[j]) {
-        t.push_back(std::move(a[j]));
-      }
-    }
-  }
-  return ts;
-}
-
-Value get_divergent_output(Value::Array& rs, const vector<int>& ps) {
-  Value::Array ys(rs.size(), Value::kArray);
-  for (size_t i = 0; i < ys.size(); ++i) {
-    auto& y = ys[i].array();
-    auto& r = rs[i].array();
-    size_t k = 0;
-    for (const auto& p : ps) {
-      y.push_back(p ? std::move(r[k++]) : nullptr);
-    }
-  }
-  return ys;
-}
-
-}  // namespace
-
-Sender<Value> Cond::Process(Sender<Value> input) {
-  auto index = std::make_shared<profiler::Index>();
-  if (scope_) {
-    *index = scope_->next_.fetch_add(1, std::memory_order_relaxed);
-    input = Then(std::move(input), [this, index](Value v) mutable {
+namespace mmdeploy::graph
+{
+
+    namespace
+    {
+
+        std::vector<int> get_predicates(const Value::Array& xs)
+        {
+            std::vector<int> ps;
+            ps.reserve(xs.size());
+            std::transform(std::begin(xs), std::end(xs), std::back_inserter(ps), [](const Value& x)
+                           { return static_cast<int>(x.get<bool>()); });
+            return ps;
+        }
+
+        std::pair<bool, int> choice(const std::vector<int>& xs)
+        {
+            auto count = std::count(std::begin(xs), std::end(xs), 1);
+            if (count == 0 || count == xs.size())
+            {
+                return std::make_pair(true, count == xs.size());
+            }
+            return std::make_pair(false, false);
+        }
+
+        Value get_divergent_input(Value::Array& as, const std::vector<int>& ps)
+        {
+            Value::Array ts(as.size(), Value::kArray);
+            for (size_t i = 0; i < ts.size(); ++i)
+            {
+                auto& t = ts[i].array();
+                auto& a = as[i].array();
+                for (size_t j = 0; j < ps.size(); ++j)
+                {
+                    if (ps[j])
+                    {
+                        t.push_back(std::move(a[j]));
+                    }
+                }
+            }
+            return ts;
+        }
+
+        Value get_divergent_output(Value::Array& rs, const vector<int>& ps)
+        {
+            Value::Array ys(rs.size(), Value::kArray);
+            for (size_t i = 0; i < ys.size(); ++i)
+            {
+                auto&  y = ys[i].array();
+                auto&  r = rs[i].array();
+                size_t k = 0;
+                for (const auto& p : ps)
+                {
+                    y.push_back(p ? std::move(r[k++]) : nullptr);
+                }
+            }
+            return ys;
+        }
+
+    }  // namespace
+
+    Sender<Value> Cond::Process(Sender<Value> input)
+    {
+        auto index = std::make_shared<profiler::Index>();
+        if (scope_)
+        {
+            *index = scope_->next_.fetch_add(1, std::memory_order_relaxed);
+            input  = Then(std::move(input), [this, index](Value v) mutable
+                         {
       scope_->Add(profiler::Event::kStart, *index, profiler::Clock::now());
-      return std::move(v);
-    });
-  }
+      return std::move(v); });
+        }
 
-  Sender<Value> output = LetValue(std::move(input), [this](Value& _input) -> Sender<Value> {
+        Sender<Value> output = LetValue(std::move(input), [this](Value& _input) -> Sender<Value>
+                                        {
     assert(_input.is_array());
     auto& as = _input.array();
     auto ps = get_predicates(as.front().array());
@@ -82,60 +97,71 @@ Sender<Value> Cond::Process(Sender<Value> input) {
              Then([ps = std::move(ps)](Value rs) -> Value {
                return get_divergent_output(rs.array(), ps);
              });
-    }
-  });
+    } });
 
-  if (scope_) {
-    output = Then(std::move(output), [this, index](Value v) {
+        if (scope_)
+        {
+            output = Then(std::move(output), [this, index](Value v)
+                          {
       scope_->Add(profiler::Event::kEnd, *index, profiler::Clock::now());
-      return std::move(v);
-    });
-  }
-  return output;
-}
-
-CondBuilder::CondBuilder(Value config) : Builder(std::move(config)) {}
-
-Result<unique_ptr<Node>> CondBuilder::BuildImpl() {
-  try {
-    auto cond = std::make_unique<Cond>();
-    cond->n_output_ = static_cast<int>(config_["output"].size());
-
-    auto& body_config = config_["body"];
-    auto inputs = config_["input"].array();
-    inputs.erase(inputs.begin());
-
-    body_config["input"] = std::move(inputs);
-    body_config["output"] = config_["output"];
-
-    // propagate context
-    if (!body_config.contains("context")) {
-      body_config["context"] = Value::Object();
+      return std::move(v); });
+        }
+        return output;
     }
-    if (config_.contains("context")) {
-      update(body_config["context"].object(), config_["context"].object(), 2);
-      if (config_["context"].contains("scope")) {
-        auto scope = config_["context"]["scope"].get<profiler::Scope*>();
-        auto name = config_.value("name", std::string("Cond"));
-        cond->scope_ = scope->CreateScope(name);
-        body_config["context"]["scope"] = cond->scope_;
-      }
+
+    CondBuilder::CondBuilder(Value config)
+        : Builder(std::move(config))
+    {
     }
 
-    if (auto builder = Builder::CreateFromConfig(body_config).value()) {
-      if (auto node = builder->Build().value()) {
-        cond->node_ = std::move(node);
-        return std::move(cond);
-      }
+    Result<unique_ptr<Node>> CondBuilder::BuildImpl()
+    {
+        try
+        {
+            auto cond       = std::make_unique<Cond>();
+            cond->n_output_ = static_cast<int>(config_["output"].size());
+
+            auto& body_config = config_["body"];
+            auto  inputs      = config_["input"].array();
+            inputs.erase(inputs.begin());
+
+            body_config["input"]  = std::move(inputs);
+            body_config["output"] = config_["output"];
+
+            // propagate context
+            if (!body_config.contains("context"))
+            {
+                body_config["context"] = Value::Object();
+            }
+            if (config_.contains("context"))
+            {
+                update(body_config["context"].object(), config_["context"].object(), 2);
+                if (config_["context"].contains("scope"))
+                {
+                    auto scope                      = config_["context"]["scope"].get<profiler::Scope*>();
+                    auto name                       = config_.value("name", std::string("Cond"));
+                    cond->scope_                    = scope->CreateScope(name);
+                    body_config["context"]["scope"] = cond->scope_;
+                }
+            }
+
+            if (auto builder = Builder::CreateFromConfig(body_config).value())
+            {
+                if (auto node = builder->Build().value())
+                {
+                    cond->node_ = std::move(node);
+                    return std::move(cond);
+                }
+            }
+        }
+        catch (const std::exception& e)
+        {
+            MMDEPLOY_ERROR("error parsing config: {}", config_);
+        }
+        return Status(eFail);
     }
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("error parsing config: {}", config_);
-  }
-  return Status(eFail);
-}
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(Builder, (Cond, 0), [](const Value& config) {
-  return std::make_unique<CondBuilder>(config);
-})
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Builder, (Cond, 0), [](const Value& config)
+                                   { return std::make_unique<CondBuilder>(config); })
 
 }  // namespace mmdeploy::graph
diff --git a/csrc/mmdeploy/graph/cond.h b/csrc/mmdeploy/graph/cond.h
index 9d374f2ff6..bf600eaa1b 100644
--- a/csrc/mmdeploy/graph/cond.h
+++ b/csrc/mmdeploy/graph/cond.h
@@ -6,27 +6,30 @@
 #include "mmdeploy/core/graph.h"
 #include "mmdeploy/core/profiler.h"
 
-namespace mmdeploy::graph {
-
-class Cond : public Node {
-  friend class CondBuilder;
-
- public:
-  Sender<Value> Process(Sender<Value> input) override;
-
- private:
-  profiler::Scope* scope_{nullptr};
-  std::unique_ptr<Node> node_;
-  int n_output_{0};
-};
-
-class CondBuilder : public Builder {
- public:
-  explicit CondBuilder(Value config);
-
- protected:
-  Result<unique_ptr<Node>> BuildImpl() override;
-};
+namespace mmdeploy::graph
+{
+
+    class Cond : public Node
+    {
+        friend class CondBuilder;
+
+      public:
+        Sender<Value> Process(Sender<Value> input) override;
+
+      private:
+        profiler::Scope*      scope_{nullptr};
+        std::unique_ptr<Node> node_;
+        int                   n_output_{0};
+    };
+
+    class CondBuilder : public Builder
+    {
+      public:
+        explicit CondBuilder(Value config);
+
+      protected:
+        Result<unique_ptr<Node>> BuildImpl() override;
+    };
 
 }  // namespace mmdeploy::graph
 
diff --git a/csrc/mmdeploy/graph/flattened.h b/csrc/mmdeploy/graph/flattened.h
index 4a2fd4cb9b..2555245453 100644
--- a/csrc/mmdeploy/graph/flattened.h
+++ b/csrc/mmdeploy/graph/flattened.h
@@ -7,48 +7,48 @@
 #include "mmdeploy/core/operator.h"
 #include "mmdeploy/execution/expand.h"
 
-namespace mmdeploy::graph {
-
-class Flattened : public Node {
- public:
-  Flattened(unique_ptr<Node> child, vector<bool> flatten, vector<bool> broadcast,
-            vector<bool> unflatten);
-
-  Sender<Value> Process(Sender<Value> input) override;
-
- private:
-  const vector<bool> flatten_;
-  const vector<bool> broadcast_;
-  const vector<bool> unflatten_;
-  unique_ptr<Node> body_;
-};
-
-inline Flattened::Flattened(unique_ptr<Node> child, vector<bool> flatten, vector<bool> broadcast,
-                            vector<bool> unflatten)
-    : flatten_(std::move(flatten)),
-      broadcast_(std::move(broadcast)),
-      unflatten_(std::move(unflatten)),
-      body_(std::move(child)) {}
-
-inline Sender<Value> Flattened::Process(Sender<Value> input) {
-  auto flatten = Then([this](Value input) -> std::tuple<Value::Array, vector<int>> {
+namespace mmdeploy::graph
+{
+
+    class Flattened : public Node
+    {
+      public:
+        Flattened(unique_ptr<Node> child, vector<bool> flatten, vector<bool> broadcast, vector<bool> unflatten);
+
+        Sender<Value> Process(Sender<Value> input) override;
+
+      private:
+        const vector<bool> flatten_;
+        const vector<bool> broadcast_;
+        const vector<bool> unflatten_;
+        unique_ptr<Node>   body_;
+    };
+
+    inline Flattened::Flattened(unique_ptr<Node> child, vector<bool> flatten, vector<bool> broadcast, vector<bool> unflatten)
+        : flatten_(std::move(flatten))
+        , broadcast_(std::move(broadcast))
+        , unflatten_(std::move(unflatten))
+        , body_(std::move(child))
+    {
+    }
+
+    inline Sender<Value> Flattened::Process(Sender<Value> input)
+    {
+        auto flatten = Then([this](Value input) -> std::tuple<Value::Array, vector<int>>
+                            {
     auto [output, index] = FlattenArray(std::move(input).array(), flatten_);
     output = BroadcastArray(std::move(output), index, broadcast_);
-    return {std::move(output), std::move(index)};
-  });
+    return {std::move(output), std::move(index)}; });
 
-  auto process = LetValue([this](Value::Array& v, vector<int>& idx) {
-    return Just(Value(std::move(v))) | body_->Process() | Then([idx](Value output) mutable {
-             return std::make_tuple(std::move(output), std::move(idx));
-           });
-  });
+        auto process = LetValue([this](Value::Array& v, vector<int>& idx)
+                                { return Just(Value(std::move(v))) | body_->Process() | Then([idx](Value output) mutable
+                                                                                             { return std::make_tuple(std::move(output), std::move(idx)); }); });
 
-  auto unflatten = Then([this](Value output, const vector<int>& index) -> Value {
-    return UnflattenArray(std::move(output).array(), index, unflatten_);
-  });
+        auto unflatten = Then([this](Value output, const vector<int>& index) -> Value
+                              { return UnflattenArray(std::move(output).array(), index, unflatten_); });
 
-  return std::move(input) | flatten | Expand() | process | Expand() | unflatten;
-}
+        return std::move(input) | flatten | Expand() | process | Expand() | unflatten;
+    }
 
 }  // namespace mmdeploy::graph
 
diff --git a/csrc/mmdeploy/graph/inference.cpp b/csrc/mmdeploy/graph/inference.cpp
index 8f5c8d1699..b72adaeef8 100644
--- a/csrc/mmdeploy/graph/inference.cpp
+++ b/csrc/mmdeploy/graph/inference.cpp
@@ -7,72 +7,85 @@
 #include "mmdeploy/core/profiler.h"
 #include "mmdeploy/graph/common.h"
 
-namespace mmdeploy::graph {
-
-using namespace framework;
-
-InferenceBuilder::InferenceBuilder(Value config) : Builder(std::move(config)) {}
-
-Result<unique_ptr<Node>> InferenceBuilder::BuildImpl() {
-  auto& model_config = config_["params"]["model"];
-  Model model;
-  if (model_config.is_any<Model>()) {
-    model = model_config.get<Model>();
-  } else {
-    auto model_name = model_config.get<string>();
-    if (auto m = Maybe{config_} / "context" / "model" / model_name / identity<Model>{}) {
-      model = *m;
-    } else {
-      model = Model(model_name);
+namespace mmdeploy::graph
+{
+
+    using namespace framework;
+
+    InferenceBuilder::InferenceBuilder(Value config)
+        : Builder(std::move(config))
+    {
+    }
+
+    Result<unique_ptr<Node>> InferenceBuilder::BuildImpl()
+    {
+        auto& model_config = config_["params"]["model"];
+        Model model;
+        if (model_config.is_any<Model>())
+        {
+            model = model_config.get<Model>();
+        }
+        else
+        {
+            auto model_name = model_config.get<string>();
+            if (auto m = Maybe{config_} / "context" / "model" / model_name / identity<Model>{})
+            {
+                model = *m;
+            }
+            else
+            {
+                model = Model(model_name);
+            }
+        }
+
+        OUTCOME_TRY(auto pipeline_config, model.ReadConfig("pipeline.json"));
+
+        auto context     = config_.value("context", Value(ValueType::kObject));
+        context["model"] = std::move(model);
+
+        if (context.contains("scope"))
+        {
+            auto name        = config_.value("name", config_["type"].get<std::string>());
+            auto scope       = context["scope"].get_ref<profiler::Scope*&>()->CreateScope(name);
+            context["scope"] = scope;
+        }
+        pipeline_config["context"] = context;
+
+        MMDEPLOY_DEBUG("{}", pipeline_config);
+
+        OUTCOME_TRY(auto pipeline_builder, Builder::CreateFromConfig(pipeline_config));
+        OUTCOME_TRY(auto node, pipeline_builder->Build());
+
+        OUTCOME_TRY(CheckInputs(*pipeline_builder));
+        OUTCOME_TRY(CheckOutputs(*pipeline_builder));
+
+        return std::move(node);
     }
-  }
-
-  OUTCOME_TRY(auto pipeline_config, model.ReadConfig("pipeline.json"));
-
-  auto context = config_.value("context", Value(ValueType::kObject));
-  context["model"] = std::move(model);
-
-  if (context.contains("scope")) {
-    auto name = config_.value("name", config_["type"].get<std::string>());
-    auto scope = context["scope"].get_ref<profiler::Scope*&>()->CreateScope(name);
-    context["scope"] = scope;
-  }
-  pipeline_config["context"] = context;
-
-  MMDEPLOY_DEBUG("{}", pipeline_config);
-
-  OUTCOME_TRY(auto pipeline_builder, Builder::CreateFromConfig(pipeline_config));
-  OUTCOME_TRY(auto node, pipeline_builder->Build());
-
-  OUTCOME_TRY(CheckInputs(*pipeline_builder));
-  OUTCOME_TRY(CheckOutputs(*pipeline_builder));
-
-  return std::move(node);
-}
-Result<void> InferenceBuilder::CheckInputs(Builder& builder) {
-  OUTCOME_TRY(auto inputs_internal, ParseStringArray(config_["input"]));
-  MMDEPLOY_INFO("{} <- {}", builder.inputs(), inputs_internal);
-  if (builder.inputs().size() != inputs_internal.size()) {
-    MMDEPLOY_ERROR("mis-matched number of inputs: {} vs {}", builder.inputs().size(),
-                   inputs_internal.size());
-    return Status(eInvalidArgument);
-  }
-  return success();
-}
-
-Result<void> InferenceBuilder::CheckOutputs(Builder& builder) {
-  OUTCOME_TRY(auto outputs_internal, ParseStringArray(config_["output"]));
-  MMDEPLOY_INFO("{} -> {}", builder.outputs(), outputs_internal);
-  if (builder.outputs().size() != outputs_internal.size()) {
-    MMDEPLOY_ERROR("mis-matched number of outputs: {} vs {}", builder.outputs().size(),
-                   outputs_internal.size());
-    return Status(eInvalidArgument);
-  }
-  return success();
-}
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(Builder, (Inference, 0), [](const Value& config) {
-  return std::make_unique<InferenceBuilder>(config);
-});
+    Result<void> InferenceBuilder::CheckInputs(Builder& builder)
+    {
+        OUTCOME_TRY(auto inputs_internal, ParseStringArray(config_["input"]));
+        MMDEPLOY_INFO("{} <- {}", builder.inputs(), inputs_internal);
+        if (builder.inputs().size() != inputs_internal.size())
+        {
+            MMDEPLOY_ERROR("mis-matched number of inputs: {} vs {}", builder.inputs().size(), inputs_internal.size());
+            return Status(eInvalidArgument);
+        }
+        return success();
+    }
+
+    Result<void> InferenceBuilder::CheckOutputs(Builder& builder)
+    {
+        OUTCOME_TRY(auto outputs_internal, ParseStringArray(config_["output"]));
+        MMDEPLOY_INFO("{} -> {}", builder.outputs(), outputs_internal);
+        if (builder.outputs().size() != outputs_internal.size())
+        {
+            MMDEPLOY_ERROR("mis-matched number of outputs: {} vs {}", builder.outputs().size(), outputs_internal.size());
+            return Status(eInvalidArgument);
+        }
+        return success();
+    }
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Builder, (Inference, 0), [](const Value& config)
+                                   { return std::make_unique<InferenceBuilder>(config); });
 
 }  // namespace mmdeploy::graph
diff --git a/csrc/mmdeploy/graph/inference.h b/csrc/mmdeploy/graph/inference.h
index 81b8aedc47..35317d21da 100644
--- a/csrc/mmdeploy/graph/inference.h
+++ b/csrc/mmdeploy/graph/inference.h
@@ -5,19 +5,21 @@
 
 #include "mmdeploy/core/graph.h"
 
-namespace mmdeploy::graph {
+namespace mmdeploy::graph
+{
 
-class InferenceBuilder : public Builder {
- public:
-  explicit InferenceBuilder(Value config);
+    class InferenceBuilder : public Builder
+    {
+      public:
+        explicit InferenceBuilder(Value config);
 
- protected:
-  Result<unique_ptr<Node>> BuildImpl() override;
+      protected:
+        Result<unique_ptr<Node>> BuildImpl() override;
 
- private:
-  Result<void> CheckInputs(Builder& builder);
-  Result<void> CheckOutputs(Builder& builder);
-};
+      private:
+        Result<void> CheckInputs(Builder& builder);
+        Result<void> CheckOutputs(Builder& builder);
+    };
 
 }  // namespace mmdeploy::graph
 
diff --git a/csrc/mmdeploy/graph/pipeline.cpp b/csrc/mmdeploy/graph/pipeline.cpp
index 6b328f3008..5b680d0055 100644
--- a/csrc/mmdeploy/graph/pipeline.cpp
+++ b/csrc/mmdeploy/graph/pipeline.cpp
@@ -5,17 +5,21 @@
 #include "mmdeploy/archive/value_archive.h"
 #include "mmdeploy/graph/static_router.h"
 
-namespace mmdeploy::graph {
+namespace mmdeploy::graph
+{
 
-PipelineBuilder::PipelineBuilder(Value config) : Builder(std::move(config)) {}
+    PipelineBuilder::PipelineBuilder(Value config)
+        : Builder(std::move(config))
+    {
+    }
 
-Result<unique_ptr<Node>> PipelineBuilder::BuildImpl() {
-  // create static router
-  return StaticRouterBuilder{}.Build(config_).value();
-}
+    Result<unique_ptr<Node>> PipelineBuilder::BuildImpl()
+    {
+        // create static router
+        return StaticRouterBuilder{}.Build(config_).value();
+    }
 
-MMDEPLOY_REGISTER_FACTORY_FUNC(Builder, (Pipeline, 0), [](const Value& config) {
-  return std::make_unique<PipelineBuilder>(config);
-});
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Builder, (Pipeline, 0), [](const Value& config)
+                                   { return std::make_unique<PipelineBuilder>(config); });
 
 }  // namespace mmdeploy::graph
diff --git a/csrc/mmdeploy/graph/pipeline.h b/csrc/mmdeploy/graph/pipeline.h
index 9cb0eae3e7..4bd3cfe2de 100644
--- a/csrc/mmdeploy/graph/pipeline.h
+++ b/csrc/mmdeploy/graph/pipeline.h
@@ -5,15 +5,17 @@
 
 #include "mmdeploy/core/graph.h"
 
-namespace mmdeploy::graph {
+namespace mmdeploy::graph
+{
 
-class PipelineBuilder : public Builder {
- public:
-  explicit PipelineBuilder(Value config);
+    class PipelineBuilder : public Builder
+    {
+      public:
+        explicit PipelineBuilder(Value config);
 
- protected:
-  Result<unique_ptr<Node>> BuildImpl() override;
-};
+      protected:
+        Result<unique_ptr<Node>> BuildImpl() override;
+    };
 
 }  // namespace mmdeploy::graph
 
diff --git a/csrc/mmdeploy/graph/static_router.cpp b/csrc/mmdeploy/graph/static_router.cpp
index 5fea50b180..9c8d31a7ed 100644
--- a/csrc/mmdeploy/graph/static_router.cpp
+++ b/csrc/mmdeploy/graph/static_router.cpp
@@ -6,37 +6,42 @@
 #include "mmdeploy/execution/schedulers/inlined_scheduler.h"
 #include "mmdeploy/graph/common.h"
 
-namespace mmdeploy::graph {
-
-class StaticRouter::State {
- public:
-  State(vector<int> use_count, Sender<Value> args);
-
-  void Write(int index, Sender<Value> value);
-  // ! coords must last until finish of the async operation.
-  Sender<Value> Collect(const vector<Coords>& coords);
-
- private:
-  Sender<Value> Read(int index);
-  // collect inputs from outputs of multiple nodes
-  Sender<Value> CollectN(const vector<Coords>& coords);
-  // collect inputs from 1 node's outputs
-  Sender<Value> Collect1(const Coords& coords);
-
- private:
-  vector<int> use_count_;
-  vector<std::optional<Sender<Value>>> values_;
-};
-
-Sender<Value> StaticRouter::State::CollectN(const vector<Coords>& coords) {
-  vector<Sender<Value>> predecessors;
-  predecessors.reserve(coords.size());
-  size_t count = 0;
-  for (const auto& coord : coords) {
-    predecessors.push_back(Read(coord.index));
-    count += coord.mapping.size();
-  }
-  return Then(WhenAll(std::move(predecessors)), [count, &coords](Value::Array vals) {
+namespace mmdeploy::graph
+{
+
+    class StaticRouter::State
+    {
+      public:
+        State(vector<int> use_count, Sender<Value> args);
+
+        void          Write(int index, Sender<Value> value);
+        // ! coords must last until finish of the async operation.
+        Sender<Value> Collect(const vector<Coords>& coords);
+
+      private:
+        Sender<Value> Read(int index);
+        // collect inputs from outputs of multiple nodes
+        Sender<Value> CollectN(const vector<Coords>& coords);
+        // collect inputs from 1 node's outputs
+        Sender<Value> Collect1(const Coords& coords);
+
+      private:
+        vector<int>                          use_count_;
+        vector<std::optional<Sender<Value>>> values_;
+    };
+
+    Sender<Value> StaticRouter::State::CollectN(const vector<Coords>& coords)
+    {
+        vector<Sender<Value>> predecessors;
+        predecessors.reserve(coords.size());
+        size_t count = 0;
+        for (const auto& coord : coords)
+        {
+            predecessors.push_back(Read(coord.index));
+            count += coord.mapping.size();
+        }
+        return Then(WhenAll(std::move(predecessors)), [count, &coords](Value::Array vals)
+                    {
     Value ret(ValueType::kArray);
     auto& args = ret.array();
     args.resize(count);
@@ -46,12 +51,13 @@ Sender<Value> StaticRouter::State::CollectN(const vector<Coords>& coords) {
         args[to] = std::move(vals[j][from]);
       }
     }
-    return ret;
-  });
-}
+    return ret; });
+    }
 
-Sender<Value> StaticRouter::State::Collect1(const StaticRouter::Coords& coords) {
-  return Then(Read(coords.index), [&coords](Value val) {
+    Sender<Value> StaticRouter::State::Collect1(const StaticRouter::Coords& coords)
+    {
+        return Then(Read(coords.index), [&coords](Value val)
+                    {
     Value ret(ValueType::kArray);
     auto& args = ret.array();
     args.resize(coords.mapping.size());
@@ -59,180 +65,226 @@ Sender<Value> StaticRouter::State::Collect1(const StaticRouter::Coords& coords)
       // ! from(s) must be unique to avoid trouble, should be enforced by parser
       args[to] = std::move(val[from]);
     }
-    return ret;
-  });
-}
-
-Sender<Value> StaticRouter::State::Collect(const vector<Coords>& coords) {
-  if (coords.size() == 1) {
-    return Collect1(coords[0]);
-  } else {
-    return CollectN(coords);
-  }
-}
-
-void StaticRouter::State::Write(int index, Sender<Value> value) {
-  assert(!values_[index]);
-  if (use_count_[index] > 1) {
-    // ! split to create a copyable sender
-    values_[index] = Split(std::move(value));
-  } else {
-    values_[index] = std::move(value);
-  }
-}
-
-Sender<Value> StaticRouter::State::Read(int index) {
-  assert(values_[index]);
-  if (--use_count_[index] == 0) {
-    return std::move(*values_[index]);
-  } else {
-    // ! copy ctor of the wrapped sender must be valid
-    return *values_[index];
-  }
-}
-
-StaticRouter::State::State(vector<int> use_count, Sender<Value> args)
-    : use_count_(std::move(use_count)), values_(use_count_.size()) {
-  values_.back() = std::move(args);
-}
-
-Sender<Value> StaticRouter::Process(Sender<Value> args) {
-  auto index = std::make_shared<profiler::Index>();
-  auto start = std::make_shared<bool>(false);
-  if (scope_) {
-    *index = scope_->next_.fetch_add(1, std::memory_order_relaxed);
-    args = Then(std::move(args), [this, index, start](Value v) mutable {
-      if (*start == false) {
-        scope_->Add(profiler::Event::kStart, *index, profiler::Clock::now());
-        *start = true;
-      }
-      return std::move(v);
-    });
-  }
-
-  State state(use_count_, std::move(args));
-  for (size_t i = 0; i < nodes_.size(); ++i) {
-    auto input = state.Collect(input_coords_[i]);
-    auto output = nodes_[i]->Process(std::move(input));
-    state.Write(static_cast<int>(i), std::move(output));
-  }
-  auto output = state.Collect(ret_coords_);
-  if (scope_) {
-    output = Then(std::move(output), [this, index](Value v) {
-      scope_->Add(profiler::Event::kEnd, *index, profiler::Clock::now());
-      return std::move(v);
-    });
-  }
-  return output;
-}
-
-/////////////////////////////////////////////////////////////////////
-/// parsers
-
-Result<unique_ptr<StaticRouter>> StaticRouterBuilder::Build(const Value& config) {
-  try {
-    auto pipeline = std::make_unique<StaticRouter>();
-    if (config.contains("context") && config["context"].contains("scope")) {
-      auto name = config.value("name", std::string("Pipeline"));
-      auto scope = config["context"]["scope"].get<profiler::Scope*>();
-      pipeline->scope_ = scope->CreateScope(name);
+    return ret; });
     }
 
-    const auto& task_configs = config["tasks"];
-    auto size = task_configs.size();
+    Sender<Value> StaticRouter::State::Collect(const vector<Coords>& coords)
+    {
+        if (coords.size() == 1)
+        {
+            return Collect1(coords[0]);
+        }
+        else
+        {
+            return CollectN(coords);
+        }
+    }
 
-    vector<unique_ptr<Node>> nodes;
-    nodes.reserve(size);
+    void StaticRouter::State::Write(int index, Sender<Value> value)
+    {
+        assert(!values_[index]);
+        if (use_count_[index] > 1)
+        {
+            // ! split to create a copyable sender
+            values_[index] = Split(std::move(value));
+        }
+        else
+        {
+            values_[index] = std::move(value);
+        }
+    }
 
-    vector<vector<StaticRouter::Coords>> input_coords;
-    input_coords.reserve(size);
+    Sender<Value> StaticRouter::State::Read(int index)
+    {
+        assert(values_[index]);
+        if (--use_count_[index] == 0)
+        {
+            return std::move(*values_[index]);
+        }
+        else
+        {
+            // ! copy ctor of the wrapped sender must be valid
+            return *values_[index];
+        }
+    }
 
-    use_count_.resize(size + 1);
+    StaticRouter::State::State(vector<int> use_count, Sender<Value> args)
+        : use_count_(std::move(use_count))
+        , values_(use_count_.size())
+    {
+        values_.back() = std::move(args);
+    }
 
-    OUTCOME_TRY(auto inputs, ParseStringArray(config["input"]));
-    OUTCOME_TRY(auto outputs, ParseStringArray(config["output"]));
+    Sender<Value> StaticRouter::Process(Sender<Value> args)
+    {
+        auto index = std::make_shared<profiler::Index>();
+        auto start = std::make_shared<bool>(false);
+        if (scope_)
+        {
+            *index = scope_->next_.fetch_add(1, std::memory_order_relaxed);
+            args   = Then(std::move(args),
+                        [this, index, start](Value v) mutable
+                        {
+                            if (*start == false)
+                            {
+                                scope_->Add(profiler::Event::kStart, *index, profiler::Clock::now());
+                                *start = true;
+                            }
+                            return std::move(v);
+                        });
+        }
 
-    OUTCOME_TRY(UpdateOutputCoords(static_cast<int>(size), inputs));
-    for (auto task_config : task_configs) {
-      auto index = static_cast<int>(nodes.size());
+        State state(use_count_, std::move(args));
+        for (size_t i = 0; i < nodes_.size(); ++i)
+        {
+            auto input  = state.Collect(input_coords_[i]);
+            auto output = nodes_[i]->Process(std::move(input));
+            state.Write(static_cast<int>(i), std::move(output));
+        }
 
-      auto name = task_config.value<string>("name", "");
-      auto type = task_config.value<string>("type", "");
-      // propagate context
-      if (!task_config.contains("context")) {
-        task_config["context"] = Value::Object();
-      }
-      if (config.contains("context")) {
-        update(task_config["context"].object(), config["context"].object(), 2);
-        if (pipeline->scope_) {
-          task_config["context"]["scope"] = pipeline->scope_;
+        auto output = state.Collect(ret_coords_);
+        if (scope_)
+        {
+            output = Then(std::move(output),
+                          [this, index](Value v)
+                          {
+                              scope_->Add(profiler::Event::kEnd, *index, profiler::Clock::now());
+                              return std::move(v);
+                          });
         }
-      }
+        return output;
+    }
 
-      OUTCOME_TRY(auto builder, Builder::CreateFromConfig(task_config));
-      if (builder) {
-        auto node = builder->Build().value();
-        OUTCOME_TRY(auto coords, GetInputCoords(builder->inputs()));
-        input_coords.push_back(std::move(coords));
-        OUTCOME_TRY(UpdateOutputCoords(index, builder->outputs()));
-        nodes.push_back(std::move(node));
-      } else {
-        MMDEPLOY_ERROR("could not create {}: {}", name, type);
-        return Status(eFail);
-      }
+    /////////////////////////////////////////////////////////////////////
+    /// parsers
+
+    Result<unique_ptr<StaticRouter>> StaticRouterBuilder::Build(const Value& config)
+    {
+        try
+        {
+            auto pipeline = std::make_unique<StaticRouter>();
+            if (config.contains("context") && config["context"].contains("scope"))
+            {
+                auto name        = config.value("name", std::string("Pipeline"));
+                auto scope       = config["context"]["scope"].get<profiler::Scope*>();
+                pipeline->scope_ = scope->CreateScope(name);
+            }
+
+            const auto&              task_configs = config["tasks"];
+            auto                     size         = task_configs.size();
+
+            vector<unique_ptr<Node>> nodes;
+            nodes.reserve(size);
+
+            vector<vector<StaticRouter::Coords>> input_coords;
+            input_coords.reserve(size);
+
+            use_count_.resize(size + 1);
+
+            OUTCOME_TRY(auto inputs, ParseStringArray(config["input"]));
+            OUTCOME_TRY(auto outputs, ParseStringArray(config["output"]));
+
+            OUTCOME_TRY(UpdateOutputCoords(static_cast<int>(size), inputs));
+            for (auto task_config : task_configs)
+            {
+                auto index = static_cast<int>(nodes.size());
+
+                auto name = task_config.value<string>("name", "");
+                auto type = task_config.value<string>("type", "");
+                // propagate context
+                if (!task_config.contains("context"))
+                {
+                    task_config["context"] = Value::Object();
+                }
+                if (config.contains("context"))
+                {
+                    update(task_config["context"].object(), config["context"].object(), 2);
+                    if (pipeline->scope_)
+                    {
+                        task_config["context"]["scope"] = pipeline->scope_;
+                    }
+                }
+
+                OUTCOME_TRY(auto builder, Builder::CreateFromConfig(task_config));
+                if (builder)
+                {
+                    auto node = builder->Build().value();
+                    OUTCOME_TRY(auto coords, GetInputCoords(builder->inputs()));
+                    input_coords.push_back(std::move(coords));
+                    OUTCOME_TRY(UpdateOutputCoords(index, builder->outputs()));
+                    nodes.push_back(std::move(node));
+                }
+                else
+                {
+                    MMDEPLOY_ERROR("could not create {}: {}", name, type);
+                    return Status(eFail);
+                }
+            }
+            OUTCOME_TRY(auto coords, GetInputCoords(outputs));
+
+            pipeline->nodes_        = std::move(nodes);
+            pipeline->use_count_    = std::move(use_count_);
+            pipeline->input_coords_ = std::move(input_coords);
+            pipeline->ret_coords_   = std::move(coords);
+
+            return std::move(pipeline);
+        }
+        catch (const std::exception& e)
+        {
+            MMDEPLOY_ERROR("error parsing config: {}", e.what());
+            return Status(eFail);
+        }
     }
-    OUTCOME_TRY(auto coords, GetInputCoords(outputs));
-
-    pipeline->nodes_ = std::move(nodes);
-    pipeline->use_count_ = std::move(use_count_);
-    pipeline->input_coords_ = std::move(input_coords);
-    pipeline->ret_coords_ = std::move(coords);
-
-    return std::move(pipeline);
-
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("error parsing config: {}", e.what());
-    return Status(eFail);
-  }
-}
-
-Result<vector<StaticRouter::Coords>> StaticRouterBuilder::GetInputCoords(
-    const vector<string>& names) {
-  vector<StaticRouter::Coords> ret;
-  ret.reserve(names.size());
-  for (int i = 0; i < names.size(); ++i) {
-    const auto& input = names[i];
-    if (auto it = output_name_to_coords_.find(input); it != output_name_to_coords_.end()) {
-      const auto& [node_id, port_id] = it->second;
-      ++use_count_[node_id];
-      auto ct = find_if(begin(ret), end(ret),
-                        [node_id = node_id](auto& c) { return c.index == node_id; });
-      if (ct == end(ret)) {
-        ct = ret.insert(ct, {node_id, {}});
-      }
-      ct->mapping.emplace_back(port_id, i);
-    } else {
-      MMDEPLOY_ERROR("missing input: {}", input);
-      for (const auto& [k, v] : output_name_to_coords_) {
-        MMDEPLOY_ERROR("local var: {}", k);
-      }
-      return Status(eEntryNotFound);
+
+    Result<vector<StaticRouter::Coords>> StaticRouterBuilder::GetInputCoords(
+        const vector<string>& names)
+    {
+        vector<StaticRouter::Coords> ret;
+        ret.reserve(names.size());
+        for (int i = 0; i < names.size(); ++i)
+        {
+            const auto& input = names[i];
+            if (auto it = output_name_to_coords_.find(input); it != output_name_to_coords_.end())
+            {
+                const auto& [node_id, port_id] = it->second;
+                ++use_count_[node_id];
+                auto ct = find_if(begin(ret), end(ret), [node_id = node_id](auto& c)
+                                  { return c.index == node_id; });
+                if (ct == end(ret))
+                {
+                    ct = ret.insert(ct, {node_id, {}});
+                }
+                ct->mapping.emplace_back(port_id, i);
+            }
+            else
+            {
+                MMDEPLOY_ERROR("missing input: {}", input);
+                for (const auto& [k, v] : output_name_to_coords_)
+                {
+                    MMDEPLOY_ERROR("local var: {}", k);
+                }
+                return Status(eEntryNotFound);
+            }
+        }
+        return ret;
     }
-  }
-  return ret;
-}
-
-Result<void> StaticRouterBuilder::UpdateOutputCoords(int index, const vector<string>& names) {
-  for (int i = 0; i < names.size(); ++i) {
-    const auto& output = names[i];
-    if (auto it = output_name_to_coords_.find(output); it != output_name_to_coords_.end()) {
-      MMDEPLOY_ERROR("duplicate output: ", output);
-      return Status(eNotSupported);
-    } else {
-      output_name_to_coords_.insert({output, {index, i}});
+
+    Result<void> StaticRouterBuilder::UpdateOutputCoords(int index, const vector<string>& names)
+    {
+        for (int i = 0; i < names.size(); ++i)
+        {
+            const auto& output = names[i];
+            if (auto it = output_name_to_coords_.find(output); it != output_name_to_coords_.end())
+            {
+                MMDEPLOY_ERROR("duplicate output: ", output);
+                return Status(eNotSupported);
+            }
+            else
+            {
+                output_name_to_coords_.insert({output, {index, i}});
+            }
+        }
+        return success();
     }
-  }
-  return success();
-}
 
 }  // namespace mmdeploy::graph
diff --git a/csrc/mmdeploy/graph/static_router.h b/csrc/mmdeploy/graph/static_router.h
index 11dd4e5fb2..668242d523 100644
--- a/csrc/mmdeploy/graph/static_router.h
+++ b/csrc/mmdeploy/graph/static_router.h
@@ -13,45 +13,49 @@
 #include "mmdeploy/execution/schedulers/registry.h"
 #include "mmdeploy/execution/when_all_value.h"
 
-namespace mmdeploy::graph {
-
-class StaticRouter : public Node {
-  friend class StaticRouterBuilder;
-
- public:
-  Sender<Value> Process(Sender<Value> args) override;
-
-  struct Coords {
-    // source node index
-    int index;
-    // source output port -> destination input port mapping
-    vector<pair<int, int>> mapping;
-  };
-
-  class State;
-
- private:
-  vector<unique_ptr<Node>> nodes_;
-  vector<int> use_count_;
-  vector<vector<Coords>> input_coords_;
-  vector<Coords> ret_coords_;
-  profiler::Scope* scope_{nullptr};
-};
-
-class StaticRouterBuilder {
- public:
-  Result<unique_ptr<StaticRouter>> Build(const Value& config);
-
- private:
-  Result<vector<StaticRouter::Coords>> GetInputCoords(const vector<string>& names);
-
-  Result<void> UpdateOutputCoords(int index, const vector<string>& names);
-
-  // use count for each node's output
-  vector<int> use_count_;
-  // name -> (node_id, port_id)
-  std::map<string, pair<int, int>> output_name_to_coords_;
-};
+namespace mmdeploy::graph
+{
+
+    class StaticRouter : public Node
+    {
+        friend class StaticRouterBuilder;
+
+      public:
+        Sender<Value> Process(Sender<Value> args) override;
+
+        struct Coords
+        {
+            // source node index
+            int                    index;
+            // source output port -> destination input port mapping
+            vector<pair<int, int>> mapping;
+        };
+
+        class State;
+
+      private:
+        vector<unique_ptr<Node>> nodes_;
+        vector<int>              use_count_;
+        vector<vector<Coords>>   input_coords_;
+        vector<Coords>           ret_coords_;
+        profiler::Scope*         scope_{nullptr};
+    };
+
+    class StaticRouterBuilder
+    {
+      public:
+        Result<unique_ptr<StaticRouter>> Build(const Value& config);
+
+      private:
+        Result<vector<StaticRouter::Coords>> GetInputCoords(const vector<string>& names);
+
+        Result<void>                         UpdateOutputCoords(int index, const vector<string>& names);
+
+        // use count for each node's output
+        vector<int>                          use_count_;
+        // name -> (node_id, port_id)
+        std::map<string, pair<int, int>>     output_name_to_coords_;
+    };
 
 }  // namespace mmdeploy::graph
 
diff --git a/csrc/mmdeploy/graph/task.cpp b/csrc/mmdeploy/graph/task.cpp
index 6cb6c4a798..e9b64a4b26 100644
--- a/csrc/mmdeploy/graph/task.cpp
+++ b/csrc/mmdeploy/graph/task.cpp
@@ -5,104 +5,140 @@
 #include "mmdeploy/core/operator.h"
 #include "mmdeploy/graph/common.h"
 
-namespace mmdeploy::graph {
-
-Sender<Value> Task::Process(Sender<Value> input) {
-  return LetValue(std::move(input), [this](Value& v) -> Sender<Value> {
-    assert(v.is_array());
-    // handle empty input
-    if (v.front().empty()) {
-      profiler::ScopedCounter counter(scope_);
-      return TransferJust(*sched_, Value(Value::Array(v.size(), Value::kArray)));
-    }
-    if (v.front().is_array() && !is_batched_) {
-      auto batch_size = v.front().size();
-      Value output = Value::Array(batch_size);
-      // clang-format off
-      return TransferJust(*sched_, std::move(output))
-          | Then([&](Value&& output) -> Value {
-            auto input = graph::DistribAA(v).value();
-            return Value{std::move(input), std::move(output)};
-          })
-          | Bulk(batch_size, [&](size_t index, Value& in_out) {
-            profiler::ScopedCounter counter(scope_);
-            const auto& input = in_out[0];
-            auto& output = in_out[1];
-            output[index] = module_->Process(input[index]).value();
-          })
-          | Then([](const Value& in_out) {
-            return graph::DistribAA(in_out[1]).value();
-          });
-      // clang-format on
-    } else {
-      return DynamicBatch(TransferJust(*sched_, std::move(v)), batch_context_, [&](const Value& u) {
-        profiler::ScopedCounter counter(scope_);
-        return module_->Process(u).value();
-      });
+namespace mmdeploy::graph
+{
+
+    Sender<Value> Task::Process(Sender<Value> input)
+    {
+        return LetValue(std::move(input),
+                        [this](Value& v) -> Sender<Value>
+                        {
+                            assert(v.is_array());
+                            // handle empty input
+                            if (v.front().empty())
+                            {
+                                profiler::ScopedCounter counter(scope_);
+                                return TransferJust(*sched_, Value(Value::Array(v.size(), Value::kArray)));
+                            }
+
+                            if (v.front().is_array() && !is_batched_)
+                            {
+                                auto  batch_size = v.front().size();
+                                Value output     = Value::Array(batch_size);
+
+                                // clang-format off
+                                return TransferJust(*sched_, std::move(output)) | 
+                                        Then([&](Value&& output) -> Value 
+                                        {
+                                            auto input = graph::DistribAA(v).value();
+                                            return Value{std::move(input), std::move(output)};
+                                        }) | 
+                                        Bulk(batch_size, 
+                                            [&](size_t index, Value& in_out) 
+                                            {
+                                                profiler::ScopedCounter counter(scope_);
+                                                const auto& input = in_out[0];
+                                                auto& output = in_out[1];
+                                                output[index] = module_->Process(input[index]).value();
+                                            }) | 
+                                        Then([](const Value& in_out) 
+                                            {
+                                                return graph::DistribAA(in_out[1]).value();
+                                            });
+                                // clang-format on
+                            }
+                            else
+                            {
+                                return DynamicBatch(TransferJust(*sched_, std::move(v)),
+                                                    batch_context_,
+                                                    [&](const Value& u)
+                                                    {
+                                                        profiler::ScopedCounter counter(scope_);
+                                                        return module_->Process(u).value();
+                                                    });
+                            }
+                        });
     }
-  });
-}
-
-TaskBuilder::TaskBuilder(Value config) : Builder(std::move(config)) {}
-
-namespace {
-
-inline Result<unique_ptr<Module>> CreateModule(const Value& config) {
-  auto type = config["module"].get<std::string>();
-  auto creator = gRegistry<Module>().Get(type);
-  if (!creator) {
-    MMDEPLOY_ERROR("failed to find module creator: {}", type);
-    return Status(eEntryNotFound);
-  }
-  auto inst = creator->Create(config);
-  if (!Check(inst)) {
-    MMDEPLOY_ERROR("failed to create module: {}", type);
-    return Status(eFail);
-  }
-  return std::move(inst);
-}
-
-}  // namespace
-
-Result<unique_ptr<Node>> TaskBuilder::BuildImpl() {
-  try {
-    auto task = std::make_unique<Task>();
-    if (auto scope = Maybe{config_} / "context" / "scope" / identity<profiler::Scope*>{}) {
-      auto module_name = config_.value<std::string>("module", "");
-      auto name = config_.value<std::string>("name", "");
-      string scope_name = (name != "") ? name : module_name;
-      task->scope_ = (*scope)->CreateScope(scope_name);
-      config_["context"]["scope"] = task->scope_;
-      if (module_name == "Transform") {
-        task->scope_ = nullptr;
-      }
+
+    TaskBuilder::TaskBuilder(Value config)
+        : Builder(std::move(config))
+    {
     }
 
-    OUTCOME_TRY(task->module_, CreateModule(config_));
+    namespace
+    {
 
-    if (auto name = Maybe{config_} / "scheduler" / identity<string>{}) {
-      if (auto sched = Maybe{config_} / "context" / "scheduler" / *name /
-                       identity<TypeErasedScheduler<Value>>{}) {
-        task->sched_ = std::move(*sched);
-      }
-    }
+        inline Result<unique_ptr<Module>> CreateModule(const Value& config)
+        {
+            auto type    = config["module"].get<std::string>();
+            auto creator = gRegistry<Module>().Get(type);
+            if (!creator)
+            {
+                MMDEPLOY_ERROR("failed to find module creator: {}", type);
+                return Status(eEntryNotFound);
+            }
+            auto inst = creator->Create(config);
+            if (!Check(inst))
+            {
+                MMDEPLOY_ERROR("failed to create module: {}", type);
+                return Status(eFail);
+            }
+            return std::move(inst);
+        }
+
+    }  // namespace
+
+    Result<unique_ptr<Node>> TaskBuilder::BuildImpl()
+    {
+        try
+        {
+            auto task = std::make_unique<Task>();
+            if (auto scope = Maybe{config_} / "context" / "scope" / identity<profiler::Scope*>{})
+            {
+                auto   module_name          = config_.value<std::string>("module", "");
+                auto   name                 = config_.value<std::string>("name", "");
+                string scope_name           = (name != "") ? name : module_name;
+                task->scope_                = (*scope)->CreateScope(scope_name);
+                config_["context"]["scope"] = task->scope_;
+                if (module_name == "Transform")
+                {
+                    task->scope_ = nullptr;
+                }
+            }
+
+            OUTCOME_TRY(task->module_, CreateModule(config_));
+
+            if (auto name = Maybe{config_} / "scheduler" / identity<string>{})
+            {
+                if (auto sched = Maybe{config_} / "context" / "scheduler" / *name /
+                                 identity<TypeErasedScheduler<Value>>{})
+                {
+                    task->sched_ = std::move(*sched);
+                }
+            }
+
+            if (!task->sched_)
+            {
+                task->sched_ =
+                    TypeErasedScheduler<Value>{std::make_shared<TypeErasedScheduler<Value>::Impl>()};
+            }
 
-    if (!task->sched_) {
-      task->sched_ =
-          TypeErasedScheduler<Value>{std::make_shared<TypeErasedScheduler<Value>::Impl>()};
+            task->is_batched_     = config_.value("is_batched", false);
+            task->is_thread_safe_ = config_.value("is_thread_safe", false);
+            return std::move(task);
+        }
+        catch (const std::exception& e)
+        {
+            MMDEPLOY_ERROR("error parsing config: {}", config_);
+            return nullptr;
+        }
     }
 
-    task->is_batched_ = config_.value("is_batched", false);
-    task->is_thread_safe_ = config_.value("is_thread_safe", false);
-    return std::move(task);
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("error parsing config: {}", config_);
-    return nullptr;
-  }
-}
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(Builder, (Task, 0), [](const Value& config) {
-  return std::make_unique<TaskBuilder>(config);
-});
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Builder,
+                                   (Task, 0),
+                                   [](const Value& config)
+                                   {
+                                       return std::make_unique<TaskBuilder>(config);
+                                   });
 
 }  // namespace mmdeploy::graph
diff --git a/csrc/mmdeploy/graph/task.h b/csrc/mmdeploy/graph/task.h
index 11974efade..6effccd489 100644
--- a/csrc/mmdeploy/graph/task.h
+++ b/csrc/mmdeploy/graph/task.h
@@ -6,30 +6,33 @@
 #include "mmdeploy/core/graph.h"
 #include "mmdeploy/core/profiler.h"
 
-namespace mmdeploy::graph {
-
-class Task : public Node {
-  friend class TaskBuilder;
-
- public:
-  Sender<Value> Process(Sender<Value> input) override;
-
- private:
-  std::optional<TypeErasedScheduler<Value>> sched_;
-  unique_ptr<Module> module_;
-  bool is_batched_{false};
-  bool is_thread_safe_{false};
-  dynamic_batch_t::context_t batch_context_;
-  profiler::Scope* scope_{nullptr};
-};
-
-class TaskBuilder : public Builder {
- public:
-  explicit TaskBuilder(Value config);
-
- protected:
-  Result<unique_ptr<Node>> BuildImpl() override;
-};
+namespace mmdeploy::graph
+{
+
+    class Task : public Node
+    {
+        friend class TaskBuilder;
+
+      public:
+        Sender<Value> Process(Sender<Value> input) override;
+
+      private:
+        std::optional<TypeErasedScheduler<Value>> sched_;
+        unique_ptr<Module>                        module_;
+        bool                                      is_batched_{false};
+        bool                                      is_thread_safe_{false};
+        dynamic_batch_t::context_t                batch_context_;
+        profiler::Scope*                          scope_{nullptr};
+    };
+
+    class TaskBuilder : public Builder
+    {
+      public:
+        explicit TaskBuilder(Value config);
+
+      protected:
+        Result<unique_ptr<Node>> BuildImpl() override;
+    };
 
 }  // namespace mmdeploy::graph
 
diff --git a/csrc/mmdeploy/model/CMakeLists.txt b/csrc/mmdeploy/model/CMakeLists.txt
index 01cbe77076..bd93ad5196 100644
--- a/csrc/mmdeploy/model/CMakeLists.txt
+++ b/csrc/mmdeploy/model/CMakeLists.txt
@@ -3,104 +3,100 @@
 project(mmdeploy_model)
 
 set(MODEL_NAMES "directory_model")
-if (${MMDEPLOY_ZIP_MODEL})
-    set(MODEL_NAMES ${MODEL_NAMES} "zip_model")
+if(${MMDEPLOY_ZIP_MODEL})
+  set(MODEL_NAMES ${MODEL_NAMES} "zip_model")
 
-    if (MSVC)
-        set(zlib zlibstatic)
-        set(LIB_PREFIX "")
-        set(LIB_SUFFIX ".lib")
-    else()
-        set(zlib z)
-        set(LIB_PREFIX "lib")
-        set(LIB_SUFFIX ".a")
-    endif()
-    set(ziplib zip)
+  if(MSVC)
+    set(zlib zlibstatic)
+    set(LIB_PREFIX "")
+    set(LIB_SUFFIX ".lib")
+  else()
+    set(zlib z)
+    set(LIB_PREFIX "lib")
+    set(LIB_SUFFIX ".a")
+  endif()
+  set(ziplib zip)
 
-    set(zlib_name ${LIB_PREFIX}${zlib}${LIB_SUFFIX})
-    set(ziplib_name ${LIB_PREFIX}${ziplib}${LIB_SUFFIX})
+  set(zlib_name ${LIB_PREFIX}${zlib}${LIB_SUFFIX})
+  set(ziplib_name ${LIB_PREFIX}${ziplib}${LIB_SUFFIX})
 
-    include(ExternalProject)
-    set(ZLIB_BUILD_DIR ${CMAKE_BINARY_DIR}/zlib-build)
-    set(ZLIB_INSTALL_DIR ${CMAKE_BINARY_DIR}/zlib-install)
-    set(ZLIB_INCLUDE_DIR ${ZLIB_INSTALL_DIR}/include)
-    set(ZLIB_LIBRARY_DIR ${ZLIB_INSTALL_DIR}/lib)
-    ExternalProject_Add(
-        zlib-external
-        GIT_REPOSITORY https://github.com/madler/zlib
-        GIT_TAG v1.2.13
-        CMAKE_ARGS
-            -DCMAKE_INSTALL_PREFIX=${ZLIB_INSTALL_DIR}
-            -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-        UPDATE_DISCONNECTED 1
-    )
-    add_custom_target(static_zlib ALL
-        COMMAND ${CMAKE_COMMAND} -E remove ${ZLIB_LIBRARY_DIR}/zlib.lib
-        COMMAND ${CMAKE_COMMAND} -E remove ${ZLIB_LIBRARY_DIR}/libz.so
-        COMMAND ${CMAKE_COMMAND} -E remove ${ZLIB_LIBRARY_DIR}/libz.dylib
-    )
-    add_dependencies(static_zlib zlib-external)
+  include(ExternalProject)
+  set(ZLIB_BUILD_DIR ${CMAKE_BINARY_DIR}/zlib-build)
+  set(ZLIB_INSTALL_DIR ${CMAKE_BINARY_DIR}/zlib-install)
+  set(ZLIB_INCLUDE_DIR ${ZLIB_INSTALL_DIR}/include)
+  set(ZLIB_LIBRARY_DIR ${ZLIB_INSTALL_DIR}/lib)
+  ExternalProject_Add(
+    zlib-external
+    GIT_REPOSITORY https://github.com/madler/zlib
+    GIT_TAG v1.2.13
+    CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${ZLIB_INSTALL_DIR}
+               -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+    UPDATE_DISCONNECTED 1)
+  add_custom_target(
+    static_zlib ALL
+    COMMAND ${CMAKE_COMMAND} -E remove ${ZLIB_LIBRARY_DIR}/zlib.lib
+    COMMAND ${CMAKE_COMMAND} -E remove ${ZLIB_LIBRARY_DIR}/libz.so
+    COMMAND ${CMAKE_COMMAND} -E remove ${ZLIB_LIBRARY_DIR}/libz.dylib)
+  add_dependencies(static_zlib zlib-external)
 
-    add_library(${zlib} STATIC IMPORTED)
-    set_target_properties(${zlib}
-        PROPERTIES IMPORTED_LOCATION ${ZLIB_LIBRARY_DIR}/${zlib_name})
-    add_dependencies(${zlib} zlib-external)
+  add_library(${zlib} STATIC IMPORTED)
+  set_target_properties(${zlib} PROPERTIES IMPORTED_LOCATION
+                                           ${ZLIB_LIBRARY_DIR}/${zlib_name})
+  add_dependencies(${zlib} zlib-external)
 
-    set(LIBZIP_BUILD_DIR ${CMAKE_BINARY_DIR}/libzip-build)
-    set(LIBZIP_INSTALL_DIR ${CMAKE_BINARY_DIR}/libzip-install)
-    set(LIBZIP_INCLUDE_DIR ${LIBZIP_INSTALL_DIR}/include)
-    set(LIBZIP_LIBRARY_DIR ${LIBZIP_INSTALL_DIR}/lib)
-    ExternalProject_Add(
-        libzip-external
-        GIT_REPOSITORY https://github.com/nih-at/libzip
-        GIT_TAG v1.9.2
-        CMAKE_ARGS
-        -DZLIB_ROOT=${ZLIB_INSTALL_DIR}
-        -DCMAKE_INSTALL_PREFIX=${LIBZIP_INSTALL_DIR}
-        -DCMAKE_INSTALL_LIBDIR=lib
-        -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-        -DBUILD_SHARED_LIBS=OFF
-        -DLIBZIP_DO_INSTALL=ON
-        -DBUILD_TOOLS=OFF
-        -DBUILD_DOC=OFF
-        -DBUILD_REGRESS=OFF
-        -DBUILD_EXAMPLES=OFF
-        -DENABLE_OPENSSL=OFF
-        -DENABLE_COMMONCRYPTO=OFF
-        -DENABLE_GNUTLS=OFF
-        -DENABLE_MBEDTLS=OFF
-        -DENABLE_WINDOWS_CRYPTO=OFF
-        -DENABLE_BZIP2=OFF
-        -DENABLE_LZMA=OFF
-        -DENABLE_ZSTD=OFF
-        PREFIX libzip
-        BINARY_DIR ${LIBZIP_BUILD_DIR}
-        # INSTALL_COMMAND ""
-        UPDATE_DISCONNECTED 1
-    )
-    add_library(${ziplib} STATIC IMPORTED)
-    set_target_properties(${ziplib}
-        PROPERTIES IMPORTED_LOCATION ${LIBZIP_LIBRARY_DIR}/${ziplib_name})
-    add_dependencies(libzip-external static_zlib)
-    add_dependencies(${ziplib} libzip-external)
-    set(ziplibs ${ziplib} ${zlib})
-    if (NOT MMDEPLOY_BUILD_SDK_MONOLITHIC)
-        install(FILES ${LIBZIP_LIBRARY_DIR}/${ziplib_name}
+  set(LIBZIP_BUILD_DIR ${CMAKE_BINARY_DIR}/libzip-build)
+  set(LIBZIP_INSTALL_DIR ${CMAKE_BINARY_DIR}/libzip-install)
+  set(LIBZIP_INCLUDE_DIR ${LIBZIP_INSTALL_DIR}/include)
+  set(LIBZIP_LIBRARY_DIR ${LIBZIP_INSTALL_DIR}/lib)
+  ExternalProject_Add(
+    libzip-external
+    GIT_REPOSITORY https://github.com/nih-at/libzip
+    GIT_TAG v1.9.2
+    CMAKE_ARGS -DZLIB_ROOT=${ZLIB_INSTALL_DIR}
+               -DCMAKE_INSTALL_PREFIX=${LIBZIP_INSTALL_DIR}
+               -DCMAKE_INSTALL_LIBDIR=lib
+               -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+               -DBUILD_SHARED_LIBS=OFF
+               -DLIBZIP_DO_INSTALL=ON
+               -DBUILD_TOOLS=OFF
+               -DBUILD_DOC=OFF
+               -DBUILD_REGRESS=OFF
+               -DBUILD_EXAMPLES=OFF
+               -DENABLE_OPENSSL=OFF
+               -DENABLE_COMMONCRYPTO=OFF
+               -DENABLE_GNUTLS=OFF
+               -DENABLE_MBEDTLS=OFF
+               -DENABLE_WINDOWS_CRYPTO=OFF
+               -DENABLE_BZIP2=OFF
+               -DENABLE_LZMA=OFF
+               -DENABLE_ZSTD=OFF
+    PREFIX libzip
+    BINARY_DIR ${LIBZIP_BUILD_DIR}
+    # INSTALL_COMMAND ""
+    UPDATE_DISCONNECTED 1)
+  add_library(${ziplib} STATIC IMPORTED)
+  set_target_properties(
+    ${ziplib} PROPERTIES IMPORTED_LOCATION ${LIBZIP_LIBRARY_DIR}/${ziplib_name})
+  add_dependencies(libzip-external static_zlib)
+  add_dependencies(${ziplib} libzip-external)
+  set(ziplibs ${ziplib} ${zlib})
+  if(NOT MMDEPLOY_BUILD_SDK_MONOLITHIC)
+    install(FILES ${LIBZIP_LIBRARY_DIR}/${ziplib_name}
             DESTINATION ${CMAKE_INSTALL_PREFIX}/lib)
-        install(FILES ${ZLIB_LIBRARY_DIR}/${zlib_name}
+    install(FILES ${ZLIB_LIBRARY_DIR}/${zlib_name}
             DESTINATION ${CMAKE_INSTALL_PREFIX}/lib)
-    endif()
-endif ()
+  endif()
+endif()
 
-foreach (MODEL_NAME ${MODEL_NAMES})
-    set(TARGET_MODEL_NAME mmdeploy_${MODEL_NAME})
-    mmdeploy_add_module(${TARGET_MODEL_NAME} ${MODEL_NAME}_impl.cpp)
-    if (${MODEL_NAME} STREQUAL "zip_model")
-        target_link_libraries(${TARGET_MODEL_NAME} PRIVATE ${ziplibs})
-        target_link_directories(${TARGET_MODEL_NAME} INTERFACE
-            $<INSTALL_INTERFACE:lib>)
-        target_include_directories(${TARGET_MODEL_NAME} PRIVATE
-            $<BUILD_INTERFACE:${LIBZIP_INCLUDE_DIR}>)
-    endif ()
-    add_library(mmdeploy::${MODEL_NAME} ALIAS ${TARGET_MODEL_NAME})
-endforeach ()
+foreach(MODEL_NAME ${MODEL_NAMES})
+  set(TARGET_MODEL_NAME mmdeploy_${MODEL_NAME})
+  mmdeploy_add_module(${TARGET_MODEL_NAME} ${MODEL_NAME}_impl.cpp)
+  if(${MODEL_NAME} STREQUAL "zip_model")
+    target_link_libraries(${TARGET_MODEL_NAME} PRIVATE ${ziplibs})
+    target_link_directories(${TARGET_MODEL_NAME} INTERFACE
+                            $<INSTALL_INTERFACE:lib>)
+    target_include_directories(${TARGET_MODEL_NAME}
+                               PRIVATE $<BUILD_INTERFACE:${LIBZIP_INCLUDE_DIR}>)
+  endif()
+  add_library(mmdeploy::${MODEL_NAME} ALIAS ${TARGET_MODEL_NAME})
+endforeach()
diff --git a/csrc/mmdeploy/model/directory_model_impl.cpp b/csrc/mmdeploy/model/directory_model_impl.cpp
index 88e8ce4350..8df8a75223 100644
--- a/csrc/mmdeploy/model/directory_model_impl.cpp
+++ b/csrc/mmdeploy/model/directory_model_impl.cpp
@@ -8,61 +8,75 @@
 #include "mmdeploy/core/model_impl.h"
 #include "mmdeploy/core/utils/filesystem.h"
 
-namespace mmdeploy::framework {
+namespace mmdeploy::framework
+{
 
-class DirectoryModelImpl : public ModelImpl {
- public:
-  DirectoryModelImpl() = default;
+    class DirectoryModelImpl : public ModelImpl
+    {
+      public:
+        DirectoryModelImpl() = default;
 
-  Result<void> Init(const std::string& sdk_model_path) override {
-    auto path = fs::path{sdk_model_path};
-    if (!is_directory(path)) {
-      return Status(eInvalidArgument);
-    }
-    root_ = fs::path{sdk_model_path};
-    return success();
-  }
+        Result<void> Init(const std::string& sdk_model_path) override
+        {
+            auto path = fs::path{sdk_model_path};
+            if (!is_directory(path))
+            {
+                return Status(eInvalidArgument);
+            }
+            root_ = fs::path{sdk_model_path};
+            return success();
+        }
 
-  Result<std::string> ReadFile(const std::string& file_path) const override {
-    auto _path = root_ / fs::path(file_path);
-    std::ifstream ifs(_path, std::ios::binary | std::ios::in);
-    if (!ifs.is_open()) {
-      MMDEPLOY_ERROR("read file {} failed", _path.string());
-      return Status(eFail);
-    }
-    ifs.seekg(0, std::ios::end);
-    auto size = ifs.tellg();
-    ifs.seekg(0, std::ios::beg);
-    std::string str(size, '\0');
-    ifs.read(str.data(), size);
-    return str;
-  }
+        Result<std::string> ReadFile(const std::string& file_path) const override
+        {
+            auto          _path = root_ / fs::path(file_path);
+            std::ifstream ifs(_path, std::ios::binary | std::ios::in);
+            if (!ifs.is_open())
+            {
+                MMDEPLOY_ERROR("read file {} failed", _path.string());
+                return Status(eFail);
+            }
+            ifs.seekg(0, std::ios::end);
+            auto size = ifs.tellg();
+            ifs.seekg(0, std::ios::beg);
+            std::string str(size, '\0');
+            ifs.read(str.data(), size);
+            return str;
+        }
 
-  Result<Value> ReadConfig(const std::string& config_path) const override {
-    try {
-      OUTCOME_TRY(auto json_str, ReadFile(config_path));
-      return from_json<Value>(nlohmann::json::parse(json_str));
-    } catch (const std::exception& e) {
-      MMDEPLOY_ERROR("exception: {}", e.what());
-      return Status(eFail);
-    }
-  }
+        Result<Value> ReadConfig(const std::string& config_path) const override
+        {
+            try
+            {
+                OUTCOME_TRY(auto json_str, ReadFile(config_path));
+                return from_json<Value>(nlohmann::json::parse(json_str));
+            }
+            catch (const std::exception& e)
+            {
+                MMDEPLOY_ERROR("exception: {}", e.what());
+                return Status(eFail);
+            }
+        }
 
-  Result<deploy_meta_info_t> ReadMeta() const override {
-    try {
-      OUTCOME_TRY(auto deploy_cfg, ReadConfig("deploy.json"));
-      return from_value<deploy_meta_info_t>(deploy_cfg);
-    } catch (std::exception& e) {
-      MMDEPLOY_ERROR("exception: {}", e.what());
-      return Status(eFail);
-    }
-  }
+        Result<deploy_meta_info_t> ReadMeta() const override
+        {
+            try
+            {
+                OUTCOME_TRY(auto deploy_cfg, ReadConfig("deploy.json"));
+                return from_value<deploy_meta_info_t>(deploy_cfg);
+            }
+            catch (std::exception& e)
+            {
+                MMDEPLOY_ERROR("exception: {}", e.what());
+                return Status(eFail);
+            }
+        }
 
- private:
-  fs::path root_;
-};
+      private:
+        fs::path root_;
+    };
 
-MMDEPLOY_REGISTER_FACTORY_FUNC(ModelImpl, (DirectoryModel, 0),
-                               [] { return std::make_unique<DirectoryModelImpl>(); });
+    MMDEPLOY_REGISTER_FACTORY_FUNC(ModelImpl, (DirectoryModel, 0), []
+                                   { return std::make_unique<DirectoryModelImpl>(); });
 
 }  // namespace mmdeploy::framework
diff --git a/csrc/mmdeploy/model/zip_model_impl.cpp b/csrc/mmdeploy/model/zip_model_impl.cpp
index 39bf1762de..23604888e0 100644
--- a/csrc/mmdeploy/model/zip_model_impl.cpp
+++ b/csrc/mmdeploy/model/zip_model_impl.cpp
@@ -13,135 +13,165 @@
 
 using nlohmann::json;
 
-namespace mmdeploy::framework {
+namespace mmdeploy::framework
+{
 
-class ZipModelImpl : public ModelImpl {
- public:
-  ~ZipModelImpl() override {
-    if (zip_ != nullptr) {
-      zip_close(zip_);
-    }
+    class ZipModelImpl : public ModelImpl
+    {
+      public:
+        ~ZipModelImpl() override
+        {
+            if (zip_ != nullptr)
+            {
+                zip_close(zip_);
+            }
 #if LIBZIP_VERSION_MAJOR >= 1
-    if (source_) {
-      zip_source_close(source_);
-    }
+            if (source_)
+            {
+                zip_source_close(source_);
+            }
 #endif
-  }
+        }
 
-  // @brief load an sdk model, which HAS TO BE a zip file.
-  // Meta file (i.e. deploy.json) will be extracted and parsed from the zip file
-  // @param sdk_model_path path of sdk model file, in zip format
-  Result<void> Init(const std::string& model_path) override {
-    int ret = 0;
-    zip_ = zip_open(model_path.c_str(), 0, &ret);
-    if (ret != 0) {
-      MMDEPLOY_INFO("Failed to open zip file {}, ret {}", model_path.c_str(), ret);
-      return Status(eInvalidArgument);
-    }
-    MMDEPLOY_INFO("Open model file {} successfully", model_path.c_str());
-    return InitZip();
-  }
+        // @brief load an sdk model, which HAS TO BE a zip file.
+        // Meta file (i.e. deploy.json) will be extracted and parsed from the zip file
+        // @param sdk_model_path path of sdk model file, in zip format
+        Result<void> Init(const std::string& model_path) override
+        {
+            int ret = 0;
+            zip_    = zip_open(model_path.c_str(), 0, &ret);
+            if (ret != 0)
+            {
+                MMDEPLOY_INFO("Failed to open zip file {}, ret {}", model_path.c_str(), ret);
+                return Status(eInvalidArgument);
+            }
+            MMDEPLOY_INFO("Open model file {} successfully", model_path.c_str());
+            return InitZip();
+        }
 
-  Result<void> Init(const void* buffer, size_t size) override {
+        Result<void> Init(const void* buffer, size_t size) override
+        {
 #if LIBZIP_VERSION_MAJOR >= 1
-    zip_error_t error{};
-    source_ = zip_source_buffer_create(buffer, size, 0, &error);
-    if (zip_error_code_zip(&error) != ZIP_ER_OK) {
-      return Status(eFail);
-    }
-    zip_ = zip_open_from_source(source_, ZIP_RDONLY, &error);
-    if (zip_error_code_zip(&error) != ZIP_ER_OK) {
-      return Status(eFail);
-    }
-    return InitZip();
+            zip_error_t error{};
+            source_ = zip_source_buffer_create(buffer, size, 0, &error);
+            if (zip_error_code_zip(&error) != ZIP_ER_OK)
+            {
+                return Status(eFail);
+            }
+            zip_ = zip_open_from_source(source_, ZIP_RDONLY, &error);
+            if (zip_error_code_zip(&error) != ZIP_ER_OK)
+            {
+                return Status(eFail);
+            }
+            return InitZip();
 #else
-    return Status(eNotSupported);
+            return Status(eNotSupported);
 #endif
-  }
+        }
 
-  Result<std::string> ReadFile(const std::string& file_path) const override {
-    int ret = 0;
-    int index = -1;
+        Result<std::string> ReadFile(const std::string& file_path) const override
+        {
+            int  ret   = 0;
+            int  index = -1;
 
-    auto iter = file_index_.find(file_path);
-    if (iter == file_index_.end()) {
-      MMDEPLOY_ERROR("cannot find file {} under dir {}", file_path.c_str(), root_dir_.c_str());
-      return Status(eFail);
-    }
-    index = iter->second;
-    struct zip_file* pzip = zip_fopen_index(zip_, index, 0);
-    if (nullptr == pzip) {
-      MMDEPLOY_ERROR("read file {} in zip file failed, whose index is {}", file_path.c_str(),
-                     index);
-      return Status(eFail);
-    }
-    struct zip_stat stat {};
-    if ((ret = zip_stat_index(zip_, index, 0, &stat)) < 0) {
-      MMDEPLOY_ERROR("get stat of file {} error, ret {}", file_path.c_str(), ret);
-      return Status(eFail);
-    }
-    MMDEPLOY_DEBUG("file size {}", (int)stat.size);
-    std::vector<char> buf(stat.size);
-    if ((ret = zip_fread(pzip, buf.data(), stat.size)) < 0) {
-      MMDEPLOY_ERROR("read data of file {} error, ret {}", file_path.c_str(), ret);
-      return Status(eFail);
-    }
-    return std::string(buf.begin(), buf.end());
-  }
+            auto iter = file_index_.find(file_path);
+            if (iter == file_index_.end())
+            {
+                MMDEPLOY_ERROR("cannot find file {} under dir {}", file_path.c_str(), root_dir_.c_str());
+                return Status(eFail);
+            }
+            index                 = iter->second;
+            struct zip_file* pzip = zip_fopen_index(zip_, index, 0);
+            if (nullptr == pzip)
+            {
+                MMDEPLOY_ERROR("read file {} in zip file failed, whose index is {}", file_path.c_str(), index);
+                return Status(eFail);
+            }
+            struct zip_stat stat
+            {
+            };
+            if ((ret = zip_stat_index(zip_, index, 0, &stat)) < 0)
+            {
+                MMDEPLOY_ERROR("get stat of file {} error, ret {}", file_path.c_str(), ret);
+                return Status(eFail);
+            }
+            MMDEPLOY_DEBUG("file size {}", (int)stat.size);
+            std::vector<char> buf(stat.size);
+            if ((ret = zip_fread(pzip, buf.data(), stat.size)) < 0)
+            {
+                MMDEPLOY_ERROR("read data of file {} error, ret {}", file_path.c_str(), ret);
+                return Status(eFail);
+            }
+            return std::string(buf.begin(), buf.end());
+        }
 
-  Result<Value> ReadConfig(const std::string& config_path) const override {
-    OUTCOME_TRY(auto json_str, ReadFile(config_path));
-    try {
-      return from_json<Value>(nlohmann::json::parse(json_str));
-    } catch (const std::exception& e) {
-      MMDEPLOY_ERROR("exception: {}", e.what());
-      return Status(eFail);
-    }
-  }
+        Result<Value> ReadConfig(const std::string& config_path) const override
+        {
+            OUTCOME_TRY(auto json_str, ReadFile(config_path));
+            try
+            {
+                return from_json<Value>(nlohmann::json::parse(json_str));
+            }
+            catch (const std::exception& e)
+            {
+                MMDEPLOY_ERROR("exception: {}", e.what());
+                return Status(eFail);
+            }
+        }
 
-  Result<deploy_meta_info_t> ReadMeta() const override {
-    try {
-      OUTCOME_TRY(auto deploy_cfg, ReadConfig("deploy.json"));
-      return from_value<deploy_meta_info_t>(deploy_cfg);
-    } catch (std::exception& e) {
-      MMDEPLOY_ERROR("exception: {}", e.what());
-      return Status(eFail);
-    }
-  }
+        Result<deploy_meta_info_t> ReadMeta() const override
+        {
+            try
+            {
+                OUTCOME_TRY(auto deploy_cfg, ReadConfig("deploy.json"));
+                return from_value<deploy_meta_info_t>(deploy_cfg);
+            }
+            catch (std::exception& e)
+            {
+                MMDEPLOY_ERROR("exception: {}", e.what());
+                return Status(eFail);
+            }
+        }
 
- private:
-  Result<void> InitZip() {
-    int files = zip_get_num_files(zip_);
-    MMDEPLOY_INFO("There are {} files in the model", files);
-    if (files == 0) {
-      return Status(eFail);
-    }
-    for (int i = 0; i < files; ++i) {
-      struct zip_stat stat;
-      zip_stat_init(&stat);
-      zip_stat_index(zip_, i, 0, &stat);
-      fs::path path(stat.name);
-      auto file_name = path.filename().string();
-      if (file_name == ".") {
-        MMDEPLOY_DEBUG("{}-th file name is: {}， which is a directory", i, stat.name);
-      } else {
-        MMDEPLOY_DEBUG("{}-th file name is: {}， which is a file", i, stat.name);
-        file_index_[file_name] = i;
-      }
-    }
-    return success();
-  }
+      private:
+        Result<void> InitZip()
+        {
+            int files = zip_get_num_files(zip_);
+            MMDEPLOY_INFO("There are {} files in the model", files);
+            if (files == 0)
+            {
+                return Status(eFail);
+            }
+            for (int i = 0; i < files; ++i)
+            {
+                struct zip_stat stat;
+                zip_stat_init(&stat);
+                zip_stat_index(zip_, i, 0, &stat);
+                fs::path path(stat.name);
+                auto     file_name = path.filename().string();
+                if (file_name == ".")
+                {
+                    MMDEPLOY_DEBUG("{}-th file name is: {}， which is a directory", i, stat.name);
+                }
+                else
+                {
+                    MMDEPLOY_DEBUG("{}-th file name is: {}， which is a file", i, stat.name);
+                    file_index_[file_name] = i;
+                }
+            }
+            return success();
+        }
 #if LIBZIP_VERSION_MAJOR >= 1
-  struct zip_source* source_{};
+        struct zip_source* source_{};
 #endif
-  struct zip* zip_{};
-  // root directory in zip file
-  std::string root_dir_;
-  // a map between file path and its index in zip file
-  std::map<std::string, int> file_index_;
-};
+        struct zip*                zip_{};
+        // root directory in zip file
+        std::string                root_dir_;
+        // a map between file path and its index in zip file
+        std::map<std::string, int> file_index_;
+    };
 
-MMDEPLOY_REGISTER_FACTORY_FUNC(ModelImpl, (ZipModel, 0),
-                               [] { return std::make_unique<ZipModelImpl>(); });
+    MMDEPLOY_REGISTER_FACTORY_FUNC(ModelImpl, (ZipModel, 0), []
+                                   { return std::make_unique<ZipModelImpl>(); });
 
 }  // namespace mmdeploy::framework
diff --git a/csrc/mmdeploy/net/CMakeLists.txt b/csrc/mmdeploy/net/CMakeLists.txt
index 9240fd7e6b..e3c6a50c9e 100644
--- a/csrc/mmdeploy/net/CMakeLists.txt
+++ b/csrc/mmdeploy/net/CMakeLists.txt
@@ -4,65 +4,63 @@ project(mmdeploy_net_module)
 
 set(BACKEND_LIB_NAMES)
 
-if ("trt" IN_LIST MMDEPLOY_TARGET_BACKENDS)
-    add_subdirectory(trt)
-endif ()
-
-if ("pplnn" IN_LIST MMDEPLOY_TARGET_BACKENDS)
-    add_subdirectory(ppl)
-endif ()
-
-if ("ort" IN_LIST MMDEPLOY_TARGET_BACKENDS)
-    add_subdirectory(ort)
-endif ()
-
-if ("ncnn" IN_LIST MMDEPLOY_TARGET_BACKENDS)
-    add_subdirectory(ncnn)
-endif ()
-
-if ("openvino" IN_LIST MMDEPLOY_TARGET_BACKENDS)
-    add_subdirectory(openvino)
-endif ()
-
-if ("snpe" IN_LIST MMDEPLOY_TARGET_BACKENDS)
-    add_subdirectory(snpe)
-endif ()
-
-if ("acl" IN_LIST MMDEPLOY_TARGET_BACKENDS)
-    add_subdirectory(acl)
-endif ()
-
-if ("torchscript" IN_LIST MMDEPLOY_TARGET_BACKENDS)
-    add_subdirectory(torchscript)
-endif ()
-
-if ("coreml" IN_LIST MMDEPLOY_TARGET_BACKENDS)
-    add_subdirectory(coreml)
-endif ()
-
-if ("rknn" IN_LIST MMDEPLOY_TARGET_BACKENDS)
-    add_subdirectory(rknn)
-endif ()
-
-if ("tvm" IN_LIST MMDEPLOY_TARGET_BACKENDS)
-    add_subdirectory(tvm)
-endif ()
-
-if (MMDEPLOY_DYNAMIC_BACKEND)
-    set(_MODULE_STR ${BACKEND_LIB_NAMES})
-    list(TRANSFORM _MODULE_STR REPLACE "(.+)" "\"\\1\"")
-    string(JOIN ",\n        " _MODULE_STR ${_MODULE_STR})
-    set(_MMDEPLOY_DYNAMIC_MODULES ${_MODULE_STR})
-
-    set(_LOADER_NAME net_loader)
-    set(_LOADER_PATH ${${PROJECT_NAME}_BINARY_DIR}/${_LOADER_NAME}.cpp)
-    configure_file(
-        ${CMAKE_SOURCE_DIR}/cmake/loader.cpp.in
-        ${_LOADER_PATH})
-    if (NOT (WIN32 OR APPLE))
-        SET(_DL_LIB dl)
-    endif ()
-endif ()
+if("trt" IN_LIST MMDEPLOY_TARGET_BACKENDS)
+  add_subdirectory(trt)
+endif()
+
+if("pplnn" IN_LIST MMDEPLOY_TARGET_BACKENDS)
+  add_subdirectory(ppl)
+endif()
+
+if("ort" IN_LIST MMDEPLOY_TARGET_BACKENDS)
+  add_subdirectory(ort)
+endif()
+
+if("ncnn" IN_LIST MMDEPLOY_TARGET_BACKENDS)
+  add_subdirectory(ncnn)
+endif()
+
+if("openvino" IN_LIST MMDEPLOY_TARGET_BACKENDS)
+  add_subdirectory(openvino)
+endif()
+
+if("snpe" IN_LIST MMDEPLOY_TARGET_BACKENDS)
+  add_subdirectory(snpe)
+endif()
+
+if("acl" IN_LIST MMDEPLOY_TARGET_BACKENDS)
+  add_subdirectory(acl)
+endif()
+
+if("torchscript" IN_LIST MMDEPLOY_TARGET_BACKENDS)
+  add_subdirectory(torchscript)
+endif()
+
+if("coreml" IN_LIST MMDEPLOY_TARGET_BACKENDS)
+  add_subdirectory(coreml)
+endif()
+
+if("rknn" IN_LIST MMDEPLOY_TARGET_BACKENDS)
+  add_subdirectory(rknn)
+endif()
+
+if("tvm" IN_LIST MMDEPLOY_TARGET_BACKENDS)
+  add_subdirectory(tvm)
+endif()
+
+if(MMDEPLOY_DYNAMIC_BACKEND)
+  set(_MODULE_STR ${BACKEND_LIB_NAMES})
+  list(TRANSFORM _MODULE_STR REPLACE "(.+)" "\"\\1\"")
+  string(JOIN ",\n        " _MODULE_STR ${_MODULE_STR})
+  set(_MMDEPLOY_DYNAMIC_MODULES ${_MODULE_STR})
+
+  set(_LOADER_NAME net_loader)
+  set(_LOADER_PATH ${${PROJECT_NAME}_BINARY_DIR}/${_LOADER_NAME}.cpp)
+  configure_file(${CMAKE_SOURCE_DIR}/cmake/loader.cpp.in ${_LOADER_PATH})
+  if(NOT (WIN32 OR APPLE))
+    set(_DL_LIB dl)
+  endif()
+endif()
 
 mmdeploy_add_module(${PROJECT_NAME} net_module.cpp ${_LOADER_PATH})
 target_link_libraries(${PROJECT_NAME} PUBLIC ${_DL_LIB})
diff --git a/csrc/mmdeploy/net/acl/CMakeLists.txt b/csrc/mmdeploy/net/acl/CMakeLists.txt
index 2056b73506..0fd16885e0 100644
--- a/csrc/mmdeploy/net/acl/CMakeLists.txt
+++ b/csrc/mmdeploy/net/acl/CMakeLists.txt
@@ -2,13 +2,17 @@
 
 project(mmdeploy_acl_net)
 
-if ("acl" IN_LIST MMDEPLOY_TARGET_BACKENDS)
-    if (NOT DEFINED ASCEND_TOOLKIT_HOME)
-        set(ASCEND_TOOLKIT_HOME $ENV{ASCEND_TOOLKIT_HOME})
-    endif ()
-    mmdeploy_add_module(${PROJECT_NAME} acl_net.cpp)
-    target_include_directories(${PROJECT_NAME} PRIVATE
-        $<BUILD_INTERFACE:${ASCEND_TOOLKIT_HOME}/runtime/include>)
-    target_link_libraries(${PROJECT_NAME} PRIVATE
-        $<BUILD_INTERFACE:${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub/libascendcl.so>)
-endif ()
+if("acl" IN_LIST MMDEPLOY_TARGET_BACKENDS)
+  if(NOT DEFINED ASCEND_TOOLKIT_HOME)
+    set(ASCEND_TOOLKIT_HOME $ENV{ASCEND_TOOLKIT_HOME})
+  endif()
+  mmdeploy_add_module(${PROJECT_NAME} acl_net.cpp)
+  target_include_directories(
+    ${PROJECT_NAME}
+    PRIVATE $<BUILD_INTERFACE:${ASCEND_TOOLKIT_HOME}/runtime/include>)
+  target_link_libraries(
+    ${PROJECT_NAME}
+    PRIVATE
+      $<BUILD_INTERFACE:${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub/libascendcl.so>
+  )
+endif()
diff --git a/csrc/mmdeploy/net/acl/acl_net.cpp b/csrc/mmdeploy/net/acl/acl_net.cpp
index c8e83bb229..11d8db4775 100644
--- a/csrc/mmdeploy/net/acl/acl_net.cpp
+++ b/csrc/mmdeploy/net/acl/acl_net.cpp
@@ -6,649 +6,799 @@
 #include "mmdeploy/core/model.h"
 #include "mmdeploy/core/utils/formatter.h"
 
-std::ostream& operator<<(std::ostream& os, const aclmdlIODims& dims) {
-  os << dims.name << " [";
-  for (int i = 0; i < dims.dimCount; ++i) {
-    os << (i ? ", " : "") << dims.dims[i];
-  }
-  os << "]";
-  return os;
+std::ostream& operator<<(std::ostream& os, const aclmdlIODims& dims)
+{
+    os << dims.name << " [";
+    for (int i = 0; i < dims.dimCount; ++i)
+    {
+        os << (i ? ", " : "") << dims.dims[i];
+    }
+    os << "]";
+    return os;
 }
 
-std::ostream& operator<<(std::ostream& os, const aclmdlBatch& batch) {
-  os << "batch [";
-  for (int i = 0; i < batch.batchCount; ++i) {
-    os << (i ? ", " : "") << batch.batch[i];
-  }
-  os << "]";
-  return os;
+std::ostream& operator<<(std::ostream& os, const aclmdlBatch& batch)
+{
+    os << "batch [";
+    for (int i = 0; i < batch.batchCount; ++i)
+    {
+        os << (i ? ", " : "") << batch.batch[i];
+    }
+    os << "]";
+    return os;
 }
 
-std::ostream& operator<<(std::ostream& os, const aclmdlHW& hw) {
-  os << "HW [";
-  for (int i = 0; i < hw.hwCount; ++i) {
-    os << (i ? ", " : "") << "(" << hw.hw[i][0] << ", " << hw.hw[i][1] << ")";
-  }
-  os << "]";
-  return os;
+std::ostream& operator<<(std::ostream& os, const aclmdlHW& hw)
+{
+    os << "HW [";
+    for (int i = 0; i < hw.hwCount; ++i)
+    {
+        os << (i ? ", " : "") << "(" << hw.hw[i][0] << ", " << hw.hw[i][1] << ")";
+    }
+    os << "]";
+    return os;
 }
 
-namespace mmdeploy::framework {
+namespace mmdeploy::framework
+{
+
+    namespace
+    {
+
+        inline Result<void> _m(aclError ec, SourceLocation loc = SourceLocation::current())
+        {
+            if (ec == ACL_SUCCESS)
+            {
+                return success();
+            }
+            else
+            {
+                return Status(eFail, loc);
+            }
+        }
 
-namespace {
+        template<typename T>
+        inline Result<T*> _p(T* ptr, SourceLocation loc = SourceLocation::current())
+        {
+            if (ptr)
+            {
+                return ptr;
+            }
+            else
+            {
+                return Status(eFail, loc);
+            }
+        }
 
-inline Result<void> _m(aclError ec, SourceLocation loc = SourceLocation::current()) {
-  if (ec == ACL_SUCCESS) {
-    return success();
-  } else {
-    return Status(eFail, loc);
-  }
-}
+        struct Context
+        {
+            Context()
+            {
+                std::lock_guard lock{mutex_};
+                if (ref_count_++ != 0)
+                {
+                    return;
+                }
+                auto ret = aclInit(nullptr);
+                if (ret == ACL_SUCCESS)
+                {
+                    MMDEPLOY_INFO("ACL initialized.");
+                    owned_acl_ = true;
+                }
+                else if (ret == ACL_ERROR_REPEAT_INITIALIZE)
+                {
+                    MMDEPLOY_INFO("ACL has already been initialized.");
+                }
+                else
+                {
+                    MMDEPLOY_ERROR("aclInit() failed: {}", ret);
+                    assert(ret == 0);
+                }
+            }
+            ~Context()
+            {
+                std::lock_guard lock{mutex_};
+                if (--ref_count_ != 0)
+                {
+                    return;
+                }
+                // skip aclFinalize if aclInit is not successfully called by us.
+                if (owned_acl_)
+                {
+                    auto ret = aclFinalize();
+                    if (ret == ACL_SUCCESS)
+                    {
+                        MMDEPLOY_INFO("ACL finalized.");
+                        owned_acl_ = false;
+                    }
+                    else if (ret == ACL_ERROR_REPEAT_FINALIZE)
+                    {
+                        MMDEPLOY_INFO("ACL has already been finalized.");
+                    }
+                    else
+                    {
+                        MMDEPLOY_ERROR("aclFinalize() failed: {}", ret);
+                    }
+                }
+            }
+            static bool       owned_acl_;
+            static int        ref_count_;
+            static std::mutex mutex_;
+        };
+
+        bool       Context::owned_acl_ = false;
+        int        Context::ref_count_ = 0;
+        std::mutex Context::mutex_{};
+
+    }  // namespace
+
+    AclNet::~AclNet()
+    {
+        auto dtor = [&]() -> Result<void>
+        {
+            auto n_inputs = aclmdlGetDatasetNumBuffers(input_dataset_);
+            for (int i = 0; i < n_inputs; ++i)
+            {
+                auto buffer = aclmdlGetDatasetBuffer(input_dataset_, i);
+                auto data   = aclGetDataBufferAddr(buffer);
+                OUTCOME_TRY(_m(aclrtFree(data)));
+            }
+            input_tensor_.clear();
+            OUTCOME_TRY(_m(aclmdlDestroyDataset(input_dataset_)));
+
+            auto n_outputs = aclmdlGetDatasetNumBuffers(output_dataset_);
+            for (int i = 0; i < n_outputs; ++i)
+            {
+                auto buffer = aclmdlGetDatasetBuffer(output_dataset_, i);
+                auto data   = aclGetDataBufferAddr(buffer);
+                OUTCOME_TRY(_m(aclrtFree(data)));
+            }
+            output_tensor_.clear();
+            OUTCOME_TRY(_m(aclmdlDestroyDataset(output_dataset_)));
+
+            OUTCOME_TRY(_m(aclmdlDestroyDesc(model_desc_)));
+            OUTCOME_TRY(_m(aclmdlUnload(model_id_)));
+            return success();
+        };
+        if (auto r = dtor(); !r)
+        {
+            MMDEPLOY_ERROR("uninit failed: {}", r.error().message().c_str());
+        }
+    }
 
-template <typename T>
-inline Result<T*> _p(T* ptr, SourceLocation loc = SourceLocation::current()) {
-  if (ptr) {
-    return ptr;
-  } else {
-    return Status(eFail, loc);
-  }
-}
+    namespace
+    {
+
+        Result<DataType> FromAclDataType(aclDataType data_type)
+        {
+            switch (data_type)
+            {
+                case ACL_FLOAT:
+                    return DataType::kFLOAT;
+                case ACL_FLOAT16:
+                    return DataType::kHALF;
+                case ACL_INT8:
+                    return DataType::kINT8;
+                case ACL_INT32:
+                    return DataType::kINT32;
+                case ACL_INT64:
+                    return DataType::kINT64;
+                default:
+                    return Status(eNotSupported);
+            }
+        }
 
-struct Context {
-  Context() {
-    std::lock_guard lock{mutex_};
-    if (ref_count_++ != 0) {
-      return;
-    }
-    auto ret = aclInit(nullptr);
-    if (ret == ACL_SUCCESS) {
-      MMDEPLOY_INFO("ACL initialized.");
-      owned_acl_ = true;
-    } else if (ret == ACL_ERROR_REPEAT_INITIALIZE) {
-      MMDEPLOY_INFO("ACL has already been initialized.");
-    } else {
-      MMDEPLOY_ERROR("aclInit() failed: {}", ret);
-      assert(ret == 0);
-    }
-  }
-  ~Context() {
-    std::lock_guard lock{mutex_};
-    if (--ref_count_ != 0) {
-      return;
-    }
-    // skip aclFinalize if aclInit is not successfully called by us.
-    if (owned_acl_) {
-      auto ret = aclFinalize();
-      if (ret == ACL_SUCCESS) {
-        MMDEPLOY_INFO("ACL finalized.");
-        owned_acl_ = false;
-      } else if (ret == ACL_ERROR_REPEAT_FINALIZE) {
-        MMDEPLOY_INFO("ACL has already been finalized.");
-      } else {
-        MMDEPLOY_ERROR("aclFinalize() failed: {}", ret);
-      }
-    }
-  }
-  static bool owned_acl_;
-  static int ref_count_;
-  static std::mutex mutex_;
-};
-
-bool Context::owned_acl_ = false;
-int Context::ref_count_ = 0;
-std::mutex Context::mutex_{};
-
-}  // namespace
-
-AclNet::~AclNet() {
-  auto dtor = [&]() -> Result<void> {
-    auto n_inputs = aclmdlGetDatasetNumBuffers(input_dataset_);
-    for (int i = 0; i < n_inputs; ++i) {
-      auto buffer = aclmdlGetDatasetBuffer(input_dataset_, i);
-      auto data = aclGetDataBufferAddr(buffer);
-      OUTCOME_TRY(_m(aclrtFree(data)));
-    }
-    input_tensor_.clear();
-    OUTCOME_TRY(_m(aclmdlDestroyDataset(input_dataset_)));
-
-    auto n_outputs = aclmdlGetDatasetNumBuffers(output_dataset_);
-    for (int i = 0; i < n_outputs; ++i) {
-      auto buffer = aclmdlGetDatasetBuffer(output_dataset_, i);
-      auto data = aclGetDataBufferAddr(buffer);
-      OUTCOME_TRY(_m(aclrtFree(data)));
-    }
-    output_tensor_.clear();
-    OUTCOME_TRY(_m(aclmdlDestroyDataset(output_dataset_)));
-
-    OUTCOME_TRY(_m(aclmdlDestroyDesc(model_desc_)));
-    OUTCOME_TRY(_m(aclmdlUnload(model_id_)));
-    return success();
-  };
-  if (auto r = dtor(); !r) {
-    MMDEPLOY_ERROR("uninit failed: {}", r.error().message().c_str());
-  }
-}
+        Result<aclDataType> ToAclDataType(DataType data_type)
+        {
+            switch (data_type)
+            {
+                case DataType::kFLOAT:
+                    return ACL_FLOAT;
+                case DataType::kHALF:
+                    return ACL_FLOAT16;
+                case DataType::kINT8:
+                    return ACL_INT8;
+                case DataType::kINT32:
+                    return ACL_INT32;
+                case DataType::kINT64:
+                    return ACL_INT64;
+                default:
+                    return Status(eNotSupported);
+            }
+        }
 
-namespace {
-
-Result<DataType> FromAclDataType(aclDataType data_type) {
-  switch (data_type) {
-    case ACL_FLOAT:
-      return DataType::kFLOAT;
-    case ACL_FLOAT16:
-      return DataType::kHALF;
-    case ACL_INT8:
-      return DataType::kINT8;
-    case ACL_INT32:
-      return DataType::kINT32;
-    case ACL_INT64:
-      return DataType::kINT64;
-    default:
-      return Status(eNotSupported);
-  }
-}
+        Result<TensorDesc> ToTensorDesc(const aclmdlIODims& dims, aclDataType data_type)
+        {
+            auto extract_name = [](const std::string& name)
+            {
+                if (auto pos = name.find_last_of(':'); pos != std::string::npos)
+                {
+                    return name.substr(pos + 1);
+                }
+                else
+                {
+                    return name;
+                }
+            };
+            OUTCOME_TRY(auto _data_type, FromAclDataType(data_type));
+            return TensorDesc{Device(0), _data_type, TensorShape(&dims.dims[0], &dims.dims[0] + dims.dimCount), extract_name(dims.name)};
+        }
 
-Result<aclDataType> ToAclDataType(DataType data_type) {
-  switch (data_type) {
-    case DataType::kFLOAT:
-      return ACL_FLOAT;
-    case DataType::kHALF:
-      return ACL_FLOAT16;
-    case DataType::kINT8:
-      return ACL_INT8;
-    case DataType::kINT32:
-      return ACL_INT32;
-    case DataType::kINT64:
-      return ACL_INT64;
-    default:
-      return Status(eNotSupported);
-  }
-}
+        Result<size_t> GetByteSize(const aclmdlIODims& dims, aclDataType data_type)
+        {
+            size_t byte_size = aclDataTypeSize(data_type);
+            for (int i = 0; i < dims.dimCount; ++i)
+            {
+                if (dims.dims[i] < 0)
+                {
+                    return Status(eInvalidArgument);
+                }
+                byte_size *= dims.dims[i];
+            }
+            return byte_size;
+        }
 
-Result<TensorDesc> ToTensorDesc(const aclmdlIODims& dims, aclDataType data_type) {
-  auto extract_name = [](const std::string& name) {
-    if (auto pos = name.find_last_of(':'); pos != std::string::npos) {
-      return name.substr(pos + 1);
-    } else {
-      return name;
+    }  // namespace
+
+    // all dims must be fixed
+    auto AclNet::CreateBuffers(const aclmdlIODims& dims, aclDataType data_type) -> Result<Buffers>
+    {
+        OUTCOME_TRY(auto byte_size, GetByteSize(dims, data_type));
+        Buffers pair{};
+        void*   dev_ptr{};
+        OUTCOME_TRY(_m(aclrtMalloc(&dev_ptr, byte_size, ACL_MEM_MALLOC_HUGE_FIRST)));
+        OUTCOME_TRY(_m(aclrtMemset(dev_ptr, byte_size, 0, byte_size)));
+        OUTCOME_TRY(pair.device_buffer, _p(aclCreateDataBuffer(dev_ptr, byte_size)));
+        OUTCOME_TRY(auto desc, ToTensorDesc(dims, data_type));
+        void* host_ptr{};
+        OUTCOME_TRY(_m(aclrtMallocHost(&host_ptr, byte_size)));
+        memset(host_ptr, 0, byte_size);
+        pair.host_tensor =
+            Tensor(desc, std::shared_ptr<void>(host_ptr, [](void* p)
+                                               { aclrtFreeHost(p); }));
+        return pair;
     }
-  };
-  OUTCOME_TRY(auto _data_type, FromAclDataType(data_type));
-  return TensorDesc{Device(0), _data_type,
-                    TensorShape(&dims.dims[0], &dims.dims[0] + dims.dimCount),
-                    extract_name(dims.name)};
-}
 
-Result<size_t> GetByteSize(const aclmdlIODims& dims, aclDataType data_type) {
-  size_t byte_size = aclDataTypeSize(data_type);
-  for (int i = 0; i < dims.dimCount; ++i) {
-    if (dims.dims[i] < 0) {
-      return Status(eInvalidArgument);
+    auto AclNet::CreateBuffersDynamicBatchSize(aclmdlIODims dims, aclDataType data_type)
+        -> Result<Buffers>
+    {
+        for (int i = 0; i < dims.dimCount; ++i)
+        {
+            if (dims.dims[i] == -1)
+            {
+                dims.dims[i] = dynamic_batch_size_.back();
+            }
+        }
+        return CreateBuffers(dims, data_type);
     }
-    byte_size *= dims.dims[i];
-  }
-  return byte_size;
-}
-
-}  // namespace
-
-// all dims must be fixed
-auto AclNet::CreateBuffers(const aclmdlIODims& dims, aclDataType data_type) -> Result<Buffers> {
-  OUTCOME_TRY(auto byte_size, GetByteSize(dims, data_type));
-  Buffers pair{};
-  void* dev_ptr{};
-  OUTCOME_TRY(_m(aclrtMalloc(&dev_ptr, byte_size, ACL_MEM_MALLOC_HUGE_FIRST)));
-  OUTCOME_TRY(_m(aclrtMemset(dev_ptr, byte_size, 0, byte_size)));
-  OUTCOME_TRY(pair.device_buffer, _p(aclCreateDataBuffer(dev_ptr, byte_size)));
-  OUTCOME_TRY(auto desc, ToTensorDesc(dims, data_type));
-  void* host_ptr{};
-  OUTCOME_TRY(_m(aclrtMallocHost(&host_ptr, byte_size)));
-  memset(host_ptr, 0, byte_size);
-  pair.host_tensor =
-      Tensor(desc, std::shared_ptr<void>(host_ptr, [](void* p) { aclrtFreeHost(p); }));
-  return pair;
-}
 
-auto AclNet::CreateBuffersDynamicBatchSize(aclmdlIODims dims, aclDataType data_type)
-    -> Result<Buffers> {
-  for (int i = 0; i < dims.dimCount; ++i) {
-    if (dims.dims[i] == -1) {
-      dims.dims[i] = dynamic_batch_size_.back();
+    auto AclNet::CreateBuffersDynamicImageSize(int index, aclmdlIODims dims, aclDataType data_type)
+        -> Result<Buffers>
+    {
+        aclmdlHW hw_desc{};
+        OUTCOME_TRY(_m(aclmdlGetDynamicHW(model_desc_, index, &hw_desc)));
+        if (hw_desc.hwCount > 0)
+        {
+            auto& val = *std::max_element(hw_desc.hw, hw_desc.hw + hw_desc.hwCount, [](auto u, auto v)
+                                          { return u[0] * u[1] < v[0] * v[1]; });
+            int   ptr = 0;
+            for (int i = 0; i < dims.dimCount; ++i)
+            {
+                if (dims.dims[i] == -1)
+                {
+                    if (ptr == 2)
+                    {
+                        return Status(eInvalidArgument);
+                    }
+                    dims.dims[i] = val[ptr++];
+                }
+            }
+            if (ptr != 2)
+            {
+                return Status(eInvalidArgument);
+            }
+        }
+        return CreateBuffers(dims, data_type);
     }
-  }
-  return CreateBuffers(dims, data_type);
-}
 
-auto AclNet::CreateBuffersDynamicImageSize(int index, aclmdlIODims dims, aclDataType data_type)
-    -> Result<Buffers> {
-  aclmdlHW hw_desc{};
-  OUTCOME_TRY(_m(aclmdlGetDynamicHW(model_desc_, index, &hw_desc)));
-  if (hw_desc.hwCount > 0) {
-    auto& val = *std::max_element(hw_desc.hw, hw_desc.hw + hw_desc.hwCount,
-                                  [](auto u, auto v) { return u[0] * u[1] < v[0] * v[1]; });
-    int ptr = 0;
-    for (int i = 0; i < dims.dimCount; ++i) {
-      if (dims.dims[i] == -1) {
-        if (ptr == 2) {
-          return Status(eInvalidArgument);
-        }
-        dims.dims[i] = val[ptr++];
-      }
-    }
-    if (ptr != 2) {
-      return Status(eInvalidArgument);
+    auto AclNet::CreateBuffersDynamicDims(int index, int dim_count, const aclmdlIODims& dims, aclDataType data_type) -> Result<Buffers>
+    {
+        int          max_index = -1;
+        size_t       max_value = 0;
+        aclmdlIODims max_shape{};
+        for (int j = 0; j < dynamic_input_dims_.size(); ++j)
+        {
+            aclmdlIODims shape{};
+            strncpy(shape.name, dims.name, sizeof(shape.name));
+            shape.dimCount = dims.dimCount;
+            std::copy(dynamic_input_dims_[j].dims + dim_count,
+                      dynamic_input_dims_[j].dims + dim_count + dims.dimCount,
+                      shape.dims);
+            OUTCOME_TRY(auto byte_size, GetByteSize(shape, data_type));
+            if (byte_size > max_value)
+            {
+                max_index = j;
+                max_value = byte_size;
+                max_shape = shape;
+            }
+        }
+        if (max_index < 0)
+        {
+            return Status(eInvalidArgument);
+        }
+        MMDEPLOY_INFO("max shape for input {}: {}", index, max_shape);
+        return CreateBuffers(max_shape, data_type);
     }
-  }
-  return CreateBuffers(dims, data_type);
-}
 
-auto AclNet::CreateBuffersDynamicDims(int index, int dim_count, const aclmdlIODims& dims,
-                                      aclDataType data_type) -> Result<Buffers> {
-  int max_index = -1;
-  size_t max_value = 0;
-  aclmdlIODims max_shape{};
-  for (int j = 0; j < dynamic_input_dims_.size(); ++j) {
-    aclmdlIODims shape{};
-    strncpy(shape.name, dims.name, sizeof(shape.name));
-    shape.dimCount = dims.dimCount;
-    std::copy(dynamic_input_dims_[j].dims + dim_count,
-              dynamic_input_dims_[j].dims + dim_count + dims.dimCount, shape.dims);
-    OUTCOME_TRY(auto byte_size, GetByteSize(shape, data_type));
-    if (byte_size > max_value) {
-      max_index = j;
-      max_value = byte_size;
-      max_shape = shape;
-    }
-  }
-  if (max_index < 0) {
-    return Status(eInvalidArgument);
-  }
-  MMDEPLOY_INFO("max shape for input {}: {}", index, max_shape);
-  return CreateBuffers(max_shape, data_type);
-}
+    Result<void> AclNet::ConfigDynamicShapes()
+    {
+        aclError status = ACL_SUCCESS;
+        {
+            size_t dynamic_tensor_index{};
+            status = aclmdlGetInputIndexByName(model_desc_, ACL_DYNAMIC_TENSOR_NAME, &dynamic_tensor_index);
+            if (status == ACL_SUCCESS)
+            {
+                dynamic_tensor_index_ = static_cast<int>(dynamic_tensor_index);
+                MMDEPLOY_INFO("dynamic tensor index: {}", dynamic_tensor_index);
+            }
+        }
 
-Result<void> AclNet::ConfigDynamicShapes() {
-  aclError status = ACL_SUCCESS;
-  {
-    size_t dynamic_tensor_index{};
-    status = aclmdlGetInputIndexByName(model_desc_, ACL_DYNAMIC_TENSOR_NAME, &dynamic_tensor_index);
-    if (status == ACL_SUCCESS) {
-      dynamic_tensor_index_ = static_cast<int>(dynamic_tensor_index);
-      MMDEPLOY_INFO("dynamic tensor index: {}", dynamic_tensor_index);
-    }
-  }
-
-  if (dynamic_tensor_index_ >= 0) {
-    aclmdlBatch batch_desc{};
-    status = aclmdlGetDynamicBatch(model_desc_, &batch_desc);
-    if (status == ACL_SUCCESS && batch_desc.batchCount > 0) {
-      MMDEPLOY_INFO("{}, status = {}", batch_desc, status);
-      input_shape_type_ = kDynamicBatchSize;
-      dynamic_batch_size_.insert(dynamic_batch_size_.end(), batch_desc.batch,
-                                 batch_desc.batch + batch_desc.batchCount);
-      std::sort(dynamic_batch_size_.begin(), dynamic_batch_size_.end());
+        if (dynamic_tensor_index_ >= 0)
+        {
+            aclmdlBatch batch_desc{};
+            status = aclmdlGetDynamicBatch(model_desc_, &batch_desc);
+            if (status == ACL_SUCCESS && batch_desc.batchCount > 0)
+            {
+                MMDEPLOY_INFO("{}, status = {}", batch_desc, status);
+                input_shape_type_ = kDynamicBatchSize;
+                dynamic_batch_size_.insert(dynamic_batch_size_.end(), batch_desc.batch, batch_desc.batch + batch_desc.batchCount);
+                std::sort(dynamic_batch_size_.begin(), dynamic_batch_size_.end());
+            }
+
+            size_t dynamic_gear_count{0};
+            if (input_shape_type_ == kStatic)
+            {
+                status = aclmdlGetInputDynamicGearCount(model_desc_, -1, &dynamic_gear_count);
+                dynamic_input_dims_.resize(dynamic_gear_count);
+                if (status == ACL_SUCCESS && dynamic_gear_count > 0)
+                {
+                    status = aclmdlGetInputDynamicDims(model_desc_, -1, dynamic_input_dims_.data(), dynamic_gear_count);
+                    for (const auto& dims : dynamic_input_dims_)
+                    {
+                        MMDEPLOY_INFO("dynamic input dims: {}", dims);
+                    }
+                    input_shape_type_ = kDynamicDims;
+                }
+                else
+                {
+                    input_shape_type_ = kDynamicImageSize;
+                }
+            }
+        }
+        return success();
     }
 
-    size_t dynamic_gear_count{0};
-    if (input_shape_type_ == kStatic) {
-      status = aclmdlGetInputDynamicGearCount(model_desc_, -1, &dynamic_gear_count);
-      dynamic_input_dims_.resize(dynamic_gear_count);
-      if (status == ACL_SUCCESS && dynamic_gear_count > 0) {
-        status = aclmdlGetInputDynamicDims(model_desc_, -1, dynamic_input_dims_.data(),
-                                           dynamic_gear_count);
-        for (const auto& dims : dynamic_input_dims_) {
-          MMDEPLOY_INFO("dynamic input dims: {}", dims);
-        }
-        input_shape_type_ = kDynamicDims;
-      } else {
-        input_shape_type_ = kDynamicImageSize;
-      }
+    Result<void> AclNet::CreateInputBuffers()
+    {
+        input_dataset_ = aclmdlCreateDataset();
+        auto n_inputs  = aclmdlGetNumInputs(model_desc_);
+        MMDEPLOY_INFO("n_inputs = {}, dynamic_tensor_index_ = {}", n_inputs, dynamic_tensor_index_);
+        int dim_count = 0;
+        for (int i = 0; i < n_inputs; ++i)
+        {
+            if (i == dynamic_tensor_index_)
+            {
+                void* data{};
+                auto  input_len = aclmdlGetInputSizeByIndex(model_desc_, i);
+                OUTCOME_TRY(_m(aclrtMalloc(&data, input_len, ACL_MEM_MALLOC_HUGE_FIRST)));
+                OUTCOME_TRY(auto buffer, _p(aclCreateDataBuffer(data, input_len)));
+                OUTCOME_TRY(_m(aclmdlAddDatasetBuffer(input_dataset_, buffer)));
+            }
+            else
+            {
+                Buffers      buffers{};
+                aclmdlIODims dims{};
+                OUTCOME_TRY(_m(aclmdlGetInputDims(model_desc_, i, &dims)));
+                input_dims_.push_back(dims);
+                auto data_type = aclmdlGetInputDataType(model_desc_, i);
+                input_data_type_.push_back(data_type);
+                MMDEPLOY_INFO("{}", dims);
+
+                switch (input_shape_type_)
+                {
+                    case kStatic:
+                    {
+                        OUTCOME_TRY(buffers, CreateBuffers(dims, data_type));
+                        break;
+                    }
+                    case kDynamicBatchSize:
+                    {
+                        OUTCOME_TRY(buffers, CreateBuffersDynamicBatchSize(dims, data_type));
+                        break;
+                    }
+                    case kDynamicImageSize:
+                    {
+                        OUTCOME_TRY(buffers, CreateBuffersDynamicImageSize(i, dims, data_type));
+                        break;
+                    }
+                    case kDynamicDims:
+                    {
+                        OUTCOME_TRY(buffers, CreateBuffersDynamicDims(i, dim_count, dims, data_type));
+                        break;
+                    }
+                    default:
+                        return Status(eInvalidArgument);
+                }
+
+                OUTCOME_TRY(_m(aclmdlAddDatasetBuffer(input_dataset_, buffers.device_buffer)));
+                input_tensor_.push_back(std::move(buffers.host_tensor));
+                dim_count += dims.dimCount;
+            }
+        }
+        return success();
     }
-  }
-  return success();
-}
 
-Result<void> AclNet::CreateInputBuffers() {
-  input_dataset_ = aclmdlCreateDataset();
-  auto n_inputs = aclmdlGetNumInputs(model_desc_);
-  MMDEPLOY_INFO("n_inputs = {}, dynamic_tensor_index_ = {}", n_inputs, dynamic_tensor_index_);
-  int dim_count = 0;
-  for (int i = 0; i < n_inputs; ++i) {
-    if (i == dynamic_tensor_index_) {
-      void* data{};
-      auto input_len = aclmdlGetInputSizeByIndex(model_desc_, i);
-      OUTCOME_TRY(_m(aclrtMalloc(&data, input_len, ACL_MEM_MALLOC_HUGE_FIRST)));
-      OUTCOME_TRY(auto buffer, _p(aclCreateDataBuffer(data, input_len)));
-      OUTCOME_TRY(_m(aclmdlAddDatasetBuffer(input_dataset_, buffer)));
-    } else {
-      Buffers buffers{};
-      aclmdlIODims dims{};
-      OUTCOME_TRY(_m(aclmdlGetInputDims(model_desc_, i, &dims)));
-      input_dims_.push_back(dims);
-      auto data_type = aclmdlGetInputDataType(model_desc_, i);
-      input_data_type_.push_back(data_type);
-      MMDEPLOY_INFO("{}", dims);
-
-      switch (input_shape_type_) {
-        case kStatic: {
-          OUTCOME_TRY(buffers, CreateBuffers(dims, data_type));
-          break;
-        }
-        case kDynamicBatchSize: {
-          OUTCOME_TRY(buffers, CreateBuffersDynamicBatchSize(dims, data_type));
-          break;
-        }
-        case kDynamicImageSize: {
-          OUTCOME_TRY(buffers, CreateBuffersDynamicImageSize(i, dims, data_type));
-          break;
-        }
-        case kDynamicDims: {
-          OUTCOME_TRY(buffers, CreateBuffersDynamicDims(i, dim_count, dims, data_type));
-          break;
-        }
-        default:
-          return Status(eInvalidArgument);
-      }
-
-      OUTCOME_TRY(_m(aclmdlAddDatasetBuffer(input_dataset_, buffers.device_buffer)));
-      input_tensor_.push_back(std::move(buffers.host_tensor));
-      dim_count += dims.dimCount;
+    Result<void> AclNet::CreateOutputBuffers()
+    {
+        output_dataset_                     = aclmdlCreateDataset();
+        auto                      n_outputs = aclmdlGetNumOutputs(model_desc_);
+        std::vector<aclmdlIODims> output_dims;
+        for (int i = 0; i < n_outputs; ++i)
+        {
+            aclmdlIODims dims{};
+            OUTCOME_TRY(_m(aclmdlGetOutputDims(model_desc_, i, &dims)));  // return max dims
+            output_dims_.push_back(dims);
+            MMDEPLOY_INFO("{}", dims);
+            auto data_type = aclmdlGetOutputDataType(model_desc_, i);
+            output_data_type_.push_back(data_type);
+            OUTCOME_TRY(auto buffers, CreateBuffers(dims, data_type));
+            OUTCOME_TRY(_m(aclmdlAddDatasetBuffer(output_dataset_, buffers.device_buffer)));
+            output_tensor_.push_back(std::move(buffers.host_tensor));
+        }
+        return success();
     }
-  }
-  return success();
-}
 
-Result<void> AclNet::CreateOutputBuffers() {
-  output_dataset_ = aclmdlCreateDataset();
-  auto n_outputs = aclmdlGetNumOutputs(model_desc_);
-  std::vector<aclmdlIODims> output_dims;
-  for (int i = 0; i < n_outputs; ++i) {
-    aclmdlIODims dims{};
-    OUTCOME_TRY(_m(aclmdlGetOutputDims(model_desc_, i, &dims)));  // return max dims
-    output_dims_.push_back(dims);
-    MMDEPLOY_INFO("{}", dims);
-    auto data_type = aclmdlGetOutputDataType(model_desc_, i);
-    output_data_type_.push_back(data_type);
-    OUTCOME_TRY(auto buffers, CreateBuffers(dims, data_type));
-    OUTCOME_TRY(_m(aclmdlAddDatasetBuffer(output_dataset_, buffers.device_buffer)));
-    output_tensor_.push_back(std::move(buffers.host_tensor));
-  }
-  return success();
-}
+    Result<void> AclNet::Init(const Value& args)
+    {
+        auto& context = args["context"];
+        cpu_stream_   = context["stream"].get<Stream>();
 
-Result<void> AclNet::Init(const Value& args) {
-  auto& context = args["context"];
-  cpu_stream_ = context["stream"].get<Stream>();
-
-  auto name = args["name"].get<std::string>();
-  auto model = context["model"].get<Model>();
-
-  device_id_ = context["device"].get<Device>().device_id();
-  acl_context_ = std::make_shared<Context>();
-
-  OUTCOME_TRY(auto config, model.GetModelConfig(name));
-  OUTCOME_TRY(auto binary, model.ReadFile(config.net));
-
-  OUTCOME_TRY(_m(aclrtSetDevice(device_id_)));
-
-  OUTCOME_TRY(_m(aclmdlLoadFromMem(binary.data(), binary.size(), &model_id_)));
-
-  model_desc_ = aclmdlCreateDesc();
-  OUTCOME_TRY(_m(aclmdlGetDesc(model_desc_, model_id_)));
-
-  // dynamic_tensor_index_
-  // input_shape_type_
-  // dynamic_batch_size_
-  // dynamic_input_dims_
-  if (auto r = ConfigDynamicShapes(); !r) {
-    MMDEPLOY_ERROR("Failed to config dynamic shapes");
-    return r.as_failure();
-  }
-
-  // input_dataset_
-  // input_data_type_
-  // input_dims_
-  // input_tensor_
-  if (auto r = CreateInputBuffers(); !r) {
-    MMDEPLOY_ERROR("Failed to create input buffers");
-    return r.as_failure();
-  }
-
-  // output_dataset_
-  // output_data_type_
-  // output_dims_
-  // output_tensor_
-  if (auto r = CreateOutputBuffers(); !r) {
-    MMDEPLOY_ERROR("Failed to create output buffers");
-    return r.as_failure();
-  }
-
-  return success();
-}
+        auto name  = args["name"].get<std::string>();
+        auto model = context["model"].get<Model>();
 
-Result<void> AclNet::Deinit() { return success(); }
+        device_id_   = context["device"].get<Device>().device_id();
+        acl_context_ = std::make_shared<Context>();
 
-Result<Span<Tensor>> AclNet::GetInputTensors() { return input_tensor_; }
+        OUTCOME_TRY(auto config, model.GetModelConfig(name));
+        OUTCOME_TRY(auto binary, model.ReadFile(config.net));
 
-Result<Span<Tensor>> AclNet::GetOutputTensors() { return output_tensor_; }
+        OUTCOME_TRY(_m(aclrtSetDevice(device_id_)));
 
-Result<void> AclNet::Reshape(Span<TensorShape> input_shapes) {
-  OUTCOME_TRY(_m(aclrtSetDevice(device_id_)));
-  // Sanity checks
-  if (input_shapes.size() != input_dims_.size()) {
-    MMDEPLOY_ERROR("inconsistent num inputs");
-    return Status(eInvalidArgument);
-  }
-  for (int i = 0; i < input_dims_.size(); ++i) {
-    if (input_shapes[i].size() != input_dims_[i].dimCount) {
-      MMDEPLOY_ERROR("inconsistent num of dims");
-      return Status(eInvalidArgument);
-    }
-  }
+        OUTCOME_TRY(_m(aclmdlLoadFromMem(binary.data(), binary.size(), &model_id_)));
 
-  switch (input_shape_type_) {
-    case kStatic: {
-      OUTCOME_TRY(ReshapeStatic(input_shapes));
-      break;
-    }
-    case kDynamicBatchSize: {
-      OUTCOME_TRY(ReshapeDynamicBatchSize(input_shapes));
-      break;
+        model_desc_ = aclmdlCreateDesc();
+        OUTCOME_TRY(_m(aclmdlGetDesc(model_desc_, model_id_)));
+
+        // dynamic_tensor_index_
+        // input_shape_type_
+        // dynamic_batch_size_
+        // dynamic_input_dims_
+        if (auto r = ConfigDynamicShapes(); !r)
+        {
+            MMDEPLOY_ERROR("Failed to config dynamic shapes");
+            return r.as_failure();
+        }
+
+        // input_dataset_
+        // input_data_type_
+        // input_dims_
+        // input_tensor_
+        if (auto r = CreateInputBuffers(); !r)
+        {
+            MMDEPLOY_ERROR("Failed to create input buffers");
+            return r.as_failure();
+        }
+
+        // output_dataset_
+        // output_data_type_
+        // output_dims_
+        // output_tensor_
+        if (auto r = CreateOutputBuffers(); !r)
+        {
+            MMDEPLOY_ERROR("Failed to create output buffers");
+            return r.as_failure();
+        }
+
+        return success();
     }
-    case kDynamicImageSize: {
-      OUTCOME_TRY(ReshapeDynamicImageSize(input_shapes));
-      break;
+
+    Result<void> AclNet::Deinit()
+    {
+        return success();
     }
-    case kDynamicDims: {
-      OUTCOME_TRY(ReshapeDynamicDims(input_shapes));
-      break;
+
+    Result<Span<Tensor>> AclNet::GetInputTensors()
+    {
+        return input_tensor_;
     }
-    default:
-      return Status(eInvalidArgument);
-  }
-
-  for (int i = 0; i < input_shapes.size(); ++i) {
-    auto buffer = input_tensor_[i].buffer();
-    auto desc = input_tensor_[i].desc();
-    desc.shape = input_shapes[i];
-    input_tensor_[i] = Tensor(std::move(desc), std::move(buffer));
-  }
-
-  for (int i = 0; i < output_dims_.size(); ++i) {
-    aclmdlIODims dims{};
-    OUTCOME_TRY(_m(aclmdlGetCurOutputDims(model_desc_, i, &dims)));
-    auto buffer = output_tensor_[i].buffer();
-    auto desc = output_tensor_[i].desc();
-    desc.shape = TensorShape(&dims.dims[0], &dims.dims[0] + dims.dimCount);
-    output_tensor_[i] = Tensor(std::move(desc), std::move(buffer));
-  }
-
-  return success();
-}
 
-Result<void> AclNet::ReshapeStatic(Span<TensorShape> input_shapes) {
-  for (int i = 0; i < input_dims_.size(); ++i) {
-    Span src(input_shapes[i]);
-    Span ref(input_dims_[i].dims, input_dims_[i].dimCount);
-    if (src != ref) {
-      MMDEPLOY_ERROR("Shape mismatch {} vs {}", src, ref);
-      return Status(eInvalidArgument);
+    Result<Span<Tensor>> AclNet::GetOutputTensors()
+    {
+        return output_tensor_;
     }
-  }
-  return success();
-}
 
-Result<void> AclNet::ReshapeDynamicBatchSize(Span<TensorShape> input_shapes) {
-  int batch_size = -1;
-  for (int i = 0; i < input_dims_.size(); ++i) {
-    for (int j = 0; j < input_dims_[i].dimCount; ++j) {
-      if (input_dims_[i].dims[j] == -1) {
-        if (batch_size != -1 && batch_size != input_shapes[i][j]) {
-          // inconsistent batch size
-          return Status(eInvalidArgument);
-        }
-        batch_size = input_shapes[i][j];
-      }
+    Result<void> AclNet::Reshape(Span<TensorShape> input_shapes)
+    {
+        OUTCOME_TRY(_m(aclrtSetDevice(device_id_)));
+        // Sanity checks
+        if (input_shapes.size() != input_dims_.size())
+        {
+            MMDEPLOY_ERROR("inconsistent num inputs");
+            return Status(eInvalidArgument);
+        }
+        for (int i = 0; i < input_dims_.size(); ++i)
+        {
+            if (input_shapes[i].size() != input_dims_[i].dimCount)
+            {
+                MMDEPLOY_ERROR("inconsistent num of dims");
+                return Status(eInvalidArgument);
+            }
+        }
+
+        switch (input_shape_type_)
+        {
+            case kStatic:
+            {
+                OUTCOME_TRY(ReshapeStatic(input_shapes));
+                break;
+            }
+            case kDynamicBatchSize:
+            {
+                OUTCOME_TRY(ReshapeDynamicBatchSize(input_shapes));
+                break;
+            }
+            case kDynamicImageSize:
+            {
+                OUTCOME_TRY(ReshapeDynamicImageSize(input_shapes));
+                break;
+            }
+            case kDynamicDims:
+            {
+                OUTCOME_TRY(ReshapeDynamicDims(input_shapes));
+                break;
+            }
+            default:
+                return Status(eInvalidArgument);
+        }
+
+        for (int i = 0; i < input_shapes.size(); ++i)
+        {
+            auto buffer      = input_tensor_[i].buffer();
+            auto desc        = input_tensor_[i].desc();
+            desc.shape       = input_shapes[i];
+            input_tensor_[i] = Tensor(std::move(desc), std::move(buffer));
+        }
+
+        for (int i = 0; i < output_dims_.size(); ++i)
+        {
+            aclmdlIODims dims{};
+            OUTCOME_TRY(_m(aclmdlGetCurOutputDims(model_desc_, i, &dims)));
+            auto buffer       = output_tensor_[i].buffer();
+            auto desc         = output_tensor_[i].desc();
+            desc.shape        = TensorShape(&dims.dims[0], &dims.dims[0] + dims.dimCount);
+            output_tensor_[i] = Tensor(std::move(desc), std::move(buffer));
+        }
+
+        return success();
     }
-  }
-  if (batch_size < 0) {
-    MMDEPLOY_ERROR("unable to determine batch size");
-    return Status(eFail);
-  }
-  MMDEPLOY_INFO("batch size {} {}", batch_size, dynamic_tensor_index_);
-  auto index =
-      std::lower_bound(dynamic_batch_size_.begin(), dynamic_batch_size_.end(), batch_size) -
-      dynamic_batch_size_.begin();
-  if (index == dynamic_batch_size_.size()) {
-    MMDEPLOY_ERROR("Unsupported batch size: {}", batch_size);
-  }
-  // TODO: memset padding memory to avoid potential extra computation
-  OUTCOME_TRY(_m(aclmdlSetDynamicBatchSize(model_id_, input_dataset_, dynamic_tensor_index_,
-                                           dynamic_batch_size_[index])));
-  return success();
-}
 
-Result<void> AclNet::ReshapeDynamicImageSize(Span<TensorShape> input_shapes) {
-  uint64_t hw[2];
-  bool found = false;
-  for (int i = 0; i < input_dims_.size(); ++i) {
-    uint64_t tmp[2];
-    int ptr = 0;
-    for (int j = 0; j < input_dims_[i].dimCount; ++j) {
-      if (input_dims_[i].dims[j] == -1) {
-        if (ptr == 2) {
-          MMDEPLOY_ERROR("dynamic HW size out of bounds: {}", input_dims_[i]);
-          return Status(eInvalidArgument);
-        } else {
-          tmp[ptr++] = input_shapes[i][j];
-        }
-      }
+    Result<void> AclNet::ReshapeStatic(Span<TensorShape> input_shapes)
+    {
+        for (int i = 0; i < input_dims_.size(); ++i)
+        {
+            Span src(input_shapes[i]);
+            Span ref(input_dims_[i].dims, input_dims_[i].dimCount);
+            if (src != ref)
+            {
+                MMDEPLOY_ERROR("Shape mismatch {} vs {}", src, ref);
+                return Status(eInvalidArgument);
+            }
+        }
+        return success();
     }
-    if (ptr && ptr != 2) {
-      MMDEPLOY_ERROR("Partially determined dynamic HW size: {}", input_dims_[i]);
-      return Status(eInvalidArgument);
+
+    Result<void> AclNet::ReshapeDynamicBatchSize(Span<TensorShape> input_shapes)
+    {
+        int batch_size = -1;
+        for (int i = 0; i < input_dims_.size(); ++i)
+        {
+            for (int j = 0; j < input_dims_[i].dimCount; ++j)
+            {
+                if (input_dims_[i].dims[j] == -1)
+                {
+                    if (batch_size != -1 && batch_size != input_shapes[i][j])
+                    {
+                        // inconsistent batch size
+                        return Status(eInvalidArgument);
+                    }
+                    batch_size = input_shapes[i][j];
+                }
+            }
+        }
+        if (batch_size < 0)
+        {
+            MMDEPLOY_ERROR("unable to determine batch size");
+            return Status(eFail);
+        }
+        MMDEPLOY_INFO("batch size {} {}", batch_size, dynamic_tensor_index_);
+        auto index =
+            std::lower_bound(dynamic_batch_size_.begin(), dynamic_batch_size_.end(), batch_size) -
+            dynamic_batch_size_.begin();
+        if (index == dynamic_batch_size_.size())
+        {
+            MMDEPLOY_ERROR("Unsupported batch size: {}", batch_size);
+        }
+        // TODO: memset padding memory to avoid potential extra computation
+        OUTCOME_TRY(_m(aclmdlSetDynamicBatchSize(model_id_, input_dataset_, dynamic_tensor_index_, dynamic_batch_size_[index])));
+        return success();
     }
-    if (ptr == 2) {
-      if (found) {
-        if (hw[0] != tmp[0] || hw[1] != tmp[1]) {
-          MMDEPLOY_ERROR("Inconsistent dynamic HW size: ({}, {}) vs ({}, {})", hw[0], hw[1], tmp[0],
-                         tmp[1]);
-          return Status(eInvalidArgument);
-        }
-      } else {
-        found = true;
-        hw[0] = tmp[0];
-        hw[1] = tmp[1];
-      }
+
+    Result<void> AclNet::ReshapeDynamicImageSize(Span<TensorShape> input_shapes)
+    {
+        uint64_t hw[2];
+        bool     found = false;
+        for (int i = 0; i < input_dims_.size(); ++i)
+        {
+            uint64_t tmp[2];
+            int      ptr = 0;
+            for (int j = 0; j < input_dims_[i].dimCount; ++j)
+            {
+                if (input_dims_[i].dims[j] == -1)
+                {
+                    if (ptr == 2)
+                    {
+                        MMDEPLOY_ERROR("dynamic HW size out of bounds: {}", input_dims_[i]);
+                        return Status(eInvalidArgument);
+                    }
+                    else
+                    {
+                        tmp[ptr++] = input_shapes[i][j];
+                    }
+                }
+            }
+            if (ptr && ptr != 2)
+            {
+                MMDEPLOY_ERROR("Partially determined dynamic HW size: {}", input_dims_[i]);
+                return Status(eInvalidArgument);
+            }
+            if (ptr == 2)
+            {
+                if (found)
+                {
+                    if (hw[0] != tmp[0] || hw[1] != tmp[1])
+                    {
+                        MMDEPLOY_ERROR("Inconsistent dynamic HW size: ({}, {}) vs ({}, {})", hw[0], hw[1], tmp[0], tmp[1]);
+                        return Status(eInvalidArgument);
+                    }
+                }
+                else
+                {
+                    found = true;
+                    hw[0] = tmp[0];
+                    hw[1] = tmp[1];
+                }
+            }
+        }
+        if (!found)
+        {
+            MMDEPLOY_ERROR("Unable to determine image size");
+            return Status(eInvalidArgument);
+        }
+        MMDEPLOY_INFO("dynamic HW size ({}, {})", hw[0], hw[1]);
+        OUTCOME_TRY(
+            _m(aclmdlSetDynamicHWSize(model_id_, input_dataset_, dynamic_tensor_index_, hw[0], hw[1])));
+        return success();
     }
-  }
-  if (!found) {
-    MMDEPLOY_ERROR("Unable to determine image size");
-    return Status(eInvalidArgument);
-  }
-  MMDEPLOY_INFO("dynamic HW size ({}, {})", hw[0], hw[1]);
-  OUTCOME_TRY(
-      _m(aclmdlSetDynamicHWSize(model_id_, input_dataset_, dynamic_tensor_index_, hw[0], hw[1])));
-  return success();
-}
 
-Result<void> AclNet::ReshapeDynamicDims(Span<TensorShape> input_shapes) {
-  std::vector<int> match(dynamic_input_dims_.size(), 1);
-  aclmdlIODims dims{};
-  for (int i = 0; i < input_shapes.size(); ++i) {
-    const auto& shape = input_shapes[i];
-    for (int j = 0; j < shape.size(); ++j) {
-      if (input_dims_[i].dims[j] == -1) {
-        for (int k = 0; k < dynamic_input_dims_.size(); ++k) {
-          // disable profile when dims mismatch, except for the first dim (batch size)
-          if (j == 0 && shape[j] < dynamic_input_dims_[k].dims[dims.dimCount]) {
-            // pass
-          } else if (shape[j] != dynamic_input_dims_[k].dims[dims.dimCount]) {
-            match[k] = 0;
-          }
-        }
-      } else {
-        if (input_dims_[i].dims[j] != shape[j]) {
-          return Status(eNotSupported);
-        }
-      }
-      dims.dims[dims.dimCount++] = shape[j];
+    Result<void> AclNet::ReshapeDynamicDims(Span<TensorShape> input_shapes)
+    {
+        std::vector<int> match(dynamic_input_dims_.size(), 1);
+        aclmdlIODims     dims{};
+        for (int i = 0; i < input_shapes.size(); ++i)
+        {
+            const auto& shape = input_shapes[i];
+            for (int j = 0; j < shape.size(); ++j)
+            {
+                if (input_dims_[i].dims[j] == -1)
+                {
+                    for (int k = 0; k < dynamic_input_dims_.size(); ++k)
+                    {
+                        // disable profile when dims mismatch, except for the first dim (batch size)
+                        if (j == 0 && shape[j] < dynamic_input_dims_[k].dims[dims.dimCount])
+                        {
+                            // pass
+                        }
+                        else if (shape[j] != dynamic_input_dims_[k].dims[dims.dimCount])
+                        {
+                            match[k] = 0;
+                        }
+                    }
+                }
+                else
+                {
+                    if (input_dims_[i].dims[j] != shape[j])
+                    {
+                        return Status(eNotSupported);
+                    }
+                }
+                dims.dims[dims.dimCount++] = shape[j];
+            }
+        }
+        int dims_index = std::find(match.begin(), match.end(), 1) - match.begin();
+        if (dims_index == match.size())
+        {
+            MMDEPLOY_ERROR("Shape not supported: {}", dims);
+            return Status(eNotSupported);
+        }
+        // TODO: memset padding memory to avoid potential extra computation
+        OUTCOME_TRY(_m(aclmdlSetInputDynamicDims(model_id_, input_dataset_, dynamic_tensor_index_, &dynamic_input_dims_[dims_index])));
+        return success();
     }
-  }
-  int dims_index = std::find(match.begin(), match.end(), 1) - match.begin();
-  if (dims_index == match.size()) {
-    MMDEPLOY_ERROR("Shape not supported: {}", dims);
-    return Status(eNotSupported);
-  }
-  // TODO: memset padding memory to avoid potential extra computation
-  OUTCOME_TRY(_m(aclmdlSetInputDynamicDims(model_id_, input_dataset_, dynamic_tensor_index_,
-                                           &dynamic_input_dims_[dims_index])));
-  return success();
-}
 
-Result<void> AclNet::Forward() {
-  OUTCOME_TRY(cpu_stream_.Wait());
-
-  OUTCOME_TRY(_m(aclrtSetDevice(device_id_)));
-
-  for (int i = 0; i < input_tensor_.size(); ++i) {
-    auto buffer = aclmdlGetDatasetBuffer(input_dataset_, i);
-    auto buffer_size = aclGetDataBufferSizeV2(buffer);
-    auto buffer_data = aclGetDataBufferAddr(buffer);
-    auto host_ptr = input_tensor_[i].data();
-    OUTCOME_TRY(_m(aclrtMemcpy(buffer_data, buffer_size, host_ptr, input_tensor_[i].byte_size(),
-                               ACL_MEMCPY_HOST_TO_DEVICE)));
-  }
-
-  OUTCOME_TRY(_m(aclmdlExecute(model_id_, input_dataset_, output_dataset_)));
-
-  for (int i = 0; i < output_tensor_.size(); ++i) {
-    auto buffer = aclmdlGetDatasetBuffer(output_dataset_, i);
-    auto buffer_data = aclGetDataBufferAddr(buffer);
-    auto host_ptr = output_tensor_[i].data();
-    OUTCOME_TRY(_m(aclrtMemcpy(host_ptr, output_tensor_[i].byte_size(), buffer_data,
-                               output_tensor_[i].byte_size(), ACL_MEMCPY_DEVICE_TO_HOST)));
-  }
-  return success();
-}
+    Result<void> AclNet::Forward()
+    {
+        OUTCOME_TRY(cpu_stream_.Wait());
 
-Result<void> AclNet::ForwardAsync(Event* event) { return Status(eNotSupported); }
+        OUTCOME_TRY(_m(aclrtSetDevice(device_id_)));
 
-static std::unique_ptr<Net> Create(const Value& args) {
-  try {
-    auto p = std::make_unique<AclNet>();
-    if (auto r = p->Init(args)) {
-      return p;
-    } else {
-      MMDEPLOY_ERROR("error creating AclNet: {}", r.error().message().c_str());
-      return nullptr;
+        for (int i = 0; i < input_tensor_.size(); ++i)
+        {
+            auto buffer      = aclmdlGetDatasetBuffer(input_dataset_, i);
+            auto buffer_size = aclGetDataBufferSizeV2(buffer);
+            auto buffer_data = aclGetDataBufferAddr(buffer);
+            auto host_ptr    = input_tensor_[i].data();
+            OUTCOME_TRY(_m(aclrtMemcpy(buffer_data, buffer_size, host_ptr, input_tensor_[i].byte_size(), ACL_MEMCPY_HOST_TO_DEVICE)));
+        }
+
+        OUTCOME_TRY(_m(aclmdlExecute(model_id_, input_dataset_, output_dataset_)));
+
+        for (int i = 0; i < output_tensor_.size(); ++i)
+        {
+            auto buffer      = aclmdlGetDatasetBuffer(output_dataset_, i);
+            auto buffer_data = aclGetDataBufferAddr(buffer);
+            auto host_ptr    = output_tensor_[i].data();
+            OUTCOME_TRY(_m(aclrtMemcpy(host_ptr, output_tensor_[i].byte_size(), buffer_data, output_tensor_[i].byte_size(), ACL_MEMCPY_DEVICE_TO_HOST)));
+        }
+        return success();
+    }
+
+    Result<void> AclNet::ForwardAsync(Event* event)
+    {
+        return Status(eNotSupported);
+    }
+
+    static std::unique_ptr<Net> Create(const Value& args)
+    {
+        try
+        {
+            auto p = std::make_unique<AclNet>();
+            if (auto r = p->Init(args))
+            {
+                return p;
+            }
+            else
+            {
+                MMDEPLOY_ERROR("error creating AclNet: {}", r.error().message().c_str());
+                return nullptr;
+            }
+        }
+        catch (const std::exception& e)
+        {
+            MMDEPLOY_ERROR("unhandled exception when creating AclNet: {}", e.what());
+            return nullptr;
+        }
     }
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("unhandled exception when creating AclNet: {}", e.what());
-    return nullptr;
-  }
-}
 
-MMDEPLOY_REGISTER_FACTORY_FUNC(Net, (ascend, 0), Create);
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Net, (ascend, 0), Create);
 
 }  // namespace mmdeploy::framework
diff --git a/csrc/mmdeploy/net/acl/acl_net.h b/csrc/mmdeploy/net/acl/acl_net.h
index a842ce4a3c..faee7e508e 100644
--- a/csrc/mmdeploy/net/acl/acl_net.h
+++ b/csrc/mmdeploy/net/acl/acl_net.h
@@ -7,63 +7,70 @@
 #include "mmdeploy/core/net.h"
 #include "mmdeploy/core/status_code.h"
 
-namespace mmdeploy::framework {
+namespace mmdeploy::framework
+{
 
-class AclNet : public Net {
- public:
-  ~AclNet() override;
-  Result<void> Init(const Value& cfg) override;
-  Result<void> Deinit() override;
-  Result<Span<Tensor>> GetInputTensors() override;
-  Result<Span<Tensor>> GetOutputTensors() override;
-  Result<void> Reshape(Span<TensorShape> input_shapes) override;
-  Result<void> Forward() override;
-  Result<void> ForwardAsync(Event* event) override;
+    class AclNet : public Net
+    {
+      public:
+        ~AclNet() override;
+        Result<void>         Init(const Value& cfg) override;
+        Result<void>         Deinit() override;
+        Result<Span<Tensor>> GetInputTensors() override;
+        Result<Span<Tensor>> GetOutputTensors() override;
+        Result<void>         Reshape(Span<TensorShape> input_shapes) override;
+        Result<void>         Forward() override;
+        Result<void>         ForwardAsync(Event* event) override;
 
- private:
-  enum InputShapeType { kStatic, kDynamicBatchSize, kDynamicImageSize, kDynamicDims };
+      private:
+        enum InputShapeType
+        {
+            kStatic,
+            kDynamicBatchSize,
+            kDynamicImageSize,
+            kDynamicDims
+        };
 
-  Result<void> ReshapeStatic(Span<TensorShape> input_shapes);
-  Result<void> ReshapeDynamicBatchSize(Span<TensorShape> input_shapes);
-  Result<void> ReshapeDynamicImageSize(Span<TensorShape> input_shapes);
-  Result<void> ReshapeDynamicDims(Span<TensorShape> input_shapes);
+        Result<void> ReshapeStatic(Span<TensorShape> input_shapes);
+        Result<void> ReshapeDynamicBatchSize(Span<TensorShape> input_shapes);
+        Result<void> ReshapeDynamicImageSize(Span<TensorShape> input_shapes);
+        Result<void> ReshapeDynamicDims(Span<TensorShape> input_shapes);
 
-  struct Buffers {
-    aclDataBuffer* device_buffer;
-    Tensor host_tensor;
-  };
+        struct Buffers
+        {
+            aclDataBuffer* device_buffer;
+            Tensor         host_tensor;
+        };
 
-  Result<Buffers> CreateBuffers(const aclmdlIODims& dims, aclDataType data_type);
+        Result<Buffers>           CreateBuffers(const aclmdlIODims& dims, aclDataType data_type);
 
-  Result<Buffers> CreateBuffersDynamicBatchSize(aclmdlIODims dims, aclDataType data_type);
-  Result<Buffers> CreateBuffersDynamicImageSize(int index, aclmdlIODims dims,
-                                                aclDataType data_type);
-  Result<Buffers> CreateBuffersDynamicDims(int index, int dim_count, const aclmdlIODims& dims,
-                                           aclDataType data_type);
+        Result<Buffers>           CreateBuffersDynamicBatchSize(aclmdlIODims dims, aclDataType data_type);
+        Result<Buffers>           CreateBuffersDynamicImageSize(int index, aclmdlIODims dims, aclDataType data_type);
+        Result<Buffers>           CreateBuffersDynamicDims(int index, int dim_count, const aclmdlIODims& dims, aclDataType data_type);
 
-  Result<void> ConfigDynamicShapes();
+        Result<void>              ConfigDynamicShapes();
 
-  Result<void> CreateInputBuffers();
-  Result<void> CreateOutputBuffers();
+        Result<void>              CreateInputBuffers();
+        Result<void>              CreateOutputBuffers();
 
-  std::shared_ptr<void> acl_context_;
-  Stream cpu_stream_;
-  int32_t device_id_{0};
-  uint32_t model_id_{(uint32_t)-1};
-  aclmdlDesc* model_desc_{nullptr};
-  int dynamic_tensor_index_{-1};
-  InputShapeType input_shape_type_{kStatic};
-  std::vector<size_t> dynamic_batch_size_;
-  std::vector<aclmdlIODims> dynamic_input_dims_;
-  aclmdlDataset* input_dataset_{nullptr};
-  aclmdlDataset* output_dataset_{nullptr};
-  std::vector<aclmdlIODims> input_dims_;
-  std::vector<aclmdlIODims> output_dims_;
-  std::vector<aclDataType> input_data_type_;
-  std::vector<aclDataType> output_data_type_;
-  std::vector<Tensor> input_tensor_;
-  std::vector<Tensor> output_tensor_;
-};
+        std::shared_ptr<void>     acl_context_;
+        Stream                    cpu_stream_;
+        int32_t                   device_id_{0};
+        uint32_t                  model_id_{(uint32_t)-1};
+        aclmdlDesc*               model_desc_{nullptr};
+        int                       dynamic_tensor_index_{-1};
+        InputShapeType            input_shape_type_{kStatic};
+        std::vector<size_t>       dynamic_batch_size_;
+        std::vector<aclmdlIODims> dynamic_input_dims_;
+        aclmdlDataset*            input_dataset_{nullptr};
+        aclmdlDataset*            output_dataset_{nullptr};
+        std::vector<aclmdlIODims> input_dims_;
+        std::vector<aclmdlIODims> output_dims_;
+        std::vector<aclDataType>  input_data_type_;
+        std::vector<aclDataType>  output_data_type_;
+        std::vector<Tensor>       input_tensor_;
+        std::vector<Tensor>       output_tensor_;
+    };
 
 }  // namespace mmdeploy::framework
 
diff --git a/csrc/mmdeploy/net/coreml/CMakeLists.txt b/csrc/mmdeploy/net/coreml/CMakeLists.txt
index 2f6de5427f..422e13835c 100644
--- a/csrc/mmdeploy/net/coreml/CMakeLists.txt
+++ b/csrc/mmdeploy/net/coreml/CMakeLists.txt
@@ -2,13 +2,17 @@
 
 project(mmdeploy_coreml_net)
 
-if ("cpu" IN_LIST MMDEPLOY_TARGET_DEVICES)
-    find_library(CORE_ML CoreML)
-    find_library(FOUNDATION Foundation)
-    mmdeploy_add_net(${PROJECT_NAME} coreml_net.mm)
-    target_include_directories(${PROJECT_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
-    target_link_libraries(${PROJECT_NAME} PRIVATE ${CORE_ML} ${FOUNDATION})
-    add_library(mmdeploy::coreml_net ALIAS ${PROJECT_NAME})
-else ()
-    message(ERROR "'coreml_net' is NOT supported in target devices: ${MMDEPLOY_TARGET_DEVICES}")
-endif ()
+if("cpu" IN_LIST MMDEPLOY_TARGET_DEVICES)
+  find_library(CORE_ML CoreML)
+  find_library(FOUNDATION Foundation)
+  mmdeploy_add_net(${PROJECT_NAME} coreml_net.mm)
+  target_include_directories(${PROJECT_NAME}
+                             PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
+  target_link_libraries(${PROJECT_NAME} PRIVATE ${CORE_ML} ${FOUNDATION})
+  add_library(mmdeploy::coreml_net ALIAS ${PROJECT_NAME})
+else()
+  message(
+    ERROR
+    "'coreml_net' is NOT supported in target devices: ${MMDEPLOY_TARGET_DEVICES}"
+  )
+endif()
diff --git a/csrc/mmdeploy/net/coreml/coreml_net.h b/csrc/mmdeploy/net/coreml/coreml_net.h
index 797dfcaa23..d28f0fa627 100644
--- a/csrc/mmdeploy/net/coreml/coreml_net.h
+++ b/csrc/mmdeploy/net/coreml/coreml_net.h
@@ -5,32 +5,35 @@
 
 #include "mmdeploy/core/net.h"
 
-namespace mmdeploy::framework {
-
-namespace coreml {
-class Execution;
-}  // namespace coreml
-
-class CoreMLNet : public Net {
- public:
-  ~CoreMLNet() override = default;
-  Result<void> Init(const Value& cfg) override;
-  Result<void> Deinit() override;
-  Result<Span<Tensor>> GetInputTensors() override;
-  Result<Span<Tensor>> GetOutputTensors() override;
-  Result<void> Reshape(Span<TensorShape> input_shapes) override;
-  Result<void> Forward() override;
-  Result<void> ForwardAsync(Event* event) override;
-
- private:
-  std::unique_ptr<coreml::Execution> execution_;
-  std::vector<Tensor> input_tensors_;
-  std::vector<Tensor> output_tensors_;
-  Device device_;
-  Stream stream_;
-
-  friend class coreml::Execution;
-};
+namespace mmdeploy::framework
+{
+
+    namespace coreml
+    {
+        class Execution;
+    }  // namespace coreml
+
+    class CoreMLNet : public Net
+    {
+      public:
+        ~CoreMLNet() override = default;
+        Result<void>         Init(const Value& cfg) override;
+        Result<void>         Deinit() override;
+        Result<Span<Tensor>> GetInputTensors() override;
+        Result<Span<Tensor>> GetOutputTensors() override;
+        Result<void>         Reshape(Span<TensorShape> input_shapes) override;
+        Result<void>         Forward() override;
+        Result<void>         ForwardAsync(Event* event) override;
+
+      private:
+        std::unique_ptr<coreml::Execution> execution_;
+        std::vector<Tensor>                input_tensors_;
+        std::vector<Tensor>                output_tensors_;
+        Device                             device_;
+        Stream                             stream_;
+
+        friend class coreml::Execution;
+    };
 
 }  // namespace mmdeploy::framework
 
diff --git a/csrc/mmdeploy/net/ncnn/CMakeLists.txt b/csrc/mmdeploy/net/ncnn/CMakeLists.txt
index 7ae7a85385..17c1a8bafc 100644
--- a/csrc/mmdeploy/net/ncnn/CMakeLists.txt
+++ b/csrc/mmdeploy/net/ncnn/CMakeLists.txt
@@ -2,16 +2,16 @@
 
 project(mmdeploy_ncnn_net)
 
-if ("cpu" IN_LIST MMDEPLOY_TARGET_DEVICES)
+if("cpu" IN_LIST MMDEPLOY_TARGET_DEVICES)
 
-    find_package(ncnn REQUIRED)
+  find_package(ncnn REQUIRED)
 
-    mmdeploy_add_net(${PROJECT_NAME} ncnn_net.cpp)
-    target_link_libraries(${PROJECT_NAME} PRIVATE mmdeploy_ncnn_ops_obj)
-    target_link_libraries(${PROJECT_NAME} PRIVATE ncnn)
-    add_library(mmdeploy::ncnn_net ALIAS ${PROJECT_NAME})
-else ()
-    message(
-            ERROR
-            "'ncnn_net' is NOT supported in target devices: ${MMDEPLOY_TARGET_DEVICES}")
-endif ()
+  mmdeploy_add_net(${PROJECT_NAME} ncnn_net.cpp)
+  target_link_libraries(${PROJECT_NAME} PRIVATE mmdeploy_ncnn_ops_obj)
+  target_link_libraries(${PROJECT_NAME} PRIVATE ncnn)
+  add_library(mmdeploy::ncnn_net ALIAS ${PROJECT_NAME})
+else()
+  message(
+    ERROR
+    "'ncnn_net' is NOT supported in target devices: ${MMDEPLOY_TARGET_DEVICES}")
+endif()
diff --git a/csrc/mmdeploy/net/ncnn/ncnn_net.cpp b/csrc/mmdeploy/net/ncnn/ncnn_net.cpp
index 6dcd5d5822..bd37bbdfcd 100644
--- a/csrc/mmdeploy/net/ncnn/ncnn_net.cpp
+++ b/csrc/mmdeploy/net/ncnn/ncnn_net.cpp
@@ -7,142 +7,181 @@
 #include "mmdeploy/core/utils/formatter.h"
 #include "ncnn_ops_register.h"
 
-namespace mmdeploy::framework {
-
-NCNNNet::~NCNNNet() {}
-
-Result<void> ncnn_status(int code) {
-  if (code == 0) {
-    return success();
-  }
-  return Status(eFail);
-}
-
-Result<void> NCNNNet::Init(const Value& args) {
-  auto& context = args["context"];
-  device_ = context["device"].get<Device>();
-  stream_ = context["stream"].get<Stream>();
-  if (args.contains("use_vulkan")) {
-    net_.opt.use_vulkan_compute = args["use_vulkan"].get<bool>();
-  }
-  if (!device_.is_host()) {
-    return Status(eNotSupported);
-  }
-
-  auto name = args["name"].get<std::string>();
-  auto model = context["model"].get<Model>();
-  OUTCOME_TRY(auto config, model.GetModelConfig(name));
-  auto precision = config.precision;
-  if (precision == "FP16") {
-    net_.opt.use_fp16_packed = true;
-    net_.opt.use_fp16_storage = true;
-    net_.opt.use_fp16_arithmetic = true;
-  } else if (precision == "INT8") {
-    // in android platform, ncnn will automatically start FP16 accelerate.
-    // In INT8 case, we set fp16 as false explicitly.
-    net_.opt.use_int8_packed = true;
-    net_.opt.use_int8_storage = true;
-    net_.opt.use_int8_arithmetic = true;
-    net_.opt.use_fp16_packed = false;
-    net_.opt.use_fp16_storage = false;
-    net_.opt.use_fp16_arithmetic = false;
-  } else {
-    // in android platform, ncnn will automatically start FP16 accelerate.
-    // In FP32 case, we set fp16 as false explicitly.
-    net_.opt.use_fp16_packed = false;
-    net_.opt.use_fp16_storage = false;
-    net_.opt.use_fp16_arithmetic = false;
-  }
-  OUTCOME_TRY(params_, model.ReadFile(config.net));
-  OUTCOME_TRY(weights_, model.ReadFile(config.weights));
-  register_mmdeploy_custom_layers(net_);
-
-  OUTCOME_TRY(ncnn_status(net_.load_param_mem(params_.c_str())));
-  net_.load_model(reinterpret_cast<const unsigned char*>(weights_.data()));
-
-  input_indices_ = net_.input_indexes();
-  for (const auto& x : net_.input_names()) {
-    input_tensors_.emplace_back(TensorDesc{
-        Device("cpu"),
-        DataType::kFLOAT,
-        {},
-        x,
-    });
-  }
-  output_indices_ = net_.output_indexes();
-  for (const auto& x : net_.output_names()) {
-    output_tensors_.emplace_back(TensorDesc{
-        Device("cpu"),
-        DataType::kFLOAT,
-        {},
-        x,
-    });
-  }
-  return success();
-}
-
-Result<void> NCNNNet::Deinit() { return success(); }
-
-Result<void> NCNNNet::Reshape(Span<TensorShape> input_shapes) {
-  for (size_t i = 0; i < input_shapes.size(); ++i) {
-    input_tensors_[i].Reshape(input_shapes[i]);
-  }
-  return success();
-}
-
-Result<Span<Tensor>> NCNNNet::GetInputTensors() { return input_tensors_; }
-
-Result<Span<Tensor>> NCNNNet::GetOutputTensors() { return output_tensors_; }
-
-// TODO: discuss a policy for batch processing
-Result<void> NCNNNet::Forward() {
-  auto extractor = net_.create_extractor();
-  OUTCOME_TRY(stream_.Wait());
-  std::vector<ncnn::Mat> inputs(input_indices_.size());
-  for (size_t i = 0; i < input_indices_.size(); ++i) {
-    auto& tensor = input_tensors_[i];
-    auto shape = tensor.shape();
-    assert(shape[0] == 1);
-    inputs[i] = ncnn::Mat(shape[3], shape[2], shape[1], tensor.data());
-    OUTCOME_TRY(ncnn_status(extractor.input(input_indices_[i], inputs[i])));
-  }
-  std::vector<ncnn::Mat> outputs(output_indices_.size());
-  for (size_t i = 0; i < output_indices_.size(); ++i) {
-    OUTCOME_TRY(ncnn_status(extractor.extract(output_indices_[i], outputs[i])));
-    auto& tensor = output_tensors_[i];
-    auto shape = outputs[i].shape();
-    if (outputs[i].dims == 1) {
-      tensor.Reshape({1, shape.w});
-    } else if (outputs[i].dims == 2) {
-      tensor.Reshape({1, shape.h, shape.w});
-    } else if (outputs[i].dims == 3) {
-      tensor.Reshape({1, shape.c, shape.h, shape.w});
-    } else {
-      // for dim==4 case and blank image.
-      tensor.Reshape({1, shape.c, shape.d, shape.h, shape.w});
+namespace mmdeploy::framework
+{
+
+    NCNNNet::~NCNNNet() {}
+
+    Result<void> ncnn_status(int code)
+    {
+        if (code == 0)
+        {
+            return success();
+        }
+        return Status(eFail);
     }
-    // tensor.Reshape({1, shape.c, shape.h, shape.w});
-    // ncnn Mat may be padded, flatten to avoid that
-    auto flattened = outputs[i].reshape(shape.c * shape.h * shape.w);
-    // if ((shape.c * shape.h * shape.w) > 0)
-    if (outputs[i].dims > 0) {
-      OUTCOME_TRY(tensor.CopyFrom(flattened.data, stream_));
+
+    Result<void> NCNNNet::Init(const Value& args)
+    {
+        auto& context = args["context"];
+        device_       = context["device"].get<Device>();
+        stream_       = context["stream"].get<Stream>();
+        if (args.contains("use_vulkan"))
+        {
+            net_.opt.use_vulkan_compute = args["use_vulkan"].get<bool>();
+        }
+        if (!device_.is_host())
+        {
+            return Status(eNotSupported);
+        }
+
+        auto name  = args["name"].get<std::string>();
+        auto model = context["model"].get<Model>();
+        OUTCOME_TRY(auto config, model.GetModelConfig(name));
+        auto precision = config.precision;
+        if (precision == "FP16")
+        {
+            net_.opt.use_fp16_packed     = true;
+            net_.opt.use_fp16_storage    = true;
+            net_.opt.use_fp16_arithmetic = true;
+        }
+        else if (precision == "INT8")
+        {
+            // in android platform, ncnn will automatically start FP16 accelerate.
+            // In INT8 case, we set fp16 as false explicitly.
+            net_.opt.use_int8_packed     = true;
+            net_.opt.use_int8_storage    = true;
+            net_.opt.use_int8_arithmetic = true;
+            net_.opt.use_fp16_packed     = false;
+            net_.opt.use_fp16_storage    = false;
+            net_.opt.use_fp16_arithmetic = false;
+        }
+        else
+        {
+            // in android platform, ncnn will automatically start FP16 accelerate.
+            // In FP32 case, we set fp16 as false explicitly.
+            net_.opt.use_fp16_packed     = false;
+            net_.opt.use_fp16_storage    = false;
+            net_.opt.use_fp16_arithmetic = false;
+        }
+        OUTCOME_TRY(params_, model.ReadFile(config.net));
+        OUTCOME_TRY(weights_, model.ReadFile(config.weights));
+        register_mmdeploy_custom_layers(net_);
+
+        OUTCOME_TRY(ncnn_status(net_.load_param_mem(params_.c_str())));
+        net_.load_model(reinterpret_cast<const unsigned char*>(weights_.data()));
+
+        input_indices_ = net_.input_indexes();
+        for (const auto& x : net_.input_names())
+        {
+            input_tensors_.emplace_back(TensorDesc{
+                Device("cpu"),
+                DataType::kFLOAT,
+                {},
+                x,
+            });
+        }
+        output_indices_ = net_.output_indexes();
+        for (const auto& x : net_.output_names())
+        {
+            output_tensors_.emplace_back(TensorDesc{
+                Device("cpu"),
+                DataType::kFLOAT,
+                {},
+                x,
+            });
+        }
+        return success();
+    }
+
+    Result<void> NCNNNet::Deinit()
+    {
+        return success();
+    }
+
+    Result<void> NCNNNet::Reshape(Span<TensorShape> input_shapes)
+    {
+        for (size_t i = 0; i < input_shapes.size(); ++i)
+        {
+            input_tensors_[i].Reshape(input_shapes[i]);
+        }
+        return success();
+    }
+
+    Result<Span<Tensor>> NCNNNet::GetInputTensors()
+    {
+        return input_tensors_;
     }
-    OUTCOME_TRY(stream_.Wait());
-  }
-  return success();
-}
-
-static std::unique_ptr<Net> Create(const Value& args) {
-  auto p = std::make_unique<NCNNNet>();
-  if (auto r = p->Init(args)) {
-    return p;
-  } else {
-    MMDEPLOY_ERROR("error creating NCNNNet: {}", r.error().message().c_str());
-    return nullptr;
-  }
-}
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(Net, (ncnn, 0), Create);
+
+    Result<Span<Tensor>> NCNNNet::GetOutputTensors()
+    {
+        return output_tensors_;
+    }
+
+    // TODO: discuss a policy for batch processing
+    Result<void> NCNNNet::Forward()
+    {
+        auto extractor = net_.create_extractor();
+        OUTCOME_TRY(stream_.Wait());
+        std::vector<ncnn::Mat> inputs(input_indices_.size());
+        for (size_t i = 0; i < input_indices_.size(); ++i)
+        {
+            auto& tensor = input_tensors_[i];
+            auto  shape  = tensor.shape();
+            assert(shape[0] == 1);
+            inputs[i] = ncnn::Mat(shape[3], shape[2], shape[1], tensor.data());
+            OUTCOME_TRY(ncnn_status(extractor.input(input_indices_[i], inputs[i])));
+        }
+        std::vector<ncnn::Mat> outputs(output_indices_.size());
+        for (size_t i = 0; i < output_indices_.size(); ++i)
+        {
+            OUTCOME_TRY(ncnn_status(extractor.extract(output_indices_[i], outputs[i])));
+            auto& tensor = output_tensors_[i];
+            auto  shape  = outputs[i].shape();
+            if (outputs[i].dims == 1)
+            {
+                tensor.Reshape({1, shape.w});
+            }
+            else if (outputs[i].dims == 2)
+            {
+                tensor.Reshape({1, shape.h, shape.w});
+            }
+            else if (outputs[i].dims == 3)
+            {
+                tensor.Reshape({1, shape.c, shape.h, shape.w});
+            }
+            else
+            {
+                // for dim==4 case and blank image.
+                tensor.Reshape({1, shape.c, shape.d, shape.h, shape.w});
+            }
+            // tensor.Reshape({1, shape.c, shape.h, shape.w});
+            // ncnn Mat may be padded, flatten to avoid that
+            auto flattened = outputs[i].reshape(shape.c * shape.h * shape.w);
+            // if ((shape.c * shape.h * shape.w) > 0)
+            if (outputs[i].dims > 0)
+            {
+                OUTCOME_TRY(tensor.CopyFrom(flattened.data, stream_));
+            }
+            OUTCOME_TRY(stream_.Wait());
+        }
+        return success();
+    }
+
+    static std::unique_ptr<Net> Create(const Value& args)
+    {
+        auto p = std::make_unique<NCNNNet>();
+        if (auto r = p->Init(args))
+        {
+            return p;
+        }
+        else
+        {
+            MMDEPLOY_ERROR("error creating NCNNNet: {}", r.error().message().c_str());
+            return nullptr;
+        }
+    }
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Net, (ncnn, 0), Create);
 
 }  // namespace mmdeploy::framework
diff --git a/csrc/mmdeploy/net/ncnn/ncnn_net.h b/csrc/mmdeploy/net/ncnn/ncnn_net.h
index dfac181d0d..73197995ee 100644
--- a/csrc/mmdeploy/net/ncnn/ncnn_net.h
+++ b/csrc/mmdeploy/net/ncnn/ncnn_net.h
@@ -7,30 +7,35 @@
 // It's ncnn's net.h
 #include "net.h"
 
-namespace mmdeploy::framework {
+namespace mmdeploy::framework
+{
 
-class NCNNNet : public Net {
- public:
-  ~NCNNNet() override;
-  Result<void> Init(const Value& args) override;
-  Result<void> Deinit() override;
-  Result<Span<Tensor>> GetInputTensors() override;
-  Result<Span<Tensor>> GetOutputTensors() override;
-  Result<void> Reshape(Span<TensorShape> input_shapes) override;
-  Result<void> Forward() override;
-  Result<void> ForwardAsync(Event* event) override { return success(); };
+    class NCNNNet : public Net
+    {
+      public:
+        ~NCNNNet() override;
+        Result<void>         Init(const Value& args) override;
+        Result<void>         Deinit() override;
+        Result<Span<Tensor>> GetInputTensors() override;
+        Result<Span<Tensor>> GetOutputTensors() override;
+        Result<void>         Reshape(Span<TensorShape> input_shapes) override;
+        Result<void>         Forward() override;
+        Result<void>         ForwardAsync(Event* event) override
+        {
+            return success();
+        };
 
- private:
-  Device device_;
-  Stream stream_;
-  std::string params_;
-  std::string weights_;
-  std::vector<int> input_indices_;
-  std::vector<int> output_indices_;
-  std::vector<Tensor> input_tensors_;
-  std::vector<Tensor> output_tensors_;
-  ncnn::Net net_;
-};
+      private:
+        Device              device_;
+        Stream              stream_;
+        std::string         params_;
+        std::string         weights_;
+        std::vector<int>    input_indices_;
+        std::vector<int>    output_indices_;
+        std::vector<Tensor> input_tensors_;
+        std::vector<Tensor> output_tensors_;
+        ncnn::Net           net_;
+    };
 
 }  // namespace mmdeploy::framework
 
diff --git a/csrc/mmdeploy/net/net_module.cpp b/csrc/mmdeploy/net/net_module.cpp
index d9ded2b5b0..bc7a901d06 100644
--- a/csrc/mmdeploy/net/net_module.cpp
+++ b/csrc/mmdeploy/net/net_module.cpp
@@ -18,315 +18,386 @@
 using std::string;
 using std::vector;
 
-namespace mmdeploy::framework {
+namespace mmdeploy::framework
+{
 
-struct NetModule::Impl {
-  using Input = std::map<std::string, Tensor>;
-  using Output = std::map<std::string, Tensor>;
+    struct NetModule::Impl
+    {
+        using Input  = std::map<std::string, Tensor>;
+        using Output = std::map<std::string, Tensor>;
 
-  explicit Impl(const Value& args) {
-    MMDEPLOY_DEBUG("Net Module cfg: {}", args);
-    auto init = [&]() -> Result<void> {
-      auto name = args["name"].get<std::string>();
-      auto& context = args["context"];
-      if (context.contains("scope")) {
-        is_profiling_ = true;
-      }
-      auto model = context["model"].get<Model>();
-      for (const auto& meta : model.meta().models) {
-        if (meta.name == name) {
-          max_batch_size_ = meta.batch_size;
+        explicit Impl(const Value& args)
+        {
+            MMDEPLOY_DEBUG("Net Module cfg: {}", args);
+            auto init = [&]() -> Result<void>
+            {
+                auto  name    = args["name"].get<std::string>();
+                auto& context = args["context"];
+                if (context.contains("scope"))
+                {
+                    is_profiling_ = true;
+                }
+                auto model = context["model"].get<Model>();
+                for (const auto& meta : model.meta().models)
+                {
+                    if (meta.name == name)
+                    {
+                        max_batch_size_ = meta.batch_size;
+                    }
+                }
+                OUTCOME_TRY(auto config, model.GetModelConfig(name));
+                device_      = context.value("device", Device{"cpu"});
+                stream_      = context.value("stream", Stream::GetDefault(device_));
+                auto creator = gRegistry<Net>().Get(config.backend);
+                if (!creator)
+                {
+                    MMDEPLOY_ERROR("Net backend not found: {}, available backends: {}", config.backend, gRegistry<Net>().List());
+                    return Status(eEntryNotFound);
+                }
+                auto net_cfg = args;
+                net_cfg["context"].update({{"device", device_}, {"stream", stream_}});
+                net_ = creator->Create(net_cfg);
+                if (!net_)
+                {
+                    MMDEPLOY_ERROR("Failed to create Net backend: {}, config: {}", config.backend, net_cfg);
+                    return Status(eFail);
+                }
+                OUTCOME_TRY(InitializeInputTensors(args));
+                OUTCOME_TRY(InitializeOutputTensors(args));
+                return success();
+            };
+            init().value();
         }
-      }
-      OUTCOME_TRY(auto config, model.GetModelConfig(name));
-      device_ = context.value("device", Device{"cpu"});
-      stream_ = context.value("stream", Stream::GetDefault(device_));
-      auto creator = gRegistry<Net>().Get(config.backend);
-      if (!creator) {
-        MMDEPLOY_ERROR("Net backend not found: {}, available backends: {}", config.backend,
-                       gRegistry<Net>().List());
-        return Status(eEntryNotFound);
-      }
-      auto net_cfg = args;
-      net_cfg["context"].update({{"device", device_}, {"stream", stream_}});
-      net_ = creator->Create(net_cfg);
-      if (!net_) {
-        MMDEPLOY_ERROR("Failed to create Net backend: {}, config: {}", config.backend, net_cfg);
-        return Status(eFail);
-      }
-      OUTCOME_TRY(InitializeInputTensors(args));
-      OUTCOME_TRY(InitializeOutputTensors(args));
-      return success();
-    };
-    init().value();
-  }
 
-  Result<void> InitializeInputTensors(const Value& args) {
-    auto inputs = args.value<Value>("input_map", ValueType::kObject);
-    for (auto it = inputs.begin(); it != inputs.end(); ++it) {
-      input_mapping_.insert({(*it).get<std::string>(), it.key()});
-    }
-    OUTCOME_TRY(inputs_, net_->GetInputTensors());
-    for (const auto& t : inputs_) {
-      input_mapping_.insert({t.name(), t.name()});
-    }
-    return success();
-  }
+        Result<void> InitializeInputTensors(const Value& args)
+        {
+            auto inputs = args.value<Value>("input_map", ValueType::kObject);
+            for (auto it = inputs.begin(); it != inputs.end(); ++it)
+            {
+                input_mapping_.insert({(*it).get<std::string>(), it.key()});
+            }
+            OUTCOME_TRY(inputs_, net_->GetInputTensors());
+            for (const auto& t : inputs_)
+            {
+                input_mapping_.insert({t.name(), t.name()});
+            }
+            return success();
+        }
 
-  Result<void> InitializeOutputTensors(const Value& args) {
-    auto outputs = args.value<Value>("output_map", ValueType::kObject);
-    for (auto it = outputs.begin(); it != outputs.end(); ++it) {
-      output_mapping_.insert({(*it).get<std::string>(), it.key()});
-    }
-    OUTCOME_TRY(outputs_, net_->GetOutputTensors());
-    for (const auto& t : outputs_) {
-      output_mapping_.insert({t.name(), t.name()});
-    }
-    return success();
-  }
+        Result<void> InitializeOutputTensors(const Value& args)
+        {
+            auto outputs = args.value<Value>("output_map", ValueType::kObject);
+            for (auto it = outputs.begin(); it != outputs.end(); ++it)
+            {
+                output_mapping_.insert({(*it).get<std::string>(), it.key()});
+            }
+            OUTCOME_TRY(outputs_, net_->GetOutputTensors());
+            for (const auto& t : outputs_)
+            {
+                output_mapping_.insert({t.name(), t.name()});
+            }
+            return success();
+        }
 
-  static Result<TensorShape> InferBatchShape(const vector<Tensor>& input) {
-    auto batch_size = input.size();
-    auto& exemplar = input.front();
-    auto shape = exemplar.shape();
-    if (batch_size == 1) {
-      return shape;
-    }
-    if (shape[0] != 1) {
-      MMDEPLOY_WARN("unsupported shape for batch assemble: {}", shape);
-      return Status(eNotSupported);
-    }
-    for (int i = 1; i < input.size(); ++i) {
-      auto& sample = input[i];
-      if (sample.shape() != shape) {
-        MMDEPLOY_WARN("shapes are not consistent across the batch");
-        return Status(eNotSupported);
-      }
-    }
-    shape[0] = static_cast<int64_t>(batch_size);
-    return shape;
-  }
+        static Result<TensorShape> InferBatchShape(const vector<Tensor>& input)
+        {
+            auto  batch_size = input.size();
+            auto& exemplar   = input.front();
+            auto  shape      = exemplar.shape();
+            if (batch_size == 1)
+            {
+                return shape;
+            }
+            if (shape[0] != 1)
+            {
+                MMDEPLOY_WARN("unsupported shape for batch assemble: {}", shape);
+                return Status(eNotSupported);
+            }
+            for (int i = 1; i < input.size(); ++i)
+            {
+                auto& sample = input[i];
+                if (sample.shape() != shape)
+                {
+                    MMDEPLOY_WARN("shapes are not consistent across the batch");
+                    return Status(eNotSupported);
+                }
+            }
+            shape[0] = static_cast<int64_t>(batch_size);
+            return shape;
+        }
 
-  static Result<vector<TensorShape>> InferBatchShape(const vector<vector<Tensor>>& inputs) {
-    vector<TensorShape> shapes;
-    shapes.reserve(inputs.size());
-    for (const auto& input : inputs) {
-      OUTCOME_TRY(auto shape, InferBatchShape(input));
-      shapes.push_back(std::move(shape));
-    }
-    return shapes;
-  }
+        static Result<vector<TensorShape>> InferBatchShape(const vector<vector<Tensor>>& inputs)
+        {
+            vector<TensorShape> shapes;
+            shapes.reserve(inputs.size());
+            for (const auto& input : inputs)
+            {
+                OUTCOME_TRY(auto shape, InferBatchShape(input));
+                shapes.push_back(std::move(shape));
+            }
+            return shapes;
+        }
 
-  Result<vector<vector<Tensor>>> CollectInputTensors(const vector<Input>& inputs) {
-    vector<vector<Tensor>> input_samples;
-    input_samples.reserve(inputs_.size());
-    for (const auto& t : inputs_) {
-      auto name = input_mapping_.at(t.name());
-      auto& tmp = input_samples.emplace_back();
-      for (const auto& sample : inputs) {
-        if (auto it = sample.find(name); it != sample.end()) {
-          tmp.push_back(it->second);
-        } else {
-          MMDEPLOY_ERROR("sample {} missing key {}", &sample - inputs.data(), name);
-          return Status(eInvalidArgument);
+        Result<vector<vector<Tensor>>> CollectInputTensors(const vector<Input>& inputs)
+        {
+            vector<vector<Tensor>> input_samples;
+            input_samples.reserve(inputs_.size());
+            for (const auto& t : inputs_)
+            {
+                auto  name = input_mapping_.at(t.name());
+                auto& tmp  = input_samples.emplace_back();
+                for (const auto& sample : inputs)
+                {
+                    if (auto it = sample.find(name); it != sample.end())
+                    {
+                        tmp.push_back(it->second);
+                    }
+                    else
+                    {
+                        MMDEPLOY_ERROR("sample {} missing key {}", &sample - inputs.data(), name);
+                        return Status(eInvalidArgument);
+                    }
+                }
+            }
+            return input_samples;
         }
-      }
-    }
-    return input_samples;
-  }
 
-  void SaveBatch(vector<vector<Tensor>> samples, vector<int> indices,
-                 vector<vector<vector<Tensor>>>& batch_tensors,
-                 vector<vector<TensorShape>>& batch_shapes,
-                 vector<vector<int>>& batch_sample_idxs) const {
-    if (auto maybe_batch_shape = InferBatchShape(samples)) {
-      batch_shapes.push_back(maybe_batch_shape.value());
-      batch_tensors.push_back(std::move(samples));
-      batch_sample_idxs.push_back(std::move(indices));
-    } else {
-      // cannot assemble batch, do it one by one
-      for (int k = 0; k < indices.size(); ++k) {
-        auto& shapes = batch_shapes.emplace_back();
-        auto& batch = batch_tensors.emplace_back(inputs_.size());
-        batch_sample_idxs.push_back({indices[k]});
-        for (int j = 0; j < inputs_.size(); ++j) {
-          shapes.push_back(samples[j][k].shape());
-          batch[j].push_back(std::move(samples[j][k]));
+        void SaveBatch(vector<vector<Tensor>> samples, vector<int> indices, vector<vector<vector<Tensor>>>& batch_tensors, vector<vector<TensorShape>>& batch_shapes, vector<vector<int>>& batch_sample_idxs) const
+        {
+            if (auto maybe_batch_shape = InferBatchShape(samples))
+            {
+                batch_shapes.push_back(maybe_batch_shape.value());
+                batch_tensors.push_back(std::move(samples));
+                batch_sample_idxs.push_back(std::move(indices));
+            }
+            else
+            {
+                // cannot assemble batch, do it one by one
+                for (int k = 0; k < indices.size(); ++k)
+                {
+                    auto& shapes = batch_shapes.emplace_back();
+                    auto& batch  = batch_tensors.emplace_back(inputs_.size());
+                    batch_sample_idxs.push_back({indices[k]});
+                    for (int j = 0; j < inputs_.size(); ++j)
+                    {
+                        shapes.push_back(samples[j][k].shape());
+                        batch[j].push_back(std::move(samples[j][k]));
+                    }
+                }
+            }
         }
-      }
-    }
-  }
 
-  void SamplesToBatches(const vector<vector<Tensor>>& input_samples, size_t n_samples,
-                        vector<vector<vector<Tensor>>>& batch_tensors,
-                        vector<vector<TensorShape>>& batch_shapes,
-                        vector<vector<int>>& batch_sample_idxs) const {
-    // concat all shapes in samples to make comparison easier
-    vector<vector<int64_t>> concat_shapes;
-    concat_shapes.reserve(n_samples);
-    for (size_t i = 0; i < n_samples; ++i) {
-      auto& shape = concat_shapes.emplace_back();
-      for (const auto& input : input_samples) {
-        shape.insert(shape.end(), input[i].shape().begin(), input[i].shape().end());
-      }
-    }
+        void SamplesToBatches(const vector<vector<Tensor>>& input_samples, size_t n_samples, vector<vector<vector<Tensor>>>& batch_tensors, vector<vector<TensorShape>>& batch_shapes, vector<vector<int>>& batch_sample_idxs) const
+        {
+            // concat all shapes in samples to make comparison easier
+            vector<vector<int64_t>> concat_shapes;
+            concat_shapes.reserve(n_samples);
+            for (size_t i = 0; i < n_samples; ++i)
+            {
+                auto& shape = concat_shapes.emplace_back();
+                for (const auto& input : input_samples)
+                {
+                    shape.insert(shape.end(), input[i].shape().begin(), input[i].shape().end());
+                }
+            }
 
-    // cluster samples by concatenated shapes
-    vector<int> shape_idxs(concat_shapes.size());
-    std::iota(shape_idxs.begin(), shape_idxs.end(), 0);
-    std::sort(shape_idxs.begin(), shape_idxs.end(),
-              [&concat_shapes](int i, int j) { return concat_shapes[i] < concat_shapes[j]; });
-    shape_idxs.erase(std::unique(shape_idxs.begin(), shape_idxs.end(),
-                                 [&concat_shapes](int i, int j) {
-                                   return concat_shapes[i] == concat_shapes[j];
-                                 }),
-                     shape_idxs.end());
+            // cluster samples by concatenated shapes
+            vector<int> shape_idxs(concat_shapes.size());
+            std::iota(shape_idxs.begin(), shape_idxs.end(), 0);
+            std::sort(shape_idxs.begin(), shape_idxs.end(), [&concat_shapes](int i, int j)
+                      { return concat_shapes[i] < concat_shapes[j]; });
+            shape_idxs.erase(std::unique(shape_idxs.begin(), shape_idxs.end(), [&concat_shapes](int i, int j)
+                                         { return concat_shapes[i] == concat_shapes[j]; }),
+                             shape_idxs.end());
 
-    // generate batches of samples with equal shapes, limit the batch size by max_batch_size_
-    for (const auto ref_shape_idx : shape_idxs) {
-      const auto& ref_shape = concat_shapes[ref_shape_idx];
-      vector<vector<Tensor>> samples(inputs_.size());
-      vector<int> indices;
-      for (size_t i = 0; i < concat_shapes.size(); ++i) {
-        if (concat_shapes[i] == ref_shape) {
-          for (size_t j = 0; j < inputs_.size(); ++j) {
-            samples[j].push_back(input_samples[j][i]);
-          }
-          indices.push_back(static_cast<int>(i));
-          if (indices.size() == max_batch_size_) {
-            SaveBatch(std::move(samples), std::move(indices), batch_tensors, batch_shapes,
-                      batch_sample_idxs);
-            samples = vector<vector<Tensor>>(inputs_.size());
-            indices = {};
-          }
+            // generate batches of samples with equal shapes, limit the batch size by max_batch_size_
+            for (const auto ref_shape_idx : shape_idxs)
+            {
+                const auto&            ref_shape = concat_shapes[ref_shape_idx];
+                vector<vector<Tensor>> samples(inputs_.size());
+                vector<int>            indices;
+                for (size_t i = 0; i < concat_shapes.size(); ++i)
+                {
+                    if (concat_shapes[i] == ref_shape)
+                    {
+                        for (size_t j = 0; j < inputs_.size(); ++j)
+                        {
+                            samples[j].push_back(input_samples[j][i]);
+                        }
+                        indices.push_back(static_cast<int>(i));
+                        if (indices.size() == max_batch_size_)
+                        {
+                            SaveBatch(std::move(samples), std::move(indices), batch_tensors, batch_shapes, batch_sample_idxs);
+                            samples = vector<vector<Tensor>>(inputs_.size());
+                            indices = {};
+                        }
+                    }
+                }
+                if (!indices.empty())
+                {
+                    SaveBatch(std::move(samples), std::move(indices), batch_tensors, batch_shapes, batch_sample_idxs);
+                }
+            }
         }
-      }
-      if (!indices.empty()) {
-        SaveBatch(std::move(samples), std::move(indices), batch_tensors, batch_shapes,
-                  batch_sample_idxs);
-      }
-    }
-  }
 
-  Result<vector<Output>> Forward(const vector<Input>& inputs) {
-    OUTCOME_TRY(auto input_samples, CollectInputTensors(inputs));
+        Result<vector<Output>> Forward(const vector<Input>& inputs)
+        {
+            OUTCOME_TRY(auto input_samples, CollectInputTensors(inputs));
 
-    vector<vector<vector<Tensor>>> batch_tensors;
-    vector<vector<TensorShape>> batch_shapes;
-    vector<vector<int>> batch_sample_indices;
+            vector<vector<vector<Tensor>>> batch_tensors;
+            vector<vector<TensorShape>>    batch_shapes;
+            vector<vector<int>>            batch_sample_indices;
 
-    SamplesToBatches(input_samples, inputs.size(), batch_tensors, batch_shapes,
-                     batch_sample_indices);
-
-    vector<Output> outputs(inputs.size());
-    for (size_t i = 0; i < batch_tensors.size(); ++i) {
-      OUTCOME_TRY(net_->Reshape(batch_shapes[i]));
-      OUTCOME_TRY(CopyInputTensors(batch_tensors[i], batch_shapes[i]));
-      OUTCOME_TRY(net_->Forward());
-      OUTCOME_TRY(CopyOutputTensors(batch_sample_indices[i], outputs));
-      if (i + 1 < batch_tensors.size()) {  // sync if not the last batch
-        OUTCOME_TRY(stream_.Wait());
-      }
-    }
+            SamplesToBatches(input_samples, inputs.size(), batch_tensors, batch_shapes, batch_sample_indices);
 
-    if (is_profiling_) {
-      OUTCOME_TRY(stream_.Wait());
-    }
+            vector<Output> outputs(inputs.size());
+            for (size_t i = 0; i < batch_tensors.size(); ++i)
+            {
+                OUTCOME_TRY(net_->Reshape(batch_shapes[i]));
+                OUTCOME_TRY(CopyInputTensors(batch_tensors[i], batch_shapes[i]));
+                OUTCOME_TRY(net_->Forward());
+                OUTCOME_TRY(CopyOutputTensors(batch_sample_indices[i], outputs));
+                if (i + 1 < batch_tensors.size())
+                {  // sync if not the last batch
+                    OUTCOME_TRY(stream_.Wait());
+                }
+            }
 
-    return outputs;
-  }
+            if (is_profiling_)
+            {
+                OUTCOME_TRY(stream_.Wait());
+            }
 
-  Result<void> CopyInputTensors(const vector<vector<Tensor>>& batch,
-                                const vector<TensorShape>& shapes) const {
-    for (int i = 0; i < inputs_.size(); ++i) {
-      auto& src = batch[i];
-      auto& dst = inputs_[i];
-      if (dst.shape() != shapes[i]) {
-        MMDEPLOY_ERROR("inconsistent input shape, expect {}, got {}", shapes[i], dst.shape());
-        return Status(eFail);
-      }
-      if (src.size() > 1) {
-        for (int j = 0; j < src.size(); ++j) {
-          OUTCOME_TRY(dst.Slice(j).CopyFrom(src[j], stream_));
+            return outputs;
         }
-      } else {
-        OUTCOME_TRY(src.front().CopyTo(dst, stream_));
-      }
-    }
-    return success();
-  }
 
-  Result<void> CopyOutputTensors(const vector<int>& indices, vector<Output>& outputs) {
-    for (const auto& output : outputs_) {
-      auto name = output_mapping_.at(output.name());
-      auto desc = output.desc();
-      desc.device = device_;
-      Tensor tmp(desc);
-      if (tmp.size()) {
-        OUTCOME_TRY(output.CopyTo(tmp, stream_));
-      } else {
-        MMDEPLOY_WARN("copy skipped due to zero sized tensor");
-      }
-      if (indices.size() > 1) {
-        for (int i = 0; i < indices.size(); ++i) {
-          outputs[indices[i]].emplace(name, tmp.Slice(i));
+        Result<void> CopyInputTensors(const vector<vector<Tensor>>& batch,
+                                      const vector<TensorShape>&    shapes) const
+        {
+            for (int i = 0; i < inputs_.size(); ++i)
+            {
+                auto& src = batch[i];
+                auto& dst = inputs_[i];
+                if (dst.shape() != shapes[i])
+                {
+                    MMDEPLOY_ERROR("inconsistent input shape, expect {}, got {}", shapes[i], dst.shape());
+                    return Status(eFail);
+                }
+                if (src.size() > 1)
+                {
+                    for (int j = 0; j < src.size(); ++j)
+                    {
+                        OUTCOME_TRY(dst.Slice(j).CopyFrom(src[j], stream_));
+                    }
+                }
+                else
+                {
+                    OUTCOME_TRY(src.front().CopyTo(dst, stream_));
+                }
+            }
+            return success();
         }
-      } else {
-        outputs[indices.front()].emplace(name, std::move(tmp));
-      }
-    }
-    return success();
-  }
 
-  Device device_;
-  Stream stream_;
-  std::unique_ptr<Net> net_;
-  Span<Tensor> inputs_;
-  Span<Tensor> outputs_;
-  // outer scope to model input names
-  std::map<std::string, std::string> input_mapping_;
-  // outer scope to model output names
-  std::map<std::string, std::string> output_mapping_;
-  int max_batch_size_{1};
-  bool is_profiling_{false};
-};
+        Result<void> CopyOutputTensors(const vector<int>& indices, vector<Output>& outputs)
+        {
+            for (const auto& output : outputs_)
+            {
+                auto name   = output_mapping_.at(output.name());
+                auto desc   = output.desc();
+                desc.device = device_;
+                Tensor tmp(desc);
+                if (tmp.size())
+                {
+                    OUTCOME_TRY(output.CopyTo(tmp, stream_));
+                }
+                else
+                {
+                    MMDEPLOY_WARN("copy skipped due to zero sized tensor");
+                }
+                if (indices.size() > 1)
+                {
+                    for (int i = 0; i < indices.size(); ++i)
+                    {
+                        outputs[indices[i]].emplace(name, tmp.Slice(i));
+                    }
+                }
+                else
+                {
+                    outputs[indices.front()].emplace(name, std::move(tmp));
+                }
+            }
+            return success();
+        }
 
-NetModule::~NetModule() = default;
+        Device                             device_;
+        Stream                             stream_;
+        std::unique_ptr<Net>               net_;
+        Span<Tensor>                       inputs_;
+        Span<Tensor>                       outputs_;
+        // outer scope to model input names
+        std::map<std::string, std::string> input_mapping_;
+        // outer scope to model output names
+        std::map<std::string, std::string> output_mapping_;
+        int                                max_batch_size_{1};
+        bool                               is_profiling_{false};
+    };
 
-NetModule::NetModule(NetModule&&) noexcept = default;
+    NetModule::~NetModule() = default;
 
-NetModule::NetModule(const Value& args) : impl_(std::make_unique<Impl>(args)) {}
+    NetModule::NetModule(NetModule&&) noexcept = default;
 
-Result<Value> NetModule::operator()(const Value& input) {
-  auto filter = [](const Value& sample) {
-    Impl::Input tensors;
-    for (auto it = sample.begin(); it != sample.end(); ++it) {
-      if (it->is_any<Tensor>()) {
-        tensors.insert({it.key(), it->get<Tensor>()});
-      }
+    NetModule::NetModule(const Value& args)
+        : impl_(std::make_unique<Impl>(args))
+    {
     }
-    return tensors;
-  };
-  std::vector<Impl::Input> batch;
-  if (input.is_array()) {
-    batch.reserve(input.size());
-    for (const auto& sample : input) {
-      batch.push_back(filter(sample));
+
+    Result<Value> NetModule::operator()(const Value& input)
+    {
+        auto filter = [](const Value& sample)
+        {
+            Impl::Input tensors;
+            for (auto it = sample.begin(); it != sample.end(); ++it)
+            {
+                if (it->is_any<Tensor>())
+                {
+                    tensors.insert({it.key(), it->get<Tensor>()});
+                }
+            }
+            return tensors;
+        };
+        std::vector<Impl::Input> batch;
+        if (input.is_array())
+        {
+            batch.reserve(input.size());
+            for (const auto& sample : input)
+            {
+                batch.push_back(filter(sample));
+            }
+        }
+        else if (input.is_object())
+        {
+            batch.push_back(filter(input));
+        }
+        else
+        {
+            return Status(eNotSupported);
+        }
+        OUTCOME_TRY(auto batch_output, impl_->Forward(batch));
+        if (input.is_array())
+        {
+            return to_value(batch_output);
+        }
+        else
+        {
+            return to_value(batch_output.at(0));
+        }
     }
-  } else if (input.is_object()) {
-    batch.push_back(filter(input));
-  } else {
-    return Status(eNotSupported);
-  }
-  OUTCOME_TRY(auto batch_output, impl_->Forward(batch));
-  if (input.is_array()) {
-    return to_value(batch_output);
-  } else {
-    return to_value(batch_output.at(0));
-  }
-}
 
-MMDEPLOY_REGISTER_FACTORY_FUNC(Module, (Net, 0),
-                               [](const Value& config) { return CreateTask(NetModule{config}); });
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Module,
+                                   (Net, 0),
+                                   [](const Value& config)
+                                   {
+                                       return CreateTask(NetModule{config});
+                                   });
 
 }  // namespace mmdeploy::framework
diff --git a/csrc/mmdeploy/net/net_module.h b/csrc/mmdeploy/net/net_module.h
index 79797d19f2..0a84ea7080 100644
--- a/csrc/mmdeploy/net/net_module.h
+++ b/csrc/mmdeploy/net/net_module.h
@@ -7,20 +7,22 @@
 #include "mmdeploy/core/tensor.h"
 #include "mmdeploy/core/value.h"
 
-namespace mmdeploy::framework {
-
-class NetModule {
- public:
-  ~NetModule();
-  NetModule(NetModule&&) noexcept;
-
-  explicit NetModule(const Value& args);
-  Result<Value> operator()(const Value& input);
-
- private:
-  struct Impl;
-  std::unique_ptr<Impl> impl_;
-};
+namespace mmdeploy::framework
+{
+
+    class NetModule
+    {
+      public:
+        ~NetModule();
+        NetModule(NetModule&&) noexcept;
+
+        explicit NetModule(const Value& args);
+        Result<Value> operator()(const Value& input);
+
+      private:
+        struct Impl;
+        std::unique_ptr<Impl> impl_;
+    };
 
 }  // namespace mmdeploy::framework
 
diff --git a/csrc/mmdeploy/net/openvino/CMakeLists.txt b/csrc/mmdeploy/net/openvino/CMakeLists.txt
index 5c9d344db3..4bee4cad72 100644
--- a/csrc/mmdeploy/net/openvino/CMakeLists.txt
+++ b/csrc/mmdeploy/net/openvino/CMakeLists.txt
@@ -2,13 +2,15 @@
 
 project(mmdeploy_openvino_net)
 
-if ("cpu" IN_LIST MMDEPLOY_TARGET_DEVICES)
-    find_package(InferenceEngine REQUIRED)
+if("cpu" IN_LIST MMDEPLOY_TARGET_DEVICES)
+  find_package(InferenceEngine REQUIRED)
 
-    mmdeploy_add_net(${PROJECT_NAME} openvino_net.cpp)
-    target_link_libraries(${PROJECT_NAME} PRIVATE
-            ${InferenceEngine_LIBRARIES})
-    add_library(mmdeploy::openvino_net ALIAS ${PROJECT_NAME})
-else ()
-    message(ERROR "'openvino_net' is NOT supported in target devices: ${MMDEPLOY_TARGET_DEVICES}")
-endif ()
+  mmdeploy_add_net(${PROJECT_NAME} openvino_net.cpp)
+  target_link_libraries(${PROJECT_NAME} PRIVATE ${InferenceEngine_LIBRARIES})
+  add_library(mmdeploy::openvino_net ALIAS ${PROJECT_NAME})
+else()
+  message(
+    ERROR
+    "'openvino_net' is NOT supported in target devices: ${MMDEPLOY_TARGET_DEVICES}"
+  )
+endif()
diff --git a/csrc/mmdeploy/net/openvino/openvino_net.cpp b/csrc/mmdeploy/net/openvino/openvino_net.cpp
index 2fed37c32c..6f95a38714 100644
--- a/csrc/mmdeploy/net/openvino/openvino_net.cpp
+++ b/csrc/mmdeploy/net/openvino/openvino_net.cpp
@@ -10,251 +10,298 @@
 #include "mmdeploy/core/utils/filesystem.h"
 #include "mmdeploy/core/utils/formatter.h"
 
-namespace mmdeploy::framework {
-
-template <typename T>
-Result<std::unique_ptr<T>> openvino_try(T* v) {
-  if (v) {
-    return success(v);
-  }
-  return Status(eFail);
-}
-
-static Result<DataType> ConvertElementType(InferenceEngine::Precision prec) {
-  auto type = InferenceEngine::Precision::ePrecision(prec);
-  switch (type) {
-    case InferenceEngine::Precision::ePrecision::FP32:
-      return DataType::kFLOAT;
-    case InferenceEngine::Precision::ePrecision::FP16:
-      return DataType::kHALF;
-    case InferenceEngine::Precision::ePrecision::I8:
-      return DataType::kINT8;
-    case InferenceEngine::Precision::ePrecision::I32:
-      return DataType::kINT32;
-    case InferenceEngine::Precision::ePrecision::I64:
-      return DataType::kINT64;
-    default:
-      MMDEPLOY_ERROR("unsupported InferenceEngine Precision: {}", static_cast<int>(type));
-      return Status(eNotSupported);
-  }
-}
-
-static Result<InferenceEngine::Precision::ePrecision> ConvertPrecision(DataType type) {
-  switch (type) {
-    case DataType::kFLOAT:
-      return InferenceEngine::Precision::ePrecision::FP32;
-    case DataType::kHALF:
-      return InferenceEngine::Precision::ePrecision::FP16;
-    case DataType::kINT8:
-      return InferenceEngine::Precision::ePrecision::I8;
-    case DataType::kINT32:
-      return InferenceEngine::Precision::ePrecision::I32;
-    case DataType::kINT64:
-      return InferenceEngine::Precision::ePrecision::I64;
-    default:
-      MMDEPLOY_ERROR("unsupported DataType: {}", static_cast<int>(type));
-      return Status(eNotSupported);
-  }
-}
-
-static Result<std::string> ConvertDeviceName(const Device& device) {
-  if (device.is_host()) {
-    return "CPU";
-  }
-  return Status(eNotSupported);
-}
-
-Result<void> OpenVINONet::Init(const Value& args) {
-  auto& context = args["context"];
-  device_ = context["device"].get<Device>();
-  stream_ = context["stream"].get<Stream>();
-
-  if (!device_.is_host()) {
-    return Status(eNotSupported);
-  }
-
-  auto name = args["name"].get<std::string>();
-  auto model = context["model"].get<Model>();
-  OUTCOME_TRY(auto config, model.GetModelConfig(name));
-
-  OUTCOME_TRY(auto raw_xml, model.ReadFile(config.net));
-  OUTCOME_TRY(auto raw_bin, model.ReadFile(config.weights));
-  auto ov_tensor = InferenceEngine::TensorDesc(InferenceEngine::Precision::U8, {raw_bin.size()},
-                                               InferenceEngine::Layout::C);
-  auto ov_blob = InferenceEngine::make_shared_blob<uint8_t>(ov_tensor);
-  ov_blob->allocate();
-  memcpy(ov_blob->buffer(), raw_bin.data(), ov_blob->byteSize());
-
-  try {
-    // create cnnnetwork
-    core_ = InferenceEngine::Core();
-    network_ = core_.ReadNetwork(raw_xml, std::move(ov_blob));
-
-    // set input tensor
-    InferenceEngine::InputsDataMap input_info = network_.getInputsInfo();
-    for (auto& item : input_info) {
-      auto input_data = item.second;
-      const auto& input_name = input_data->name();
-      OUTCOME_TRY(auto data_type, ConvertElementType(input_data->getPrecision()));
-      const auto& size_vector = input_data->getTensorDesc().getDims();
-      TensorShape shape{size_vector.begin(), size_vector.end()};
-      input_tensors_.emplace_back(TensorDesc{device_, data_type, shape, input_name});
+namespace mmdeploy::framework
+{
+
+    template<typename T>
+    Result<std::unique_ptr<T>> openvino_try(T* v)
+    {
+        if (v)
+        {
+            return success(v);
+        }
+        return Status(eFail);
     }
 
-    // set output tensor
-    InferenceEngine::OutputsDataMap output_info = network_.getOutputsInfo();
-    for (auto& item : output_info) {
-      auto output_data = item.second;
-      const auto& output_name = output_data->getName();
-      OUTCOME_TRY(auto data_type, ConvertElementType(output_data->getPrecision()));
-      const auto& size_vector = output_data->getDims();
-      TensorShape shape{size_vector.begin(), size_vector.end()};
-      output_tensors_.emplace_back(TensorDesc{device_, data_type, shape, output_name});
+    static Result<DataType> ConvertElementType(InferenceEngine::Precision prec)
+    {
+        auto type = InferenceEngine::Precision::ePrecision(prec);
+        switch (type)
+        {
+            case InferenceEngine::Precision::ePrecision::FP32:
+                return DataType::kFLOAT;
+            case InferenceEngine::Precision::ePrecision::FP16:
+                return DataType::kHALF;
+            case InferenceEngine::Precision::ePrecision::I8:
+                return DataType::kINT8;
+            case InferenceEngine::Precision::ePrecision::I32:
+                return DataType::kINT32;
+            case InferenceEngine::Precision::ePrecision::I64:
+                return DataType::kINT64;
+            default:
+                MMDEPLOY_ERROR("unsupported InferenceEngine Precision: {}", static_cast<int>(type));
+                return Status(eNotSupported);
+        }
     }
 
-    // create request
-    net_config_ =
-        std::map<std::string, std::string>{{InferenceEngine::PluginConfigParams::KEY_PERF_COUNT,
-                                            InferenceEngine::PluginConfigParams::YES}};
-    OUTCOME_TRY(auto device_str, ConvertDeviceName(device_));
-    auto executable_network = core_.LoadNetwork(network_, device_str, net_config_);
-    request_ = executable_network.CreateInferRequest();
-
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("unhandled exception when creating OpenVINO: {}", e.what());
-    return Status(eFail);
-  }
-  return success();
-}
-
-Result<void> OpenVINONet::ForwardAsync(Event* event) { return Status(eNotSupported); }
-
-Result<void> OpenVINONet::Deinit() { return success(); }
-
-Result<Span<Tensor>> OpenVINONet::GetInputTensors() { return input_tensors_; }
-
-Result<Span<Tensor>> OpenVINONet::GetOutputTensors() { return output_tensors_; }
-
-Result<void> OpenVINONet::Reshape(Span<TensorShape> input_shapes) {
-  for (size_t i = 0; i < input_shapes.size(); ++i) {
-    input_tensors_[i].Reshape(input_shapes[i]);
-  }
-  return success();
-}
-
-static Result<void> SetBlob(InferenceEngine::InferRequest& request, Tensor& tensor) {
-  const auto& input_name = tensor.desc().name;
-
-  const auto& desc = tensor.desc();
-  const auto& shape = desc.shape;
-  InferenceEngine::SizeVector size_vector{shape.begin(), shape.end()};
-  OUTCOME_TRY(auto prec, ConvertPrecision(desc.data_type));
-  InferenceEngine::TensorDesc ie_desc(prec, size_vector, InferenceEngine::Layout::NCHW);
-
-  // TODO: find a better way instead of switch case
-  switch (desc.data_type) {
-    case DataType::kFLOAT:
-      request.SetBlob(input_name,
-                      InferenceEngine::make_shared_blob<float>(ie_desc, tensor.data<float>()));
-      break;
-    case DataType::kINT8:
-      request.SetBlob(input_name,
-                      InferenceEngine::make_shared_blob<int8_t>(ie_desc, tensor.data<int8_t>()));
-      break;
-    case DataType::kINT32:
-      request.SetBlob(input_name,
-                      InferenceEngine::make_shared_blob<int32_t>(ie_desc, tensor.data<int32_t>()));
-      break;
-    case DataType::kINT64:
-      request.SetBlob(input_name,
-                      InferenceEngine::make_shared_blob<int64_t>(ie_desc, tensor.data<int64_t>()));
-      break;
-    default:
-      MMDEPLOY_ERROR("unsupported DataType: {}", static_cast<int>(desc.data_type));
-      return Status(eNotSupported);
-  }
-  return success();
-}
-
-static Result<void> GetBlob(InferenceEngine::InferRequest& request, Tensor& tensor,
-                            Stream& stream) {
-  const auto& desc = tensor.desc();
-  const auto& output_name = desc.name;
-  const auto device = desc.device;
-  const auto data_type = desc.data_type;
-  const auto& output = request.GetBlob(output_name);
-  const auto& size_vector = output->getTensorDesc().getDims();
-  TensorShape shape{size_vector.begin(), size_vector.end()};
-
-  InferenceEngine::MemoryBlob::CPtr moutput =
-      InferenceEngine::as<InferenceEngine::MemoryBlob>(output);
-  auto moutputHolder = moutput->rmap();
-  std::shared_ptr<void> data(const_cast<void*>(moutputHolder.as<const void*>()), [](void*) {});
-
-  Tensor blob_tensor = {TensorDesc{device, data_type, shape, output_name}, data};
-  if (!std::equal(blob_tensor.shape().begin(), blob_tensor.shape().end(), tensor.shape().begin()))
-    tensor.Reshape(shape);
-  OUTCOME_TRY(tensor.CopyFrom(blob_tensor, stream));
-
-  return success();
-}
-
-Result<void> OpenVINONet::Forward() {
-  OUTCOME_TRY(stream_.Wait());
-
-  // reshape network if shape does not match
-  bool need_reshape = false;
-  auto input_shapes = network_.getInputShapes();
-  for (auto& tensor : input_tensors_) {
-    const auto& input_name = tensor.desc().name;
-    const auto& tensor_shape = tensor.desc().shape;
-    auto& size_vector = input_shapes[input_name];
-    bool shape_changed = !std::equal(size_vector.begin(), size_vector.end(), tensor_shape.begin(),
-                                     [](size_t a, int64_t b) { return a == size_t(b); });
-    need_reshape |= shape_changed;
-    if (shape_changed)
-      size_vector = InferenceEngine::SizeVector{tensor_shape.begin(), tensor_shape.end()};
-  }
-
-  if (need_reshape) {
-    network_.reshape(input_shapes);
-    OUTCOME_TRY(auto device_str, ConvertDeviceName(device_));
-    auto executable_network = core_.LoadNetwork(network_, device_str, net_config_);
-    request_ = executable_network.CreateInferRequest();
-  }
-
-  // fill input into request
-  for (auto& tensor : input_tensors_) {
-    OUTCOME_TRY(SetBlob(request_, tensor));
-  }
-
-  request_.StartAsync();
-  request_.Wait(InferenceEngine::InferRequest::WaitMode::RESULT_READY);
-
-  // read output from request
-  for (auto& tensor : output_tensors_) {
-    OUTCOME_TRY(GetBlob(request_, tensor, stream_));
-  }
-  OUTCOME_TRY(stream_.Wait());
-
-  return success();
-}
-
-static std::unique_ptr<Net> Create(const Value& args) {
-  try {
-    auto p = std::make_unique<OpenVINONet>();
-    if (auto r = p->Init(args)) {
-      return p;
-    } else {
-      MMDEPLOY_ERROR("error creating OpenVINONet: {}", r.error().message().c_str());
-      return nullptr;
+    static Result<InferenceEngine::Precision::ePrecision> ConvertPrecision(DataType type)
+    {
+        switch (type)
+        {
+            case DataType::kFLOAT:
+                return InferenceEngine::Precision::ePrecision::FP32;
+            case DataType::kHALF:
+                return InferenceEngine::Precision::ePrecision::FP16;
+            case DataType::kINT8:
+                return InferenceEngine::Precision::ePrecision::I8;
+            case DataType::kINT32:
+                return InferenceEngine::Precision::ePrecision::I32;
+            case DataType::kINT64:
+                return InferenceEngine::Precision::ePrecision::I64;
+            default:
+                MMDEPLOY_ERROR("unsupported DataType: {}", static_cast<int>(type));
+                return Status(eNotSupported);
+        }
     }
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("unhandled exception when creating OpenVINONet: {}", e.what());
-    return nullptr;
-  }
-}
 
-MMDEPLOY_REGISTER_FACTORY_FUNC(Net, (openvino, 0), Create);
+    static Result<std::string> ConvertDeviceName(const Device& device)
+    {
+        if (device.is_host())
+        {
+            return "CPU";
+        }
+        return Status(eNotSupported);
+    }
+
+    Result<void> OpenVINONet::Init(const Value& args)
+    {
+        auto& context = args["context"];
+        device_       = context["device"].get<Device>();
+        stream_       = context["stream"].get<Stream>();
+
+        if (!device_.is_host())
+        {
+            return Status(eNotSupported);
+        }
+
+        auto name  = args["name"].get<std::string>();
+        auto model = context["model"].get<Model>();
+        OUTCOME_TRY(auto config, model.GetModelConfig(name));
+
+        OUTCOME_TRY(auto raw_xml, model.ReadFile(config.net));
+        OUTCOME_TRY(auto raw_bin, model.ReadFile(config.weights));
+        auto ov_tensor = InferenceEngine::TensorDesc(InferenceEngine::Precision::U8, {raw_bin.size()}, InferenceEngine::Layout::C);
+        auto ov_blob   = InferenceEngine::make_shared_blob<uint8_t>(ov_tensor);
+        ov_blob->allocate();
+        memcpy(ov_blob->buffer(), raw_bin.data(), ov_blob->byteSize());
+
+        try
+        {
+            // create cnnnetwork
+            core_    = InferenceEngine::Core();
+            network_ = core_.ReadNetwork(raw_xml, std::move(ov_blob));
+
+            // set input tensor
+            InferenceEngine::InputsDataMap input_info = network_.getInputsInfo();
+            for (auto& item : input_info)
+            {
+                auto        input_data = item.second;
+                const auto& input_name = input_data->name();
+                OUTCOME_TRY(auto data_type, ConvertElementType(input_data->getPrecision()));
+                const auto& size_vector = input_data->getTensorDesc().getDims();
+                TensorShape shape{size_vector.begin(), size_vector.end()};
+                input_tensors_.emplace_back(TensorDesc{device_, data_type, shape, input_name});
+            }
+
+            // set output tensor
+            InferenceEngine::OutputsDataMap output_info = network_.getOutputsInfo();
+            for (auto& item : output_info)
+            {
+                auto        output_data = item.second;
+                const auto& output_name = output_data->getName();
+                OUTCOME_TRY(auto data_type, ConvertElementType(output_data->getPrecision()));
+                const auto& size_vector = output_data->getDims();
+                TensorShape shape{size_vector.begin(), size_vector.end()};
+                output_tensors_.emplace_back(TensorDesc{device_, data_type, shape, output_name});
+            }
+
+            // create request
+            net_config_ =
+                std::map<std::string, std::string>{{InferenceEngine::PluginConfigParams::KEY_PERF_COUNT,
+                                                    InferenceEngine::PluginConfigParams::YES}};
+            OUTCOME_TRY(auto device_str, ConvertDeviceName(device_));
+            auto executable_network = core_.LoadNetwork(network_, device_str, net_config_);
+            request_                = executable_network.CreateInferRequest();
+        }
+        catch (const std::exception& e)
+        {
+            MMDEPLOY_ERROR("unhandled exception when creating OpenVINO: {}", e.what());
+            return Status(eFail);
+        }
+        return success();
+    }
+
+    Result<void> OpenVINONet::ForwardAsync(Event* event)
+    {
+        return Status(eNotSupported);
+    }
+
+    Result<void> OpenVINONet::Deinit()
+    {
+        return success();
+    }
+
+    Result<Span<Tensor>> OpenVINONet::GetInputTensors()
+    {
+        return input_tensors_;
+    }
+
+    Result<Span<Tensor>> OpenVINONet::GetOutputTensors()
+    {
+        return output_tensors_;
+    }
+
+    Result<void> OpenVINONet::Reshape(Span<TensorShape> input_shapes)
+    {
+        for (size_t i = 0; i < input_shapes.size(); ++i)
+        {
+            input_tensors_[i].Reshape(input_shapes[i]);
+        }
+        return success();
+    }
+
+    static Result<void> SetBlob(InferenceEngine::InferRequest& request, Tensor& tensor)
+    {
+        const auto&                 input_name = tensor.desc().name;
+
+        const auto&                 desc  = tensor.desc();
+        const auto&                 shape = desc.shape;
+        InferenceEngine::SizeVector size_vector{shape.begin(), shape.end()};
+        OUTCOME_TRY(auto prec, ConvertPrecision(desc.data_type));
+        InferenceEngine::TensorDesc ie_desc(prec, size_vector, InferenceEngine::Layout::NCHW);
+
+        // TODO: find a better way instead of switch case
+        switch (desc.data_type)
+        {
+            case DataType::kFLOAT:
+                request.SetBlob(input_name,
+                                InferenceEngine::make_shared_blob<float>(ie_desc, tensor.data<float>()));
+                break;
+            case DataType::kINT8:
+                request.SetBlob(input_name,
+                                InferenceEngine::make_shared_blob<int8_t>(ie_desc, tensor.data<int8_t>()));
+                break;
+            case DataType::kINT32:
+                request.SetBlob(input_name,
+                                InferenceEngine::make_shared_blob<int32_t>(ie_desc, tensor.data<int32_t>()));
+                break;
+            case DataType::kINT64:
+                request.SetBlob(input_name,
+                                InferenceEngine::make_shared_blob<int64_t>(ie_desc, tensor.data<int64_t>()));
+                break;
+            default:
+                MMDEPLOY_ERROR("unsupported DataType: {}", static_cast<int>(desc.data_type));
+                return Status(eNotSupported);
+        }
+        return success();
+    }
+
+    static Result<void> GetBlob(InferenceEngine::InferRequest& request, Tensor& tensor, Stream& stream)
+    {
+        const auto&                       desc        = tensor.desc();
+        const auto&                       output_name = desc.name;
+        const auto                        device      = desc.device;
+        const auto                        data_type   = desc.data_type;
+        const auto&                       output      = request.GetBlob(output_name);
+        const auto&                       size_vector = output->getTensorDesc().getDims();
+        TensorShape                       shape{size_vector.begin(), size_vector.end()};
+
+        InferenceEngine::MemoryBlob::CPtr moutput =
+            InferenceEngine::as<InferenceEngine::MemoryBlob>(output);
+        auto                  moutputHolder = moutput->rmap();
+        std::shared_ptr<void> data(const_cast<void*>(moutputHolder.as<const void*>()), [](void*) {});
+
+        Tensor                blob_tensor = {TensorDesc{device, data_type, shape, output_name}, data};
+        if (!std::equal(blob_tensor.shape().begin(), blob_tensor.shape().end(), tensor.shape().begin()))
+            tensor.Reshape(shape);
+        OUTCOME_TRY(tensor.CopyFrom(blob_tensor, stream));
+
+        return success();
+    }
+
+    Result<void> OpenVINONet::Forward()
+    {
+        OUTCOME_TRY(stream_.Wait());
+
+        // reshape network if shape does not match
+        bool need_reshape = false;
+        auto input_shapes = network_.getInputShapes();
+        for (auto& tensor : input_tensors_)
+        {
+            const auto& input_name    = tensor.desc().name;
+            const auto& tensor_shape  = tensor.desc().shape;
+            auto&       size_vector   = input_shapes[input_name];
+            bool        shape_changed = !std::equal(size_vector.begin(),
+                                             size_vector.end(),
+                                             tensor_shape.begin(),
+                                             [](size_t a, int64_t b)
+                                             {
+                                                 return a == size_t(b);
+                                             });
+            need_reshape |= shape_changed;
+            if (shape_changed)
+                size_vector = InferenceEngine::SizeVector{tensor_shape.begin(), tensor_shape.end()};
+        }
+
+        if (need_reshape)
+        {
+            network_.reshape(input_shapes);
+            OUTCOME_TRY(auto device_str, ConvertDeviceName(device_));
+            auto executable_network = core_.LoadNetwork(network_, device_str, net_config_);
+            request_                = executable_network.CreateInferRequest();
+        }
+
+        // fill input into request
+        for (auto& tensor : input_tensors_)
+        {
+            OUTCOME_TRY(SetBlob(request_, tensor));
+        }
+
+        request_.StartAsync();
+        request_.Wait(InferenceEngine::InferRequest::WaitMode::RESULT_READY);
+
+        // read output from request
+        for (auto& tensor : output_tensors_)
+        {
+            OUTCOME_TRY(GetBlob(request_, tensor, stream_));
+        }
+        OUTCOME_TRY(stream_.Wait());
+
+        return success();
+    }
+
+    static std::unique_ptr<Net> Create(const Value& args)
+    {
+        try
+        {
+            auto p = std::make_unique<OpenVINONet>();
+            if (auto r = p->Init(args))
+            {
+                return p;
+            }
+            else
+            {
+                MMDEPLOY_ERROR("error creating OpenVINONet: {}", r.error().message().c_str());
+                return nullptr;
+            }
+        }
+        catch (const std::exception& e)
+        {
+            MMDEPLOY_ERROR("unhandled exception when creating OpenVINONet: {}", e.what());
+            return nullptr;
+        }
+    }
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Net, (openvino, 0), Create);
 
 }  // namespace mmdeploy::framework
diff --git a/csrc/mmdeploy/net/openvino/openvino_net.h b/csrc/mmdeploy/net/openvino/openvino_net.h
index ce43b5de30..3972e2d8c5 100644
--- a/csrc/mmdeploy/net/openvino/openvino_net.h
+++ b/csrc/mmdeploy/net/openvino/openvino_net.h
@@ -6,30 +6,32 @@
 #include "inference_engine.hpp"
 #include "mmdeploy/core/net.h"
 
-namespace mmdeploy::framework {
+namespace mmdeploy::framework
+{
 
-class OpenVINONet : public Net {
- public:
-  ~OpenVINONet() override = default;
-  Result<void> Init(const Value& cfg) override;
-  Result<void> Deinit() override;
-  Result<Span<Tensor>> GetInputTensors() override;
-  Result<Span<Tensor>> GetOutputTensors() override;
-  Result<void> Reshape(Span<TensorShape> input_shapes) override;
-  Result<void> Forward() override;
-  Result<void> ForwardAsync(Event* event) override;
+    class OpenVINONet : public Net
+    {
+      public:
+        ~OpenVINONet() override = default;
+        Result<void>         Init(const Value& cfg) override;
+        Result<void>         Deinit() override;
+        Result<Span<Tensor>> GetInputTensors() override;
+        Result<Span<Tensor>> GetOutputTensors() override;
+        Result<void>         Reshape(Span<TensorShape> input_shapes) override;
+        Result<void>         Forward() override;
+        Result<void>         ForwardAsync(Event* event) override;
 
- private:
-  InferenceEngine::Core core_;
-  InferenceEngine::CNNNetwork network_;
-  InferenceEngine::InferRequest request_;
-  std::map<std::string, std::string> net_config_;
-  std::vector<Tensor> input_tensors_;
-  std::vector<Tensor> output_tensors_;
-  std::string device_str_;
-  Device device_;
-  Stream stream_;
-};
+      private:
+        InferenceEngine::Core              core_;
+        InferenceEngine::CNNNetwork        network_;
+        InferenceEngine::InferRequest      request_;
+        std::map<std::string, std::string> net_config_;
+        std::vector<Tensor>                input_tensors_;
+        std::vector<Tensor>                output_tensors_;
+        std::string                        device_str_;
+        Device                             device_;
+        Stream                             stream_;
+    };
 
 }  // namespace mmdeploy::framework
 
diff --git a/csrc/mmdeploy/net/ort/ort_net.cpp b/csrc/mmdeploy/net/ort/ort_net.cpp
index 38b04e70f0..cf0f4a27a0 100644
--- a/csrc/mmdeploy/net/ort/ort_net.cpp
+++ b/csrc/mmdeploy/net/ort/ort_net.cpp
@@ -9,204 +9,245 @@
 #include "mmdeploy/core/utils/formatter.h"
 #include "onnxruntime_register.h"
 
-namespace mmdeploy::framework {
-
-static TensorShape to_shape(const Ort::TypeInfo& info) {
-  auto shape = info.GetTensorTypeAndShapeInfo().GetShape();
-  return {shape.begin(), shape.end()};
-}
-
-static Result<DataType> ConvertElementType(ONNXTensorElementDataType type) {
-  switch (type) {
-    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT:
-      return DataType::kFLOAT;
-    case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16:
-      return DataType::kHALF;
-    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8:
-    case ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL:
-      return DataType::kINT8;
-    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32:
-      return DataType::kINT32;
-    case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64:
-      return DataType::kINT64;
-    default:
-      MMDEPLOY_ERROR("unsupported ONNXTensorElementDataType: {}", static_cast<int>(type));
-      return Status(eNotSupported);
-  }
-}
-
-// TODO: handle datatype
-Result<void> OrtNet::Init(const Value& args) {
-  auto& context = args["context"];
-  device_ = context["device"].get<Device>();
-  stream_ = context["stream"].get<Stream>();
-  DeviceGuard guard(device_);
-  auto name = args["name"].get<std::string>();
-  auto model = context["model"].get<Model>();
-
-  OUTCOME_TRY(auto config, model.GetModelConfig(name));
-
-  OUTCOME_TRY(auto onnx, model.ReadFile(config.net));
-
-  Ort::SessionOptions options;
-  options.SetLogSeverityLevel(3);
-
-  RegisterCustomOps(options, OrtGetApiBase());
-
-  if (device_.is_device()) {
-    OrtCUDAProviderOptions cuda_options{};
-    cuda_options.device_id = device_.device_id();
-    // TODO set compute stream
-    options.AppendExecutionProvider_CUDA(cuda_options);
-  }
-  session_ = {env_, onnx.data(), onnx.size(), options};
-
-  auto memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
-  Ort::Allocator allocator(session_, memory_info);
-
-  auto n_inputs = session_.GetInputCount();
-
-  // force negative shape to be empty
-  auto filter_shape = [](TensorShape& shape) {
-    if (std::any_of(begin(shape), end(shape), [](auto x) { return x < 0; })) {
-      shape = {};
+namespace mmdeploy::framework
+{
+
+    static TensorShape to_shape(const Ort::TypeInfo& info)
+    {
+        auto shape = info.GetTensorTypeAndShapeInfo().GetShape();
+        return {shape.begin(), shape.end()};
+    }
+
+    static Result<DataType> ConvertElementType(ONNXTensorElementDataType type)
+    {
+        switch (type)
+        {
+            case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT:
+                return DataType::kFLOAT;
+            case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16:
+                return DataType::kHALF;
+            case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8:
+            case ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL:
+                return DataType::kINT8;
+            case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32:
+                return DataType::kINT32;
+            case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64:
+                return DataType::kINT64;
+            default:
+                MMDEPLOY_ERROR("unsupported ONNXTensorElementDataType: {}", static_cast<int>(type));
+                return Status(eNotSupported);
+        }
     }
-  };
 
-  for (int i = 0; i < n_inputs; ++i) {
+    // TODO: handle datatype
+    Result<void> OrtNet::Init(const Value& args)
+    {
+        auto& context = args["context"];
+        device_       = context["device"].get<Device>();
+        stream_       = context["stream"].get<Stream>();
+        DeviceGuard guard(device_);
+        auto        name  = args["name"].get<std::string>();
+        auto        model = context["model"].get<Model>();
+
+        OUTCOME_TRY(auto config, model.GetModelConfig(name));
+
+        OUTCOME_TRY(auto onnx, model.ReadFile(config.net));
+
+        Ort::SessionOptions options;
+        options.SetLogSeverityLevel(3);
+
+        RegisterCustomOps(options, OrtGetApiBase());
+
+        if (device_.is_device())
+        {
+            OrtCUDAProviderOptions cuda_options{};
+            cuda_options.device_id = device_.device_id();
+            // TODO set compute stream
+            options.AppendExecutionProvider_CUDA(cuda_options);
+        }
+        session_ = {env_, onnx.data(), onnx.size(), options};
+
+        auto           memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
+        Ort::Allocator allocator(session_, memory_info);
+
+        auto           n_inputs = session_.GetInputCount();
+
+        // force negative shape to be empty
+        auto           filter_shape = [](TensorShape& shape)
+        {
+            if (std::any_of(begin(shape), end(shape), [](auto x)
+                            { return x < 0; }))
+            {
+                shape = {};
+            }
+        };
+
+        for (int i = 0; i < n_inputs; ++i)
+        {
 #if ORT_API_VERSION >= 13
-    auto input_name = session_.GetInputNameAllocated(i, allocator).release();
+            auto input_name = session_.GetInputNameAllocated(i, allocator).release();
 #else
-    auto input_name = session_.GetInputName(i, allocator);
+            auto input_name = session_.GetInputName(i, allocator);
 #endif
-    auto type_info = session_.GetInputTypeInfo(i);
-    auto shape = to_shape(type_info);
-    MMDEPLOY_DEBUG("input {}, shape = {}", i, shape);
-    filter_shape(shape);
-    OUTCOME_TRY(auto data_type,
-                ConvertElementType(type_info.GetTensorTypeAndShapeInfo().GetElementType()));
-    input_tensors_.emplace_back(TensorDesc{device_, data_type, shape, input_name});
-    allocator.Free(input_name);
-  }
-
-  auto n_outputs = session_.GetOutputCount();
-
-  for (int i = 0; i < n_outputs; ++i) {
+            auto type_info = session_.GetInputTypeInfo(i);
+            auto shape     = to_shape(type_info);
+            MMDEPLOY_DEBUG("input {}, shape = {}", i, shape);
+            filter_shape(shape);
+            OUTCOME_TRY(auto data_type,
+                        ConvertElementType(type_info.GetTensorTypeAndShapeInfo().GetElementType()));
+            input_tensors_.emplace_back(TensorDesc{device_, data_type, shape, input_name});
+            allocator.Free(input_name);
+        }
+
+        auto n_outputs = session_.GetOutputCount();
+
+        for (int i = 0; i < n_outputs; ++i)
+        {
 #if ORT_API_VERSION >= 13
-    auto output_name = session_.GetOutputNameAllocated(i, allocator).release();
+            auto output_name = session_.GetOutputNameAllocated(i, allocator).release();
 #else
-    auto output_name = session_.GetOutputName(i, allocator);
+            auto output_name = session_.GetOutputName(i, allocator);
 #endif
-    auto type_info = session_.GetOutputTypeInfo(i);
-    auto shape = to_shape(type_info);
-    MMDEPLOY_DEBUG("output {}, shape = {}", i, shape);
-    filter_shape(shape);
-    OUTCOME_TRY(auto data_type,
-                ConvertElementType(type_info.GetTensorTypeAndShapeInfo().GetElementType()));
-    output_tensors_.emplace_back(TensorDesc{device_, data_type, shape, output_name});
-    allocator.Free(output_name);
-  }
-
-  return success();
-}
-
-Result<void> OrtNet::ForwardAsync(Event* event) { return Status(eNotSupported); }
-
-Result<void> OrtNet::Deinit() { return success(); }
-
-Result<Span<Tensor>> OrtNet::GetInputTensors() { return input_tensors_; }
-
-Result<Span<Tensor>> OrtNet::GetOutputTensors() { return output_tensors_; }
-
-Result<void> OrtNet::Reshape(Span<TensorShape> input_shapes) {
-  for (size_t i = 0; i < input_shapes.size(); ++i) {
-    input_tensors_[i].Reshape(input_shapes[i]);
-  }
-  return success();
-}
-
-static Ort::MemoryInfo MemoryInfo(const TensorDesc& desc) {
-  const char* device_name = desc.device.is_host() ? "Cpu" : "Cuda";
-  Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator, desc.device.device_id(),
-                              OrtMemTypeDefault);
-  return memory_info;
-}
-
-static Ort::Value AsOrtValue(Tensor& tensor) {
-  auto memory_info = MemoryInfo(tensor.desc());
-  std::vector<int64_t> shape(begin(tensor.shape()), end(tensor.shape()));
-  return Ort::Value::CreateTensor(memory_info, tensor.data<float>(), tensor.size(), shape.data(),
-                                  shape.size());
-}
-
-static Result<Tensor> AsTensor(Ort::Value& value, const Device& device) {
-  auto info = value.GetTensorTypeAndShapeInfo();
-  TensorDesc desc;
-  desc.shape = info.GetShape();
-  desc.device = device;
-  OUTCOME_TRY(desc.data_type, ConvertElementType(info.GetElementType()));
-  std::shared_ptr<void> data(const_cast<void*>(value.GetTensorData<void>()), [](void*) {});
-  return Tensor(desc, data);
-}
-
-Result<void> OrtNet::Forward() {
-  DeviceGuard guard(device_);
-  try {
-    OUTCOME_TRY(stream_.Wait());
-    Ort::IoBinding binding(session_);
-    std::vector<Ort::Value> inputs;
-    std::vector<Ort::Value> outputs;
-    Ort::RunOptions options;
-
-    inputs.reserve(input_tensors_.size());
-    for (auto& t : input_tensors_) {
-      inputs.push_back(AsOrtValue(t));
-      binding.BindInput(t.name(), inputs.back());
+            auto type_info = session_.GetOutputTypeInfo(i);
+            auto shape     = to_shape(type_info);
+            MMDEPLOY_DEBUG("output {}, shape = {}", i, shape);
+            filter_shape(shape);
+            OUTCOME_TRY(auto data_type,
+                        ConvertElementType(type_info.GetTensorTypeAndShapeInfo().GetElementType()));
+            output_tensors_.emplace_back(TensorDesc{device_, data_type, shape, output_name});
+            allocator.Free(output_name);
+        }
+
+        return success();
+    }
+
+    Result<void> OrtNet::ForwardAsync(Event* event)
+    {
+        return Status(eNotSupported);
+    }
+
+    Result<void> OrtNet::Deinit()
+    {
+        return success();
+    }
+
+    Result<Span<Tensor>> OrtNet::GetInputTensors()
+    {
+        return input_tensors_;
     }
 
-    // TODO: We are in the same situation as PPL.nn, the backend can't infer shapes
-    //  without executing forward
-    for (auto& t : output_tensors_) {
-      binding.BindOutput(t.name(), MemoryInfo(t.desc()));
+    Result<Span<Tensor>> OrtNet::GetOutputTensors()
+    {
+        return output_tensors_;
     }
 
-    session_.Run({}, binding);
+    Result<void> OrtNet::Reshape(Span<TensorShape> input_shapes)
+    {
+        for (size_t i = 0; i < input_shapes.size(); ++i)
+        {
+            input_tensors_[i].Reshape(input_shapes[i]);
+        }
+        return success();
+    }
+
+    static Ort::MemoryInfo MemoryInfo(const TensorDesc& desc)
+    {
+        const char*     device_name = desc.device.is_host() ? "Cpu" : "Cuda";
+        Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator, desc.device.device_id(), OrtMemTypeDefault);
+        return memory_info;
+    }
+
+    static Ort::Value AsOrtValue(Tensor& tensor)
+    {
+        auto                 memory_info = MemoryInfo(tensor.desc());
+        std::vector<int64_t> shape(begin(tensor.shape()), end(tensor.shape()));
+        return Ort::Value::CreateTensor(memory_info, tensor.data<float>(), tensor.size(), shape.data(), shape.size());
+    }
+
+    static Result<Tensor> AsTensor(Ort::Value& value, const Device& device)
+    {
+        auto       info = value.GetTensorTypeAndShapeInfo();
+        TensorDesc desc;
+        desc.shape  = info.GetShape();
+        desc.device = device;
+        OUTCOME_TRY(desc.data_type, ConvertElementType(info.GetElementType()));
+        std::shared_ptr<void> data(const_cast<void*>(value.GetTensorData<void>()), [](void*) {});
+        return Tensor(desc, data);
+    }
+
+    Result<void> OrtNet::Forward()
+    {
+        DeviceGuard guard(device_);
+        try
+        {
+            OUTCOME_TRY(stream_.Wait());
+            Ort::IoBinding          binding(session_);
+            std::vector<Ort::Value> inputs;
+            std::vector<Ort::Value> outputs;
+            Ort::RunOptions         options;
+
+            inputs.reserve(input_tensors_.size());
+            for (auto& t : input_tensors_)
+            {
+                inputs.push_back(AsOrtValue(t));
+                binding.BindInput(t.name(), inputs.back());
+            }
+
+            // TODO: We are in the same situation as PPL.nn, the backend can't infer shapes
+            //  without executing forward
+            for (auto& t : output_tensors_)
+            {
+                binding.BindOutput(t.name(), MemoryInfo(t.desc()));
+            }
+
+            session_.Run({}, binding);
+
+            outputs = binding.GetOutputValues();
+            for (size_t i = 0; i < output_tensors_.size(); ++i)
+            {
+                OUTCOME_TRY(auto tmp, AsTensor(outputs[i], output_tensors_[i].device()));
+                output_tensors_[i].Reshape(tmp.shape());
+                OUTCOME_TRY(tmp.CopyTo(output_tensors_[i], stream_));
+            }
+
+            OUTCOME_TRY(stream_.Wait());
+        }
+        catch (const std::exception& e)
+        {
+            MMDEPLOY_ERROR(e.what());
+            return Status(eFail);
+        }
+        return success();
+    }
 
-    outputs = binding.GetOutputValues();
-    for (size_t i = 0; i < output_tensors_.size(); ++i) {
-      OUTCOME_TRY(auto tmp, AsTensor(outputs[i], output_tensors_[i].device()));
-      output_tensors_[i].Reshape(tmp.shape());
-      OUTCOME_TRY(tmp.CopyTo(output_tensors_[i], stream_));
+    OrtNet::~OrtNet()
+    {
+        DeviceGuard guard(device_);
+        session_ = Ort::Session{nullptr};
     }
 
-    OUTCOME_TRY(stream_.Wait());
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR(e.what());
-    return Status(eFail);
-  }
-  return success();
-}
-
-OrtNet::~OrtNet() {
-  DeviceGuard guard(device_);
-  session_ = Ort::Session{nullptr};
-}
-
-static std::unique_ptr<Net> Create(const Value& args) {
-  try {
-    auto p = std::make_unique<OrtNet>();
-    if (auto r = p->Init(args)) {
-      return p;
-    } else {
-      MMDEPLOY_ERROR("error creating OrtNet: {}", r.error().message().c_str());
-      return nullptr;
+    static std::unique_ptr<Net> Create(const Value& args)
+    {
+        try
+        {
+            auto p = std::make_unique<OrtNet>();
+            if (auto r = p->Init(args))
+            {
+                return p;
+            }
+            else
+            {
+                MMDEPLOY_ERROR("error creating OrtNet: {}", r.error().message().c_str());
+                return nullptr;
+            }
+        }
+        catch (const std::exception& e)
+        {
+            MMDEPLOY_ERROR("unhandled exception when creating ORTNet: {}", e.what());
+            return nullptr;
+        }
     }
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("unhandled exception when creating ORTNet: {}", e.what());
-    return nullptr;
-  }
-}
 
-MMDEPLOY_REGISTER_FACTORY_FUNC(Net, (onnxruntime, 0), Create);
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Net, (onnxruntime, 0), Create);
 
 }  // namespace mmdeploy::framework
diff --git a/csrc/mmdeploy/net/ort/ort_net.h b/csrc/mmdeploy/net/ort/ort_net.h
index 94f5095d84..85629eddc1 100644
--- a/csrc/mmdeploy/net/ort/ort_net.h
+++ b/csrc/mmdeploy/net/ort/ort_net.h
@@ -7,27 +7,29 @@
 #include "onnxruntime_c_api.h"
 #include "onnxruntime_cxx_api.h"
 
-namespace mmdeploy::framework {
+namespace mmdeploy::framework
+{
 
-class OrtNet : public Net {
- public:
-  ~OrtNet() override;
-  Result<void> Init(const Value& cfg) override;
-  Result<void> Deinit() override;
-  Result<Span<Tensor>> GetInputTensors() override;
-  Result<Span<Tensor>> GetOutputTensors() override;
-  Result<void> Reshape(Span<TensorShape> input_shapes) override;
-  Result<void> Forward() override;
-  Result<void> ForwardAsync(Event* event) override;
+    class OrtNet : public Net
+    {
+      public:
+        ~OrtNet() override;
+        Result<void>         Init(const Value& cfg) override;
+        Result<void>         Deinit() override;
+        Result<Span<Tensor>> GetInputTensors() override;
+        Result<Span<Tensor>> GetOutputTensors() override;
+        Result<void>         Reshape(Span<TensorShape> input_shapes) override;
+        Result<void>         Forward() override;
+        Result<void>         ForwardAsync(Event* event) override;
 
- private:
-  Ort::Env env_;
-  Ort::Session session_{nullptr};
-  std::vector<Tensor> input_tensors_;
-  std::vector<Tensor> output_tensors_;
-  Device device_;
-  Stream stream_;
-};
+      private:
+        Ort::Env            env_;
+        Ort::Session        session_{nullptr};
+        std::vector<Tensor> input_tensors_;
+        std::vector<Tensor> output_tensors_;
+        Device              device_;
+        Stream              stream_;
+    };
 
 }  // namespace mmdeploy::framework
 
diff --git a/csrc/mmdeploy/net/ppl/CMakeLists.txt b/csrc/mmdeploy/net/ppl/CMakeLists.txt
index c92b80ba72..6473f3e94b 100644
--- a/csrc/mmdeploy/net/ppl/CMakeLists.txt
+++ b/csrc/mmdeploy/net/ppl/CMakeLists.txt
@@ -5,20 +5,20 @@ project(mmdeploy_pplnn_net)
 find_package(pplnn REQUIRED)
 
 mmdeploy_add_module(${PROJECT_NAME} ppl_net.cpp)
-target_include_directories(${PROJECT_NAME} PUBLIC
-        $<BUILD_INTERFACE:${PPLNN_INCLUDE_DIRS}>)
-if (PPLNN_USE_X86 AND ("cpu" IN_LIST MMDEPLOY_TARGET_DEVICES))
-    target_compile_definitions(${PROJECT_NAME} PRIVATE -DPPL_NN_HAS_X86=1)
-endif ()
-if (PPLNN_USE_CUDA AND ("cuda" IN_LIST MMDEPLOY_TARGET_DEVICES))
-    target_compile_definitions(${PROJECT_NAME} PRIVATE -DPPL_NN_HAS_CUDA=1)
-    target_include_directories(${PROJECT_NAME} PUBLIC ${CUDA_TOOLKIT_ROOT_DIR}/include)
-    target_link_directories(${PROJECT_NAME} PUBLIC ${CUDA_TOOLKIT_ROOT_DIR}/lib64)
-    target_link_libraries(${PROJECT_NAME} PRIVATE nvrtc)
-endif ()
-target_link_libraries(${PROJECT_NAME}
-        PRIVATE ${PPLNN_LIBRARIES})
-if (PPLNN_USE_RISCV AND ("cpu" IN_LIST MMDEPLOY_TARGET_DEVICES))
-    target_compile_definitions(${PROJECT_NAME} PRIVATE -DPPL_NN_HAS_RISCV=1)
-endif ()
+target_include_directories(${PROJECT_NAME}
+                           PUBLIC $<BUILD_INTERFACE:${PPLNN_INCLUDE_DIRS}>)
+if(PPLNN_USE_X86 AND ("cpu" IN_LIST MMDEPLOY_TARGET_DEVICES))
+  target_compile_definitions(${PROJECT_NAME} PRIVATE -DPPL_NN_HAS_X86=1)
+endif()
+if(PPLNN_USE_CUDA AND ("cuda" IN_LIST MMDEPLOY_TARGET_DEVICES))
+  target_compile_definitions(${PROJECT_NAME} PRIVATE -DPPL_NN_HAS_CUDA=1)
+  target_include_directories(${PROJECT_NAME}
+                             PUBLIC ${CUDA_TOOLKIT_ROOT_DIR}/include)
+  target_link_directories(${PROJECT_NAME} PUBLIC ${CUDA_TOOLKIT_ROOT_DIR}/lib64)
+  target_link_libraries(${PROJECT_NAME} PRIVATE nvrtc)
+endif()
+target_link_libraries(${PROJECT_NAME} PRIVATE ${PPLNN_LIBRARIES})
+if(PPLNN_USE_RISCV AND ("cpu" IN_LIST MMDEPLOY_TARGET_DEVICES))
+  target_compile_definitions(${PROJECT_NAME} PRIVATE -DPPL_NN_HAS_RISCV=1)
+endif()
 add_library(mmdeploy::pplnn_net ALIAS ${PROJECT_NAME})
diff --git a/csrc/mmdeploy/net/ppl/ppl_net.cpp b/csrc/mmdeploy/net/ppl/ppl_net.cpp
index 06f21bf579..8bb53320f4 100644
--- a/csrc/mmdeploy/net/ppl/ppl_net.cpp
+++ b/csrc/mmdeploy/net/ppl/ppl_net.cpp
@@ -8,351 +8,419 @@
 #include "ppl/nn/common/logger.h"
 #include "ppl/nn/models/onnx/runtime_builder_factory.h"
 #if PPL_NN_HAS_X86
-#include "ppl/nn/engines/x86/engine_factory.h"
-#include "ppl/nn/engines/x86/engine_options.h"
-#include "ppl/nn/engines/x86/ops.h"
+    #include "ppl/nn/engines/x86/engine_factory.h"
+    #include "ppl/nn/engines/x86/engine_options.h"
+    #include "ppl/nn/engines/x86/ops.h"
 #endif
 #if PPL_NN_HAS_CUDA
-#include "ppl/nn/engines/cuda/engine_factory.h"
-#include "ppl/nn/engines/cuda/engine_options.h"
-#include "ppl/nn/engines/cuda/ops.h"
-#define PPL_CUDA_IMPORT_FROM_BUFFER 1
+    #include "ppl/nn/engines/cuda/engine_factory.h"
+    #include "ppl/nn/engines/cuda/engine_options.h"
+    #include "ppl/nn/engines/cuda/ops.h"
+    #define PPL_CUDA_IMPORT_FROM_BUFFER 1
 #endif
 #if PPL_NN_HAS_RISCV
-#include "ppl/nn/engines/riscv/engine_factory.h"
-#include "ppl/nn/engines/riscv/engine_options.h"
-#include "ppl/nn/engines/riscv/ops.h"
+    #include "ppl/nn/engines/riscv/engine_factory.h"
+    #include "ppl/nn/engines/riscv/engine_options.h"
+    #include "ppl/nn/engines/riscv/ops.h"
 #endif
 
-namespace mmdeploy::framework {
-
-Result<void> ppl_try(int code) {
-  if (code == 0) {
-    return success();
-  }
-  MMDEPLOY_ERROR("ppl error: {}", ppl::common::GetRetCodeStr(code));
-  return Status(eFail);
-}
-
-template <typename T>
-Result<std::unique_ptr<T>> ppl_try(T* v) {
-  if (v) {
-    return success(v);
-  }
-  return Status(eFail);
-}
-
-Tensor PPLNet::CreateInternalTensor(ppl::nn::Tensor* src, Device device) {
-  const auto& desc = *src->GetShape();
-  auto name = src->GetName();
-  std::vector<int64_t> shape{desc.GetDims(), desc.GetDims() + desc.GetDimCount()};
-  if (std::any_of(begin(shape), end(shape), [](auto x) { return x <= 0; })) {
-    shape = {};
-  }
-  return TensorDesc{.device = device, .data_type = DataType::kFLOAT, .shape = shape, .name = name};
-}
-
-Result<void> PPLNet::Init(const Value& args) {
-  auto& context = args["context"];
-  device_ = context["device"].get<Device>();
-  stream_ = context["stream"].get<Stream>();
-  auto name = args["name"].get<std::string>();
-  auto model = context["model"].get<Model>();
-
-  OUTCOME_TRY(auto config, model.GetModelConfig(name));
-  OUTCOME_TRY(auto onnx, model.ReadFile(config.net));
+namespace mmdeploy::framework
+{
+
+    Result<void> ppl_try(int code)
+    {
+        if (code == 0)
+        {
+            return success();
+        }
+        MMDEPLOY_ERROR("ppl error: {}", ppl::common::GetRetCodeStr(code));
+        return Status(eFail);
+    }
 
-#if PPL_NN_HAS_CUDA
-  if (device_.is_device()) {
-    ppl::nn::cuda::RegisterBuiltinOpImpls();
-    ppl::nn::cuda::EngineOptions options{};
-    options.device_id = device_.device_id();
-    options.mm_policy = ppl::nn::cuda::MM_BEST_FIT;
-    engines_.emplace_back(ppl::nn::cuda::EngineFactory::Create(options));
-
-    bool import_algo = false;
-
-#if PPL_CUDA_IMPORT_FROM_BUFFER
-    auto algo = model.ReadFile(config.weights);
-    if (algo) {
-      auto ret =
-          engines_.back()->Configure(ppl::nn::cuda::ENGINE_CONF_IMPORT_ALGORITHMS_FROM_BUFFER,
-                                     algo.value().c_str(), algo.value().size());
-      if (ret == ppl::common::RC_SUCCESS) {
-        import_algo = true;
-      } else {
-        MMDEPLOY_ERROR("failed to import algorithms ({}), default algorithms will be used", ret);
-      }
+    template<typename T>
+    Result<std::unique_ptr<T>> ppl_try(T* v)
+    {
+        if (v)
+        {
+            return success(v);
+        }
+        return Status(eFail);
     }
-#endif
 
-    if (!import_algo) {
-      engines_.back()->Configure(ppl::nn::cuda::ENGINE_CONF_USE_DEFAULT_ALGORITHMS, true);
+    Tensor PPLNet::CreateInternalTensor(ppl::nn::Tensor* src, Device device)
+    {
+        const auto&          desc = *src->GetShape();
+        auto                 name = src->GetName();
+        std::vector<int64_t> shape{desc.GetDims(), desc.GetDims() + desc.GetDimCount()};
+        if (std::any_of(begin(shape), end(shape), [](auto x)
+                        { return x <= 0; }))
+        {
+            shape = {};
+        }
+        return TensorDesc{.device = device, .data_type = DataType::kFLOAT, .shape = shape, .name = name};
     }
-  }
+
+    Result<void> PPLNet::Init(const Value& args)
+    {
+        auto& context = args["context"];
+        device_       = context["device"].get<Device>();
+        stream_       = context["stream"].get<Stream>();
+        auto name     = args["name"].get<std::string>();
+        auto model    = context["model"].get<Model>();
+
+        OUTCOME_TRY(auto config, model.GetModelConfig(name));
+        OUTCOME_TRY(auto onnx, model.ReadFile(config.net));
+
+#if PPL_NN_HAS_CUDA
+        if (device_.is_device())
+        {
+            ppl::nn::cuda::RegisterBuiltinOpImpls();
+            ppl::nn::cuda::EngineOptions options{};
+            options.device_id = device_.device_id();
+            options.mm_policy = ppl::nn::cuda::MM_BEST_FIT;
+            engines_.emplace_back(ppl::nn::cuda::EngineFactory::Create(options));
+
+            bool import_algo = false;
+
+    #if PPL_CUDA_IMPORT_FROM_BUFFER
+            auto algo = model.ReadFile(config.weights);
+            if (algo)
+            {
+                auto ret =
+                    engines_.back()->Configure(ppl::nn::cuda::ENGINE_CONF_IMPORT_ALGORITHMS_FROM_BUFFER,
+                                               algo.value().c_str(),
+                                               algo.value().size());
+                if (ret == ppl::common::RC_SUCCESS)
+                {
+                    import_algo = true;
+                }
+                else
+                {
+                    MMDEPLOY_ERROR("failed to import algorithms ({}), default algorithms will be used", ret);
+                }
+            }
+    #endif
+
+            if (!import_algo)
+            {
+                engines_.back()->Configure(ppl::nn::cuda::ENGINE_CONF_USE_DEFAULT_ALGORITHMS, true);
+            }
+        }
 #endif
 #if PPL_NN_HAS_X86
-  if (device_.is_host()) {
-    ppl::nn::x86::RegisterBuiltinOpImpls();
-    engines_.emplace_back(ppl::nn::x86::EngineFactory::Create({}));
-  }
+        if (device_.is_host())
+        {
+            ppl::nn::x86::RegisterBuiltinOpImpls();
+            engines_.emplace_back(ppl::nn::x86::EngineFactory::Create({}));
+        }
 #endif
 #if PPL_NN_HAS_RISCV
-  if (device_.is_host()) {
-    ppl::nn::riscv::RegisterBuiltinOpImpls();
-    ppl::nn::riscv::EngineOptions options{};
-    // TODO:
-    //   FP16 -> postprocess
-    options.forward_precision = ppl::common::DATATYPE_FLOAT32;
-    options.dynamic_tuning_level = 0;
-    options.winograd_level = 1;
-    engines_.emplace_back(ppl::nn::riscv::EngineFactory::Create(options));
-  }
+        if (device_.is_host())
+        {
+            ppl::nn::riscv::RegisterBuiltinOpImpls();
+            ppl::nn::riscv::EngineOptions options{};
+            // TODO:
+            //   FP16 -> postprocess
+            options.forward_precision    = ppl::common::DATATYPE_FLOAT32;
+            options.dynamic_tuning_level = 0;
+            options.winograd_level       = 1;
+            engines_.emplace_back(ppl::nn::riscv::EngineFactory::Create(options));
+        }
 #endif
 
-  std::vector<ppl::nn::Engine*> engines;
-  for (const auto& engine : engines_) {
-    engines.push_back(engine.get());
-  }
-
-  OUTCOME_TRY(auto builder, ppl_try(ppl::nn::onnx::RuntimeBuilderFactory::Create()));
-  OUTCOME_TRY(ppl_try(builder->LoadModel(onnx.data(), onnx.size(), nullptr)));
-
-  ppl::nn::onnx::RuntimeBuilder::Resources resources{};
-  resources.engines = engines.data();
-  resources.engine_num = engines.size();
-  OUTCOME_TRY(ppl_try(builder->SetResources(resources)));
-  OUTCOME_TRY(ppl_try(builder->Preprocess()));
-
-  OUTCOME_TRY(auto runtime, ppl_try(builder->CreateRuntime()));
-
-  for (int i = 0; i < runtime->GetInputCount(); ++i) {
-    auto src = runtime->GetInputTensor(i);
-    inputs_internal_.push_back(src);
-    inputs_external_.push_back(CreateInternalTensor(src, device_));
-
-    /// debug only
-    const auto& desc = *inputs_internal_[i]->GetShape();
-    std::vector<long> shape_(desc.GetDims(), desc.GetDims() + desc.GetDimCount());
-    MMDEPLOY_DEBUG("input {}: datatype = {}, dataformat = {}, shape = {}", i,
-                   ppl::common::GetDataTypeStr(desc.GetDataType()),
-                   ppl::common::GetDataFormatStr(desc.GetDataFormat()), shape_);
-  }
-
-  for (int i = 0; i < runtime->GetOutputCount(); ++i) {
-    auto src = runtime->GetOutputTensor(i);
-    outputs_internal_.push_back(src);
-    outputs_external_.push_back(CreateInternalTensor(src, device_));
-
-    const auto& desc = *outputs_internal_[i]->GetShape();
-    std::vector<long> shape_(desc.GetDims(), desc.GetDims() + desc.GetDimCount());
-    MMDEPLOY_DEBUG("output {}: datatype = {}, dataformat = {}, shape = {}", i,
-                   ppl::common::GetDataTypeStr(desc.GetDataType()),
-                   ppl::common::GetDataFormatStr(desc.GetDataFormat()), shape_);
-    TensorShape shape(desc.GetDims(), desc.GetDims() + desc.GetDimCount());
-  }
-
-  auto input_shapes = GetShapes(inputs_external_);
-  if (auto input_batch_size = GetBatchSize(input_shapes)) {
-    auto output_shapes = GetShapes(outputs_external_);
-    if (auto output_batch_size = GetBatchSize(output_shapes)) {
-      if (input_batch_size.value() == output_batch_size.value()) {
-        can_infer_output_shapes_ = true;
-      }
+        std::vector<ppl::nn::Engine*> engines;
+        for (const auto& engine : engines_)
+        {
+            engines.push_back(engine.get());
+        }
+
+        OUTCOME_TRY(auto builder, ppl_try(ppl::nn::onnx::RuntimeBuilderFactory::Create()));
+        OUTCOME_TRY(ppl_try(builder->LoadModel(onnx.data(), onnx.size(), nullptr)));
+
+        ppl::nn::onnx::RuntimeBuilder::Resources resources{};
+        resources.engines    = engines.data();
+        resources.engine_num = engines.size();
+        OUTCOME_TRY(ppl_try(builder->SetResources(resources)));
+        OUTCOME_TRY(ppl_try(builder->Preprocess()));
+
+        OUTCOME_TRY(auto runtime, ppl_try(builder->CreateRuntime()));
+
+        for (int i = 0; i < runtime->GetInputCount(); ++i)
+        {
+            auto src = runtime->GetInputTensor(i);
+            inputs_internal_.push_back(src);
+            inputs_external_.push_back(CreateInternalTensor(src, device_));
+
+            /// debug only
+            const auto&       desc = *inputs_internal_[i]->GetShape();
+            std::vector<long> shape_(desc.GetDims(), desc.GetDims() + desc.GetDimCount());
+            MMDEPLOY_DEBUG("input {}: datatype = {}, dataformat = {}, shape = {}", i, ppl::common::GetDataTypeStr(desc.GetDataType()), ppl::common::GetDataFormatStr(desc.GetDataFormat()), shape_);
+        }
+
+        for (int i = 0; i < runtime->GetOutputCount(); ++i)
+        {
+            auto src = runtime->GetOutputTensor(i);
+            outputs_internal_.push_back(src);
+            outputs_external_.push_back(CreateInternalTensor(src, device_));
+
+            const auto&       desc = *outputs_internal_[i]->GetShape();
+            std::vector<long> shape_(desc.GetDims(), desc.GetDims() + desc.GetDimCount());
+            MMDEPLOY_DEBUG("output {}: datatype = {}, dataformat = {}, shape = {}", i, ppl::common::GetDataTypeStr(desc.GetDataType()), ppl::common::GetDataFormatStr(desc.GetDataFormat()), shape_);
+            TensorShape shape(desc.GetDims(), desc.GetDims() + desc.GetDimCount());
+        }
+
+        auto input_shapes = GetShapes(inputs_external_);
+        if (auto input_batch_size = GetBatchSize(input_shapes))
+        {
+            auto output_shapes = GetShapes(outputs_external_);
+            if (auto output_batch_size = GetBatchSize(output_shapes))
+            {
+                if (input_batch_size.value() == output_batch_size.value())
+                {
+                    can_infer_output_shapes_ = true;
+                }
+            }
+        }
+
+        runtime_ = std::move(runtime);
+        return success();
+    }
+
+    Result<void> PPLNet::Deinit()
+    {
+        try
+        {
+            runtime_.reset();
+            return success();
+        }
+        catch (...)
+        {
+            return Status(eFail);
+        }
+    }
+
+    static TensorShape GetShape(const PPLTensor& tensor)
+    {
+        const auto& desc = *tensor.GetShape();
+        return {desc.GetDims(), desc.GetDims() + desc.GetDimCount()};
+    }
+
+    Result<ppl::common::datatype_t> GetPPLDataType(DataType data_type)
+    {
+        switch (data_type)
+        {
+            case DataType::kFLOAT:
+                return ppl::common::DATATYPE_FLOAT32;
+            case DataType::kHALF:
+                return ppl::common::DATATYPE_FLOAT16;
+            case DataType::kINT8:
+                return ppl::common::DATATYPE_INT8;
+            case DataType::kINT32:
+                return ppl::common::DATATYPE_INT32;
+            case DataType::kINT64:
+                return ppl::common::DATATYPE_INT64;
+            default:
+                return Status(eNotSupported);
+        }
+    }
+
+    Result<DataType> GetMMDeployDataType(ppl::common::datatype_t data_type)
+    {
+        switch (data_type)
+        {
+            case ppl::common::DATATYPE_FLOAT32:
+                return DataType::kFLOAT;
+            case ppl::common::DATATYPE_FLOAT16:
+                return DataType::kHALF;
+            case ppl::common::DATATYPE_INT8:
+                return DataType::kINT8;
+            case ppl::common::DATATYPE_INT32:
+                return DataType::kINT32;
+            case ppl::common::DATATYPE_INT64:
+                return DataType::kINT64;
+            default:
+                return Status(eNotSupported);
+        }
     }
-  }
-
-  runtime_ = std::move(runtime);
-  return success();
-}
-
-Result<void> PPLNet::Deinit() {
-  try {
-    runtime_.reset();
-    return success();
-  } catch (...) {
-    return Status(eFail);
-  }
-}
-
-static TensorShape GetShape(const PPLTensor& tensor) {
-  const auto& desc = *tensor.GetShape();
-  return {desc.GetDims(), desc.GetDims() + desc.GetDimCount()};
-}
-
-Result<ppl::common::datatype_t> GetPPLDataType(DataType data_type) {
-  switch (data_type) {
-    case DataType::kFLOAT:
-      return ppl::common::DATATYPE_FLOAT32;
-    case DataType::kHALF:
-      return ppl::common::DATATYPE_FLOAT16;
-    case DataType::kINT8:
-      return ppl::common::DATATYPE_INT8;
-    case DataType::kINT32:
-      return ppl::common::DATATYPE_INT32;
-    case DataType::kINT64:
-      return ppl::common::DATATYPE_INT64;
-    default:
-      return Status(eNotSupported);
-  }
-}
-
-Result<DataType> GetMMDeployDataType(ppl::common::datatype_t data_type) {
-  switch (data_type) {
-    case ppl::common::DATATYPE_FLOAT32:
-      return DataType::kFLOAT;
-    case ppl::common::DATATYPE_FLOAT16:
-      return DataType::kHALF;
-    case ppl::common::DATATYPE_INT8:
-      return DataType::kINT8;
-    case ppl::common::DATATYPE_INT32:
-      return DataType::kINT32;
-    case ppl::common::DATATYPE_INT64:
-      return DataType::kINT64;
-    default:
-      return Status(eNotSupported);
-  }
-}
-
-Result<void> PPLNet::Forward() {
-  OUTCOME_TRY(stream_.Wait());
-
-  OUTCOME_TRY(ppl_try(runtime_->Run()));
-
-  for (int i = 0; i < outputs_external_.size(); ++i) {
-    auto& internal = *outputs_internal_[i];
-    auto format = internal.GetShape()->GetDataFormat();
-    if (format != ppl::common::DATAFORMAT_NDARRAY) {
-      MMDEPLOY_ERROR("output {}'s format is {}, only NDARRAY is currently supported", i,
-                     ppl::common::GetDataFormatStr(format));
-      return Status(eNotSupported);
+
+    Result<void> PPLNet::Forward()
+    {
+        OUTCOME_TRY(stream_.Wait());
+
+        OUTCOME_TRY(ppl_try(runtime_->Run()));
+
+        for (int i = 0; i < outputs_external_.size(); ++i)
+        {
+            auto& internal = *outputs_internal_[i];
+            auto  format   = internal.GetShape()->GetDataFormat();
+            if (format != ppl::common::DATAFORMAT_NDARRAY)
+            {
+                MMDEPLOY_ERROR("output {}'s format is {}, only NDARRAY is currently supported", i, ppl::common::GetDataFormatStr(format));
+                return Status(eNotSupported);
+            }
+            auto& external  = outputs_external_[i];
+            auto  dtype_int = internal.GetShape()->GetDataType();
+            OUTCOME_TRY(auto dtype_ext, GetPPLDataType(external.data_type()));
+            auto shape_int = GetShape(internal);
+            auto shape_ext = external.shape();
+            auto data_int  = internal.GetBufferPtr();
+            auto data_ext  = external.data();
+            if (shape_int != shape_ext || dtype_int != dtype_ext || data_int != data_ext)
+            {
+                if (dtype_int != dtype_ext)
+                {
+                    auto desc  = external.desc();
+                    desc.shape = shape_int;
+                    OUTCOME_TRY(desc.data_type, GetMMDeployDataType(dtype_int));
+                    external = Tensor(desc, external.allocator());
+                }
+                else
+                {
+                    external.Reshape(shape_int);
+                }
+                std::shared_ptr<void> data(data_int, [](void*) {});
+                if (external.size() > 0)
+                {
+                    OUTCOME_TRY(Tensor(external.desc(), data).CopyTo(external, stream_));
+                }
+                else
+                {
+                    MMDEPLOY_WARN("copy skipped due to zero sized tensor: {} {}", external.name(), external.shape());
+                }
+            }
+        }
+
+        OUTCOME_TRY(stream_.Wait());
+        return success();
+    }
+
+    Result<void> PPLNet::ForwardAsync(Event* event)
+    {
+        return Status(eNotSupported);
     }
-    auto& external = outputs_external_[i];
-    auto dtype_int = internal.GetShape()->GetDataType();
-    OUTCOME_TRY(auto dtype_ext, GetPPLDataType(external.data_type()));
-    auto shape_int = GetShape(internal);
-    auto shape_ext = external.shape();
-    auto data_int = internal.GetBufferPtr();
-    auto data_ext = external.data();
-    if (shape_int != shape_ext || dtype_int != dtype_ext || data_int != data_ext) {
-      if (dtype_int != dtype_ext) {
-        auto desc = external.desc();
-        desc.shape = shape_int;
-        OUTCOME_TRY(desc.data_type, GetMMDeployDataType(dtype_int));
-        external = Tensor(desc, external.allocator());
-      } else {
-        external.Reshape(shape_int);
-      }
-      std::shared_ptr<void> data(data_int, [](void*) {});
-      if (external.size() > 0) {
-        OUTCOME_TRY(Tensor(external.desc(), data).CopyTo(external, stream_));
-      } else {
-        MMDEPLOY_WARN("copy skipped due to zero sized tensor: {} {}", external.name(),
-                      external.shape());
-      }
+
+    Result<void> ReshapeLike(PPLTensor& dst, Tensor& src)
+    {
+        auto& dst_desc = *dst.GetShape();
+        auto& src_desc = src.desc();
+        OUTCOME_TRY(auto data_type, GetPPLDataType(src_desc.data_type));
+        dst_desc.SetDataType(data_type);
+        dst_desc.SetDataFormat(ppl::common::DATAFORMAT_NDARRAY);
+        dst_desc.Reshape({begin(src_desc.shape), end(src_desc.shape)});
+        dst.SetBufferPtr(src.data());
+        return success();
     }
-  }
-
-  OUTCOME_TRY(stream_.Wait());
-  return success();
-}
-
-Result<void> PPLNet::ForwardAsync(Event* event) { return Status(eNotSupported); }
-
-Result<void> ReshapeLike(PPLTensor& dst, Tensor& src) {
-  auto& dst_desc = *dst.GetShape();
-  auto& src_desc = src.desc();
-  OUTCOME_TRY(auto data_type, GetPPLDataType(src_desc.data_type));
-  dst_desc.SetDataType(data_type);
-  dst_desc.SetDataFormat(ppl::common::DATAFORMAT_NDARRAY);
-  dst_desc.Reshape({begin(src_desc.shape), end(src_desc.shape)});
-  dst.SetBufferPtr(src.data());
-  return success();
-}
-
-Result<void> PPLNet::Reshape(Span<TensorShape> input_shapes) {
-  auto prev_in_shapes = GetShapes(inputs_external_);
-  auto prev_out_shapes = GetShapes(outputs_external_);
-
-  for (int i = 0; i < inputs_external_.size(); ++i) {
-    auto& input = inputs_external_[i];
-    input.Reshape(input_shapes[i]);
-    OUTCOME_TRY(ReshapeLike(*inputs_internal_[i], input));
-  }
-
-  if (can_infer_output_shapes_) {
-    OUTCOME_TRY(auto output_shapes,
-                InferOutputShapes(input_shapes, prev_in_shapes, prev_out_shapes));
-    MMDEPLOY_DEBUG("inferred output shapes: {}", output_shapes);
-    for (int i = 0; i < outputs_external_.size(); ++i) {
-      auto& output = outputs_external_[i];
-      output.Reshape(output_shapes[i]);
-      OUTCOME_TRY(ReshapeLike(*outputs_internal_[i], output));
+
+    Result<void> PPLNet::Reshape(Span<TensorShape> input_shapes)
+    {
+        auto prev_in_shapes  = GetShapes(inputs_external_);
+        auto prev_out_shapes = GetShapes(outputs_external_);
+
+        for (int i = 0; i < inputs_external_.size(); ++i)
+        {
+            auto& input = inputs_external_[i];
+            input.Reshape(input_shapes[i]);
+            OUTCOME_TRY(ReshapeLike(*inputs_internal_[i], input));
+        }
+
+        if (can_infer_output_shapes_)
+        {
+            OUTCOME_TRY(auto output_shapes,
+                        InferOutputShapes(input_shapes, prev_in_shapes, prev_out_shapes));
+            MMDEPLOY_DEBUG("inferred output shapes: {}", output_shapes);
+            for (int i = 0; i < outputs_external_.size(); ++i)
+            {
+                auto& output = outputs_external_[i];
+                output.Reshape(output_shapes[i]);
+                OUTCOME_TRY(ReshapeLike(*outputs_internal_[i], output));
+            }
+        }
+        return success();
     }
-  }
-  return success();
-}
-
-Result<Span<Tensor>> PPLNet::GetInputTensors() { return inputs_external_; }
-
-Result<Span<Tensor>> PPLNet::GetOutputTensors() { return outputs_external_; }
-
-std::vector<TensorShape> PPLNet::GetShapes(Span<Tensor> tensors) {
-  std::vector<TensorShape> shapes;
-  shapes.reserve(tensors.size());
-  for (const auto& t : tensors) {
-    shapes.push_back(t.shape());
-  }
-  return shapes;
-}
-
-Result<int64_t> PPLNet::GetBatchSize(Span<TensorShape> shapes) {
-  int64_t batch_size = -1;
-  for (const auto& s : shapes) {
-    if (s.empty()) {
-      return Status(eNotSupported);
+
+    Result<Span<Tensor>> PPLNet::GetInputTensors()
+    {
+        return inputs_external_;
     }
-    if (batch_size < 0) {
-      batch_size = s.front();
-    } else if (batch_size != s.front()) {
-      return Status(eNotSupported);
+
+    Result<Span<Tensor>> PPLNet::GetOutputTensors()
+    {
+        return outputs_external_;
     }
-  }
-  return batch_size;
-}
-
-Result<std::vector<TensorShape>> PPLNet::InferOutputShapes(Span<TensorShape> input_shapes,
-                                                           Span<TensorShape> prev_in_shapes,
-                                                           Span<TensorShape> prev_out_shapes) {
-  OUTCOME_TRY(auto batch_size, GetBatchSize(input_shapes));
-  if (input_shapes.size() != prev_in_shapes.size()) {
-    return Status(eInvalidArgument);
-  }
-  for (int i = 0; i < input_shapes.size(); ++i) {
-    prev_in_shapes[i][0] = batch_size;
-    if (prev_in_shapes[i] != input_shapes[i]) {
-      return Status(eNotSupported);
+
+    std::vector<TensorShape> PPLNet::GetShapes(Span<Tensor> tensors)
+    {
+        std::vector<TensorShape> shapes;
+        shapes.reserve(tensors.size());
+        for (const auto& t : tensors)
+        {
+            shapes.push_back(t.shape());
+        }
+        return shapes;
     }
-  }
-  std::vector<TensorShape> output_shapes(prev_out_shapes.begin(), prev_out_shapes.end());
-  for (auto& shape : output_shapes) {
-    shape[0] = batch_size;
-  }
-  return output_shapes;
-}
-
-PPLNet::~PPLNet() = default;
-
-static std::unique_ptr<Net> Create(const Value& args) {
-  auto p = std::make_unique<PPLNet>();
-  if (auto r = p->Init(args)) {
-    return p;
-  } else {
-    MMDEPLOY_ERROR("error creating PPLNet: {}", r.error().message().c_str());
-    return nullptr;
-  }
-}
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(Net, (pplnn, 0), Create);
+
+    Result<int64_t> PPLNet::GetBatchSize(Span<TensorShape> shapes)
+    {
+        int64_t batch_size = -1;
+        for (const auto& s : shapes)
+        {
+            if (s.empty())
+            {
+                return Status(eNotSupported);
+            }
+            if (batch_size < 0)
+            {
+                batch_size = s.front();
+            }
+            else if (batch_size != s.front())
+            {
+                return Status(eNotSupported);
+            }
+        }
+        return batch_size;
+    }
+
+    Result<std::vector<TensorShape>> PPLNet::InferOutputShapes(Span<TensorShape> input_shapes,
+                                                               Span<TensorShape> prev_in_shapes,
+                                                               Span<TensorShape> prev_out_shapes)
+    {
+        OUTCOME_TRY(auto batch_size, GetBatchSize(input_shapes));
+        if (input_shapes.size() != prev_in_shapes.size())
+        {
+            return Status(eInvalidArgument);
+        }
+        for (int i = 0; i < input_shapes.size(); ++i)
+        {
+            prev_in_shapes[i][0] = batch_size;
+            if (prev_in_shapes[i] != input_shapes[i])
+            {
+                return Status(eNotSupported);
+            }
+        }
+        std::vector<TensorShape> output_shapes(prev_out_shapes.begin(), prev_out_shapes.end());
+        for (auto& shape : output_shapes)
+        {
+            shape[0] = batch_size;
+        }
+        return output_shapes;
+    }
+
+    PPLNet::~PPLNet() = default;
+
+    static std::unique_ptr<Net> Create(const Value& args)
+    {
+        auto p = std::make_unique<PPLNet>();
+        if (auto r = p->Init(args))
+        {
+            return p;
+        }
+        else
+        {
+            MMDEPLOY_ERROR("error creating PPLNet: {}", r.error().message().c_str());
+            return nullptr;
+        }
+    }
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Net, (pplnn, 0), Create);
 
 }  // namespace mmdeploy::framework
diff --git a/csrc/mmdeploy/net/ppl/ppl_net.h b/csrc/mmdeploy/net/ppl/ppl_net.h
index d7a0f70c28..45e188994f 100644
--- a/csrc/mmdeploy/net/ppl/ppl_net.h
+++ b/csrc/mmdeploy/net/ppl/ppl_net.h
@@ -8,50 +8,52 @@
 #include "ppl/nn/engines/engine.h"
 #include "ppl/nn/runtime/runtime.h"
 
-namespace mmdeploy::framework {
+namespace mmdeploy::framework
+{
 
-using PPLTensor = ppl::nn::Tensor;
+    using PPLTensor = ppl::nn::Tensor;
 
-class PPLNet : public Net {
- public:
-  ~PPLNet() override;
+    class PPLNet : public Net
+    {
+      public:
+        ~PPLNet() override;
 
-  Result<void> Init(const Value& args) override;
+        Result<void>                            Init(const Value& args) override;
 
-  Result<void> Deinit() override;
+        Result<void>                            Deinit() override;
 
-  Result<void> Reshape(Span<TensorShape> input_shapes) override;
+        Result<void>                            Reshape(Span<TensorShape> input_shapes) override;
 
-  Result<Span<Tensor> > GetInputTensors() override;
+        Result<Span<Tensor>>                    GetInputTensors() override;
 
-  Result<Span<Tensor> > GetOutputTensors() override;
+        Result<Span<Tensor>>                    GetOutputTensors() override;
 
-  Result<void> Forward() override;
+        Result<void>                            Forward() override;
 
-  Result<void> ForwardAsync(Event* event) override;
+        Result<void>                            ForwardAsync(Event* event) override;
 
-  static Result<std::vector<TensorShape> > InferOutputShapes(Span<TensorShape> input_shapes,
-                                                             Span<TensorShape> prev_in_shapes,
-                                                             Span<TensorShape> prev_out_shapes);
+        static Result<std::vector<TensorShape>> InferOutputShapes(Span<TensorShape> input_shapes,
+                                                                  Span<TensorShape> prev_in_shapes,
+                                                                  Span<TensorShape> prev_out_shapes);
 
- private:
-  static Tensor CreateInternalTensor(ppl::nn::Tensor* src, Device device);
+      private:
+        static Tensor                                 CreateInternalTensor(ppl::nn::Tensor* src, Device device);
 
-  static Result<int64_t> GetBatchSize(Span<TensorShape> shapes);
+        static Result<int64_t>                        GetBatchSize(Span<TensorShape> shapes);
 
-  static std::vector<TensorShape> GetShapes(Span<Tensor> tensors);
+        static std::vector<TensorShape>               GetShapes(Span<Tensor> tensors);
 
-  Device device_;
-  Stream stream_;
-  std::vector<std::unique_ptr<ppl::nn::Engine> > engines_;
-  std::vector<Tensor> inputs_external_;
-  std::vector<Tensor> outputs_external_;
-  std::vector<PPLTensor*> inputs_internal_;
-  std::vector<PPLTensor*> outputs_internal_;
-  std::unique_ptr<ppl::nn::Runtime> runtime_;
-  bool can_infer_output_shapes_{false};
-  static constexpr const auto kHost = Device(0);
-};
+        Device                                        device_;
+        Stream                                        stream_;
+        std::vector<std::unique_ptr<ppl::nn::Engine>> engines_;
+        std::vector<Tensor>                           inputs_external_;
+        std::vector<Tensor>                           outputs_external_;
+        std::vector<PPLTensor*>                       inputs_internal_;
+        std::vector<PPLTensor*>                       outputs_internal_;
+        std::unique_ptr<ppl::nn::Runtime>             runtime_;
+        bool                                          can_infer_output_shapes_{false};
+        static constexpr const auto                   kHost = Device(0);
+    };
 
 }  // namespace mmdeploy::framework
 
diff --git a/csrc/mmdeploy/net/rknn/CMakeLists.txt b/csrc/mmdeploy/net/rknn/CMakeLists.txt
index 11d4d0c94c..1938376cb5 100644
--- a/csrc/mmdeploy/net/rknn/CMakeLists.txt
+++ b/csrc/mmdeploy/net/rknn/CMakeLists.txt
@@ -6,32 +6,36 @@ mmdeploy_add_module(${PROJECT_NAME} rknn_net.cpp)
 
 add_library(rknn SHARED IMPORTED)
 
-if (DEFINED ENV{RKNPU2_DEVICE_DIR})
-    file(TO_CMAKE_PATH $ENV{RKNPU2_DEVICE_DIR} RKNPU2_DEVICE_DIR)
-endif ()
-if (DEFINED RKNPU2_DEVICE_DIR)
-    set_target_properties(rknn PROPERTIES
-            IMPORTED_LOCATION "${RKNPU2_DEVICE_DIR}/Linux/librknn_api/aarch64/librknn_api.so"
-            INTERFACE_INCLUDE_DIRECTORIES "${RKNPU2_DEVICE_DIR}/Linux/librknn_api/include"
-            )
-    target_compile_definitions(${PROJECT_NAME} PRIVATE RK_MODELS)
-    target_link_libraries(${PROJECT_NAME} PRIVATE rknn)
-    add_library(mmdeploy::rknn_net ALIAS ${PROJECT_NAME})
-endif ()
+if(DEFINED ENV{RKNPU2_DEVICE_DIR})
+  file(TO_CMAKE_PATH $ENV{RKNPU2_DEVICE_DIR} RKNPU2_DEVICE_DIR)
+endif()
+if(DEFINED RKNPU2_DEVICE_DIR)
+  set_target_properties(
+    rknn
+    PROPERTIES IMPORTED_LOCATION
+               "${RKNPU2_DEVICE_DIR}/Linux/librknn_api/aarch64/librknn_api.so"
+               INTERFACE_INCLUDE_DIRECTORIES
+               "${RKNPU2_DEVICE_DIR}/Linux/librknn_api/include")
+  target_compile_definitions(${PROJECT_NAME} PRIVATE RK_MODELS)
+  target_link_libraries(${PROJECT_NAME} PRIVATE rknn)
+  add_library(mmdeploy::rknn_net ALIAS ${PROJECT_NAME})
+endif()
 
-if (DEFINED ENV{RKNPU_DEVICE_DIR})
-    file(TO_CMAKE_PATH $ENV{RKNPU_DEVICE_DIR} RKNPU_DEVICE_DIR)
-endif ()
-if (DEFINED RKNPU_DEVICE_DIR)
-    set_target_properties(rknn PROPERTIES IMPORTED_CONFIGURATIONS RELEASE
-            IMPORTED_LOCATION_RELEASE "${RKNPU_DEVICE_DIR}/lib/librknn_api.so"
-            INTERFACE_INCLUDE_DIRECTORIES "${RKNPU_DEVICE_DIR}/include"
-            )
-    target_compile_definitions(${PROJECT_NAME} PRIVATE RV_MODELS)
-    target_link_libraries(${PROJECT_NAME} PRIVATE rknn)
-    add_library(mmdeploy::rknn_net ALIAS ${PROJECT_NAME})
-endif ()
+if(DEFINED ENV{RKNPU_DEVICE_DIR})
+  file(TO_CMAKE_PATH $ENV{RKNPU_DEVICE_DIR} RKNPU_DEVICE_DIR)
+endif()
+if(DEFINED RKNPU_DEVICE_DIR)
+  set_target_properties(
+    rknn
+    PROPERTIES IMPORTED_CONFIGURATIONS RELEASE
+               IMPORTED_LOCATION_RELEASE
+               "${RKNPU_DEVICE_DIR}/lib/librknn_api.so"
+               INTERFACE_INCLUDE_DIRECTORIES "${RKNPU_DEVICE_DIR}/include")
+  target_compile_definitions(${PROJECT_NAME} PRIVATE RV_MODELS)
+  target_link_libraries(${PROJECT_NAME} PRIVATE rknn)
+  add_library(mmdeploy::rknn_net ALIAS ${PROJECT_NAME})
+endif()
 
-if (NOT (DEFINED RKNPU2_DEVICE_DIR OR RKNPU_DEVICE_DIR))
-    message(FATAL_ERROR "RKNPU2_DEVICE_DIR or RKNPU_DEVICE_DIR must be defined")
-endif ()
+if(NOT (DEFINED RKNPU2_DEVICE_DIR OR RKNPU_DEVICE_DIR))
+  message(FATAL_ERROR "RKNPU2_DEVICE_DIR or RKNPU_DEVICE_DIR must be defined")
+endif()
diff --git a/csrc/mmdeploy/net/rknn/rknn_net.cpp b/csrc/mmdeploy/net/rknn/rknn_net.cpp
index 2582bb98c7..d5e1e6c0c0 100644
--- a/csrc/mmdeploy/net/rknn/rknn_net.cpp
+++ b/csrc/mmdeploy/net/rknn/rknn_net.cpp
@@ -10,275 +10,343 @@
 #include "mmdeploy/core/utils/filesystem.h"
 #include "mmdeploy/core/utils/formatter.h"
 
-namespace mmdeploy::framework {
-
-static inline const char* const rknn_type(rknn_tensor_type type) {
-  switch (type) {
-    case RKNN_TENSOR_FLOAT32:
-      return "FP32";
-    case RKNN_TENSOR_FLOAT16:
-      return "FP16";
-    case RKNN_TENSOR_INT8:
-      return "INT8";
-    case RKNN_TENSOR_UINT8:
-      return "UINT8";
-    case RKNN_TENSOR_INT16:
-      return "INT16";
+namespace mmdeploy::framework
+{
+
+    static inline const char* const rknn_type(rknn_tensor_type type)
+    {
+        switch (type)
+        {
+            case RKNN_TENSOR_FLOAT32:
+                return "FP32";
+            case RKNN_TENSOR_FLOAT16:
+                return "FP16";
+            case RKNN_TENSOR_INT8:
+                return "INT8";
+            case RKNN_TENSOR_UINT8:
+                return "UINT8";
+            case RKNN_TENSOR_INT16:
+                return "INT16";
 #ifdef RK_MODELS
-    case RKNN_TENSOR_INT32:
-      return "INT32";
-    case RKNN_TENSOR_INT64:
-      return "INT64";
+            case RKNN_TENSOR_INT32:
+                return "INT32";
+            case RKNN_TENSOR_INT64:
+                return "INT64";
 #endif
-    default:
-      return "UNKNOWN";
-  }
-}
-
-static inline const char* const rknn_format(rknn_tensor_format fmt) {
-  switch (fmt) {
-    case RKNN_TENSOR_NCHW:
-      return "NCHW";
-    case RKNN_TENSOR_NHWC:
-      return "NHWC";
-    default:
-      return "UNKNOWN";
-  }
-}
-
-static inline const char* const rknn_qnt_type(rknn_tensor_qnt_type type) {
-  switch (type) {
-    case RKNN_TENSOR_QNT_NONE:
-      return "NONE";
-    case RKNN_TENSOR_QNT_DFP:
-      return "DFP";
-    case RKNN_TENSOR_QNT_AFFINE_ASYMMETRIC:
-      return "AFFINE";
-    default:
-      return "UNKNOWN";
-  }
-}
-
-static Result<rknn_tensor_type> GetRKNNDataType(DataType data_type) {
-  switch (data_type) {
-    case DataType::kFLOAT:
-      return RKNN_TENSOR_FLOAT32;
-    case DataType::kHALF:
-      return RKNN_TENSOR_FLOAT16;
-    case DataType::kINT8:
-      return RKNN_TENSOR_INT8;
+            default:
+                return "UNKNOWN";
+        }
+    }
+
+    static inline const char* const rknn_format(rknn_tensor_format fmt)
+    {
+        switch (fmt)
+        {
+            case RKNN_TENSOR_NCHW:
+                return "NCHW";
+            case RKNN_TENSOR_NHWC:
+                return "NHWC";
+            default:
+                return "UNKNOWN";
+        }
+    }
+
+    static inline const char* const rknn_qnt_type(rknn_tensor_qnt_type type)
+    {
+        switch (type)
+        {
+            case RKNN_TENSOR_QNT_NONE:
+                return "NONE";
+            case RKNN_TENSOR_QNT_DFP:
+                return "DFP";
+            case RKNN_TENSOR_QNT_AFFINE_ASYMMETRIC:
+                return "AFFINE";
+            default:
+                return "UNKNOWN";
+        }
+    }
+
+    static Result<rknn_tensor_type> GetRKNNDataType(DataType data_type)
+    {
+        switch (data_type)
+        {
+            case DataType::kFLOAT:
+                return RKNN_TENSOR_FLOAT32;
+            case DataType::kHALF:
+                return RKNN_TENSOR_FLOAT16;
+            case DataType::kINT8:
+                return RKNN_TENSOR_INT8;
 #ifdef RK_MODELS
-    case DataType::kINT32:
-      return RKNN_TENSOR_INT32;
-    case DataType::kINT64:
-      return RKNN_TENSOR_INT64;
+            case DataType::kINT32:
+                return RKNN_TENSOR_INT32;
+            case DataType::kINT64:
+                return RKNN_TENSOR_INT64;
 #endif
-    default:
-      return Status(eNotSupported);
-  }
-}
-
-static Result<DataType> GetMMDeployDataType(rknn_tensor_type type) {
-  switch (type) {
-    case RKNN_TENSOR_FLOAT32:
-      return DataType::kFLOAT;
-    case RKNN_TENSOR_FLOAT16:
-      return DataType::kHALF;
-    case RKNN_TENSOR_INT8:  // fall through
-    case RKNN_TENSOR_UINT8:
-      return DataType::kINT8;
+            default:
+                return Status(eNotSupported);
+        }
+    }
+
+    static Result<DataType> GetMMDeployDataType(rknn_tensor_type type)
+    {
+        switch (type)
+        {
+            case RKNN_TENSOR_FLOAT32:
+                return DataType::kFLOAT;
+            case RKNN_TENSOR_FLOAT16:
+                return DataType::kHALF;
+            case RKNN_TENSOR_INT8:  // fall through
+            case RKNN_TENSOR_UINT8:
+                return DataType::kINT8;
 #ifdef RK_MODELS
-    case RKNN_TENSOR_INT32:
-      return DataType::kINT32;
-    case RKNN_TENSOR_INT64:
-      return DataType::kINT64;
+            case RKNN_TENSOR_INT32:
+                return DataType::kINT32;
+            case RKNN_TENSOR_INT64:
+                return DataType::kINT64;
 #endif
-    default:
-      MMDEPLOY_ERROR("unsupported rknn_tensor_type: {}", rknn_type(type));
-      return Status(eNotSupported);
-  }
-}
-
-RKNNNet::~RKNNNet() { rknn_destroy(ctx_); }
-
-void RKNNNet::PrintRKNNTensorAttr(const char* tag, const std::vector<rknn_tensor_attr>& attrs) {
-  MMDEPLOY_INFO("{} tensors: ", tag);
-  for (auto& attr : attrs) {
-    MMDEPLOY_INFO(
-        " - index={}, name={}, type={}, n_dims={}, dims=[{}, {}, {}, {}], n_elems={}, size={},"
-        " fmt={}, qnt_type={}, zp={}, scale={}",
-        attr.index, attr.name, rknn_type(attr.type), attr.n_dims, attr.dims[0], attr.dims[1],
-        attr.dims[2], attr.dims[3], attr.n_elems, attr.size, rknn_format(attr.fmt),
-        rknn_qnt_type(attr.qnt_type), attr.zp, attr.scale);
-  }
-}
-
-Result<void> RKNNNet::Init(const Value& args) {
-  auto& context = args["context"];
-  device_ = context["device"].get<Device>();
-  stream_ = context["stream"].get<Stream>();
-  if (!device_.is_host()) {
-    return Status(eNotSupported);
-  }
-
-  auto name = args["name"].get<std::string>();
-  auto model = context["model"].get<Model>();
-  OUTCOME_TRY(auto config, model.GetModelConfig(name));
-
-  std::string content;
-  OUTCOME_TRY(content, model.ReadFile(config.net));
-  char* model_ptr = const_cast<char*>(content.data());
+            default:
+                MMDEPLOY_ERROR("unsupported rknn_tensor_type: {}", rknn_type(type));
+                return Status(eNotSupported);
+        }
+    }
+
+    RKNNNet::~RKNNNet()
+    {
+        rknn_destroy(ctx_);
+    }
+
+    void RKNNNet::PrintRKNNTensorAttr(const char* tag, const std::vector<rknn_tensor_attr>& attrs)
+    {
+        MMDEPLOY_INFO("{} tensors: ", tag);
+        for (auto& attr : attrs)
+        {
+            MMDEPLOY_INFO(
+                " - index={}, name={}, type={}, n_dims={}, dims=[{}, {}, {}, {}], n_elems={}, size={},"
+                " fmt={}, qnt_type={}, zp={}, scale={}",
+                attr.index,
+                attr.name,
+                rknn_type(attr.type),
+                attr.n_dims,
+                attr.dims[0],
+                attr.dims[1],
+                attr.dims[2],
+                attr.dims[3],
+                attr.n_elems,
+                attr.size,
+                rknn_format(attr.fmt),
+                rknn_qnt_type(attr.qnt_type),
+                attr.zp,
+                attr.scale);
+        }
+    }
+
+    Result<void> RKNNNet::Init(const Value& args)
+    {
+        auto& context = args["context"];
+        device_       = context["device"].get<Device>();
+        stream_       = context["stream"].get<Stream>();
+        if (!device_.is_host())
+        {
+            return Status(eNotSupported);
+        }
+
+        auto name  = args["name"].get<std::string>();
+        auto model = context["model"].get<Model>();
+        OUTCOME_TRY(auto config, model.GetModelConfig(name));
+
+        std::string content;
+        OUTCOME_TRY(content, model.ReadFile(config.net));
+        char* model_ptr = const_cast<char*>(content.data());
 #ifdef RK_MODELS
-  int ret = rknn_init(&ctx_, model_ptr, content.size(), 0, NULL);
+        int ret = rknn_init(&ctx_, model_ptr, content.size(), 0, NULL);
 #endif
 #ifdef RV_MODELS
-  int ret = rknn_init(&ctx_, model_ptr, content.size(), 0);
+        int ret = rknn_init(&ctx_, model_ptr, content.size(), 0);
 #endif
-  if (ret != RKNN_SUCC) {
-    MMDEPLOY_ERROR("init rknn model with {} failed! ret: {}", config.net, ret);
-    return Status(eFail);
-  }
-
-  // Get Model Input Output Info
-  rknn_input_output_num io_num;
-  ret = rknn_query(ctx_, RKNN_QUERY_IN_OUT_NUM, &io_num, sizeof(io_num));
-  if (ret != RKNN_SUCC) {
-    MMDEPLOY_ERROR("rknn query 'RKNN_QUERY_IN_OUT_NUM' fail! ret: {}", ret);
-    return Status(eFail);
-  }
-  MMDEPLOY_DEBUG("model input num: {}, output num: {}", io_num.n_input, io_num.n_output);
-
-  auto get_tensor_shape = [](rknn_tensor_attr& attr) -> Result<TensorShape> {
-    TensorShape shape;
-    for (int i = 0; i < attr.n_dims; ++i) {
-      shape.push_back(attr.dims[i]);
-    }
+        if (ret != RKNN_SUCC)
+        {
+            MMDEPLOY_ERROR("init rknn model with {} failed! ret: {}", config.net, ret);
+            return Status(eFail);
+        }
+
+        // Get Model Input Output Info
+        rknn_input_output_num io_num;
+        ret = rknn_query(ctx_, RKNN_QUERY_IN_OUT_NUM, &io_num, sizeof(io_num));
+        if (ret != RKNN_SUCC)
+        {
+            MMDEPLOY_ERROR("rknn query 'RKNN_QUERY_IN_OUT_NUM' fail! ret: {}", ret);
+            return Status(eFail);
+        }
+        MMDEPLOY_DEBUG("model input num: {}, output num: {}", io_num.n_input, io_num.n_output);
+
+        auto get_tensor_shape = [](rknn_tensor_attr& attr) -> Result<TensorShape>
+        {
+            TensorShape shape;
+            for (int i = 0; i < attr.n_dims; ++i)
+            {
+                shape.push_back(attr.dims[i]);
+            }
 #ifdef RK_MODELS
-    return shape;
+            return shape;
 #endif
 #ifdef RV_MODELS
-    std::reverse(shape.begin(), shape.end());
-    return shape;
+            std::reverse(shape.begin(), shape.end());
+            return shape;
 #endif
-  };
-
-  for (int i = 0; i < io_num.n_input; i++) {
-    rknn_tensor_attr attr;
-    attr.index = i;
-    ret = rknn_query(ctx_, RKNN_QUERY_INPUT_ATTR, &(attr), sizeof(rknn_tensor_attr));
-    if (ret != RKNN_SUCC) {
-      MMDEPLOY_ERROR("rknn query 'RKNN_QUERY_INPUT_ATTR' fail! ret: {}", ret);
-      return Status(eFail);
+        };
+
+        for (int i = 0; i < io_num.n_input; i++)
+        {
+            rknn_tensor_attr attr;
+            attr.index = i;
+            ret        = rknn_query(ctx_, RKNN_QUERY_INPUT_ATTR, &(attr), sizeof(rknn_tensor_attr));
+            if (ret != RKNN_SUCC)
+            {
+                MMDEPLOY_ERROR("rknn query 'RKNN_QUERY_INPUT_ATTR' fail! ret: {}", ret);
+                return Status(eFail);
+            }
+            if (!(attr.type == RKNN_TENSOR_UINT8 || attr.type == RKNN_TENSOR_INT8))
+            {
+                MMDEPLOY_ERROR("MMDeploy SDK only supports RKNN-INT8 model");
+                return Status(eInvalidArgument);
+            }
+            input_attrs_.push_back(attr);
+            // Only support uint8 input data
+            OUTCOME_TRY(auto data_type, GetMMDeployDataType(RKNN_TENSOR_UINT8));
+            input_tensors_.emplace_back(
+                TensorDesc{device_, data_type, get_tensor_shape(attr).value(), "#" + std::to_string(i)});
+        }
+        PrintRKNNTensorAttr("input", input_attrs_);
+
+        for (int i = 0; i < io_num.n_output; i++)
+        {
+            rknn_tensor_attr attr;
+            attr.index = i;
+            ret        = rknn_query(ctx_, RKNN_QUERY_OUTPUT_ATTR, &(attr), sizeof(rknn_tensor_attr));
+            if (ret != RKNN_SUCC)
+            {
+                MMDEPLOY_ERROR("rknn query 'RKNN_QUERY_OUTPUT_ATTR' fail! ret: {}", ret);
+                return Status(eFail);
+            }
+            output_attrs_.push_back(attr);
+            // MMDeploy SDK always make the output data type as float
+            output_tensors_.emplace_back(TensorDesc{
+                device_,
+                DataType::kFLOAT,
+                get_tensor_shape(attr).value(),
+                "#" + std::to_string(i)});
+        }
+        PrintRKNNTensorAttr("output", output_attrs_);
+
+        return success();
     }
-    if (!(attr.type == RKNN_TENSOR_UINT8 || attr.type == RKNN_TENSOR_INT8)) {
-      MMDEPLOY_ERROR("MMDeploy SDK only supports RKNN-INT8 model");
-      return Status(eInvalidArgument);
+
+    Result<void> RKNNNet::ForwardAsync(Event* event)
+    {
+        return Status(eNotSupported);
     }
-    input_attrs_.push_back(attr);
-    // Only support uint8 input data
-    OUTCOME_TRY(auto data_type, GetMMDeployDataType(RKNN_TENSOR_UINT8));
-    input_tensors_.emplace_back(
-        TensorDesc{device_, data_type, get_tensor_shape(attr).value(), "#" + std::to_string(i)});
-  }
-  PrintRKNNTensorAttr("input", input_attrs_);
-
-  for (int i = 0; i < io_num.n_output; i++) {
-    rknn_tensor_attr attr;
-    attr.index = i;
-    ret = rknn_query(ctx_, RKNN_QUERY_OUTPUT_ATTR, &(attr), sizeof(rknn_tensor_attr));
-    if (ret != RKNN_SUCC) {
-      MMDEPLOY_ERROR("rknn query 'RKNN_QUERY_OUTPUT_ATTR' fail! ret: {}", ret);
-      return Status(eFail);
+
+    Result<void> RKNNNet::Deinit()
+    {
+        return success();
     }
-    output_attrs_.push_back(attr);
-    // MMDeploy SDK always make the output data type as float
-    output_tensors_.emplace_back(TensorDesc{
-        device_, DataType::kFLOAT, get_tensor_shape(attr).value(), "#" + std::to_string(i)});
-  }
-  PrintRKNNTensorAttr("output", output_attrs_);
-
-  return success();
-}
-
-Result<void> RKNNNet::ForwardAsync(Event* event) { return Status(eNotSupported); }
-
-Result<void> RKNNNet::Deinit() { return success(); }
-
-Result<Span<Tensor>> RKNNNet::GetInputTensors() { return input_tensors_; }
-
-Result<Span<Tensor>> RKNNNet::GetOutputTensors() { return output_tensors_; }
-
-Result<void> RKNNNet::Reshape(Span<TensorShape> input_shapes) {
-  for (size_t i = 0; i < input_shapes.size(); ++i) {
-    input_tensors_[i].Reshape(input_shapes[i]);
-  }
-  return success();
-}
-
-Result<void> RKNNNet::Forward() {
-  OUTCOME_TRY(stream_.Wait());
-
-  std::vector<rknn_input> inputs;
-  for (int i = 0; i < input_tensors_.size(); i++) {
-    rknn_input input;
-    input.index = i;
-    // '0' let the buf data be converted into an input consistent with the model
-    input.pass_through = 0;
-    input.type = RKNN_TENSOR_UINT8;  // data type of input buf
-    input.fmt = RKNN_TENSOR_NHWC;    // data format of input buf
-    input.buf = input_tensors_[i].data();
-    input.size = input_attrs_[i].size;
-    inputs.push_back(input);
-  }
-
-  // Set input
-  int ret = rknn_inputs_set(ctx_, input_tensors_.size(), inputs.data());
-  if (ret < 0) {
-    MMDEPLOY_ERROR("rknn_input_set fail! ret= {}", ret);
-    return Status(eFail);
-  }
-
-  // Forward
-  ret = rknn_run(ctx_, NULL);
-  if (ret < 0) {
-    MMDEPLOY_ERROR("rknn_run fail! ret={}", ret);
-    return Status(eFail);
-  }
-
-  // Get output
-  std::vector<rknn_output> outputs(output_tensors_.size());
-  for (uint32_t i = 0; i < output_tensors_.size(); ++i) {
-    outputs[i].want_float = 1;
-    outputs[i].is_prealloc = 1;  // use pre-allocated buffer in `output_tensors_`
-    outputs[i].index = i;
-    outputs[i].buf = output_tensors_[i].data();
-    outputs[i].size = output_tensors_[i].byte_size();
-  }
-  ret = rknn_outputs_get(ctx_, outputs.size(), outputs.data(), NULL);
-  if (ret < 0) {
-    MMDEPLOY_ERROR("rknn_outputs_get fail! ret= {}", ret);
-    return Status(eFail);
-  }
-
-  OUTCOME_TRY(stream_.Wait());
-  return success();
-}
-
-static std::unique_ptr<Net> Create(const Value& args) {
-  try {
-    auto p = std::make_unique<RKNNNet>();
-    if (auto r = p->Init(args)) {
-      return p;
-    } else {
-      MMDEPLOY_ERROR("error creating RKNNNet: {}", r.error().message().c_str());
-      return nullptr;
+
+    Result<Span<Tensor>> RKNNNet::GetInputTensors()
+    {
+        return input_tensors_;
+    }
+
+    Result<Span<Tensor>> RKNNNet::GetOutputTensors()
+    {
+        return output_tensors_;
+    }
+
+    Result<void> RKNNNet::Reshape(Span<TensorShape> input_shapes)
+    {
+        for (size_t i = 0; i < input_shapes.size(); ++i)
+        {
+            input_tensors_[i].Reshape(input_shapes[i]);
+        }
+        return success();
+    }
+
+    Result<void> RKNNNet::Forward()
+    {
+        OUTCOME_TRY(stream_.Wait());
+
+        std::vector<rknn_input> inputs;
+        for (int i = 0; i < input_tensors_.size(); i++)
+        {
+            rknn_input input;
+            input.index        = i;
+            // '0' let the buf data be converted into an input consistent with the model
+            input.pass_through = 0;
+            input.type         = RKNN_TENSOR_UINT8;  // data type of input buf
+            input.fmt          = RKNN_TENSOR_NHWC;   // data format of input buf
+            input.buf          = input_tensors_[i].data();
+            input.size         = input_attrs_[i].size;
+            inputs.push_back(input);
+        }
+
+        // Set input
+        int ret = rknn_inputs_set(ctx_, input_tensors_.size(), inputs.data());
+        if (ret < 0)
+        {
+            MMDEPLOY_ERROR("rknn_input_set fail! ret= {}", ret);
+            return Status(eFail);
+        }
+
+        // Forward
+        ret = rknn_run(ctx_, NULL);
+        if (ret < 0)
+        {
+            MMDEPLOY_ERROR("rknn_run fail! ret={}", ret);
+            return Status(eFail);
+        }
+
+        // Get output
+        std::vector<rknn_output> outputs(output_tensors_.size());
+        for (uint32_t i = 0; i < output_tensors_.size(); ++i)
+        {
+            outputs[i].want_float  = 1;
+            outputs[i].is_prealloc = 1;  // use pre-allocated buffer in `output_tensors_`
+            outputs[i].index       = i;
+            outputs[i].buf         = output_tensors_[i].data();
+            outputs[i].size        = output_tensors_[i].byte_size();
+        }
+        ret = rknn_outputs_get(ctx_, outputs.size(), outputs.data(), NULL);
+        if (ret < 0)
+        {
+            MMDEPLOY_ERROR("rknn_outputs_get fail! ret= {}", ret);
+            return Status(eFail);
+        }
+
+        OUTCOME_TRY(stream_.Wait());
+        return success();
+    }
+
+    static std::unique_ptr<Net> Create(const Value& args)
+    {
+        try
+        {
+            auto p = std::make_unique<RKNNNet>();
+            if (auto r = p->Init(args))
+            {
+                return p;
+            }
+            else
+            {
+                MMDEPLOY_ERROR("error creating RKNNNet: {}", r.error().message().c_str());
+                return nullptr;
+            }
+        }
+        catch (const std::exception& e)
+        {
+            MMDEPLOY_ERROR("unhandled exception when creating RKNNNet: {}", e.what());
+            return nullptr;
+        }
     }
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("unhandled exception when creating RKNNNet: {}", e.what());
-    return nullptr;
-  }
-}
 
-MMDEPLOY_REGISTER_FACTORY_FUNC(Net, (rknn, 0), Create);
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Net, (rknn, 0), Create);
 
 }  // namespace mmdeploy::framework
diff --git a/csrc/mmdeploy/net/rknn/rknn_net.h b/csrc/mmdeploy/net/rknn/rknn_net.h
index 5d42c749d5..f307af1cd0 100644
--- a/csrc/mmdeploy/net/rknn/rknn_net.h
+++ b/csrc/mmdeploy/net/rknn/rknn_net.h
@@ -7,38 +7,40 @@
 #include "mmdeploy/core/net.h"
 #include "rknn_api.h"
 
-namespace mmdeploy::framework {
+namespace mmdeploy::framework
+{
 
-class RKNNNet : public Net {
- public:
-  ~RKNNNet() override;
+    class RKNNNet : public Net
+    {
+      public:
+        ~RKNNNet() override;
 
-  Result<void> Init(const Value& args) override;
+        Result<void>         Init(const Value& args) override;
 
-  Result<void> Deinit() override;
+        Result<void>         Deinit() override;
 
-  Result<void> Reshape(Span<TensorShape> input_shapes) override;
+        Result<void>         Reshape(Span<TensorShape> input_shapes) override;
 
-  Result<Span<Tensor> > GetInputTensors() override;
+        Result<Span<Tensor>> GetInputTensors() override;
 
-  Result<Span<Tensor> > GetOutputTensors() override;
+        Result<Span<Tensor>> GetOutputTensors() override;
 
-  Result<void> Forward() override;
+        Result<void>         Forward() override;
 
-  Result<void> ForwardAsync(Event* event) override;
+        Result<void>         ForwardAsync(Event* event) override;
 
- private:
-  void PrintRKNNTensorAttr(const char* tag, const std::vector<rknn_tensor_attr>& attrs);
+      private:
+        void                          PrintRKNNTensorAttr(const char* tag, const std::vector<rknn_tensor_attr>& attrs);
 
-  Device device_;
-  Stream stream_;
-  rknn_context ctx_;
-  std::vector<Tensor> input_tensors_;
-  std::vector<Tensor> output_tensors_;
-  std::vector<rknn_tensor_attr> input_attrs_;
-  std::vector<rknn_tensor_attr> output_attrs_;
-  static constexpr const auto kHost = Device(0);
-};
+        Device                        device_;
+        Stream                        stream_;
+        rknn_context                  ctx_;
+        std::vector<Tensor>           input_tensors_;
+        std::vector<Tensor>           output_tensors_;
+        std::vector<rknn_tensor_attr> input_attrs_;
+        std::vector<rknn_tensor_attr> output_attrs_;
+        static constexpr const auto   kHost = Device(0);
+    };
 
 }  // namespace mmdeploy::framework
 
diff --git a/csrc/mmdeploy/net/snpe/CMakeLists.txt b/csrc/mmdeploy/net/snpe/CMakeLists.txt
index 2f8af24dc3..25ce6a8a30 100644
--- a/csrc/mmdeploy/net/snpe/CMakeLists.txt
+++ b/csrc/mmdeploy/net/snpe/CMakeLists.txt
@@ -10,15 +10,15 @@ else()
   set(sub_dir "x86_64-linux-clang")
 endif()
 
-if (NOT EXISTS $ENV{SNPE_ROOT}/lib/${sub_dir}/)
-  message(ERROR "SNPE_ROOT directory not exist:  $ENV{SNPE_ROOT}/lib/${sub_dir}/")
+if(NOT EXISTS $ENV{SNPE_ROOT}/lib/${sub_dir}/)
+  message(ERROR
+          "SNPE_ROOT directory not exist:  $ENV{SNPE_ROOT}/lib/${sub_dir}/")
 endif()
 message(STATUS "SNPE lib directory $ENV{SNPE_ROOT}/lib/${sub_dir}/")
 
-set_target_properties(snpe PROPERTIES
-  IMPORTED_LOCATION "$ENV{SNPE_ROOT}/lib/${sub_dir}/libSNPE.so"
-  INTERFACE_INCLUDE_DIRECTORIES "$ENV{SNPE_ROOT}/include/zdl"
-)
+set_target_properties(
+  snpe PROPERTIES IMPORTED_LOCATION "$ENV{SNPE_ROOT}/lib/${sub_dir}/libSNPE.so"
+                  INTERFACE_INCLUDE_DIRECTORIES "$ENV{SNPE_ROOT}/include/zdl")
 
 mmdeploy_add_module(${PROJECT_NAME} snpe_net.cpp)
 target_link_libraries(${PROJECT_NAME} PRIVATE snpe)
diff --git a/csrc/mmdeploy/net/snpe/snpe_net.cpp b/csrc/mmdeploy/net/snpe/snpe_net.cpp
index d847449652..79057aa67a 100644
--- a/csrc/mmdeploy/net/snpe/snpe_net.cpp
+++ b/csrc/mmdeploy/net/snpe/snpe_net.cpp
@@ -6,252 +6,301 @@
 #include "mmdeploy/core/model.h"
 #include "mmdeploy/core/utils/formatter.h"
 
-namespace mmdeploy::framework {
-
-SNPENet::~SNPENet() {}
-
-std::string SNPENet::ShapeStr(zdl::DlSystem::ITensor* pTensor) {
-  std::string str;
-
-  str += "[";
-  auto shape = pTensor->getShape();
-  for (int i = 0; i < shape.rank(); ++i) {
-    str += std::to_string(shape[i]);
-    str += ",";
-  }
-  str += ']';
-  return str;
-}
-
-void SNPENet::Build(std::unique_ptr<zdl::DlContainer::IDlContainer>& container,
-                    zdl::DlSystem::Runtime_t runtime, zdl::DlSystem::RuntimeList runtimeList,
-                    bool useUserSuppliedBuffers, zdl::DlSystem::PlatformConfig platformConfig) {
-  zdl::SNPE::SNPEBuilder snpeBuilder(container.get());
-
-  if (runtimeList.empty()) {
-    runtimeList.add(runtime);
-  }
-
-  snpe_ =
-      snpeBuilder.setOutputLayers({})
-          .setRuntimeProcessorOrder(runtimeList)
-          .setUseUserSuppliedBuffers(useUserSuppliedBuffers)
-          .setPlatformConfig(platformConfig)
-          .setPerformanceProfile(zdl::DlSystem::PerformanceProfile_t::SUSTAINED_HIGH_PERFORMANCE)
-          .build();
-  return;
-}
-
-void SNPENet::copy_output(const zdl::DlSystem::ITensor* from, Tensor& to) {
-  auto hwc_to_chw = [](const zdl::DlSystem::TensorShape& shape) -> bool {
-    if (shape.rank() != 4 || (shape[1] == 1 && shape[2] > 1 && shape[3] > 1)) {
-      return false;
+namespace mmdeploy::framework
+{
+
+    SNPENet::~SNPENet() {}
+
+    std::string SNPENet::ShapeStr(zdl::DlSystem::ITensor* pTensor)
+    {
+        std::string str;
+
+        str += "[";
+        auto shape = pTensor->getShape();
+        for (int i = 0; i < shape.rank(); ++i)
+        {
+            str += std::to_string(shape[i]);
+            str += ",";
+        }
+        str += ']';
+        return str;
     }
-    return true;
-  };
 
-  auto output_shape = from->getShape();
-
-  if (to.size() != from->getSize()) {
-    TensorShape tensor_shape;
-    for (int j = 0; j < output_shape.rank(); ++j) {
-      tensor_shape.push_back(output_shape[j]);
+    void SNPENet::Build(std::unique_ptr<zdl::DlContainer::IDlContainer>& container,
+                        zdl::DlSystem::Runtime_t                         runtime,
+                        zdl::DlSystem::RuntimeList                       runtimeList,
+                        bool                                             useUserSuppliedBuffers,
+                        zdl::DlSystem::PlatformConfig                    platformConfig)
+    {
+        zdl::SNPE::SNPEBuilder snpeBuilder(container.get());
+
+        if (runtimeList.empty())
+        {
+            runtimeList.add(runtime);
+        }
+
+        snpe_ =
+            snpeBuilder.setOutputLayers({})
+                .setRuntimeProcessorOrder(runtimeList)
+                .setUseUserSuppliedBuffers(useUserSuppliedBuffers)
+                .setPlatformConfig(platformConfig)
+                .setPerformanceProfile(zdl::DlSystem::PerformanceProfile_t::SUSTAINED_HIGH_PERFORMANCE)
+                .build();
+        return;
     }
 
-    if (hwc_to_chw(output_shape)) {
-      auto tmp = output_shape[3];
-      output_shape[3] = output_shape[1];
-      output_shape[1] = tmp;
+    void SNPENet::copy_output(const zdl::DlSystem::ITensor* from, Tensor& to)
+    {
+        auto hwc_to_chw = [](const zdl::DlSystem::TensorShape& shape) -> bool
+        {
+            if (shape.rank() != 4 || (shape[1] == 1 && shape[2] > 1 && shape[3] > 1))
+            {
+                return false;
+            }
+            return true;
+        };
+
+        auto output_shape = from->getShape();
+
+        if (to.size() != from->getSize())
+        {
+            TensorShape tensor_shape;
+            for (int j = 0; j < output_shape.rank(); ++j)
+            {
+                tensor_shape.push_back(output_shape[j]);
+            }
+
+            if (hwc_to_chw(output_shape))
+            {
+                auto tmp        = output_shape[3];
+                output_shape[3] = output_shape[1];
+                output_shape[1] = tmp;
+            }
+            to.Reshape(tensor_shape);
+        }
+
+        float* pto = to.data<float>();
+
+        if (output_shape.rank() != 4 ||
+            (output_shape[1] == 1 && output_shape[2] > 1 && output_shape[3] > 1))
+        {
+            // skip [1,1,w>1,h>1] for segmentation task
+            for (auto it = from->cbegin(); it != from->cend(); ++it, ++pto)
+            {
+                *pto = *it;
+            }
+        }
+        else
+        {
+            const int channel = output_shape[1];
+            const int panel   = output_shape[2] * output_shape[3];
+
+            int       i = 0;
+            // HWC to CHW
+            for (auto it = from->cbegin(); it != from->cend(); ++it, ++i)
+            {
+                int channel_idx                      = i % channel;
+                int panel_idx                        = i / channel;
+                pto[channel_idx * panel + panel_idx] = *it;
+            }
+        }
+        return;
     }
-    to.Reshape(tensor_shape);
-  }
-
-  float* pto = to.data<float>();
 
-  if (output_shape.rank() != 4 ||
-      (output_shape[1] == 1 && output_shape[2] > 1 && output_shape[3] > 1)) {
-    // skip [1,1,w>1,h>1] for segmentation task
-    for (auto it = from->cbegin(); it != from->cend(); ++it, ++pto) {
-      *pto = *it;
+    void SNPENet::copy_input(const Tensor& from, zdl::DlSystem::ITensor* to)
+    {
+        if (from.size() != to->getSize())
+        {
+            MMDEPLOY_ERROR("input tensor size not match");
+            return;
+        }
+
+        const float* pfrom = from.data<float>();
+
+        auto         input_shape = to->getShape();
+        if (input_shape.rank() == 4)
+        {
+            const int channel = input_shape[3];
+            const int panel   = input_shape[1] * input_shape[2];
+
+            int       i = 0;
+            // CHW to HWC
+            for (auto it = to->begin(); it != to->end(); ++it, ++i)
+            {
+                int channel_index = i % channel;
+                int panel_index   = (i / channel) % panel;
+
+                *it = pfrom[channel_index * panel + panel_index];
+            }
+        }
+        else
+        {
+            for (auto it = to->begin(); it != to->end(); ++it, ++pfrom)
+            {
+                *it = *pfrom;
+            }
+        }
     }
-  } else {
-    const int channel = output_shape[1];
-    const int panel = output_shape[2] * output_shape[3];
-
-    int i = 0;
-    // HWC to CHW
-    for (auto it = from->cbegin(); it != from->cend(); ++it, ++i) {
-      int channel_idx = i % channel;
-      int panel_idx = i / channel;
-      pto[channel_idx * panel + panel_idx] = *it;
+
+    Result<void> SNPENet::Init(const Value& args)
+    {
+        auto& context = args["context"];
+        device_       = context["device"].get<Device>();
+        stream_       = context["stream"].get<Stream>();
+        if (!device_.is_host())
+        {
+            return Status(eNotSupported);
+        }
+
+        auto name  = args["name"].get<std::string>();
+        auto model = context["model"].get<Model>();
+        OUTCOME_TRY(auto config, model.GetModelConfig(name));
+
+        std::string content;
+        OUTCOME_TRY(content, model.ReadFile(config.net));
+        char* model_ptr = const_cast<char*>(content.data());
+        container_ =
+            zdl::DlContainer::IDlContainer::open(reinterpret_cast<uint8_t*>(model_ptr), content.size());
+        if (container_ == nullptr)
+        {
+            MMDEPLOY_ERROR("Load .dlc failed: {}", config.net);
+            return Status(eInvalidArgument);
+        }
+
+        zdl::DlSystem::Runtime_t runtime = zdl::DlSystem::Runtime_t::GPU;
+        if (!zdl::SNPE::SNPEFactory::isRuntimeAvailable(runtime))
+        {
+            MMDEPLOY_WARN("Selected runtime not present. Falling back to CPU.\n");
+            runtime = zdl::DlSystem::Runtime_t::CPU;
+        }
+
+        zdl::DlSystem::RuntimeList runtimeList;
+        // Add CPU backend to support fallback
+        runtimeList.add(zdl::DlSystem::Runtime_t::CPU);
+        runtimeList.add(runtime);
+        zdl::DlSystem::PlatformConfig platformConfig;
+        Build(container_, runtime, runtimeList, false, platformConfig);
+
+        // init internal input tensor list
+        const auto& inputTensorNamesRef = snpe_->getInputTensorNames();
+        const auto& inputTensorNames    = *inputTensorNamesRef;
+        inputs_internal_.resize(inputTensorNames.size());
+
+        for (int i = 0; i < inputTensorNames.size(); ++i)
+        {
+            const auto& inputShape_opt = snpe_->getInputDimensions(inputTensorNames.at(i));
+            const auto& inputShape     = *inputShape_opt;
+
+            inputs_internal_[i] = zdl::SNPE::SNPEFactory::getTensorFactory().createTensor(inputShape);
+
+            std::string info =
+                std::string(inputTensorNames.at(i)) + " shape: " + ShapeStr(inputs_internal_[i].get());
+            MMDEPLOY_INFO(info);
+
+            input_tensor_map_.add(inputTensorNames.at(i), inputs_internal_[i].get());
+
+            input_tensors_.emplace_back(TensorDesc{
+                Device("cpu"),
+                DataType::kFLOAT,
+                {},
+                std::string(inputTensorNames.at(i)),
+            });
+        }
+
+        const auto& outputTensorNamesRef = snpe_->getOutputTensorNames();
+        const auto& outputTensorNames    = *outputTensorNamesRef;
+        for (int i = 0; i < outputTensorNames.size(); ++i)
+        {
+            output_tensors_.emplace_back(TensorDesc{
+                Device("cpu"),
+                DataType::kFLOAT,
+                {},
+                std::string(outputTensorNames.at(i)),
+            });
+        }
+
+        return success();
     }
-  }
-  return;
-}
-
-void SNPENet::copy_input(const Tensor& from, zdl::DlSystem::ITensor* to) {
-  if (from.size() != to->getSize()) {
-    MMDEPLOY_ERROR("input tensor size not match");
-    return;
-  }
-
-  const float* pfrom = from.data<float>();
-
-  auto input_shape = to->getShape();
-  if (input_shape.rank() == 4) {
-    const int channel = input_shape[3];
-    const int panel = input_shape[1] * input_shape[2];
-
-    int i = 0;
-    // CHW to HWC
-    for (auto it = to->begin(); it != to->end(); ++it, ++i) {
-      int channel_index = i % channel;
-      int panel_index = (i / channel) % panel;
-
-      *it = pfrom[channel_index * panel + panel_index];
+
+    Result<void> SNPENet::Deinit()
+    {
+        return success();
     }
 
-  } else {
-    for (auto it = to->begin(); it != to->end(); ++it, ++pfrom) {
-      *it = *pfrom;
+    Result<void> SNPENet::Reshape(Span<TensorShape> input_shapes)
+    {
+        for (size_t i = 0; i < input_shapes.size(); ++i)
+        {
+            input_tensors_[i].Reshape(input_shapes[i]);
+        }
+        return success();
     }
-  }
-}
-
-Result<void> SNPENet::Init(const Value& args) {
-  auto& context = args["context"];
-  device_ = context["device"].get<Device>();
-  stream_ = context["stream"].get<Stream>();
-  if (!device_.is_host()) {
-    return Status(eNotSupported);
-  }
-
-  auto name = args["name"].get<std::string>();
-  auto model = context["model"].get<Model>();
-  OUTCOME_TRY(auto config, model.GetModelConfig(name));
-
-  std::string content;
-  OUTCOME_TRY(content, model.ReadFile(config.net));
-  char* model_ptr = const_cast<char*>(content.data());
-  container_ =
-      zdl::DlContainer::IDlContainer::open(reinterpret_cast<uint8_t*>(model_ptr), content.size());
-  if (container_ == nullptr) {
-    MMDEPLOY_ERROR("Load .dlc failed: {}", config.net);
-    return Status(eInvalidArgument);
-  }
-
-  zdl::DlSystem::Runtime_t runtime = zdl::DlSystem::Runtime_t::GPU;
-  if (!zdl::SNPE::SNPEFactory::isRuntimeAvailable(runtime)) {
-    MMDEPLOY_WARN("Selected runtime not present. Falling back to CPU.\n");
-    runtime = zdl::DlSystem::Runtime_t::CPU;
-  }
-
-  zdl::DlSystem::RuntimeList runtimeList;
-  // Add CPU backend to support fallback
-  runtimeList.add(zdl::DlSystem::Runtime_t::CPU);
-  runtimeList.add(runtime);
-  zdl::DlSystem::PlatformConfig platformConfig;
-  Build(container_, runtime, runtimeList, false, platformConfig);
-
-  // init internal input tensor list
-  const auto& inputTensorNamesRef = snpe_->getInputTensorNames();
-  const auto& inputTensorNames = *inputTensorNamesRef;
-  inputs_internal_.resize(inputTensorNames.size());
-
-  for (int i = 0; i < inputTensorNames.size(); ++i) {
-    const auto& inputShape_opt = snpe_->getInputDimensions(inputTensorNames.at(i));
-    const auto& inputShape = *inputShape_opt;
-
-    inputs_internal_[i] = zdl::SNPE::SNPEFactory::getTensorFactory().createTensor(inputShape);
-
-    std::string info =
-        std::string(inputTensorNames.at(i)) + " shape: " + ShapeStr(inputs_internal_[i].get());
-    MMDEPLOY_INFO(info);
-
-    input_tensor_map_.add(inputTensorNames.at(i), inputs_internal_[i].get());
-
-    input_tensors_.emplace_back(TensorDesc{
-        Device("cpu"),
-        DataType::kFLOAT,
-        {},
-        std::string(inputTensorNames.at(i)),
-    });
-  }
-
-  const auto& outputTensorNamesRef = snpe_->getOutputTensorNames();
-  const auto& outputTensorNames = *outputTensorNamesRef;
-  for (int i = 0; i < outputTensorNames.size(); ++i) {
-    output_tensors_.emplace_back(TensorDesc{
-        Device("cpu"),
-        DataType::kFLOAT,
-        {},
-        std::string(outputTensorNames.at(i)),
-    });
-  }
-
-  return success();
-}
-
-Result<void> SNPENet::Deinit() { return success(); }
-
-Result<void> SNPENet::Reshape(Span<TensorShape> input_shapes) {
-  for (size_t i = 0; i < input_shapes.size(); ++i) {
-    input_tensors_[i].Reshape(input_shapes[i]);
-  }
-  return success();
-}
-
-Result<Span<Tensor>> SNPENet::GetInputTensors() { return input_tensors_; }
-
-Result<Span<Tensor>> SNPENet::GetOutputTensors() { return output_tensors_; }
-
-Result<void> SNPENet::Forward() {
-  OUTCOME_TRY(stream_.Wait());
-
-  {
-    // copy input to itensor buffer
-    for (auto& tensor : input_tensors_) {
-      const auto& name = tensor.desc().name;
-      auto pbuffer = input_tensor_map_.getTensor(name.c_str());
-
-      copy_input(tensor, pbuffer);
+
+    Result<Span<Tensor>> SNPENet::GetInputTensors()
+    {
+        return input_tensors_;
     }
-  }
-
-  // A tensor map for SNPE execution outputs
-  zdl::DlSystem::TensorMap output_map;
-  {
-    // real inference
-    bool success = snpe_->execute(input_tensor_map_, output_map);
-    if (!success) {
-      MMDEPLOY_ERROR("snpe Inference error: {}", std::string(zdl::DlSystem::getLastErrorString()));
-      return Status(eFail);
+
+    Result<Span<Tensor>> SNPENet::GetOutputTensors()
+    {
+        return output_tensors_;
     }
-  }
 
-  {
-    // extract output buffer to tensor
-    auto names = output_map.getTensorNames();
-    for (size_t i = 0; i < names.size(); ++i) {
-      const zdl::DlSystem::ITensor* pbuffer = output_map.getTensor(names.at(i));
+    Result<void> SNPENet::Forward()
+    {
+        OUTCOME_TRY(stream_.Wait());
+
+        {
+            // copy input to itensor buffer
+            for (auto& tensor : input_tensors_)
+            {
+                const auto& name    = tensor.desc().name;
+                auto        pbuffer = input_tensor_map_.getTensor(name.c_str());
+
+                copy_input(tensor, pbuffer);
+            }
+        }
+
+        // A tensor map for SNPE execution outputs
+        zdl::DlSystem::TensorMap output_map;
+        {
+            // real inference
+            bool success = snpe_->execute(input_tensor_map_, output_map);
+            if (!success)
+            {
+                MMDEPLOY_ERROR("snpe Inference error: {}", std::string(zdl::DlSystem::getLastErrorString()));
+                return Status(eFail);
+            }
+        }
+
+        {
+            // extract output buffer to tensor
+            auto names = output_map.getTensorNames();
+            for (size_t i = 0; i < names.size(); ++i)
+            {
+                const zdl::DlSystem::ITensor* pbuffer = output_map.getTensor(names.at(i));
+
+                auto&                         tensor = output_tensors_[i];
+                copy_output(pbuffer, tensor);
+            }
+        }
+        return success();
+    }
 
-      auto& tensor = output_tensors_[i];
-      copy_output(pbuffer, tensor);
+    static std::unique_ptr<Net> Create(const Value& args)
+    {
+        auto p = std::make_unique<SNPENet>();
+        if (auto r = p->Init(args))
+        {
+            return p;
+        }
+        else
+        {
+            MMDEPLOY_ERROR("error creating SNPENet: {}", r.error().message().c_str());
+            return nullptr;
+        }
     }
-  }
-  return success();
-}
-
-static std::unique_ptr<Net> Create(const Value& args) {
-  auto p = std::make_unique<SNPENet>();
-  if (auto r = p->Init(args)) {
-    return p;
-  } else {
-    MMDEPLOY_ERROR("error creating SNPENet: {}", r.error().message().c_str());
-    return nullptr;
-  }
-}
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(Net, (snpe, 0), Create);
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Net, (snpe, 0), Create);
 
 }  // namespace mmdeploy::framework
diff --git a/csrc/mmdeploy/net/snpe/snpe_net.h b/csrc/mmdeploy/net/snpe/snpe_net.h
index 90257811df..203ec320e1 100644
--- a/csrc/mmdeploy/net/snpe/snpe_net.h
+++ b/csrc/mmdeploy/net/snpe/snpe_net.h
@@ -21,40 +21,47 @@
 #include "SNPE/SNPEFactory.hpp"
 #include "mmdeploy/core/net.h"
 
-namespace mmdeploy::framework {
-
-class SNPENet : public Net {
- public:
-  ~SNPENet() override;
-  Result<void> Init(const Value& args) override;
-  Result<void> Deinit() override;
-  Result<Span<Tensor>> GetInputTensors() override;
-  Result<Span<Tensor>> GetOutputTensors() override;
-  Result<void> Reshape(Span<TensorShape> input_shapes) override;
-  Result<void> Forward() override;
-  Result<void> ForwardAsync(Event* event) override { return Status(eNotSupported); };
-
- private:
-  void Build(std::unique_ptr<zdl::DlContainer::IDlContainer>& container,
-             zdl::DlSystem::Runtime_t runtime, zdl::DlSystem::RuntimeList runtimeList,
-             bool useUserSuppliedBuffers, zdl::DlSystem::PlatformConfig platformConfig);
-
-  std::string ShapeStr(zdl::DlSystem::ITensor* pTensor);
-
-  void copy_output(const zdl::DlSystem::ITensor* from, Tensor& to);
-  void copy_input(const Tensor& from, zdl::DlSystem::ITensor* to);
-
-  Device device_;
-  Stream stream_;
-  std::vector<Tensor> input_tensors_;
-  std::vector<Tensor> output_tensors_;
-
-  std::unique_ptr<zdl::SNPE::SNPE> snpe_;
-  std::unique_ptr<zdl::DlContainer::IDlContainer> container_;
-
-  std::vector<std::unique_ptr<zdl::DlSystem::ITensor>> inputs_internal_;
-  zdl::DlSystem::TensorMap input_tensor_map_;
-};
+namespace mmdeploy::framework
+{
+
+    class SNPENet : public Net
+    {
+      public:
+        ~SNPENet() override;
+        Result<void>         Init(const Value& args) override;
+        Result<void>         Deinit() override;
+        Result<Span<Tensor>> GetInputTensors() override;
+        Result<Span<Tensor>> GetOutputTensors() override;
+        Result<void>         Reshape(Span<TensorShape> input_shapes) override;
+        Result<void>         Forward() override;
+        Result<void>         ForwardAsync(Event* event) override
+        {
+            return Status(eNotSupported);
+        };
+
+      private:
+        void                                                 Build(std::unique_ptr<zdl::DlContainer::IDlContainer>& container,
+                                                                   zdl::DlSystem::Runtime_t                         runtime,
+                                                                   zdl::DlSystem::RuntimeList                       runtimeList,
+                                                                   bool                                             useUserSuppliedBuffers,
+                                                                   zdl::DlSystem::PlatformConfig                    platformConfig);
+
+        std::string                                          ShapeStr(zdl::DlSystem::ITensor* pTensor);
+
+        void                                                 copy_output(const zdl::DlSystem::ITensor* from, Tensor& to);
+        void                                                 copy_input(const Tensor& from, zdl::DlSystem::ITensor* to);
+
+        Device                                               device_;
+        Stream                                               stream_;
+        std::vector<Tensor>                                  input_tensors_;
+        std::vector<Tensor>                                  output_tensors_;
+
+        std::unique_ptr<zdl::SNPE::SNPE>                     snpe_;
+        std::unique_ptr<zdl::DlContainer::IDlContainer>      container_;
+
+        std::vector<std::unique_ptr<zdl::DlSystem::ITensor>> inputs_internal_;
+        zdl::DlSystem::TensorMap                             input_tensor_map_;
+    };
 
 }  // namespace mmdeploy::framework
 
diff --git a/csrc/mmdeploy/net/torchscript/CMakeLists.txt b/csrc/mmdeploy/net/torchscript/CMakeLists.txt
index e3e25c9f85..34df69f46d 100644
--- a/csrc/mmdeploy/net/torchscript/CMakeLists.txt
+++ b/csrc/mmdeploy/net/torchscript/CMakeLists.txt
@@ -4,25 +4,24 @@ project(mmdeploy_torch_net)
 
 option(MMDEPLOY_TORCHSCRIPT_SDK_BACKEND "Build TorchScript SDK backend" OFF)
 
-if (MMDEPLOY_TORCHSCRIPT_SDK_BACKEND)
-    find_package(Torch REQUIRED)
-    find_package(TorchVision QUIET)
+if(MMDEPLOY_TORCHSCRIPT_SDK_BACKEND)
+  find_package(Torch REQUIRED)
+  find_package(TorchVision QUIET)
 
-    mmdeploy_add_net(${PROJECT_NAME} torch_net.cpp)
+  mmdeploy_add_net(${PROJECT_NAME} torch_net.cpp)
 
-    target_link_libraries(${PROJECT_NAME} PRIVATE
-            ${TORCH_LIBRARIES})
+  target_link_libraries(${PROJECT_NAME} PRIVATE ${TORCH_LIBRARIES})
 
-    target_link_directories(${PROJECT_NAME} INTERFACE
-            $<BUILD_INTERFACE:${Torch_DIR}/../../../lib>)
+  target_link_directories(${PROJECT_NAME} INTERFACE
+                          $<BUILD_INTERFACE:${Torch_DIR}/../../../lib>)
 
-    target_link_libraries(${PROJECT_NAME} PRIVATE
-            mmdeploy_torchscript_ops_obj)
+  target_link_libraries(${PROJECT_NAME} PRIVATE mmdeploy_torchscript_ops_obj)
 
-    if (TorchVision_FOUND)
-        target_link_libraries(${PROJECT_NAME} PRIVATE TorchVision::TorchVision)
-        target_compile_definitions(${PROJECT_NAME} PRIVATE -DMMDEPLOY_USE_TORCHVISION=1)
-    endif ()
+  if(TorchVision_FOUND)
+    target_link_libraries(${PROJECT_NAME} PRIVATE TorchVision::TorchVision)
+    target_compile_definitions(${PROJECT_NAME}
+                               PRIVATE -DMMDEPLOY_USE_TORCHVISION=1)
+  endif()
 
-    add_library(mmdeploy::torch_net ALIAS ${PROJECT_NAME})
-endif ()
+  add_library(mmdeploy::torch_net ALIAS ${PROJECT_NAME})
+endif()
diff --git a/csrc/mmdeploy/net/torchscript/torch_net.cpp b/csrc/mmdeploy/net/torchscript/torch_net.cpp
index 0f424bb710..61ff37ed69 100644
--- a/csrc/mmdeploy/net/torchscript/torch_net.cpp
+++ b/csrc/mmdeploy/net/torchscript/torch_net.cpp
@@ -7,227 +7,295 @@
 #include "torch/torch.h"
 
 #if MMDEPLOY_USE_CUDA
-#include "c10/cuda/CUDAGuard.h"
-#include "c10/cuda/CUDAStream.h"
+    #include "c10/cuda/CUDAGuard.h"
+    #include "c10/cuda/CUDAStream.h"
 #endif
 
 #if MMDEPLOY_USE_TORCHVISION
-#include "torchvision/vision.h"
-MMDEPLOY_API void _mmdeploy_force_link_torchvision() { vision::detail::_register_ops(); }
+    #include "torchvision/vision.h"
+MMDEPLOY_API void _mmdeploy_force_link_torchvision()
+{
+    vision::detail::_register_ops();
+}
 #endif
 
-namespace mmdeploy::framework {
+namespace mmdeploy::framework
+{
 
-namespace {
+    namespace
+    {
 
-class InferenceMode {
+        class InferenceMode
+        {
 #if TORCH_VERSION_MAJOR == 1 && TORCH_VERSION_MINOR >= 10
-  c10::InferenceMode guard_;
+            c10::InferenceMode guard_;
 #else
-  at::AutoNonVariableTypeMode guard_;
+            at::AutoNonVariableTypeMode guard_;
 #endif
-};
+        };
 
-class StreamGuard {
- public:
-  StreamGuard(const torch::Device& device, Stream stream)
-      : device_(device), stream_(std::move(stream)), device_guard_(device) {
-    stream_.Wait().value();
-  }
+        class StreamGuard
+        {
+          public:
+            StreamGuard(const torch::Device& device, Stream stream)
+                : device_(device)
+                , stream_(std::move(stream))
+                , device_guard_(device)
+            {
+                stream_.Wait().value();
+            }
 
-  ~StreamGuard() {
+            ~StreamGuard()
+            {
 #if MMDEPLOY_USE_CUDA
-    auto device = stream_.GetDevice();
-    if (device.is_device()) {
-      Stream stream(device, (cudaStream_t)c10::cuda::getCurrentCUDAStream(device_.index()));
-      stream.Wait().value();
-    }
+                auto device = stream_.GetDevice();
+                if (device.is_device())
+                {
+                    Stream stream(device, (cudaStream_t)c10::cuda::getCurrentCUDAStream(device_.index()));
+                    stream.Wait().value();
+                }
 #endif
-  }
-
- private:
-  torch::Device device_;
-  Stream stream_;
-  c10::DeviceGuard device_guard_;
-};
-
-Result<torch::ScalarType> FromDataType(DataType data_type) {
-  switch (data_type) {
-    case DataType::kFLOAT:
-      return torch::ScalarType::Float;
-    case DataType::kHALF:
-      return torch::ScalarType::Half;
-    case DataType::kINT32:
-      return torch::ScalarType::Int;
-    case DataType::kINT64:
-      return torch::ScalarType::Long;
-    case DataType::kINT8:
-      return torch::ScalarType::Char;
-    default:
-      MMDEPLOY_ERROR("Unsupported mmdeploy::DataType: {}", to_string(data_type));
-      return Status(eNotSupported);
-  }
-}
+            }
 
-Result<DataType> ToDataType(torch::ScalarType scalar_type) {
-  switch (scalar_type) {
-    case torch::ScalarType::Float:
-      return DataType::kFLOAT;
-    case torch::ScalarType::Half:
-      return DataType::kHALF;
-    case torch::ScalarType::Int:
-      return DataType::kINT32;
-    case torch::ScalarType::Long:
-      return DataType::kINT64;
-    case torch::ScalarType::Char:
-      return DataType::kINT8;
-    default:
-      MMDEPLOY_ERROR("Unsupported torch::ScalarType: {}", toString(scalar_type));
-      return Status(eNotSupported);
-  }
-}
+          private:
+            torch::Device    device_;
+            Stream           stream_;
+            c10::DeviceGuard device_guard_;
+        };
 
-}  // namespace
+        Result<torch::ScalarType> FromDataType(DataType data_type)
+        {
+            switch (data_type)
+            {
+                case DataType::kFLOAT:
+                    return torch::ScalarType::Float;
+                case DataType::kHALF:
+                    return torch::ScalarType::Half;
+                case DataType::kINT32:
+                    return torch::ScalarType::Int;
+                case DataType::kINT64:
+                    return torch::ScalarType::Long;
+                case DataType::kINT8:
+                    return torch::ScalarType::Char;
+                default:
+                    MMDEPLOY_ERROR("Unsupported mmdeploy::DataType: {}", to_string(data_type));
+                    return Status(eNotSupported);
+            }
+        }
+
+        Result<DataType> ToDataType(torch::ScalarType scalar_type)
+        {
+            switch (scalar_type)
+            {
+                case torch::ScalarType::Float:
+                    return DataType::kFLOAT;
+                case torch::ScalarType::Half:
+                    return DataType::kHALF;
+                case torch::ScalarType::Int:
+                    return DataType::kINT32;
+                case torch::ScalarType::Long:
+                    return DataType::kINT64;
+                case torch::ScalarType::Char:
+                    return DataType::kINT8;
+                default:
+                    MMDEPLOY_ERROR("Unsupported torch::ScalarType: {}", toString(scalar_type));
+                    return Status(eNotSupported);
+            }
+        }
 
-TorchNet::~TorchNet() = default;
+    }  // namespace
 
-Result<void> TorchNet::Init(const Value& cfg) {
-  auto& context = cfg["context"];
-  device_ = context["device"].get<Device>();
-  stream_ = context["stream"].get<Stream>();
+    TorchNet::~TorchNet() = default;
+
+    Result<void> TorchNet::Init(const Value& cfg)
+    {
+        auto& context = cfg["context"];
+        device_       = context["device"].get<Device>();
+        stream_       = context["stream"].get<Stream>();
 
-  auto name = cfg["name"].get<std::string>();
-  auto model = context["model"].get<Model>();
+        auto name  = cfg["name"].get<std::string>();
+        auto model = context["model"].get<Model>();
 
-  OUTCOME_TRY(auto config, model.GetModelConfig(name));
-  OUTCOME_TRY(auto bytes, model.ReadFile(config.net));
+        OUTCOME_TRY(auto config, model.GetModelConfig(name));
+        OUTCOME_TRY(auto bytes, model.ReadFile(config.net));
 
-  auto platform = Platform(device_.platform_id());
-  auto device_name = platform.GetPlatformName();
+        auto platform    = Platform(device_.platform_id());
+        auto device_name = platform.GetPlatformName();
 
-  try {
+        try
+        {
+            {
+                using namespace std::string_literals;
+                if (device_name == "cpu"s)
+                {
+                    torch_device_ = torch::Device(device_name);
+                }
+                else
+                {
+                    torch_device_ = torch::Device(device_name + ":"s + std::to_string(device_.device_id()));
+                }
+            }
+            std::istringstream iss(bytes);
+            InferenceMode      guard;
+            module_ = torch::jit::load(iss);
+            module_.eval();
+            module_.to(*torch_device_);
+            auto forward = module_.get_method("forward");
+
+            auto ToDesc = [&](torch::jit::Value* value, const char* type, int index)
+            {
+                MMDEPLOY_INFO("Found {}: {}", type, value->debugNameBase());
+                return TensorDesc{device_, DataType::kFLOAT, {}, "#" + std::to_string(index)};
+            };
+
+            auto inputs      = forward.graph()->inputs();
+            int  input_count = 0;
+            for (int i = 1; i < inputs.size(); ++i)
+            {
+                if (inputs[i]->type()->kind() == c10::TypeKind::TensorType)
+                {
+                    input_tensor_.emplace_back(ToDesc(inputs[i], "input", input_count++));
+                }
+                else
+                {
+                    MMDEPLOY_ERROR("Unsupported input type: {}", typeKindToString(inputs[i]->type()->kind()));
+                    return Status(eNotSupported);
+                }
+            }
+
+            auto outputs      = forward.graph()->outputs();
+            int  output_count = 0;
+            for (const auto& output : outputs)
+            {
+                auto kind = output->type()->kind();
+                if (kind == c10::TypeKind::TensorType)
+                {
+                    output_tensor_.emplace_back(ToDesc(output, "output", output_count++));
+                }
+                else if (output->type()->kind() == c10::TypeKind::TupleType)
+                {
+                    for (const auto& v : output->node()->inputs())
+                    {
+                        if (v->type()->kind() == c10::TypeKind::TensorType)
+                        {
+                            output_tensor_.emplace_back(ToDesc(v, "output", output_count++));
+                        }
+                        else
+                        {
+                            MMDEPLOY_ERROR("Unsupported output type: {}", typeKindToString(v->type()->kind()));
+                            return Status(eNotSupported);
+                        }
+                    }
+                }
+                else
+                {
+                    MMDEPLOY_ERROR("Unsupported output type: {}", typeKindToString(kind));
+                }
+            }
+            return success();
+        }
+        catch (const std::exception& e)
+        {
+            MMDEPLOY_ERROR("unhandled exception: {}", e.what());
+            return Status(eFail);
+        }
+    }
+
+    Result<void> TorchNet::Deinit()
+    {
+        return success();
+    }
+    Result<Span<Tensor>> TorchNet::GetInputTensors()
     {
-      using namespace std::string_literals;
-      if (device_name == "cpu"s) {
-        torch_device_ = torch::Device(device_name);
-      } else {
-        torch_device_ = torch::Device(device_name + ":"s + std::to_string(device_.device_id()));
-      }
+        return input_tensor_;
     }
-    std::istringstream iss(bytes);
-    InferenceMode guard;
-    module_ = torch::jit::load(iss);
-    module_.eval();
-    module_.to(*torch_device_);
-    auto forward = module_.get_method("forward");
-
-    auto ToDesc = [&](torch::jit::Value* value, const char* type, int index) {
-      MMDEPLOY_INFO("Found {}: {}", type, value->debugNameBase());
-      return TensorDesc{device_, DataType::kFLOAT, {}, "#" + std::to_string(index)};
-    };
-
-    auto inputs = forward.graph()->inputs();
-    int input_count = 0;
-    for (int i = 1; i < inputs.size(); ++i) {
-      if (inputs[i]->type()->kind() == c10::TypeKind::TensorType) {
-        input_tensor_.emplace_back(ToDesc(inputs[i], "input", input_count++));
-      } else {
-        MMDEPLOY_ERROR("Unsupported input type: {}", typeKindToString(inputs[i]->type()->kind()));
-        return Status(eNotSupported);
-      }
+    Result<Span<Tensor>> TorchNet::GetOutputTensors()
+    {
+        return output_tensor_;
     }
 
-    auto outputs = forward.graph()->outputs();
-    int output_count = 0;
-    for (const auto& output : outputs) {
-      auto kind = output->type()->kind();
-      if (kind == c10::TypeKind::TensorType) {
-        output_tensor_.emplace_back(ToDesc(output, "output", output_count++));
-      } else if (output->type()->kind() == c10::TypeKind::TupleType) {
-        for (const auto& v : output->node()->inputs()) {
-          if (v->type()->kind() == c10::TypeKind::TensorType) {
-            output_tensor_.emplace_back(ToDesc(v, "output", output_count++));
-          } else {
-            MMDEPLOY_ERROR("Unsupported output type: {}", typeKindToString(v->type()->kind()));
-            return Status(eNotSupported);
-          }
+    Result<void> TorchNet::Reshape(Span<TensorShape> input_shapes)
+    {
+        if (input_shapes.size() != input_tensor_.size())
+        {
+            return Status(eInvalidArgument);
+        }
+        for (size_t i = 0; i < input_shapes.size(); ++i)
+        {
+            input_tensor_[i].Reshape(input_shapes[i]);
         }
-      } else {
-        MMDEPLOY_ERROR("Unsupported output type: {}", typeKindToString(kind));
-      }
+        return success();
     }
-    return success();
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("unhandled exception: {}", e.what());
-    return Status(eFail);
-  }
-}
-
-Result<void> TorchNet::Deinit() { return success(); }
-Result<Span<Tensor>> TorchNet::GetInputTensors() { return input_tensor_; }
-Result<Span<Tensor>> TorchNet::GetOutputTensors() { return output_tensor_; }
-
-Result<void> TorchNet::Reshape(Span<TensorShape> input_shapes) {
-  if (input_shapes.size() != input_tensor_.size()) {
-    return Status(eInvalidArgument);
-  }
-  for (size_t i = 0; i < input_shapes.size(); ++i) {
-    input_tensor_[i].Reshape(input_shapes[i]);
-  }
-  return success();
-}
 
-Result<void> TorchNet::Forward() {
-  try {
-    StreamGuard stream_guard(*torch_device_, stream_);
-    InferenceMode inference_guard;
-    std::vector<torch::jit::IValue> inputs;
-    for (auto& v : input_tensor_) {
-      OUTCOME_TRY(auto data_type, FromDataType(v.data_type()));
-      auto tensor = torch::from_blob(v.data(), v.shape(),
-                                     c10::TensorOptions(*torch_device_).dtype(data_type));
-      inputs.emplace_back(tensor);
+    Result<void> TorchNet::Forward()
+    {
+        try
+        {
+            StreamGuard                     stream_guard(*torch_device_, stream_);
+            InferenceMode                   inference_guard;
+            std::vector<torch::jit::IValue> inputs;
+            for (auto& v : input_tensor_)
+            {
+                OUTCOME_TRY(auto data_type, FromDataType(v.data_type()));
+                auto tensor = torch::from_blob(v.data(), v.shape(), c10::TensorOptions(*torch_device_).dtype(data_type));
+                inputs.emplace_back(tensor);
+            }
+            auto outputs = module_.forward(inputs);
+            if (outputs.isTensor())
+            {
+                OUTCOME_TRY(output_tensor_[0], FromTorchTensor(outputs.toTensor(), output_tensor_[0].name()));
+            }
+            else if (outputs.isTuple())
+            {
+                auto   tuple = outputs.toTuple();
+                size_t index = 0;
+                for (const auto& x : tuple->elements())
+                {
+                    OUTCOME_TRY(output_tensor_[index],
+                                FromTorchTensor(x.toTensor(), output_tensor_[index].name()));
+                    ++index;
+                }
+            }
+            else
+            {
+                MMDEPLOY_ERROR("{}", toString(outputs.type()));
+                return Status(eNotSupported);
+            }
+        }
+        catch (const std::exception& e)
+        {
+            MMDEPLOY_ERROR("unhandled exception: {}", e.what());
+            return Status(eFail);
+        }
+        return success();
     }
-    auto outputs = module_.forward(inputs);
-    if (outputs.isTensor()) {
-      OUTCOME_TRY(output_tensor_[0], FromTorchTensor(outputs.toTensor(), output_tensor_[0].name()));
-    } else if (outputs.isTuple()) {
-      auto tuple = outputs.toTuple();
-      size_t index = 0;
-      for (const auto& x : tuple->elements()) {
-        OUTCOME_TRY(output_tensor_[index],
-                    FromTorchTensor(x.toTensor(), output_tensor_[index].name()));
-        ++index;
-      }
-    } else {
-      MMDEPLOY_ERROR("{}", toString(outputs.type()));
-      return Status(eNotSupported);
+    Result<void> TorchNet::ForwardAsync(Event* event)
+    {
+        return success();
     }
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("unhandled exception: {}", e.what());
-    return Status(eFail);
-  }
-  return success();
-}
-Result<void> TorchNet::ForwardAsync(Event* event) { return success(); }
 
-Result<Tensor> TorchNet::FromTorchTensor(const torch::Tensor& tensor, const std::string& name) {
-  OUTCOME_TRY(auto data_type, ToDataType(tensor.scalar_type()));
-  auto shape = tensor.sizes();
-  TensorDesc desc{device_, data_type, {shape.begin(), shape.end()}, name};
-  return Tensor(desc, std::shared_ptr<void>(tensor.data_ptr(), [tensor](auto) {}));
-}
+    Result<Tensor> TorchNet::FromTorchTensor(const torch::Tensor& tensor, const std::string& name)
+    {
+        OUTCOME_TRY(auto data_type, ToDataType(tensor.scalar_type()));
+        auto       shape = tensor.sizes();
+        TensorDesc desc{device_, data_type, {shape.begin(), shape.end()}, name};
+        return Tensor(desc, std::shared_ptr<void>(tensor.data_ptr(), [tensor](auto) {}));
+    }
 
-static std::unique_ptr<Net> Create(const Value& args) {
-  auto p = std::make_unique<TorchNet>();
-  if (auto status = p->Init(args)) {
-    return p;
-  } else {
-    MMDEPLOY_ERROR("Failed to created TorchNet with config: {}", args);
-  }
-  return nullptr;
-}
+    static std::unique_ptr<Net> Create(const Value& args)
+    {
+        auto p = std::make_unique<TorchNet>();
+        if (auto status = p->Init(args))
+        {
+            return p;
+        }
+        else
+        {
+            MMDEPLOY_ERROR("Failed to created TorchNet with config: {}", args);
+        }
+        return nullptr;
+    }
 
-MMDEPLOY_REGISTER_FACTORY_FUNC(Net, (torchscript, 0), Create);
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Net, (torchscript, 0), Create);
 
 }  // namespace mmdeploy::framework
diff --git a/csrc/mmdeploy/net/torchscript/torch_net.h b/csrc/mmdeploy/net/torchscript/torch_net.h
index 9027e33ef0..4db1198056 100644
--- a/csrc/mmdeploy/net/torchscript/torch_net.h
+++ b/csrc/mmdeploy/net/torchscript/torch_net.h
@@ -6,29 +6,31 @@
 #include "mmdeploy/core/net.h"
 #include "torch/script.h"
 
-namespace mmdeploy::framework {
-
-class TorchNet : public Net {
- public:
-  ~TorchNet() override;
-  Result<void> Init(const Value& cfg) override;
-  Result<void> Deinit() override;
-  Result<Span<Tensor>> GetInputTensors() override;
-  Result<Span<Tensor>> GetOutputTensors() override;
-  Result<void> Reshape(Span<TensorShape> input_shapes) override;
-  Result<void> Forward() override;
-  Result<void> ForwardAsync(Event* event) override;
-
- private:
-  Result<Tensor> FromTorchTensor(const torch::Tensor& tensor, const std::string& name);
-
-  torch::jit::script::Module module_;
-  std::vector<Tensor> input_tensor_;
-  std::vector<Tensor> output_tensor_;
-  Device device_;
-  Stream stream_;
-  std::optional<torch::Device> torch_device_;
-};
+namespace mmdeploy::framework
+{
+
+    class TorchNet : public Net
+    {
+      public:
+        ~TorchNet() override;
+        Result<void>         Init(const Value& cfg) override;
+        Result<void>         Deinit() override;
+        Result<Span<Tensor>> GetInputTensors() override;
+        Result<Span<Tensor>> GetOutputTensors() override;
+        Result<void>         Reshape(Span<TensorShape> input_shapes) override;
+        Result<void>         Forward() override;
+        Result<void>         ForwardAsync(Event* event) override;
+
+      private:
+        Result<Tensor>               FromTorchTensor(const torch::Tensor& tensor, const std::string& name);
+
+        torch::jit::script::Module   module_;
+        std::vector<Tensor>          input_tensor_;
+        std::vector<Tensor>          output_tensor_;
+        Device                       device_;
+        Stream                       stream_;
+        std::optional<torch::Device> torch_device_;
+    };
 
 }  // namespace mmdeploy::framework
 
diff --git a/csrc/mmdeploy/net/trt/CMakeLists.txt b/csrc/mmdeploy/net/trt/CMakeLists.txt
index bc49d0b176..12cba2874f 100644
--- a/csrc/mmdeploy/net/trt/CMakeLists.txt
+++ b/csrc/mmdeploy/net/trt/CMakeLists.txt
@@ -5,10 +5,10 @@ project(mmdeploy_trt_net)
 include(${CMAKE_SOURCE_DIR}/cmake/tensorrt.cmake)
 
 mmdeploy_add_net(${PROJECT_NAME} trt_net.cpp)
-target_include_directories(${PROJECT_NAME} PRIVATE
-        ${TENSORRT_INCLUDE_DIR})
+target_include_directories(${PROJECT_NAME} PRIVATE ${TENSORRT_INCLUDE_DIR})
 target_include_directories(${PROJECT_NAME} PRIVATE ${CUDNN_DIR}/include)
-target_include_directories(${PROJECT_NAME} PRIVATE ${CUDA_TOOLKIT_ROOT_DIR}/include)
+target_include_directories(${PROJECT_NAME}
+                           PRIVATE ${CUDA_TOOLKIT_ROOT_DIR}/include)
 target_link_libraries(${PROJECT_NAME} PRIVATE mmdeploy_tensorrt_ops_obj)
 target_link_libraries(${PROJECT_NAME} PUBLIC ${TENSORRT_LIBS} cudnn cuda)
 
diff --git a/csrc/mmdeploy/net/trt/trt_net.cpp b/csrc/mmdeploy/net/trt/trt_net.cpp
index a2dc2efa83..db83d595eb 100644
--- a/csrc/mmdeploy/net/trt/trt_net.cpp
+++ b/csrc/mmdeploy/net/trt/trt_net.cpp
@@ -9,219 +9,265 @@
 #include "mmdeploy/core/module.h"
 #include "mmdeploy/core/utils/formatter.h"
 
-namespace mmdeploy::framework {
-
-namespace trt_detail {
-
-class TRTLogger : public nvinfer1::ILogger {
- public:
-  void log(Severity severity, const char* msg) noexcept override {
-    switch (severity) {
-      case Severity::kINFO:
-        MMDEPLOY_DEBUG("TRTNet: {}", msg);
-        break;
-      case Severity::kWARNING:
-        MMDEPLOY_WARN("TRTNet: {}", msg);
-        break;
-      case Severity::kERROR:
-      case Severity::kINTERNAL_ERROR:
-        MMDEPLOY_ERROR("TRTNet: {}", msg);
-        break;
-      default:
-        break;
+namespace mmdeploy::framework
+{
+
+    namespace trt_detail
+    {
+
+        class TRTLogger : public nvinfer1::ILogger
+        {
+          public:
+            void log(Severity severity, const char* msg) noexcept override
+            {
+                switch (severity)
+                {
+                    case Severity::kINFO:
+                        MMDEPLOY_DEBUG("TRTNet: {}", msg);
+                        break;
+                    case Severity::kWARNING:
+                        MMDEPLOY_WARN("TRTNet: {}", msg);
+                        break;
+                    case Severity::kERROR:
+                    case Severity::kINTERNAL_ERROR:
+                        MMDEPLOY_ERROR("TRTNet: {}", msg);
+                        break;
+                    default:
+                        break;
+                }
+            }
+            static TRTLogger& get()
+            {
+                static TRTLogger trt_logger{};
+                return trt_logger;
+            }
+        };
+
+        nvinfer1::Dims to_dims(const TensorShape& shape)
+        {
+            nvinfer1::Dims dims{};
+            dims.nbDims = shape.size();
+            for (size_t i = 0; i < shape.size(); ++i)
+            {
+                dims.d[i] = shape[i];
+            }
+            return dims;
+        }
+
+        TensorShape to_shape(const nvinfer1::Dims& dims)
+        {
+            TensorShape shape(dims.nbDims);
+            for (int i = 0; i < shape.size(); ++i)
+            {
+                shape[i] = dims.d[i];
+            }
+            return shape;
+        }
+
+    }  // namespace trt_detail
+
+    std::string to_string(const nvinfer1::Dims& dims)
+    {
+        std::stringstream ss;
+        ss << "(";
+        for (int i = 0; i < dims.nbDims; ++i)
+        {
+            if (i) ss << ", ";
+            ss << dims.d[i];
+        }
+        ss << ")";
+        return ss.str();
+    }
+
+    static inline Result<void> trt_try(bool code, const char* msg = nullptr, Status e = Status(eFail))
+    {
+        if (code)
+        {
+            return success();
+        }
+        if (msg)
+        {
+            MMDEPLOY_ERROR("{}", msg);
+        }
+        return e;
     }
-  }
-  static TRTLogger& get() {
-    static TRTLogger trt_logger{};
-    return trt_logger;
-  }
-};
-
-nvinfer1::Dims to_dims(const TensorShape& shape) {
-  nvinfer1::Dims dims{};
-  dims.nbDims = shape.size();
-  for (size_t i = 0; i < shape.size(); ++i) {
-    dims.d[i] = shape[i];
-  }
-  return dims;
-}
-
-TensorShape to_shape(const nvinfer1::Dims& dims) {
-  TensorShape shape(dims.nbDims);
-  for (int i = 0; i < shape.size(); ++i) {
-    shape[i] = dims.d[i];
-  }
-  return shape;
-}
-
-}  // namespace trt_detail
-
-std::string to_string(const nvinfer1::Dims& dims) {
-  std::stringstream ss;
-  ss << "(";
-  for (int i = 0; i < dims.nbDims; ++i) {
-    if (i) ss << ", ";
-    ss << dims.d[i];
-  }
-  ss << ")";
-  return ss.str();
-}
-
-static inline Result<void> trt_try(bool code, const char* msg = nullptr, Status e = Status(eFail)) {
-  if (code) {
-    return success();
-  }
-  if (msg) {
-    MMDEPLOY_ERROR("{}", msg);
-  }
-  return e;
-}
 
 #define TRT_TRY(...) OUTCOME_TRY(trt_try(__VA_ARGS__))
 
-TRTNet::~TRTNet() {
-  CudaDeviceGuard guard(device_);
-  context_.reset();
-  engine_.reset();
-  runtime_.reset();
-}
-
-static Result<DataType> MapDataType(nvinfer1::DataType dtype) {
-  switch (dtype) {
-    case nvinfer1::DataType::kFLOAT:
-      return DataType::kFLOAT;
-    case nvinfer1::DataType::kHALF:
-      return DataType::kHALF;
-    case nvinfer1::DataType::kINT8:
-    case nvinfer1::DataType::kBOOL:
-      return DataType::kINT8;
-    case nvinfer1::DataType::kINT32:
-      return DataType::kINT32;
-    default:
-      return Status(eNotSupported);
-  }
-}
-
-Result<void> TRTNet::Init(const Value& args) {
-  using namespace trt_detail;
-
-  auto& context = args["context"];
-  device_ = context["device"].get<Device>();
-  if (device_.is_host()) {
-    MMDEPLOY_ERROR("TRTNet: device must be a GPU!");
-    return Status(eNotSupported);
-  }
-  CudaDeviceGuard guard(device_);
-  stream_ = context["stream"].get<Stream>();
-
-  event_ = Event(device_);
-
-  auto name = args["name"].get<std::string>();
-  auto model = context["model"].get<Model>();
-  OUTCOME_TRY(auto config, model.GetModelConfig(name));
-
-  OUTCOME_TRY(auto plan, model.ReadFile(config.net));
-
-  runtime_ = nvinfer1::createInferRuntime(TRTLogger::get());
-  TRT_TRY(!!runtime_, "failed to create TRT infer runtime");
-
-  engine_ = runtime_->deserializeCudaEngine(plan.data(), plan.size());
-  TRT_TRY(!!engine_, "failed to deserialize TRT CUDA engine");
-
-  TRT_TRY(engine_->getNbOptimizationProfiles() == 1, "only 1 optimization profile supported",
-          Status(eNotSupported));
-
-  auto n_bindings = engine_->getNbBindings();
-  for (int i = 0; i < n_bindings; ++i) {
-    auto binding_name = engine_->getBindingName(i);
-    auto dims = engine_->getBindingDimensions(i);
-    if (engine_->isShapeBinding(i)) {
-      MMDEPLOY_ERROR("shape binding is not supported.");
-      return Status(eNotSupported);
+    TRTNet::~TRTNet()
+    {
+        CudaDeviceGuard guard(device_);
+        context_.reset();
+        engine_.reset();
+        runtime_.reset();
+    }
+
+    static Result<DataType> MapDataType(nvinfer1::DataType dtype)
+    {
+        switch (dtype)
+        {
+            case nvinfer1::DataType::kFLOAT:
+                return DataType::kFLOAT;
+            case nvinfer1::DataType::kHALF:
+                return DataType::kHALF;
+            case nvinfer1::DataType::kINT8:
+            case nvinfer1::DataType::kBOOL:
+                return DataType::kINT8;
+            case nvinfer1::DataType::kINT32:
+                return DataType::kINT32;
+            default:
+                return Status(eNotSupported);
+        }
+    }
+
+    Result<void> TRTNet::Init(const Value& args)
+    {
+        using namespace trt_detail;
+
+        auto& context = args["context"];
+        device_       = context["device"].get<Device>();
+        if (device_.is_host())
+        {
+            MMDEPLOY_ERROR("TRTNet: device must be a GPU!");
+            return Status(eNotSupported);
+        }
+        CudaDeviceGuard guard(device_);
+        stream_ = context["stream"].get<Stream>();
+
+        event_ = Event(device_);
+
+        auto name  = args["name"].get<std::string>();
+        auto model = context["model"].get<Model>();
+        OUTCOME_TRY(auto config, model.GetModelConfig(name));
+
+        OUTCOME_TRY(auto plan, model.ReadFile(config.net));
+
+        runtime_ = nvinfer1::createInferRuntime(TRTLogger::get());
+        TRT_TRY(!!runtime_, "failed to create TRT infer runtime");
+
+        engine_ = runtime_->deserializeCudaEngine(plan.data(), plan.size());
+        TRT_TRY(!!engine_, "failed to deserialize TRT CUDA engine");
+
+        TRT_TRY(engine_->getNbOptimizationProfiles() == 1, "only 1 optimization profile supported", Status(eNotSupported));
+
+        auto n_bindings = engine_->getNbBindings();
+        for (int i = 0; i < n_bindings; ++i)
+        {
+            auto binding_name = engine_->getBindingName(i);
+            auto dims         = engine_->getBindingDimensions(i);
+            if (engine_->isShapeBinding(i))
+            {
+                MMDEPLOY_ERROR("shape binding is not supported.");
+                return Status(eNotSupported);
+            }
+            OUTCOME_TRY(auto dtype, MapDataType(engine_->getBindingDataType(i)));
+            TensorDesc desc{device_, dtype, to_shape(dims), binding_name};
+            if (engine_->bindingIsInput(i))
+            {
+                MMDEPLOY_DEBUG("input binding {} {} {}", i, binding_name, to_string(dims));
+                input_ids_.push_back(i);
+                input_names_.emplace_back(binding_name);
+                input_tensors_.emplace_back(desc, Buffer());
+            }
+            else
+            {
+                MMDEPLOY_DEBUG("output binding {} {} {}", i, binding_name, to_string(dims));
+                output_ids_.push_back(i);
+                output_names_.emplace_back(binding_name);
+                output_tensors_.emplace_back(desc, Buffer());
+            }
+        }
+        context_ = engine_->createExecutionContext();
+        TRT_TRY(!!context_, "failed to create TRT execution context");
+
+        context_->setOptimizationProfileAsync(0, static_cast<cudaStream_t>(stream_.GetNative()));
+        OUTCOME_TRY(stream_.Wait());
+
+        return success();
+    }
+
+    Result<void> TRTNet::Deinit()
+    {
+        return success();
+    }
+
+    Result<void> TRTNet::Reshape(Span<TensorShape> input_shapes)
+    {
+        CudaDeviceGuard guard(device_);
+        using namespace trt_detail;
+        if (input_shapes.size() != input_tensors_.size())
+        {
+            return Status(eInvalidArgument);
+        }
+        for (int i = 0; i < input_tensors_.size(); ++i)
+        {
+            auto dims = to_dims(input_shapes[i]);
+            MMDEPLOY_DEBUG("input shape: {}", to_string(dims));
+            TRT_TRY(context_->setBindingDimensions(input_ids_[i], dims));
+            input_tensors_[i].Reshape(input_shapes[i]);
+        }
+        if (!context_->allInputDimensionsSpecified())
+        {
+            MMDEPLOY_ERROR("not all input dimensions specified");
+            return Status(eFail);
+        }
+        for (int i = 0; i < output_tensors_.size(); ++i)
+        {
+            auto dims = context_->getBindingDimensions(output_ids_[i]);
+            MMDEPLOY_DEBUG("output shape: {}", to_string(dims));
+            output_tensors_[i].Reshape(to_shape(dims));
+        }
+        return success();
     }
-    OUTCOME_TRY(auto dtype, MapDataType(engine_->getBindingDataType(i)));
-    TensorDesc desc{device_, dtype, to_shape(dims), binding_name};
-    if (engine_->bindingIsInput(i)) {
-      MMDEPLOY_DEBUG("input binding {} {} {}", i, binding_name, to_string(dims));
-      input_ids_.push_back(i);
-      input_names_.emplace_back(binding_name);
-      input_tensors_.emplace_back(desc, Buffer());
-    } else {
-      MMDEPLOY_DEBUG("output binding {} {} {}", i, binding_name, to_string(dims));
-      output_ids_.push_back(i);
-      output_names_.emplace_back(binding_name);
-      output_tensors_.emplace_back(desc, Buffer());
+
+    Result<Span<Tensor>> TRTNet::GetInputTensors()
+    {
+        return input_tensors_;
+    }
+
+    Result<Span<Tensor>> TRTNet::GetOutputTensors()
+    {
+        return output_tensors_;
+    }
+
+    Result<void> TRTNet::Forward()
+    {
+        CudaDeviceGuard guard(device_);
+        using namespace trt_detail;
+        std::vector<void*> bindings(engine_->getNbBindings());
+
+        for (int i = 0; i < input_tensors_.size(); ++i)
+        {
+            bindings[input_ids_[i]] = input_tensors_[i].data();
+        }
+        for (int i = 0; i < output_tensors_.size(); ++i)
+        {
+            bindings[output_ids_[i]] = output_tensors_[i].data();
+        }
+
+        auto event  = GetNative<cudaEvent_t>(event_);
+        auto status = context_->enqueueV2(bindings.data(), GetNative<cudaStream_t>(stream_), &event);
+        TRT_TRY(status, "TRT forward failed", Status(eFail));
+        OUTCOME_TRY(event_.Wait());
+
+        return success();
     }
-  }
-  context_ = engine_->createExecutionContext();
-  TRT_TRY(!!context_, "failed to create TRT execution context");
-
-  context_->setOptimizationProfileAsync(0, static_cast<cudaStream_t>(stream_.GetNative()));
-  OUTCOME_TRY(stream_.Wait());
-
-  return success();
-}
-
-Result<void> TRTNet::Deinit() { return success(); }
-
-Result<void> TRTNet::Reshape(Span<TensorShape> input_shapes) {
-  CudaDeviceGuard guard(device_);
-  using namespace trt_detail;
-  if (input_shapes.size() != input_tensors_.size()) {
-    return Status(eInvalidArgument);
-  }
-  for (int i = 0; i < input_tensors_.size(); ++i) {
-    auto dims = to_dims(input_shapes[i]);
-    MMDEPLOY_DEBUG("input shape: {}", to_string(dims));
-    TRT_TRY(context_->setBindingDimensions(input_ids_[i], dims));
-    input_tensors_[i].Reshape(input_shapes[i]);
-  }
-  if (!context_->allInputDimensionsSpecified()) {
-    MMDEPLOY_ERROR("not all input dimensions specified");
-    return Status(eFail);
-  }
-  for (int i = 0; i < output_tensors_.size(); ++i) {
-    auto dims = context_->getBindingDimensions(output_ids_[i]);
-    MMDEPLOY_DEBUG("output shape: {}", to_string(dims));
-    output_tensors_[i].Reshape(to_shape(dims));
-  }
-  return success();
-}
-
-Result<Span<Tensor>> TRTNet::GetInputTensors() { return input_tensors_; }
-
-Result<Span<Tensor>> TRTNet::GetOutputTensors() { return output_tensors_; }
-
-Result<void> TRTNet::Forward() {
-  CudaDeviceGuard guard(device_);
-  using namespace trt_detail;
-  std::vector<void*> bindings(engine_->getNbBindings());
-
-  for (int i = 0; i < input_tensors_.size(); ++i) {
-    bindings[input_ids_[i]] = input_tensors_[i].data();
-  }
-  for (int i = 0; i < output_tensors_.size(); ++i) {
-    bindings[output_ids_[i]] = output_tensors_[i].data();
-  }
-
-  auto event = GetNative<cudaEvent_t>(event_);
-  auto status = context_->enqueueV2(bindings.data(), GetNative<cudaStream_t>(stream_), &event);
-  TRT_TRY(status, "TRT forward failed", Status(eFail));
-  OUTCOME_TRY(event_.Wait());
-
-  return success();
-}
-
-Result<void> TRTNet::ForwardAsync(Event* event) { return Status(eNotSupported); }
-
-static std::unique_ptr<Net> Create(const Value& args) {
-  auto p = std::make_unique<TRTNet>();
-  if (p->Init(args)) {
-    return p;
-  }
-  return nullptr;
-}
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(Net, (tensorrt, 0), Create);
+
+    Result<void> TRTNet::ForwardAsync(Event* event)
+    {
+        return Status(eNotSupported);
+    }
+
+    static std::unique_ptr<Net> Create(const Value& args)
+    {
+        auto p = std::make_unique<TRTNet>();
+        if (p->Init(args))
+        {
+            return p;
+        }
+        return nullptr;
+    }
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Net, (tensorrt, 0), Create);
 
 }  // namespace mmdeploy::framework
diff --git a/csrc/mmdeploy/net/trt/trt_net.h b/csrc/mmdeploy/net/trt/trt_net.h
index 6233e05207..6252de5185 100644
--- a/csrc/mmdeploy/net/trt/trt_net.h
+++ b/csrc/mmdeploy/net/trt/trt_net.h
@@ -8,73 +8,101 @@
 #include "mmdeploy/core/net.h"
 #include "mmdeploy/device/cuda/cuda_device.h"
 
-namespace mmdeploy::framework {
+namespace mmdeploy::framework
+{
 
-namespace trt_detail {
+    namespace trt_detail
+    {
 
-template <typename T>
-class TRTWrapper {
- public:
-  TRTWrapper() : ptr_(nullptr) {}
-  TRTWrapper(T* ptr) : ptr_(ptr) {}  // NOLINT
-  ~TRTWrapper() { reset(); }
-  TRTWrapper(const TRTWrapper&) = delete;
-  TRTWrapper& operator=(const TRTWrapper&) = delete;
-  TRTWrapper(TRTWrapper&& other) noexcept { *this = std::move(other); }
-  TRTWrapper& operator=(TRTWrapper&& other) noexcept {
-    reset(std::exchange(other.ptr_, nullptr));
-    return *this;
-  }
-  T& operator*() { return *ptr_; }
-  T* operator->() { return ptr_; }
-  void reset(T* p = nullptr) {
-    if (auto old = std::exchange(ptr_, p)) {  // NOLINT
+        template<typename T>
+        class TRTWrapper
+        {
+          public:
+            TRTWrapper()
+                : ptr_(nullptr)
+            {
+            }
+            TRTWrapper(T* ptr)
+                : ptr_(ptr)
+            {
+            }  // NOLINT
+            ~TRTWrapper()
+            {
+                reset();
+            }
+            TRTWrapper(const TRTWrapper&)            = delete;
+            TRTWrapper& operator=(const TRTWrapper&) = delete;
+            TRTWrapper(TRTWrapper&& other) noexcept
+            {
+                *this = std::move(other);
+            }
+            TRTWrapper& operator=(TRTWrapper&& other) noexcept
+            {
+                reset(std::exchange(other.ptr_, nullptr));
+                return *this;
+            }
+            T& operator*()
+            {
+                return *ptr_;
+            }
+            T* operator->()
+            {
+                return ptr_;
+            }
+            void reset(T* p = nullptr)
+            {
+                if (auto old = std::exchange(ptr_, p))
+                {  // NOLINT
 #if NV_TENSORRT_MAJOR < 8
-      old->destroy();
+                    old->destroy();
 #else
-      delete old;
+                    delete old;
 #endif
-    }
-  }
+                }
+            }
 
-  explicit operator bool() const noexcept { return ptr_ != nullptr; }
+            explicit operator bool() const noexcept
+            {
+                return ptr_ != nullptr;
+            }
 
- private:
-  T* ptr_;
-};
+          private:
+            T* ptr_;
+        };
 
-// clang-format off
+        // clang-format off
 template <typename T>
 explicit TRTWrapper(T*) -> TRTWrapper<T>;
-// clang-format on
-}  // namespace trt_detail
+        // clang-format on
+    }  // namespace trt_detail
 
-class TRTNet : public Net {
- public:
-  ~TRTNet() override;
-  Result<void> Init(const Value& cfg) override;
-  Result<void> Deinit() override;
-  Result<Span<Tensor>> GetInputTensors() override;
-  Result<Span<Tensor>> GetOutputTensors() override;
-  Result<void> Reshape(Span<TensorShape> input_shapes) override;
-  Result<void> Forward() override;
-  Result<void> ForwardAsync(Event* event) override;
+    class TRTNet : public Net
+    {
+      public:
+        ~TRTNet() override;
+        Result<void>         Init(const Value& cfg) override;
+        Result<void>         Deinit() override;
+        Result<Span<Tensor>> GetInputTensors() override;
+        Result<Span<Tensor>> GetOutputTensors() override;
+        Result<void>         Reshape(Span<TensorShape> input_shapes) override;
+        Result<void>         Forward() override;
+        Result<void>         ForwardAsync(Event* event) override;
 
- private:
- private:
-  trt_detail::TRTWrapper<nvinfer1::ICudaEngine> engine_;
-  trt_detail::TRTWrapper<nvinfer1::IExecutionContext> context_;
-  trt_detail::TRTWrapper<nvinfer1::IRuntime> runtime_;
-  std::vector<int> input_ids_;
-  std::vector<int> output_ids_;
-  std::vector<std::string> input_names_;
-  std::vector<std::string> output_names_;
-  std::vector<Tensor> input_tensors_;
-  std::vector<Tensor> output_tensors_;
-  Device device_;
-  Stream stream_;
-  Event event_;
-};
+      private:
+      private:
+        trt_detail::TRTWrapper<nvinfer1::ICudaEngine>       engine_;
+        trt_detail::TRTWrapper<nvinfer1::IExecutionContext> context_;
+        trt_detail::TRTWrapper<nvinfer1::IRuntime>          runtime_;
+        std::vector<int>                                    input_ids_;
+        std::vector<int>                                    output_ids_;
+        std::vector<std::string>                            input_names_;
+        std::vector<std::string>                            output_names_;
+        std::vector<Tensor>                                 input_tensors_;
+        std::vector<Tensor>                                 output_tensors_;
+        Device                                              device_;
+        Stream                                              stream_;
+        Event                                               event_;
+    };
 
 }  // namespace mmdeploy::framework
 
diff --git a/csrc/mmdeploy/net/tvm/CMakeLists.txt b/csrc/mmdeploy/net/tvm/CMakeLists.txt
index 426d3bb4ec..996eeffd0a 100644
--- a/csrc/mmdeploy/net/tvm/CMakeLists.txt
+++ b/csrc/mmdeploy/net/tvm/CMakeLists.txt
@@ -5,7 +5,9 @@ project(mmdeploy_tvm_net)
 include(${CMAKE_SOURCE_DIR}/cmake/modules/FindTVM.cmake)
 
 mmdeploy_add_net(${PROJECT_NAME} tvm_net.cpp)
-target_include_directories(${PROJECT_NAME} PRIVATE ${TVM_INCLUDE_DIR} ${DLPACK_INCLUDE_DIR} ${DMLC_CORE_INCLUDE_DIR})
+target_include_directories(
+  ${PROJECT_NAME} PRIVATE ${TVM_INCLUDE_DIR} ${DLPACK_INCLUDE_DIR}
+                          ${DMLC_CORE_INCLUDE_DIR})
 target_link_libraries(${PROJECT_NAME} PRIVATE tvm_runtime mmdeploy_dlpack_utils)
 
 add_library(mmdeploy::tvm_net ALIAS ${PROJECT_NAME})
diff --git a/csrc/mmdeploy/net/tvm/tvm_net.cpp b/csrc/mmdeploy/net/tvm/tvm_net.cpp
index 8985065d4a..ed095944fa 100644
--- a/csrc/mmdeploy/net/tvm/tvm_net.cpp
+++ b/csrc/mmdeploy/net/tvm/tvm_net.cpp
@@ -14,269 +14,334 @@
 #include "mmdeploy/core/utils/formatter.h"
 #include "mmdeploy/utils/dlpack/dlpack_utils.h"
 
-namespace mmdeploy::framework {
-
-static DLDevice GetDLDevice(const Device& device) {
-  DLDevice dev;
-  if (device.is_device()) {
-    dev = {kDLCUDA, device.device_id()};
-  } else {
-    dev = {kDLCPU, 0};
-  }
-  return dev;
-}
-
-static Result<DLDataType> FromDataType(DataType data_type) {
-  switch (data_type) {
-    case DataType::kFLOAT:
-      return DLDataType{kDLFloat, 32, 1};
-    case DataType::kINT32:
-      return DLDataType{kDLInt, 32, 1};
-    case DataType::kINT64:
-      return DLDataType{kDLInt, 64, 1};
-    case DataType::kINT8:
-      return DLDataType{kDLInt, 8, 1};
-    default:
-      MMDEPLOY_ERROR("Unsupported mmdeploy::DataType");
-      return Status(eNotSupported);
-  }
-}
-
-static Result<DataType> ToDataType(DLDataType scalar_type) {
-  if (scalar_type.lanes != 1) {
-    MMDEPLOY_ERROR("Unsupported scalar_type.lanes==1.");
-    return Status(eNotSupported);
-  }
-
-  if (scalar_type.code == kDLFloat && scalar_type.bits == 32) {
-    return DataType::kFLOAT;
-  } else if (scalar_type.code == kDLInt) {
-    switch (scalar_type.bits) {
-      case 32:
-        return DataType::kINT32;
-      case 64:
-        return DataType::kINT64;
-      case 8:
-        return DataType::kINT8;
-      default:
-        break;
+namespace mmdeploy::framework
+{
+
+    static DLDevice GetDLDevice(const Device& device)
+    {
+        DLDevice dev;
+        if (device.is_device())
+        {
+            dev = {kDLCUDA, device.device_id()};
+        }
+        else
+        {
+            dev = {kDLCPU, 0};
+        }
+        return dev;
+    }
+
+    static Result<DLDataType> FromDataType(DataType data_type)
+    {
+        switch (data_type)
+        {
+            case DataType::kFLOAT:
+                return DLDataType{kDLFloat, 32, 1};
+            case DataType::kINT32:
+                return DLDataType{kDLInt, 32, 1};
+            case DataType::kINT64:
+                return DLDataType{kDLInt, 64, 1};
+            case DataType::kINT8:
+                return DLDataType{kDLInt, 8, 1};
+            default:
+                MMDEPLOY_ERROR("Unsupported mmdeploy::DataType");
+                return Status(eNotSupported);
+        }
+    }
+
+    static Result<DataType> ToDataType(DLDataType scalar_type)
+    {
+        if (scalar_type.lanes != 1)
+        {
+            MMDEPLOY_ERROR("Unsupported scalar_type.lanes==1.");
+            return Status(eNotSupported);
+        }
+
+        if (scalar_type.code == kDLFloat && scalar_type.bits == 32)
+        {
+            return DataType::kFLOAT;
+        }
+        else if (scalar_type.code == kDLInt)
+        {
+            switch (scalar_type.bits)
+            {
+                case 32:
+                    return DataType::kINT32;
+                case 64:
+                    return DataType::kINT64;
+                case 8:
+                    return DataType::kINT8;
+                default:
+                    break;
+            }
+        }
+
+        MMDEPLOY_ERROR("Unsupported code: {}, bits: {}, lanes: {}.", std::to_string(scalar_type.code), std::to_string(scalar_type.bits), std::to_string(scalar_type.lanes));
+        return Status(eNotSupported);
+    }
+
+    static std::vector<std::string> split_str(const std::string& s, char delim)
+    {
+        using namespace std;
+        vector<string> result;
+        stringstream   ss(s);
+        string         item;
+
+        while (getline(ss, item, delim))
+        {
+            result.push_back(item);
+        }
+
+        return result;
+    }
+
+    Result<void> TVMNet::Init(const Value& args)
+    {
+        auto& context = args["context"];
+        device_       = context["device"].get<Device>();
+        stream_       = context["stream"].get<Stream>();
+
+        auto name  = args["name"].get<std::string>();
+        auto model = context["model"].get<Model>();
+        OUTCOME_TRY(auto config, model.GetModelConfig(name));
+
+        auto        tmp_dir = fs::temp_directory_path();
+        std::string tmp_lib = (tmp_dir / fs::path(config.net)).string();
+        OUTCOME_TRY(auto raw_lib, model.ReadFile(config.net));
+        std::string tmp_label = (tmp_dir / fs::path(config.weights)).string();
+        OUTCOME_TRY(auto raw_label, model.ReadFile(config.weights));
+
+        try
+        {
+            std::ofstream lib_out(tmp_lib, std::ios::binary);
+            lib_out << raw_lib;
+            lib_out.close();
+        }
+        catch (const std::exception& e)
+        {
+            MMDEPLOY_ERROR("unhandled exception when creating tmp library: {}", e.what());
+            return Status(eFail);
+        }
+
+        try
+        {
+            auto     io_names     = split_str(raw_label, '\n');
+            auto     input_names  = split_str(io_names[0], ',');
+            auto     output_names = split_str(io_names[1], ',');
+            DLDevice dev          = GetDLDevice(device_);
+
+            mod_factory_ = tvm::runtime::Module::LoadFromFile(tmp_lib);
+
+            use_vm_ = false;
+            if (io_names.size() > 2)
+            {
+                use_vm_ = true;
+                OUTCOME_TRY(auto bytecode, model.ReadFile(io_names[2]));
+                auto                 exec           = tvm::runtime::vm::Executable::Load(bytecode, mod_factory_);
+                const auto           runtime_create = *tvm::runtime::Registry::Get("runtime._VirtualMachine");
+                tvm::runtime::Module vm_            = runtime_create(exec);
+
+                // init vm
+                auto                 func_init  = vm_.GetFunction("init", false);
+                auto                 alloc_type = static_cast<int>(tvm::runtime::vm::AllocatorType::kPooled);
+                if (dev.device_type != kDLCPU)
+                {
+                    func_init(static_cast<int>(kDLCPU), 0, alloc_type, int(dev.device_type), int(dev.device_id), alloc_type);
+                }
+                else
+                {
+                    func_init(int(dev.device_type), int(dev.device_id), alloc_type);
+                }
+
+                // get input ids
+                auto func_input_index_ = vm_.GetFunction("get_input_index", false);
+                for (auto name : input_names)
+                {
+                    input_ids_[name] = func_input_index_(name, "main");
+                }
+
+                // get function
+                func_set_input_ = vm_.GetFunction("set_input");
+                func_run_       = vm_.GetFunction("invoke");
+            }
+            else
+            {
+                // graph executor won't do synchronize stream after run？
+                if (device_.is_device())
+                    tvm::runtime::DeviceAPI::Get(dev)->SetStream(dev, stream_.GetNative());
+                tvm::runtime::Module gmod = mod_factory_.GetFunction("default")(dev);
+
+                // get function
+                func_set_input_  = gmod.GetFunction("set_input");
+                func_get_output_ = gmod.GetFunction("get_output");
+                func_run_        = gmod.GetFunction("run");
+            }
+
+            auto ToDesc = [&](const std::string& name)
+            {
+                return TensorDesc{device_, DataType::kFLOAT, {}, name};
+            };
+
+            for (auto name : input_names)
+            {
+                input_tensors_.emplace_back(ToDesc(name));
+            }
+
+            for (auto name : output_names)
+            {
+                output_tensors_.emplace_back(ToDesc(name));
+            }
+        }
+        catch (const std::exception& e)
+        {
+            MMDEPLOY_ERROR("unhandled exception when creating TVM Net: {}", e.what());
+            return Status(eFail);
+        }
+
+        return success();
+    }
+
+    Result<void> TVMNet::ForwardAsync(Event* event)
+    {
+        return Status(eNotSupported);
     }
-  }
-
-  MMDEPLOY_ERROR("Unsupported code: {}, bits: {}, lanes: {}.", std::to_string(scalar_type.code),
-                 std::to_string(scalar_type.bits), std::to_string(scalar_type.lanes));
-  return Status(eNotSupported);
-}
-
-static std::vector<std::string> split_str(const std::string& s, char delim) {
-  using namespace std;
-  vector<string> result;
-  stringstream ss(s);
-  string item;
-
-  while (getline(ss, item, delim)) {
-    result.push_back(item);
-  }
-
-  return result;
-}
-
-Result<void> TVMNet::Init(const Value& args) {
-  auto& context = args["context"];
-  device_ = context["device"].get<Device>();
-  stream_ = context["stream"].get<Stream>();
-
-  auto name = args["name"].get<std::string>();
-  auto model = context["model"].get<Model>();
-  OUTCOME_TRY(auto config, model.GetModelConfig(name));
-
-  auto tmp_dir = fs::temp_directory_path();
-  std::string tmp_lib = (tmp_dir / fs::path(config.net)).string();
-  OUTCOME_TRY(auto raw_lib, model.ReadFile(config.net));
-  std::string tmp_label = (tmp_dir / fs::path(config.weights)).string();
-  OUTCOME_TRY(auto raw_label, model.ReadFile(config.weights));
-
-  try {
-    std::ofstream lib_out(tmp_lib, std::ios::binary);
-    lib_out << raw_lib;
-    lib_out.close();
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("unhandled exception when creating tmp library: {}", e.what());
-    return Status(eFail);
-  }
-
-  try {
-    auto io_names = split_str(raw_label, '\n');
-    auto input_names = split_str(io_names[0], ',');
-    auto output_names = split_str(io_names[1], ',');
-    DLDevice dev = GetDLDevice(device_);
-
-    mod_factory_ = tvm::runtime::Module::LoadFromFile(tmp_lib);
-
-    use_vm_ = false;
-    if (io_names.size() > 2) {
-      use_vm_ = true;
-      OUTCOME_TRY(auto bytecode, model.ReadFile(io_names[2]));
-      auto exec = tvm::runtime::vm::Executable::Load(bytecode, mod_factory_);
-      const auto runtime_create = *tvm::runtime::Registry::Get("runtime._VirtualMachine");
-      tvm::runtime::Module vm_ = runtime_create(exec);
-
-      // init vm
-      auto func_init = vm_.GetFunction("init", false);
-      auto alloc_type = static_cast<int>(tvm::runtime::vm::AllocatorType::kPooled);
-      if (dev.device_type != kDLCPU) {
-        func_init(static_cast<int>(kDLCPU), 0, alloc_type, int(dev.device_type), int(dev.device_id),
-                  alloc_type);
-      } else {
-        func_init(int(dev.device_type), int(dev.device_id), alloc_type);
-      }
-
-      // get input ids
-      auto func_input_index_ = vm_.GetFunction("get_input_index", false);
-      for (auto name : input_names) {
-        input_ids_[name] = func_input_index_(name, "main");
-      }
-
-      // get function
-      func_set_input_ = vm_.GetFunction("set_input");
-      func_run_ = vm_.GetFunction("invoke");
-    } else {
-      // graph executor won't do synchronize stream after run？
-      if (device_.is_device())
-        tvm::runtime::DeviceAPI::Get(dev)->SetStream(dev, stream_.GetNative());
-      tvm::runtime::Module gmod = mod_factory_.GetFunction("default")(dev);
-
-      // get function
-      func_set_input_ = gmod.GetFunction("set_input");
-      func_get_output_ = gmod.GetFunction("get_output");
-      func_run_ = gmod.GetFunction("run");
+
+    Result<void> TVMNet::Deinit()
+    {
+        return success();
     }
 
-    auto ToDesc = [&](const std::string& name) {
-      return TensorDesc{device_, DataType::kFLOAT, {}, name};
-    };
+    Result<Span<Tensor>> TVMNet::GetInputTensors()
+    {
+        return input_tensors_;
+    }
 
-    for (auto name : input_names) {
-      input_tensors_.emplace_back(ToDesc(name));
+    Result<Span<Tensor>> TVMNet::GetOutputTensors()
+    {
+        return output_tensors_;
     }
 
-    for (auto name : output_names) {
-      output_tensors_.emplace_back(ToDesc(name));
+    Result<void> TVMNet::Reshape(Span<TensorShape> input_shapes)
+    {
+        for (size_t i = 0; i < input_shapes.size(); ++i)
+        {
+            input_tensors_[i].Reshape(input_shapes[i]);
+        }
+        return success();
     }
 
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR("unhandled exception when creating TVM Net: {}", e.what());
-    return Status(eFail);
-  }
-
-  return success();
-}
-
-Result<void> TVMNet::ForwardAsync(Event* event) { return Status(eNotSupported); }
-
-Result<void> TVMNet::Deinit() { return success(); }
-
-Result<Span<Tensor>> TVMNet::GetInputTensors() { return input_tensors_; }
-
-Result<Span<Tensor>> TVMNet::GetOutputTensors() { return output_tensors_; }
-
-Result<void> TVMNet::Reshape(Span<TensorShape> input_shapes) {
-  for (size_t i = 0; i < input_shapes.size(); ++i) {
-    input_tensors_[i].Reshape(input_shapes[i]);
-  }
-  return success();
-}
-
-Result<void> TVMNet::Forward() {
-  DLDevice dev = GetDLDevice(device_);
-  try {
-    OUTCOME_TRY(stream_.Wait());
-
-    if (use_vm_) {
-      // vm
-
-      // set input
-      int num_inputs = input_tensors_.size();
-      std::vector<tvm::runtime::NDArray> args_arr(num_inputs);
-      std::vector<TVMValue> tvm_values(num_inputs + 1);
-      std::vector<int> tvm_type_codes(num_inputs + 1);
-      tvm::runtime::TVMArgsSetter setter(tvm_values.data(), tvm_type_codes.data());
-      setter(0, "main");
-      for (int k = 0; k < num_inputs; ++k) {
-        auto v = input_tensors_[k];
-        OUTCOME_TRY(auto managed_tensor, ToDLPack(v, stream_));
-        OUTCOME_TRY(stream_.Wait());
-        args_arr[k] = tvm::runtime::NDArray::FromDLPack(managed_tensor);
-
-        int input_id = input_ids_[v.name()];
-        setter(input_id + 1, args_arr[k]);
-      }
-      func_set_input_.CallPacked(
-          tvm::runtime::TVMArgs(tvm_values.data(), tvm_type_codes.data(), num_inputs + 1), nullptr);
-
-      // run
-      tvm::runtime::TVMRetValue ret = func_run_("main");
-      if (device_.is_device()) {
-        // tvm virtual machine use default stream.
-        OUTCOME_TRY(Stream(device_, nullptr).Wait());
-      }
-
-      // get output
-      if (ret.type_code() == kTVMNDArrayHandle) {
-        tvm::runtime::NDArray ndarray = ret.AsObjectRef<tvm::runtime::NDArray>();
-        Tensor& v = output_tensors_[0];
-        OUTCOME_TRY(v, FromDLPack(ndarray.ToDLPack(), v.name(), stream_));
-      } else if (ret.type_code() == kTVMObjectHandle) {
-        const auto& adt = ret.AsObjectRef<tvm::runtime::ADT>();
-        for (int i = 0; i < output_tensors_.size(); ++i) {
-          tvm::runtime::NDArray ndarray = tvm::runtime::Downcast<tvm::runtime::NDArray>(adt[i]);
-          Tensor& v = output_tensors_[i];
-          OUTCOME_TRY(v, FromDLPack(ndarray.ToDLPack(), v.name(), stream_));
+    Result<void> TVMNet::Forward()
+    {
+        DLDevice dev = GetDLDevice(device_);
+        try
+        {
+            OUTCOME_TRY(stream_.Wait());
+
+            if (use_vm_)
+            {
+                // vm
+
+                // set input
+                int                                num_inputs = input_tensors_.size();
+                std::vector<tvm::runtime::NDArray> args_arr(num_inputs);
+                std::vector<TVMValue>              tvm_values(num_inputs + 1);
+                std::vector<int>                   tvm_type_codes(num_inputs + 1);
+                tvm::runtime::TVMArgsSetter        setter(tvm_values.data(), tvm_type_codes.data());
+                setter(0, "main");
+                for (int k = 0; k < num_inputs; ++k)
+                {
+                    auto v = input_tensors_[k];
+                    OUTCOME_TRY(auto managed_tensor, ToDLPack(v, stream_));
+                    OUTCOME_TRY(stream_.Wait());
+                    args_arr[k] = tvm::runtime::NDArray::FromDLPack(managed_tensor);
+
+                    int input_id = input_ids_[v.name()];
+                    setter(input_id + 1, args_arr[k]);
+                }
+                func_set_input_.CallPacked(
+                    tvm::runtime::TVMArgs(tvm_values.data(), tvm_type_codes.data(), num_inputs + 1),
+                    nullptr);
+
+                // run
+                tvm::runtime::TVMRetValue ret = func_run_("main");
+                if (device_.is_device())
+                {
+                    // tvm virtual machine use default stream.
+                    OUTCOME_TRY(Stream(device_, nullptr).Wait());
+                }
+
+                // get output
+                if (ret.type_code() == kTVMNDArrayHandle)
+                {
+                    tvm::runtime::NDArray ndarray = ret.AsObjectRef<tvm::runtime::NDArray>();
+                    Tensor&               v       = output_tensors_[0];
+                    OUTCOME_TRY(v, FromDLPack(ndarray.ToDLPack(), v.name(), stream_));
+                }
+                else if (ret.type_code() == kTVMObjectHandle)
+                {
+                    const auto& adt = ret.AsObjectRef<tvm::runtime::ADT>();
+                    for (int i = 0; i < output_tensors_.size(); ++i)
+                    {
+                        tvm::runtime::NDArray ndarray = tvm::runtime::Downcast<tvm::runtime::NDArray>(adt[i]);
+                        Tensor&               v       = output_tensors_[i];
+                        OUTCOME_TRY(v, FromDLPack(ndarray.ToDLPack(), v.name(), stream_));
+                    }
+                }
+                else
+                {
+                    MMDEPLOY_ERROR("error return type code {}", ret.type_code());
+                    return Status(eFail);
+                }
+            }
+            else
+            {
+                // graph executor
+
+                // set input
+                for (auto v : input_tensors_)
+                {
+                    OUTCOME_TRY(auto managed_tensor, ToDLPack(v, stream_));
+                    OUTCOME_TRY(stream_.Wait());
+                    auto ndarray = tvm::runtime::NDArray::FromDLPack(managed_tensor);
+
+                    func_set_input_(v.name(), ndarray);
+                }
+
+                // run
+                func_run_();
+
+                // get output
+                for (int i = 0; i < output_tensors_.size(); ++i)
+                {
+                    tvm::runtime::NDArray ndarray = func_get_output_(i);
+                    Tensor&               v       = output_tensors_[i];
+                    OUTCOME_TRY(v, FromDLPack(ndarray.ToDLPack(), v.name(), stream_));
+                }
+
+                OUTCOME_TRY(stream_.Wait());
+            }
         }
-      } else {
-        MMDEPLOY_ERROR("error return type code {}", ret.type_code());
-        return Status(eFail);
-      }
-    } else {
-      // graph executor
-
-      // set input
-      for (auto v : input_tensors_) {
-        OUTCOME_TRY(auto managed_tensor, ToDLPack(v, stream_));
-        OUTCOME_TRY(stream_.Wait());
-        auto ndarray = tvm::runtime::NDArray::FromDLPack(managed_tensor);
-
-        func_set_input_(v.name(), ndarray);
-      }
-
-      // run
-      func_run_();
-
-      // get output
-      for (int i = 0; i < output_tensors_.size(); ++i) {
-        tvm::runtime::NDArray ndarray = func_get_output_(i);
-        Tensor& v = output_tensors_[i];
-        OUTCOME_TRY(v, FromDLPack(ndarray.ToDLPack(), v.name(), stream_));
-      }
-
-      OUTCOME_TRY(stream_.Wait());
+        catch (const std::exception& e)
+        {
+            MMDEPLOY_ERROR(e.what());
+            return Status(eFail);
+        }
+        return success();
     }
-  } catch (const std::exception& e) {
-    MMDEPLOY_ERROR(e.what());
-    return Status(eFail);
-  }
-  return success();
-}
-
-static std::unique_ptr<Net> Create(const Value& args) {
-  auto p = std::make_unique<TVMNet>();
-  if (auto status = p->Init(args)) {
-    return p;
-  } else {
-    MMDEPLOY_ERROR("Failed to created TVMNet with config: {}", args);
-  }
-  return nullptr;
-}
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(Net, (tvm, 0), Create);
+
+    static std::unique_ptr<Net> Create(const Value& args)
+    {
+        auto p = std::make_unique<TVMNet>();
+        if (auto status = p->Init(args))
+        {
+            return p;
+        }
+        else
+        {
+            MMDEPLOY_ERROR("Failed to created TVMNet with config: {}", args);
+        }
+        return nullptr;
+    }
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Net, (tvm, 0), Create);
 }  // namespace mmdeploy::framework
diff --git a/csrc/mmdeploy/net/tvm/tvm_net.h b/csrc/mmdeploy/net/tvm/tvm_net.h
index 9f1135eb08..2159cd1982 100644
--- a/csrc/mmdeploy/net/tvm/tvm_net.h
+++ b/csrc/mmdeploy/net/tvm/tvm_net.h
@@ -7,33 +7,35 @@
 
 #include "mmdeploy/core/net.h"
 
-namespace mmdeploy::framework {
-
-class TVMNet : public Net {
- public:
-  ~TVMNet() override = default;
-  Result<void> Init(const Value& cfg) override;
-  Result<void> Deinit() override;
-  Result<Span<Tensor>> GetInputTensors() override;
-  Result<Span<Tensor>> GetOutputTensors() override;
-  Result<void> Reshape(Span<TensorShape> input_shapes) override;
-  Result<void> Forward() override;
-  Result<void> ForwardAsync(Event* event) override;
-
- private:
-  tvm::runtime::Module mod_factory_;
-
-  tvm::runtime::PackedFunc func_set_input_;
-  tvm::runtime::PackedFunc func_get_output_;
-  tvm::runtime::PackedFunc func_run_;
-  bool use_vm_;
-
-  std::map<std::string, int> input_ids_;
-  std::vector<Tensor> input_tensors_;
-  std::vector<Tensor> output_tensors_;
-  Device device_;
-  Stream stream_;
-};
+namespace mmdeploy::framework
+{
+
+    class TVMNet : public Net
+    {
+      public:
+        ~TVMNet() override = default;
+        Result<void>         Init(const Value& cfg) override;
+        Result<void>         Deinit() override;
+        Result<Span<Tensor>> GetInputTensors() override;
+        Result<Span<Tensor>> GetOutputTensors() override;
+        Result<void>         Reshape(Span<TensorShape> input_shapes) override;
+        Result<void>         Forward() override;
+        Result<void>         ForwardAsync(Event* event) override;
+
+      private:
+        tvm::runtime::Module       mod_factory_;
+
+        tvm::runtime::PackedFunc   func_set_input_;
+        tvm::runtime::PackedFunc   func_get_output_;
+        tvm::runtime::PackedFunc   func_run_;
+        bool                       use_vm_;
+
+        std::map<std::string, int> input_ids_;
+        std::vector<Tensor>        input_tensors_;
+        std::vector<Tensor>        output_tensors_;
+        Device                     device_;
+        Stream                     stream_;
+    };
 
 }  // namespace mmdeploy::framework
 
diff --git a/csrc/mmdeploy/operation/cpu/CMakeLists.txt b/csrc/mmdeploy/operation/cpu/CMakeLists.txt
index 5607123deb..be45c86056 100644
--- a/csrc/mmdeploy/operation/cpu/CMakeLists.txt
+++ b/csrc/mmdeploy/operation/cpu/CMakeLists.txt
@@ -2,20 +2,20 @@
 
 project(mmdeploy_operation_cpu)
 
-set(SRCS resize.cpp
-        cvtcolor.cpp
-        pad.cpp
-        to_float.cpp
-        hwc2chw.cpp
-        normalize.cpp
-        crop.cpp
-        flip.cpp
-        warp_affine.cpp
-        crop_resize_pad.cpp
-        permute.cpp)
+set(SRCS
+    resize.cpp
+    cvtcolor.cpp
+    pad.cpp
+    to_float.cpp
+    hwc2chw.cpp
+    normalize.cpp
+    crop.cpp
+    flip.cpp
+    warp_affine.cpp
+    crop_resize_pad.cpp
+    permute.cpp)
 
 mmdeploy_add_module(${PROJECT_NAME} "${SRCS}")
 
-target_link_libraries(${PROJECT_NAME} PRIVATE
-        mmdeploy_operation
-        mmdeploy_opencv_utils)
+target_link_libraries(${PROJECT_NAME} PRIVATE mmdeploy_operation
+                                              mmdeploy_opencv_utils)
diff --git a/csrc/mmdeploy/operation/cpu/crop.cpp b/csrc/mmdeploy/operation/cpu/crop.cpp
index bf13a42e53..57cf1ff726 100644
--- a/csrc/mmdeploy/operation/cpu/crop.cpp
+++ b/csrc/mmdeploy/operation/cpu/crop.cpp
@@ -3,19 +3,22 @@
 #include "mmdeploy/operation/vision.h"
 #include "mmdeploy/utils/opencv/opencv_utils.h"
 
-namespace mmdeploy::operation::cpu {
+namespace mmdeploy::operation::cpu
+{
 
-class CropImpl : public Crop {
- public:
-  Result<void> apply(const Tensor& src, Tensor& dst, int top, int left, int bottom,
-                     int right) override {
-    cv::Mat mat = mmdeploy::cpu::Tensor2CVMat(src);
-    cv::Mat cropped_mat = mmdeploy::cpu::Crop(mat, top, left, bottom, right);
-    dst = mmdeploy::cpu::CVMat2Tensor(cropped_mat);
-    return success();
-  }
-};
+    class CropImpl : public Crop
+    {
+      public:
+        Result<void> apply(const Tensor& src, Tensor& dst, int top, int left, int bottom, int right) override
+        {
+            cv::Mat mat         = mmdeploy::cpu::Tensor2CVMat(src);
+            cv::Mat cropped_mat = mmdeploy::cpu::Crop(mat, top, left, bottom, right);
+            dst                 = mmdeploy::cpu::CVMat2Tensor(cropped_mat);
+            return success();
+        }
+    };
 
-MMDEPLOY_REGISTER_FACTORY_FUNC(Crop, (cpu, 0), []() { return std::make_unique<CropImpl>(); });
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Crop, (cpu, 0), []()
+                                   { return std::make_unique<CropImpl>(); });
 
 }  // namespace mmdeploy::operation::cpu
diff --git a/csrc/mmdeploy/operation/cpu/crop_resize_pad.cpp b/csrc/mmdeploy/operation/cpu/crop_resize_pad.cpp
index ee0dea65ab..bfaec5a4c2 100644
--- a/csrc/mmdeploy/operation/cpu/crop_resize_pad.cpp
+++ b/csrc/mmdeploy/operation/cpu/crop_resize_pad.cpp
@@ -3,23 +3,24 @@
 #include "mmdeploy/operation/vision.h"
 #include "mmdeploy/utils/opencv/opencv_utils.h"
 
-namespace mmdeploy::operation::cpu {
+namespace mmdeploy::operation::cpu
+{
 
-class CropResizePadImpl : public CropResizePad {
- public:
-  CropResizePadImpl() = default;
+    class CropResizePadImpl : public CropResizePad
+    {
+      public:
+        CropResizePadImpl() = default;
 
-  Result<void> apply(const Tensor &src, const std::vector<int> &crop_rect,
-                     const std::vector<int> &target_size, const std::vector<int> &pad_rect,
-                     Tensor &dst) override {
-    auto src_mat = mmdeploy::cpu::Tensor2CVMat(src);
-    auto dst_mat = mmdeploy::cpu::CropResizePad(src_mat, crop_rect, target_size, pad_rect);
-    dst = mmdeploy::cpu::CVMat2Tensor(dst_mat);
-    return success();
-  }
-};
+        Result<void> apply(const Tensor& src, const std::vector<int>& crop_rect, const std::vector<int>& target_size, const std::vector<int>& pad_rect, Tensor& dst) override
+        {
+            auto src_mat = mmdeploy::cpu::Tensor2CVMat(src);
+            auto dst_mat = mmdeploy::cpu::CropResizePad(src_mat, crop_rect, target_size, pad_rect);
+            dst          = mmdeploy::cpu::CVMat2Tensor(dst_mat);
+            return success();
+        }
+    };
 
-MMDEPLOY_REGISTER_FACTORY_FUNC(CropResizePad, (cpu, 0),
-                               []() { return std::make_unique<CropResizePadImpl>(); });
+    MMDEPLOY_REGISTER_FACTORY_FUNC(CropResizePad, (cpu, 0), []()
+                                   { return std::make_unique<CropResizePadImpl>(); });
 
 }  // namespace mmdeploy::operation::cpu
diff --git a/csrc/mmdeploy/operation/cpu/cvtcolor.cpp b/csrc/mmdeploy/operation/cpu/cvtcolor.cpp
index b1e4d9b536..65484db766 100644
--- a/csrc/mmdeploy/operation/cpu/cvtcolor.cpp
+++ b/csrc/mmdeploy/operation/cpu/cvtcolor.cpp
@@ -3,18 +3,22 @@
 #include "mmdeploy/operation/vision.h"
 #include "mmdeploy/utils/opencv/opencv_utils.h"
 
-namespace mmdeploy::operation::cpu {
+namespace mmdeploy::operation::cpu
+{
 
-class CvtColorImpl : public CvtColor {
- public:
-  Result<void> apply(const Mat& src, Mat& dst, PixelFormat dst_fmt) override {
-    auto src_mat = mmdeploy::cpu::Mat2CVMat(src);
-    auto dst_mat = mmdeploy::cpu::CvtColor(src_mat, src.pixel_format(), dst_fmt);
-    dst = mmdeploy::cpu::CVMat2Mat(dst_mat, dst_fmt);
-    return success();
-  }
-};
+    class CvtColorImpl : public CvtColor
+    {
+      public:
+        Result<void> apply(const Mat& src, Mat& dst, PixelFormat dst_fmt) override
+        {
+            auto src_mat = mmdeploy::cpu::Mat2CVMat(src);
+            auto dst_mat = mmdeploy::cpu::CvtColor(src_mat, src.pixel_format(), dst_fmt);
+            dst          = mmdeploy::cpu::CVMat2Mat(dst_mat, dst_fmt);
+            return success();
+        }
+    };
 
-MMDEPLOY_REGISTER_FACTORY_FUNC(CvtColor, (cpu, 0), [] { return std::make_unique<CvtColorImpl>(); });
+    MMDEPLOY_REGISTER_FACTORY_FUNC(CvtColor, (cpu, 0), []
+                                   { return std::make_unique<CvtColorImpl>(); });
 
 }  // namespace mmdeploy::operation::cpu
diff --git a/csrc/mmdeploy/operation/cpu/flip.cpp b/csrc/mmdeploy/operation/cpu/flip.cpp
index 94808793b6..a456600468 100644
--- a/csrc/mmdeploy/operation/cpu/flip.cpp
+++ b/csrc/mmdeploy/operation/cpu/flip.cpp
@@ -3,22 +3,25 @@
 #include "mmdeploy/operation/vision.h"
 #include "mmdeploy/utils/opencv/opencv_utils.h"
 
-namespace mmdeploy::operation::cpu {
+namespace mmdeploy::operation::cpu
+{
 
-class FlipImpl : public Flip {
- public:
-  using Flip::Flip;
+    class FlipImpl : public Flip
+    {
+      public:
+        using Flip::Flip;
 
-  Result<void> apply(const Tensor& src, Tensor& dst) override {
-    cv::Mat mat = mmdeploy::cpu::Tensor2CVMat(src);
-    cv::Mat flipped_mat;
-    cv::flip(mat, flipped_mat, flip_code_);
-    dst = mmdeploy::cpu::CVMat2Tensor(flipped_mat);
-    return success();
-  }
-};
+        Result<void> apply(const Tensor& src, Tensor& dst) override
+        {
+            cv::Mat mat = mmdeploy::cpu::Tensor2CVMat(src);
+            cv::Mat flipped_mat;
+            cv::flip(mat, flipped_mat, flip_code_);
+            dst = mmdeploy::cpu::CVMat2Tensor(flipped_mat);
+            return success();
+        }
+    };
 
-MMDEPLOY_REGISTER_FACTORY_FUNC(Flip, (cpu, 0),
-                               [](int flip_code) { return std::make_unique<FlipImpl>(flip_code); });
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Flip, (cpu, 0), [](int flip_code)
+                                   { return std::make_unique<FlipImpl>(flip_code); });
 
 }  // namespace mmdeploy::operation::cpu
diff --git a/csrc/mmdeploy/operation/cpu/hwc2chw.cpp b/csrc/mmdeploy/operation/cpu/hwc2chw.cpp
index 33db1b8e4d..eebd0b97b5 100644
--- a/csrc/mmdeploy/operation/cpu/hwc2chw.cpp
+++ b/csrc/mmdeploy/operation/cpu/hwc2chw.cpp
@@ -3,26 +3,30 @@
 #include "mmdeploy/operation/vision.h"
 #include "mmdeploy/utils/opencv/opencv_utils.h"
 
-namespace mmdeploy::operation::cpu {
-
-class HWC2CHWImpl : public HWC2CHW {
- public:
-  Result<void> apply(const Tensor& img, Tensor& dst) override {
-    auto shape = img.shape();
-    auto height = shape[1];
-    auto width = shape[2];
-    auto channels = shape[3];
-
-    auto dst_mat = mmdeploy::cpu::Transpose(mmdeploy::cpu::Tensor2CVMat(img));
-
-    auto dst_tensor = mmdeploy::cpu::CVMat2Tensor(dst_mat);
-    dst_tensor.Reshape({1, channels, height, width});
-
-    dst = std::move(dst_tensor);
-    return success();
-  }
-};
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(HWC2CHW, (cpu, 0), []() { return std::make_unique<HWC2CHWImpl>(); });
+namespace mmdeploy::operation::cpu
+{
+
+    class HWC2CHWImpl : public HWC2CHW
+    {
+      public:
+        Result<void> apply(const Tensor& img, Tensor& dst) override
+        {
+            auto shape    = img.shape();
+            auto height   = shape[1];
+            auto width    = shape[2];
+            auto channels = shape[3];
+
+            auto dst_mat = mmdeploy::cpu::Transpose(mmdeploy::cpu::Tensor2CVMat(img));
+
+            auto dst_tensor = mmdeploy::cpu::CVMat2Tensor(dst_mat);
+            dst_tensor.Reshape({1, channels, height, width});
+
+            dst = std::move(dst_tensor);
+            return success();
+        }
+    };
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(HWC2CHW, (cpu, 0), []()
+                                   { return std::make_unique<HWC2CHWImpl>(); });
 
 }  // namespace mmdeploy::operation::cpu
diff --git a/csrc/mmdeploy/operation/cpu/normalize.cpp b/csrc/mmdeploy/operation/cpu/normalize.cpp
index e444cf0dad..83244794fe 100644
--- a/csrc/mmdeploy/operation/cpu/normalize.cpp
+++ b/csrc/mmdeploy/operation/cpu/normalize.cpp
@@ -3,27 +3,32 @@
 #include "mmdeploy/operation/vision.h"
 #include "mmdeploy/utils/opencv/opencv_utils.h"
 
-namespace mmdeploy::operation::cpu {
-
-class NormalizeImpl : public Normalize {
- public:
-  explicit NormalizeImpl(Param param) : param_(std::move(param)) {}
-
-  Result<void> apply(const Tensor& src, Tensor& dst) override {
-    auto mat = mmdeploy::cpu::Tensor2CVMat(src);
-    auto dst_mat = mmdeploy::cpu::Normalize(mat, param_.mean, param_.std, param_.to_rgb, false);
-    auto output = mmdeploy::cpu::CVMat2Tensor(dst_mat);
-
-    dst = std::move(output);
-    return success();
-  }
-
- protected:
-  Param param_;
-};
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(Normalize, (cpu, 0), [](const Normalize::Param& param) {
-  return std::make_unique<NormalizeImpl>(param);
-});
+namespace mmdeploy::operation::cpu
+{
+
+    class NormalizeImpl : public Normalize
+    {
+      public:
+        explicit NormalizeImpl(Param param)
+            : param_(std::move(param))
+        {
+        }
+
+        Result<void> apply(const Tensor& src, Tensor& dst) override
+        {
+            auto mat     = mmdeploy::cpu::Tensor2CVMat(src);
+            auto dst_mat = mmdeploy::cpu::Normalize(mat, param_.mean, param_.std, param_.to_rgb, false);
+            auto output  = mmdeploy::cpu::CVMat2Tensor(dst_mat);
+
+            dst = std::move(output);
+            return success();
+        }
+
+      protected:
+        Param param_;
+    };
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Normalize, (cpu, 0), [](const Normalize::Param& param)
+                                   { return std::make_unique<NormalizeImpl>(param); });
 
 }  // namespace mmdeploy::operation::cpu
diff --git a/csrc/mmdeploy/operation/cpu/pad.cpp b/csrc/mmdeploy/operation/cpu/pad.cpp
index 8a6ba50d6a..872bf51208 100644
--- a/csrc/mmdeploy/operation/cpu/pad.cpp
+++ b/csrc/mmdeploy/operation/cpu/pad.cpp
@@ -5,35 +5,40 @@
 #include "mmdeploy/operation/vision.h"
 #include "mmdeploy/utils/opencv/opencv_utils.h"
 
-namespace mmdeploy::operation::cpu {
-
-class PadImpl : public Pad {
- public:
-  PadImpl(cv::BorderTypes border_type, float pad_val)
-      : border_type_(border_type), pad_val_(pad_val) {}
-
-  Result<void> apply(const Tensor& src, Tensor& dst, int top, int left, int bottom,
-                     int right) override {
-    cv::Mat dst_mat = mmdeploy::cpu::Pad(mmdeploy::cpu::Tensor2CVMat(src), top, left, bottom, right,
-                                         border_type_, pad_val_);
-    dst = mmdeploy::cpu::CVMat2Tensor(dst_mat);
-    return success();
-  }
-
- private:
-  cv::BorderTypes border_type_;
-  float pad_val_;
-};
-
-static auto Create(const string_view& border_type, float pad_val) {
-  static const std::map<string_view, cv::BorderTypes> border_map{
-      {"constant", cv::BORDER_CONSTANT},
-      {"edge", cv::BORDER_REPLICATE},
-      {"reflect", cv::BORDER_REFLECT_101},
-      {"symmetric", cv::BORDER_REFLECT}};
-  return std::make_unique<PadImpl>(border_map.at(border_type), pad_val);
-}
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(Pad, (cpu, 0), Create);
+namespace mmdeploy::operation::cpu
+{
+
+    class PadImpl : public Pad
+    {
+      public:
+        PadImpl(cv::BorderTypes border_type, float pad_val)
+            : border_type_(border_type)
+            , pad_val_(pad_val)
+        {
+        }
+
+        Result<void> apply(const Tensor& src, Tensor& dst, int top, int left, int bottom, int right) override
+        {
+            cv::Mat dst_mat = mmdeploy::cpu::Pad(mmdeploy::cpu::Tensor2CVMat(src), top, left, bottom, right, border_type_, pad_val_);
+            dst             = mmdeploy::cpu::CVMat2Tensor(dst_mat);
+            return success();
+        }
+
+      private:
+        cv::BorderTypes border_type_;
+        float           pad_val_;
+    };
+
+    static auto Create(const string_view& border_type, float pad_val)
+    {
+        static const std::map<string_view, cv::BorderTypes> border_map{
+            {"constant", cv::BORDER_CONSTANT},
+            {"edge", cv::BORDER_REPLICATE},
+            {"reflect", cv::BORDER_REFLECT_101},
+            {"symmetric", cv::BORDER_REFLECT}};
+        return std::make_unique<PadImpl>(border_map.at(border_type), pad_val);
+    }
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Pad, (cpu, 0), Create);
 
 }  // namespace mmdeploy::operation::cpu
diff --git a/csrc/mmdeploy/operation/cpu/permute.cpp b/csrc/mmdeploy/operation/cpu/permute.cpp
index 44c98fe24d..7eadd5946d 100644
--- a/csrc/mmdeploy/operation/cpu/permute.cpp
+++ b/csrc/mmdeploy/operation/cpu/permute.cpp
@@ -3,89 +3,108 @@
 #include "mmdeploy/operation/vision.h"
 #include "mmdeploy/utils/opencv/opencv_utils.h"
 
-namespace mmdeploy::operation::cpu {
+namespace mmdeploy::operation::cpu
+{
 
-class PermuteImpl : public Permute {
- public:
-  explicit PermuteImpl() {}
+    class PermuteImpl : public Permute
+    {
+      public:
+        explicit PermuteImpl() {}
 
-  Result<void> apply(const Tensor& src, Tensor& dst, const std::vector<int>& axes) override {
-    int ndim = src.shape().size();
-    if (ndim != axes.size()) {
-      MMDEPLOY_ERROR("The size of axes should be equal to src, {} vs {}", axes.size(), ndim);
-      return Status(eInvalidArgument);
-    }
-    std::vector<int> axes_vis(ndim, 0);
-    for (const auto& x : axes) {
-      if (x < 0 || x >= ndim || axes_vis[x]) {
-        MMDEPLOY_ERROR("Invalid axes");
-        return Status(eInvalidArgument);
-      }
-      axes_vis[x] = 1;
-    }
+        Result<void> apply(const Tensor& src, Tensor& dst, const std::vector<int>& axes) override
+        {
+            int ndim = src.shape().size();
+            if (ndim != axes.size())
+            {
+                MMDEPLOY_ERROR("The size of axes should be equal to src, {} vs {}", axes.size(), ndim);
+                return Status(eInvalidArgument);
+            }
+            std::vector<int> axes_vis(ndim, 0);
+            for (const auto& x : axes)
+            {
+                if (x < 0 || x >= ndim || axes_vis[x])
+                {
+                    MMDEPLOY_ERROR("Invalid axes");
+                    return Status(eInvalidArgument);
+                }
+                axes_vis[x] = 1;
+            }
 
-    Tensor dst_tensor(src.desc());
-    auto src_dims = src.shape();
-    TensorShape dst_dims(ndim);
-    for (int i = 0; i < src_dims.size(); i++) {
-      dst_dims[i] = src_dims[axes[i]];
-    }
-    dst_tensor.Reshape(dst_dims);
+            Tensor      dst_tensor(src.desc());
+            auto        src_dims = src.shape();
+            TensorShape dst_dims(ndim);
+            for (int i = 0; i < src_dims.size(); i++)
+            {
+                dst_dims[i] = src_dims[axes[i]];
+            }
+            dst_tensor.Reshape(dst_dims);
 
-    std::vector<int> dst_strides(ndim);
-    std::vector<int> src_strides(ndim);
-    dst_strides[ndim - 1] = src_strides[ndim - 1] = 1;
-    for (int i = ndim - 2; i >= 0; i--) {
-      dst_strides[i] = dst_strides[i + 1] * dst_dims[i + 1];
-      src_strides[i] = src_strides[i + 1] * src_dims[i + 1];
-    }
+            std::vector<int> dst_strides(ndim);
+            std::vector<int> src_strides(ndim);
+            dst_strides[ndim - 1] = src_strides[ndim - 1] = 1;
+            for (int i = ndim - 2; i >= 0; i--)
+            {
+                dst_strides[i] = dst_strides[i + 1] * dst_dims[i + 1];
+                src_strides[i] = src_strides[i + 1] * src_dims[i + 1];
+            }
 
-    std::vector<int> tmp(ndim);
-    for (int i = 0; i < ndim; i++) {
-      tmp[i] = src_strides[axes[i]];
-    }
-    src_strides.swap(tmp);
+            std::vector<int> tmp(ndim);
+            for (int i = 0; i < ndim; i++)
+            {
+                tmp[i] = src_strides[axes[i]];
+            }
+            src_strides.swap(tmp);
 
-    if (src.data_type() == DataType::kINT8) {
-      OUTCOME_TRY(PermuteDispatch<uint8_t>(src, dst_tensor, src_strides, dst_strides));
-    } else if (src.data_type() == DataType::kFLOAT) {
-      OUTCOME_TRY(PermuteDispatch<float>(src, dst_tensor, src_strides, dst_strides));
-    } else {
-      MMDEPLOY_ERROR("unsupported data type {}", src.data_type());
-      return Status(eNotSupported);
-    }
-    dst = std::move(dst_tensor);
-    return success();
-  }
+            if (src.data_type() == DataType::kINT8)
+            {
+                OUTCOME_TRY(PermuteDispatch<uint8_t>(src, dst_tensor, src_strides, dst_strides));
+            }
+            else if (src.data_type() == DataType::kFLOAT)
+            {
+                OUTCOME_TRY(PermuteDispatch<float>(src, dst_tensor, src_strides, dst_strides));
+            }
+            else
+            {
+                MMDEPLOY_ERROR("unsupported data type {}", src.data_type());
+                return Status(eNotSupported);
+            }
+            dst = std::move(dst_tensor);
+            return success();
+        }
 
-  template <typename T>
-  Result<void> PermuteDispatch(const Tensor& src, Tensor& dst, const std::vector<int>& src_strides,
-                               const std::vector<int>& dst_strides) {
-    auto shape = dst.shape();
-    int ndim = src.shape().size();
-    std::vector<int> coord(ndim, 0);
-    auto dst_data = dst.data<T>();
-    auto src_data = src.data<T>();
+        template<typename T>
+        Result<void> PermuteDispatch(const Tensor& src, Tensor& dst, const std::vector<int>& src_strides, const std::vector<int>& dst_strides)
+        {
+            auto             shape = dst.shape();
+            int              ndim  = src.shape().size();
+            std::vector<int> coord(ndim, 0);
+            auto             dst_data = dst.data<T>();
+            auto             src_data = src.data<T>();
 
-    int i;
-    do {
-      dst_data[0] = src_data[0];
-      for (i = ndim - 1; i >= 0; i--) {
-        if (++coord[i] == shape[i]) {
-          coord[i] = 0;
-          dst_data -= (shape[i] - 1) * dst_strides[i];
-          src_data -= (shape[i] - 1) * src_strides[i];
-        } else {
-          dst_data += dst_strides[i];
-          src_data += src_strides[i];
-          break;
+            int              i;
+            do {
+                dst_data[0] = src_data[0];
+                for (i = ndim - 1; i >= 0; i--)
+                {
+                    if (++coord[i] == shape[i])
+                    {
+                        coord[i] = 0;
+                        dst_data -= (shape[i] - 1) * dst_strides[i];
+                        src_data -= (shape[i] - 1) * src_strides[i];
+                    }
+                    else
+                    {
+                        dst_data += dst_strides[i];
+                        src_data += src_strides[i];
+                        break;
+                    }
+                }
+            } while (i >= 0);
+            return success();
         }
-      }
-    } while (i >= 0);
-    return success();
-  }
-};
+    };
 
-MMDEPLOY_REGISTER_FACTORY_FUNC(Permute, (cpu, 0), []() { return std::make_unique<PermuteImpl>(); });
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Permute, (cpu, 0), []()
+                                   { return std::make_unique<PermuteImpl>(); });
 
 }  // namespace mmdeploy::operation::cpu
diff --git a/csrc/mmdeploy/operation/cpu/resize.cpp b/csrc/mmdeploy/operation/cpu/resize.cpp
index 33c5ce313b..d01664c4ad 100644
--- a/csrc/mmdeploy/operation/cpu/resize.cpp
+++ b/csrc/mmdeploy/operation/cpu/resize.cpp
@@ -3,25 +3,30 @@
 #include "mmdeploy/operation/vision.h"
 #include "mmdeploy/utils/opencv/opencv_utils.h"
 
-namespace mmdeploy::operation::cpu {
+namespace mmdeploy::operation::cpu
+{
 
-class ResizeImpl : public Resize {
- public:
-  explicit ResizeImpl(std::string interp) : interp_(std::move(interp)) {}
+    class ResizeImpl : public Resize
+    {
+      public:
+        explicit ResizeImpl(std::string interp)
+            : interp_(std::move(interp))
+        {
+        }
 
-  Result<void> apply(const Tensor& src, Tensor& dst, int dst_h, int dst_w) override {
-    auto src_mat = mmdeploy::cpu::Tensor2CVMat(src);
-    auto dst_mat = mmdeploy::cpu::Resize(src_mat, dst_h, dst_w, interp_);
-    dst = mmdeploy::cpu::CVMat2Tensor(dst_mat);
-    return success();
-  }
+        Result<void> apply(const Tensor& src, Tensor& dst, int dst_h, int dst_w) override
+        {
+            auto src_mat = mmdeploy::cpu::Tensor2CVMat(src);
+            auto dst_mat = mmdeploy::cpu::Resize(src_mat, dst_h, dst_w, interp_);
+            dst          = mmdeploy::cpu::CVMat2Tensor(dst_mat);
+            return success();
+        }
 
- private:
-  std::string interp_;
-};
+      private:
+        std::string interp_;
+    };
 
-MMDEPLOY_REGISTER_FACTORY_FUNC(Resize, (cpu, 0), [](const string_view& interp) {
-  return std::make_unique<ResizeImpl>(std::string(interp));
-});
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Resize, (cpu, 0), [](const string_view& interp)
+                                   { return std::make_unique<ResizeImpl>(std::string(interp)); });
 
 }  // namespace mmdeploy::operation::cpu
diff --git a/csrc/mmdeploy/operation/cpu/to_float.cpp b/csrc/mmdeploy/operation/cpu/to_float.cpp
index 90435d31b3..f513f093b6 100644
--- a/csrc/mmdeploy/operation/cpu/to_float.cpp
+++ b/csrc/mmdeploy/operation/cpu/to_float.cpp
@@ -5,38 +5,45 @@
 #include "mmdeploy/operation/vision.h"
 #include "mmdeploy/utils/opencv/opencv_utils.h"
 
-namespace mmdeploy::operation::cpu {
-
-class ToFloatImpl : public ToFloat {
- public:
-  Result<void> apply(const Tensor& src, Tensor& dst) override {
-    auto data_type = src.desc().data_type;
-    if (data_type == DataType::kFLOAT) {
-      dst = src;
-      return success();
-    }
-
-    if (data_type == DataType::kINT8) {
-      const auto size = src.size();
-      if (size > std::numeric_limits<int>::max()) {
-        throw_exception(eNotSupported);
-      }
-      cv::Mat uint8_mat(1, static_cast<int>(size), CV_8U, const_cast<void*>(src.data()));
-
-      auto desc = src.desc();
-      desc.data_type = DataType::kFLOAT;
-      Tensor dst_tensor(desc);
-
-      cv::Mat float_mat(1, static_cast<int>(size), CV_32F, dst_tensor.data());
-      uint8_mat.convertTo(float_mat, CV_32F);
-
-      dst = std::move(dst_tensor);
-      return success();
-    }
-    throw_exception(eNotSupported);
-  }
-};
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(ToFloat, (cpu, 0), []() { return std::make_unique<ToFloatImpl>(); });
+namespace mmdeploy::operation::cpu
+{
+
+    class ToFloatImpl : public ToFloat
+    {
+      public:
+        Result<void> apply(const Tensor& src, Tensor& dst) override
+        {
+            auto data_type = src.desc().data_type;
+            if (data_type == DataType::kFLOAT)
+            {
+                dst = src;
+                return success();
+            }
+
+            if (data_type == DataType::kINT8)
+            {
+                const auto size = src.size();
+                if (size > std::numeric_limits<int>::max())
+                {
+                    throw_exception(eNotSupported);
+                }
+                cv::Mat uint8_mat(1, static_cast<int>(size), CV_8U, const_cast<void*>(src.data()));
+
+                auto    desc   = src.desc();
+                desc.data_type = DataType::kFLOAT;
+                Tensor  dst_tensor(desc);
+
+                cv::Mat float_mat(1, static_cast<int>(size), CV_32F, dst_tensor.data());
+                uint8_mat.convertTo(float_mat, CV_32F);
+
+                dst = std::move(dst_tensor);
+                return success();
+            }
+            throw_exception(eNotSupported);
+        }
+    };
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(ToFloat, (cpu, 0), []()
+                                   { return std::make_unique<ToFloatImpl>(); });
 
 }  // namespace mmdeploy::operation::cpu
diff --git a/csrc/mmdeploy/operation/cpu/warp_affine.cpp b/csrc/mmdeploy/operation/cpu/warp_affine.cpp
index 5b5914db71..257716a5b9 100644
--- a/csrc/mmdeploy/operation/cpu/warp_affine.cpp
+++ b/csrc/mmdeploy/operation/cpu/warp_affine.cpp
@@ -3,27 +3,31 @@
 #include "mmdeploy/operation/vision.h"
 #include "mmdeploy/utils/opencv/opencv_utils.h"
 
-namespace mmdeploy::operation::cpu {
+namespace mmdeploy::operation::cpu
+{
 
-class WarpAffineImpl : public WarpAffine {
- public:
-  explicit WarpAffineImpl(int method) : method_(method) {}
+    class WarpAffineImpl : public WarpAffine
+    {
+      public:
+        explicit WarpAffineImpl(int method)
+            : method_(method)
+        {
+        }
 
-  Result<void> apply(const Tensor& src, Tensor& dst, const float affine_matrix[6], int dst_h,
-                     int dst_w) override {
-    auto src_mat = mmdeploy::cpu::Tensor2CVMat(src);
-    cv::Mat_<float> _matrix(2, 3, const_cast<float*>(affine_matrix));
-    auto dst_mat = mmdeploy::cpu::WarpAffine(src_mat, _matrix, dst_h, dst_w, method_);
-    dst = mmdeploy::cpu::CVMat2Tensor(dst_mat);
-    return success();
-  }
+        Result<void> apply(const Tensor& src, Tensor& dst, const float affine_matrix[6], int dst_h, int dst_w) override
+        {
+            auto            src_mat = mmdeploy::cpu::Tensor2CVMat(src);
+            cv::Mat_<float> _matrix(2, 3, const_cast<float*>(affine_matrix));
+            auto            dst_mat = mmdeploy::cpu::WarpAffine(src_mat, _matrix, dst_h, dst_w, method_);
+            dst                     = mmdeploy::cpu::CVMat2Tensor(dst_mat);
+            return success();
+        }
 
- private:
-  int method_;
-};
+      private:
+        int method_;
+    };
 
-MMDEPLOY_REGISTER_FACTORY_FUNC(WarpAffine, (cpu, 0), [](const string_view& interp) {
-  return std::make_unique<WarpAffineImpl>(::mmdeploy::cpu::GetInterpolationMethod(interp).value());
-});
+    MMDEPLOY_REGISTER_FACTORY_FUNC(WarpAffine, (cpu, 0), [](const string_view& interp)
+                                   { return std::make_unique<WarpAffineImpl>(::mmdeploy::cpu::GetInterpolationMethod(interp).value()); });
 
 }  // namespace mmdeploy::operation::cpu
diff --git a/csrc/mmdeploy/operation/cuda/CMakeLists.txt b/csrc/mmdeploy/operation/cuda/CMakeLists.txt
index 551f89977b..1842f06f70 100644
--- a/csrc/mmdeploy/operation/cuda/CMakeLists.txt
+++ b/csrc/mmdeploy/operation/cuda/CMakeLists.txt
@@ -1,32 +1,33 @@
-if (NOT ("cuda" IN_LIST MMDEPLOY_TARGET_DEVICES))
-    return()
-endif ()
+if(NOT ("cuda" IN_LIST MMDEPLOY_TARGET_DEVICES))
+  return()
+endif()
 
 project(mmdeploy_operation_cuda CUDA CXX)
 
 find_package(pplcv REQUIRED)
 
-set(SRCS resize.cpp
-        cvtcolor.cpp
-        pad.cpp
-        to_float.cpp
-        cast.cu
-        hwc2chw.cpp
-        transpose.cu
-        normalize.cpp
-        normalize.cu
-        crop.cpp
-        crop.cu
-        flip.cpp
-        warp_affine.cpp
-        crop_resize_pad.cpp
-        permute.cpp
-        permute.cu)
+set(SRCS
+    resize.cpp
+    cvtcolor.cpp
+    pad.cpp
+    to_float.cpp
+    cast.cu
+    hwc2chw.cpp
+    transpose.cu
+    normalize.cpp
+    normalize.cu
+    crop.cpp
+    crop.cu
+    flip.cpp
+    warp_affine.cpp
+    crop_resize_pad.cpp
+    permute.cpp
+    permute.cu)
 
 mmdeploy_add_module(${PROJECT_NAME} "${SRCS}")
 
-target_link_libraries(${PROJECT_NAME} PRIVATE
-        mmdeploy_operation
-        ${PPLCV_LIBRARIES})
-target_include_directories(${PROJECT_NAME}
-        PRIVATE ${CUDA_TOOLKIT_ROOT_DIR}/include ${PPLCV_INCLUDE_DIRS})
+target_link_libraries(${PROJECT_NAME} PRIVATE mmdeploy_operation
+                                              ${PPLCV_LIBRARIES})
+target_include_directories(
+  ${PROJECT_NAME} PRIVATE ${CUDA_TOOLKIT_ROOT_DIR}/include
+                          ${PPLCV_INCLUDE_DIRS})
diff --git a/csrc/mmdeploy/operation/cuda/cast.cu b/csrc/mmdeploy/operation/cuda/cast.cu
index 9449071b05..cf18d22683 100644
--- a/csrc/mmdeploy/operation/cuda/cast.cu
+++ b/csrc/mmdeploy/operation/cuda/cast.cu
@@ -2,27 +2,33 @@
 
 #include <cstdint>
 
-namespace mmdeploy {
-namespace operation {
-namespace cuda {
+namespace mmdeploy
+{
+    namespace operation
+    {
+        namespace cuda
+        {
 
-template <typename From, typename To>
-__global__ void _Cast(const From* src, To* dst, size_t n) {
-  auto idx = threadIdx.x + static_cast<size_t>(blockIdx.x) * blockDim.x;
-  for (size_t i = idx; i < n; i += blockDim.x * gridDim.x) {
-    dst[i] = static_cast<To>(src[i]);
-  }
-}
+            template<typename From, typename To>
+            __global__ void _Cast(const From* src, To* dst, size_t n)
+            {
+                auto idx = threadIdx.x + static_cast<size_t>(blockIdx.x) * blockDim.x;
+                for (size_t i = idx; i < n; i += blockDim.x * gridDim.x)
+                {
+                    dst[i] = static_cast<To>(src[i]);
+                }
+            }
 
-template <typename From, typename To>
-void Cast(const From* src, To* dst, size_t n, cudaStream_t stream) {
-  size_t n_threads = 256;
-  size_t n_blocks = (n + n_threads - 1) / n_threads;
-  _Cast<<<n_blocks, n_threads, 0, stream>>>(src, dst, n);
-}
+            template<typename From, typename To>
+            void Cast(const From* src, To* dst, size_t n, cudaStream_t stream)
+            {
+                size_t n_threads = 256;
+                size_t n_blocks  = (n + n_threads - 1) / n_threads;
+                _Cast<<<n_blocks, n_threads, 0, stream>>>(src, dst, n);
+            }
 
-template void Cast(const uint8_t*, float*, size_t, cudaStream_t);
+            template void Cast(const uint8_t*, float*, size_t, cudaStream_t);
 
-}  // namespace cuda
-}  // namespace operation
+        }  // namespace cuda
+    }      // namespace operation
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/operation/cuda/crop.cpp b/csrc/mmdeploy/operation/cuda/crop.cpp
index be8930a835..52b65a5c26 100644
--- a/csrc/mmdeploy/operation/cuda/crop.cpp
+++ b/csrc/mmdeploy/operation/cuda/crop.cpp
@@ -5,64 +5,82 @@
 #include "mmdeploy/core/utils/formatter.h"
 #include "mmdeploy/operation/vision.h"
 
-namespace mmdeploy::operation::cuda {
+namespace mmdeploy::operation::cuda
+{
 
-namespace impl {
+    namespace impl
+    {
 
-template <typename T, int channels>
-void Crop(const T* src, int src_w, T* dst, int dst_h, int dst_w, int offset_h, int offset_w,
-          cudaStream_t stream);
+        template<typename T, int channels>
+        void Crop(const T* src, int src_w, T* dst, int dst_h, int dst_w, int offset_h, int offset_w, cudaStream_t stream);
 
-}
+    }
 
-class CropImpl : public Crop {
- public:
-  Result<void> apply(const Tensor& src, Tensor& dst, int top, int left, int bottom,
-                     int right) override {
-    auto cuda_stream = GetNative<cudaStream_t>(stream());
-    auto desc = src.desc();
+    class CropImpl : public Crop
+    {
+      public:
+        Result<void> apply(const Tensor& src, Tensor& dst, int top, int left, int bottom, int right) override
+        {
+            auto        cuda_stream = GetNative<cudaStream_t>(stream());
+            auto        desc        = src.desc();
 
-    int h = bottom - top + 1;
-    int w = right - left + 1;
-    int c = desc.shape[3];
-    auto type = desc.data_type;
+            int         h    = bottom - top + 1;
+            int         w    = right - left + 1;
+            int         c    = desc.shape[3];
+            auto        type = desc.data_type;
 
-    TensorShape shape{1, bottom - top + 1, right - left + 1, src.desc().shape[3]};
-    TensorDesc dst_desc{device(), src.desc().data_type, shape, desc.name};
-    Tensor dst_tensor{dst_desc};
+            TensorShape shape{1, bottom - top + 1, right - left + 1, src.desc().shape[3]};
+            TensorDesc  dst_desc{device(), src.desc().data_type, shape, desc.name};
+            Tensor      dst_tensor{dst_desc};
 
-    if (DataType::kINT8 == type) {
-      auto input = src.data<uint8_t>();
-      auto output = dst_tensor.data<uint8_t>();
-      if (3 == c) {
-        impl::Crop<uint8_t, 3>(input, desc.shape[2], output, h, w, top, left, cuda_stream);
-      } else if (1 == c) {
-        impl::Crop<uint8_t, 1>(input, desc.shape[2], output, h, w, top, left, cuda_stream);
-      } else {
-        MMDEPLOY_ERROR("unsupported channels {}", c);
-        return Status(eNotSupported);
-      }
-    } else if (DataType::kFLOAT == type) {
-      auto input = static_cast<float*>(src.buffer().GetNative());
-      auto output = static_cast<float*>(dst_tensor.buffer().GetNative());
-      if (3 == c) {
-        impl::Crop<float, 3>(input, desc.shape[2], output, h, w, top, left, cuda_stream);
-      } else if (1 == c) {
-        impl::Crop<float, 1>(input, desc.shape[2], output, h, w, top, left, cuda_stream);
-      } else {
-        MMDEPLOY_ERROR("unsupported channels {}", c);
-        return Status(eNotSupported);
-      }
-    } else {
-      MMDEPLOY_ERROR("unsupported type {}", type);
-      return Status(eNotSupported);
-    }
+            if (DataType::kINT8 == type)
+            {
+                auto input  = src.data<uint8_t>();
+                auto output = dst_tensor.data<uint8_t>();
+                if (3 == c)
+                {
+                    impl::Crop<uint8_t, 3>(input, desc.shape[2], output, h, w, top, left, cuda_stream);
+                }
+                else if (1 == c)
+                {
+                    impl::Crop<uint8_t, 1>(input, desc.shape[2], output, h, w, top, left, cuda_stream);
+                }
+                else
+                {
+                    MMDEPLOY_ERROR("unsupported channels {}", c);
+                    return Status(eNotSupported);
+                }
+            }
+            else if (DataType::kFLOAT == type)
+            {
+                auto input  = static_cast<float*>(src.buffer().GetNative());
+                auto output = static_cast<float*>(dst_tensor.buffer().GetNative());
+                if (3 == c)
+                {
+                    impl::Crop<float, 3>(input, desc.shape[2], output, h, w, top, left, cuda_stream);
+                }
+                else if (1 == c)
+                {
+                    impl::Crop<float, 1>(input, desc.shape[2], output, h, w, top, left, cuda_stream);
+                }
+                else
+                {
+                    MMDEPLOY_ERROR("unsupported channels {}", c);
+                    return Status(eNotSupported);
+                }
+            }
+            else
+            {
+                MMDEPLOY_ERROR("unsupported type {}", type);
+                return Status(eNotSupported);
+            }
 
-    dst = std::move(dst_tensor);
-    return success();
-  }
-};
+            dst = std::move(dst_tensor);
+            return success();
+        }
+    };
 
-MMDEPLOY_REGISTER_FACTORY_FUNC(Crop, (cuda, 0), [] { return std::make_unique<CropImpl>(); });
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Crop, (cuda, 0), []
+                                   { return std::make_unique<CropImpl>(); });
 
 }  // namespace mmdeploy::operation::cuda
diff --git a/csrc/mmdeploy/operation/cuda/crop.cu b/csrc/mmdeploy/operation/cuda/crop.cu
index e2f09ff7d5..39e2caf4ea 100644
--- a/csrc/mmdeploy/operation/cuda/crop.cu
+++ b/csrc/mmdeploy/operation/cuda/crop.cu
@@ -2,52 +2,53 @@
 
 #include <cstdint>
 
-namespace mmdeploy {
-namespace operation {
-namespace cuda {
-namespace impl {
-
-template <typename T, int channels>
-__global__ void crop(const T *src, int src_w, T *dst, int dst_h, int dst_w, int offset_h,
-                     int offset_w) {
-  int x = blockIdx.x * blockDim.x + threadIdx.x;
-  int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-  if (x >= dst_w || y >= dst_h) return;
-  int src_x = x + offset_w;
-  int src_y = y + offset_h;
-
-  int dst_loc = (y * dst_w + x) * channels;
-  int src_loc = (src_y * src_w + src_x) * channels;
-
-  for (int i = 0; i < channels; ++i) {
-    dst[dst_loc + i] = src[src_loc + i];
-  }
-}
-
-template <typename T, int channels>
-void Crop(const T *src, int src_w, T *dst, int dst_h, int dst_w, int offset_h, int offset_w,
-          cudaStream_t stream) {
-  const dim3 thread_block(32, 8);
-  const dim3 block_num((dst_w + thread_block.x - 1) / thread_block.x,
-                       (dst_h + thread_block.y - 1) / thread_block.y);
-  crop<T, channels>
-      <<<block_num, thread_block, 0, stream>>>(src, src_w, dst, dst_h, dst_w, offset_h, offset_w);
-}
-
-template void Crop<uint8_t, 3>(const uint8_t *src, int src_w, uint8_t *dst, int dst_h, int dst_w,
-                               int offset_h, int offset_w, cudaStream_t stream);
-
-template void Crop<uint8_t, 1>(const uint8_t *src, int src_w, uint8_t *dst, int dst_h, int dst_w,
-                               int offset_h, int offset_w, cudaStream_t stream);
-
-template void Crop<float, 3>(const float *src, int src_w, float *dst, int dst_h, int dst_w,
-                             int offset_h, int offset_w, cudaStream_t stream);
-
-template void Crop<float, 1>(const float *src, int src_w, float *dst, int dst_h, int dst_w,
-                             int offset_h, int offset_w, cudaStream_t stream);
-
-}  // namespace impl
-}  // namespace cuda
-}  // namespace operation
+namespace mmdeploy
+{
+    namespace operation
+    {
+        namespace cuda
+        {
+            namespace impl
+            {
+
+                template<typename T, int channels>
+                __global__ void crop(const T* src, int src_w, T* dst, int dst_h, int dst_w, int offset_h, int offset_w)
+                {
+                    int x = blockIdx.x * blockDim.x + threadIdx.x;
+                    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+                    if (x >= dst_w || y >= dst_h) return;
+                    int src_x = x + offset_w;
+                    int src_y = y + offset_h;
+
+                    int dst_loc = (y * dst_w + x) * channels;
+                    int src_loc = (src_y * src_w + src_x) * channels;
+
+                    for (int i = 0; i < channels; ++i)
+                    {
+                        dst[dst_loc + i] = src[src_loc + i];
+                    }
+                }
+
+                template<typename T, int channels>
+                void Crop(const T* src, int src_w, T* dst, int dst_h, int dst_w, int offset_h, int offset_w, cudaStream_t stream)
+                {
+                    const dim3 thread_block(32, 8);
+                    const dim3 block_num((dst_w + thread_block.x - 1) / thread_block.x,
+                                         (dst_h + thread_block.y - 1) / thread_block.y);
+                    crop<T, channels>
+                        <<<block_num, thread_block, 0, stream>>>(src, src_w, dst, dst_h, dst_w, offset_h, offset_w);
+                }
+
+                template void Crop<uint8_t, 3>(const uint8_t* src, int src_w, uint8_t* dst, int dst_h, int dst_w, int offset_h, int offset_w, cudaStream_t stream);
+
+                template void Crop<uint8_t, 1>(const uint8_t* src, int src_w, uint8_t* dst, int dst_h, int dst_w, int offset_h, int offset_w, cudaStream_t stream);
+
+                template void Crop<float, 3>(const float* src, int src_w, float* dst, int dst_h, int dst_w, int offset_h, int offset_w, cudaStream_t stream);
+
+                template void Crop<float, 1>(const float* src, int src_w, float* dst, int dst_h, int dst_w, int offset_h, int offset_w, cudaStream_t stream);
+
+            }  // namespace impl
+        }      // namespace cuda
+    }          // namespace operation
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/operation/cuda/crop_resize_pad.cpp b/csrc/mmdeploy/operation/cuda/crop_resize_pad.cpp
index e12ae06a1e..a12b5c560c 100644
--- a/csrc/mmdeploy/operation/cuda/crop_resize_pad.cpp
+++ b/csrc/mmdeploy/operation/cuda/crop_resize_pad.cpp
@@ -3,88 +3,121 @@
 #include "mmdeploy/operation/vision.h"
 #include "ppl/cv/cuda/resize.h"
 
-namespace mmdeploy::operation::cuda {
-
-class CropResizePadImpl : public CropResizePad {
- public:
-  CropResizePadImpl() = default;
-
-  Result<void> apply(const Tensor &src, const std::vector<int> &crop_rect,
-                     const std::vector<int> &target_size, const std::vector<int> &pad_rect,
-                     Tensor &dst) override {
-    auto cuda_stream = GetNative<cudaStream_t>(stream());
-
-    int width = target_size[0] + pad_rect[1] + pad_rect[3];
-    int height = target_size[1] + pad_rect[0] + pad_rect[2];
-
-    TensorDesc desc{device(), src.data_type(), {1, height, width, src.shape(3)}, src.name()};
-    Tensor dst_tensor(desc);
-    cudaMemsetAsync(dst_tensor.data<uint8_t>(), 0, dst_tensor.byte_size(), cuda_stream);
-
-    if (crop_rect[2] - crop_rect[0] + 1 > 0 && crop_rect[3] - crop_rect[1] + 1 > 0) {
-      if (src.data_type() == DataType::kINT8) {
-        OUTCOME_TRY(ResizeDispatch<uint8_t>(src, crop_rect, target_size, pad_rect, dst_tensor,
-                                            cuda_stream));
-      } else if (src.data_type() == DataType::kFLOAT) {
-        OUTCOME_TRY(
-            ResizeDispatch<float>(src, crop_rect, target_size, pad_rect, dst_tensor, cuda_stream));
-      } else {
-        MMDEPLOY_ERROR("unsupported data type {}", src.data_type());
-        return Status(eNotSupported);
-      }
-    }
-
-    dst = std::move(dst_tensor);
-    return success();
-  }
-
- private:
-  template <typename T>
-  auto Select(int channels) -> decltype(&ppl::cv::cuda::Resize<T, 1>) {
-    switch (channels) {
-      case 1:
-        return &ppl::cv::cuda::Resize<T, 1>;
-      case 3:
-        return &ppl::cv::cuda::Resize<T, 3>;
-      case 4:
-        return &ppl::cv::cuda::Resize<T, 4>;
-      default:
-        MMDEPLOY_ERROR("unsupported channels {}", channels);
-        return nullptr;
-    }
-  }
-
-  template <class T>
-  Result<void> ResizeDispatch(const Tensor &src, const std::vector<int> &crop_rect,
-                              const std::vector<int> &target_size, const std::vector<int> &pad_rect,
-                              Tensor &dst, cudaStream_t stream) {
-    int in_height = crop_rect[2] - crop_rect[0] + 1;
-    int in_width = crop_rect[3] - crop_rect[1] + 1;
-    int in_width_stride = src.shape(2) * src.shape(3);
-    int in_offset = crop_rect[0] * in_width_stride + crop_rect[1] * src.shape(3);
-    int out_h = target_size[1];
-    int out_w = target_size[0];
-    int out_width_stride = dst.shape(2) * dst.shape(3);
-    int out_offset = pad_rect[0] * out_width_stride + pad_rect[1] * dst.shape(3);
-    auto interp = ppl::cv::InterpolationType::INTERPOLATION_LINEAR;
-
-    auto input = src.data<T>();
-    auto output = dst.data<T>();
-
-    ppl::common::RetCode ret = 0;
-
-    if (auto resize = Select<T>(src.shape(3)); resize) {
-      ret = resize(stream, in_height, in_width, in_width_stride, input + in_offset, out_h, out_w,
-                   out_width_stride, output + out_offset, interp);
-    } else {
-      return Status(eNotSupported);
-    }
-
-    return ret == 0 ? success() : Result<void>(Status(eFail));
-  }
-};
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(CropResizePad, (cuda, 0),
-                               []() { return std::make_unique<CropResizePadImpl>(); });
+namespace mmdeploy::operation::cuda
+{
+
+    class CropResizePadImpl : public CropResizePad
+    {
+      public:
+        CropResizePadImpl() = default;
+
+        Result<void> apply(const Tensor&           src,
+                           const std::vector<int>& crop_rect,
+                           const std::vector<int>& target_size,
+                           const std::vector<int>& pad_rect,
+                           Tensor&                 dst) override
+        {
+            auto       cuda_stream = GetNative<cudaStream_t>(stream());
+
+            int        width  = target_size[0] + pad_rect[1] + pad_rect[3];
+            int        height = target_size[1] + pad_rect[0] + pad_rect[2];
+
+            TensorDesc desc{device(),
+                            src.data_type(),
+                            {1, height, width, src.shape(3)},
+                            src.name()};
+            Tensor     dst_tensor(desc);
+            cudaMemsetAsync(dst_tensor.data<uint8_t>(), 0, dst_tensor.byte_size(), cuda_stream);
+
+            if (crop_rect[2] - crop_rect[0] + 1 > 0 && crop_rect[3] - crop_rect[1] + 1 > 0)
+            {
+                if (src.data_type() == DataType::kINT8)
+                {
+                    OUTCOME_TRY(ResizeDispatch<uint8_t>(src, crop_rect, target_size, pad_rect, dst_tensor, cuda_stream));
+                }
+                else if (src.data_type() == DataType::kFLOAT)
+                {
+                    OUTCOME_TRY(ResizeDispatch<float>(src, crop_rect, target_size, pad_rect, dst_tensor, cuda_stream));
+                }
+                else
+                {
+                    MMDEPLOY_ERROR("unsupported data type {}", src.data_type());
+                    return Status(eNotSupported);
+                }
+            }
+
+            dst = std::move(dst_tensor);
+            return success();
+        }
+
+      private:
+        template<typename T>
+        auto Select(int channels) -> decltype(&ppl::cv::cuda::Resize<T, 1>)
+        {
+            switch (channels)
+            {
+                case 1:
+                    return &ppl::cv::cuda::Resize<T, 1>;
+                case 3:
+                    return &ppl::cv::cuda::Resize<T, 3>;
+                case 4:
+                    return &ppl::cv::cuda::Resize<T, 4>;
+                default:
+                    MMDEPLOY_ERROR("unsupported channels {}", channels);
+                    return nullptr;
+            }
+        }
+
+        template<class T>
+        Result<void> ResizeDispatch(const Tensor&           src,
+                                    const std::vector<int>& crop_rect,
+                                    const std::vector<int>& target_size,
+                                    const std::vector<int>& pad_rect,
+                                    Tensor&                 dst,
+                                    cudaStream_t            stream)
+        {
+            int                  in_height        = crop_rect[2] - crop_rect[0] + 1;
+            int                  in_width         = crop_rect[3] - crop_rect[1] + 1;
+            int                  in_width_stride  = src.shape(2) * src.shape(3);
+            int                  in_offset        = crop_rect[0] * in_width_stride + crop_rect[1] * src.shape(3);
+            int                  out_h            = target_size[1];
+            int                  out_w            = target_size[0];
+            int                  out_width_stride = dst.shape(2) * dst.shape(3);
+            int                  out_offset       = pad_rect[0] * out_width_stride + pad_rect[1] * dst.shape(3);
+            auto                 interp           = ppl::cv::InterpolationType::INTERPOLATION_LINEAR;
+
+            auto                 input  = src.data<T>();
+            auto                 output = dst.data<T>();
+
+            ppl::common::RetCode ret = 0;
+
+            if (auto resize = Select<T>(src.shape(3)); resize)
+            {
+                ret = resize(stream,
+                             in_height,
+                             in_width,
+                             in_width_stride,
+                             input + in_offset,
+                             out_h,
+                             out_w,
+                             out_width_stride,
+                             output + out_offset,
+                             interp);
+            }
+            else
+            {
+                return Status(eNotSupported);
+            }
+
+            return ret == 0 ? success() : Result<void>(Status(eFail));
+        }
+    };
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(CropResizePad,
+                                   (cuda, 0),
+                                   []()
+                                   {
+                                       return std::make_unique<CropResizePadImpl>();
+                                   });
 
 }  // namespace mmdeploy::operation::cuda
diff --git a/csrc/mmdeploy/operation/cuda/cvtcolor.cpp b/csrc/mmdeploy/operation/cuda/cvtcolor.cpp
index 1e35adc06b..ce6743f152 100644
--- a/csrc/mmdeploy/operation/cuda/cvtcolor.cpp
+++ b/csrc/mmdeploy/operation/cuda/cvtcolor.cpp
@@ -7,122 +7,141 @@
 
 using namespace ppl::cv::cuda;
 
-namespace mmdeploy::operation::cuda {
-
-template <typename T>
-using Converter = ppl::common::RetCode (*)(cudaStream_t stream, int height, int width,
-                                           int inWidthStride, const T* inData, int outWidthStride,
-                                           T* outData);
-
-namespace {
-
-template <typename T>
-ppl::common::RetCode CopyLuma(cudaStream_t stream, int height, int width, int inWidthStride,
-                              const T* inData, int outWidthStride, T* outData) {
-  auto ec = cudaMemcpyAsync(outData, inData, height * width * sizeof(T), cudaMemcpyDefault, stream);
-  if (ec == cudaSuccess) {
-    return ppl::common::RC_SUCCESS;
-  }
-  return ppl::common::RC_OTHER_ERROR;
-}
-
-template <typename T>
-class ConverterTable {
-  static constexpr auto kSize = static_cast<size_t>(PixelFormat::kCOUNT);
-
-  Converter<T> converters_[kSize][kSize]{};  // value-initialize to zeros
-
-  template <typename Self>
-  static auto& get_impl(Self& self, PixelFormat src, PixelFormat dst) {
-    return self.converters_[static_cast<int32_t>(src)][static_cast<int32_t>(dst)];
-  }
-
- public:
-  auto& get(PixelFormat src, PixelFormat dst) noexcept { return get_impl(*this, src, dst); }
-  auto& get(PixelFormat src, PixelFormat dst) const noexcept { return get_impl(*this, src, dst); }
-
-  ConverterTable() {
-    using namespace pixel_formats;
-    // to BGR
-    get(kRGB, kBGR) = RGB2BGR<T>;
-    get(kGRAY, kBGR) = GRAY2BGR<T>;
-    if constexpr (std::is_same_v<T, uint8_t>) {
-      get(kNV21, kBGR) = NV212BGR<T>;
-      get(kNV12, kBGR) = NV122BGR<T>;
-    }
-    get(kBGRA, kBGR) = BGRA2BGR<T>;
-
-    // to RGB
-    get(kBGR, kRGB) = BGR2RGB<T>;
-    get(kGRAY, kRGB) = GRAY2RGB<T>;
-    if constexpr (std::is_same_v<T, uint8_t>) {
-      get(kNV21, kRGB) = NV212RGB<T>;
-      get(kNV12, kRGB) = NV122RGB<T>;
-    }
-    get(kBGRA, kRGB) = BGRA2RGB<T>;
-
-    // to GRAY
-    get(kBGR, kGRAY) = BGR2GRAY<T>;
-    get(kRGB, kGRAY) = RGB2GRAY<T>;
-    get(kNV21, kGRAY) = CopyLuma<T>;
-    get(kNV12, kGRAY) = CopyLuma<T>;
-    get(kBGRA, kGRAY) = BGRA2GRAY<T>;
-  }
-};
-
-template <typename T>
-Converter<T> GetConverter(PixelFormat src, PixelFormat dst) {
-  static const ConverterTable<T> table{};
-  return table.get(src, dst);
-}
-
-}  // namespace
-
-class CvtColorImpl : public CvtColor {
- public:
-  Result<void> apply(const Mat& src, Mat& dst, PixelFormat dst_fmt) override {
-    if (src.pixel_format() == dst_fmt) {
-      dst = src;
-      return success();
-    }
-
-    auto cuda_stream = GetNative<cudaStream_t>(stream());
-
-    auto height = src.height();
-    auto width = src.width();
-    auto src_channels = src.channel();
-    auto src_stride = width * src_channels;
-
-    Mat dst_mat(height, width, dst_fmt, src.type(), device());
-    auto dst_stride = width * dst_mat.channel();
-
-    auto convert = [&](auto type) -> Result<void> {
-      using T = typename decltype(type)::type;
-      auto converter = GetConverter<T>(src.pixel_format(), dst_fmt);
-      if (!converter) {
-        return Status(eNotSupported);
-      }
-      auto ret = converter(cuda_stream, height, width, src_stride, src.data<T>(), dst_stride,
-                           dst_mat.data<T>());
-      if (ret != ppl::common::RC_SUCCESS) {
-        return Status(eFail);
-      }
-      dst = std::move(dst_mat);
-      return success();
+namespace mmdeploy::operation::cuda
+{
+
+    template<typename T>
+    using Converter = ppl::common::RetCode (*)(cudaStream_t stream, int height, int width, int inWidthStride, const T* inData, int outWidthStride, T* outData);
+
+    namespace
+    {
+
+        template<typename T>
+        ppl::common::RetCode CopyLuma(cudaStream_t stream, int height, int width, int inWidthStride, const T* inData, int outWidthStride, T* outData)
+        {
+            auto ec = cudaMemcpyAsync(outData, inData, height * width * sizeof(T), cudaMemcpyDefault, stream);
+            if (ec == cudaSuccess)
+            {
+                return ppl::common::RC_SUCCESS;
+            }
+            return ppl::common::RC_OTHER_ERROR;
+        }
+
+        template<typename T>
+        class ConverterTable
+        {
+            static constexpr auto kSize = static_cast<size_t>(PixelFormat::kCOUNT);
+
+            Converter<T>          converters_[kSize][kSize]{};  // value-initialize to zeros
+
+            template<typename Self>
+            static auto& get_impl(Self& self, PixelFormat src, PixelFormat dst)
+            {
+                return self.converters_[static_cast<int32_t>(src)][static_cast<int32_t>(dst)];
+            }
+
+          public:
+            auto& get(PixelFormat src, PixelFormat dst) noexcept
+            {
+                return get_impl(*this, src, dst);
+            }
+            auto& get(PixelFormat src, PixelFormat dst) const noexcept
+            {
+                return get_impl(*this, src, dst);
+            }
+
+            ConverterTable()
+            {
+                using namespace pixel_formats;
+                // to BGR
+                get(kRGB, kBGR)  = RGB2BGR<T>;
+                get(kGRAY, kBGR) = GRAY2BGR<T>;
+                if constexpr (std::is_same_v<T, uint8_t>)
+                {
+                    get(kNV21, kBGR) = NV212BGR<T>;
+                    get(kNV12, kBGR) = NV122BGR<T>;
+                }
+                get(kBGRA, kBGR) = BGRA2BGR<T>;
+
+                // to RGB
+                get(kBGR, kRGB)  = BGR2RGB<T>;
+                get(kGRAY, kRGB) = GRAY2RGB<T>;
+                if constexpr (std::is_same_v<T, uint8_t>)
+                {
+                    get(kNV21, kRGB) = NV212RGB<T>;
+                    get(kNV12, kRGB) = NV122RGB<T>;
+                }
+                get(kBGRA, kRGB) = BGRA2RGB<T>;
+
+                // to GRAY
+                get(kBGR, kGRAY)  = BGR2GRAY<T>;
+                get(kRGB, kGRAY)  = RGB2GRAY<T>;
+                get(kNV21, kGRAY) = CopyLuma<T>;
+                get(kNV12, kGRAY) = CopyLuma<T>;
+                get(kBGRA, kGRAY) = BGRA2GRAY<T>;
+            }
+        };
+
+        template<typename T>
+        Converter<T> GetConverter(PixelFormat src, PixelFormat dst)
+        {
+            static const ConverterTable<T> table{};
+            return table.get(src, dst);
+        }
+
+    }  // namespace
+
+    class CvtColorImpl : public CvtColor
+    {
+      public:
+        Result<void> apply(const Mat& src, Mat& dst, PixelFormat dst_fmt) override
+        {
+            if (src.pixel_format() == dst_fmt)
+            {
+                dst = src;
+                return success();
+            }
+
+            auto cuda_stream = GetNative<cudaStream_t>(stream());
+
+            auto height       = src.height();
+            auto width        = src.width();
+            auto src_channels = src.channel();
+            auto src_stride   = width * src_channels;
+
+            Mat  dst_mat(height, width, dst_fmt, src.type(), device());
+            auto dst_stride = width * dst_mat.channel();
+
+            auto convert = [&](auto type) -> Result<void>
+            {
+                using T        = typename decltype(type)::type;
+                auto converter = GetConverter<T>(src.pixel_format(), dst_fmt);
+                if (!converter)
+                {
+                    return Status(eNotSupported);
+                }
+                auto ret = converter(cuda_stream, height, width, src_stride, src.data<T>(), dst_stride, dst_mat.data<T>());
+                if (ret != ppl::common::RC_SUCCESS)
+                {
+                    return Status(eFail);
+                }
+                dst = std::move(dst_mat);
+                return success();
+            };
+
+            switch (src.type())
+            {
+                case DataType::kINT8:
+                    return convert(basic_type<uint8_t>{});
+                case DataType::kFLOAT:
+                    return convert(basic_type<float>{});
+                default:
+                    return Status(eNotSupported);
+            }
+        }
     };
 
-    switch (src.type()) {
-      case DataType::kINT8:
-        return convert(basic_type<uint8_t>{});
-      case DataType::kFLOAT:
-        return convert(basic_type<float>{});
-      default:
-        return Status(eNotSupported);
-    }
-  }
-};
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(CvtColor, (cuda, 0),
-                               [] { return std::make_unique<CvtColorImpl>(); });
+    MMDEPLOY_REGISTER_FACTORY_FUNC(CvtColor, (cuda, 0), []
+                                   { return std::make_unique<CvtColorImpl>(); });
 
 }  // namespace mmdeploy::operation::cuda
diff --git a/csrc/mmdeploy/operation/cuda/flip.cpp b/csrc/mmdeploy/operation/cuda/flip.cpp
index 18b295c526..d640385cf8 100644
--- a/csrc/mmdeploy/operation/cuda/flip.cpp
+++ b/csrc/mmdeploy/operation/cuda/flip.cpp
@@ -5,57 +5,72 @@
 #include "mmdeploy/core/utils/formatter.h"
 #include "mmdeploy/operation/vision.h"
 
-namespace mmdeploy::operation::cuda {
-
-class FlipImpl : public Flip {
- public:
-  using Flip::Flip;
-
-  Result<void> apply(const Tensor& src, Tensor& dst) override {
-    Tensor dst_tensor(src.desc());
-    auto cuda_stream = GetNative<cudaStream_t>(stream());
-    auto h = static_cast<int>(src.shape(1));
-    auto w = static_cast<int>(src.shape(2));
-    auto c = static_cast<int>(src.shape(3));
-    ppl::common::RetCode ret;
-    if (src.data_type() == DataType::kINT8) {
-      auto input = src.data<uint8_t>();
-      auto output = dst_tensor.data<uint8_t>();
-      if (c == 1) {
-        ret = ppl::cv::cuda::Flip<uint8_t, 1>(cuda_stream, h, w, w * c, input, w * c, output,
-                                              flip_code_);
-      } else if (c == 3) {
-        ret = ppl::cv::cuda::Flip<uint8_t, 3>(cuda_stream, h, w, w * c, input, w * c, output,
-                                              flip_code_);
-      } else {
-        ret = ppl::common::RC_UNSUPPORTED;
-      }
-    } else if (src.data_type() == DataType::kFLOAT) {
-      auto input = src.data<float>();
-      auto output = dst_tensor.data<float>();
-      if (c == 1) {
-        ret = ppl::cv::cuda::Flip<float, 1>(cuda_stream, h, w, w * c, input, w * c, output,
-                                            flip_code_);
-      } else if (c == 3) {
-        ret = ppl::cv::cuda::Flip<float, 3>(cuda_stream, h, w, w * c, input, w * c, output,
-                                            flip_code_);
-      } else {
-        ret = ppl::common::RC_UNSUPPORTED;
-      }
-    } else {
-      MMDEPLOY_ERROR("unsupported data type {}", src.data_type());
-      return Status(eNotSupported);
-    }
-
-    if (ret != 0) {
-      return Status(eFail);
-    }
-    dst = std::move(dst_tensor);
-    return success();
-  }
-};
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(Flip, (cuda, 0),
-                               [](int flip_code) { return std::make_unique<FlipImpl>(flip_code); });
+namespace mmdeploy::operation::cuda
+{
+
+    class FlipImpl : public Flip
+    {
+      public:
+        using Flip::Flip;
+
+        Result<void> apply(const Tensor& src, Tensor& dst) override
+        {
+            Tensor               dst_tensor(src.desc());
+            auto                 cuda_stream = GetNative<cudaStream_t>(stream());
+            auto                 h           = static_cast<int>(src.shape(1));
+            auto                 w           = static_cast<int>(src.shape(2));
+            auto                 c           = static_cast<int>(src.shape(3));
+            ppl::common::RetCode ret;
+            if (src.data_type() == DataType::kINT8)
+            {
+                auto input  = src.data<uint8_t>();
+                auto output = dst_tensor.data<uint8_t>();
+                if (c == 1)
+                {
+                    ret = ppl::cv::cuda::Flip<uint8_t, 1>(cuda_stream, h, w, w * c, input, w * c, output, flip_code_);
+                }
+                else if (c == 3)
+                {
+                    ret = ppl::cv::cuda::Flip<uint8_t, 3>(cuda_stream, h, w, w * c, input, w * c, output, flip_code_);
+                }
+                else
+                {
+                    ret = ppl::common::RC_UNSUPPORTED;
+                }
+            }
+            else if (src.data_type() == DataType::kFLOAT)
+            {
+                auto input  = src.data<float>();
+                auto output = dst_tensor.data<float>();
+                if (c == 1)
+                {
+                    ret = ppl::cv::cuda::Flip<float, 1>(cuda_stream, h, w, w * c, input, w * c, output, flip_code_);
+                }
+                else if (c == 3)
+                {
+                    ret = ppl::cv::cuda::Flip<float, 3>(cuda_stream, h, w, w * c, input, w * c, output, flip_code_);
+                }
+                else
+                {
+                    ret = ppl::common::RC_UNSUPPORTED;
+                }
+            }
+            else
+            {
+                MMDEPLOY_ERROR("unsupported data type {}", src.data_type());
+                return Status(eNotSupported);
+            }
+
+            if (ret != 0)
+            {
+                return Status(eFail);
+            }
+            dst = std::move(dst_tensor);
+            return success();
+        }
+    };
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Flip, (cuda, 0), [](int flip_code)
+                                   { return std::make_unique<FlipImpl>(flip_code); });
 
 }  // namespace mmdeploy::operation::cuda
diff --git a/csrc/mmdeploy/operation/cuda/hwc2chw.cpp b/csrc/mmdeploy/operation/cuda/hwc2chw.cpp
index abc6c54db3..4c86327e3a 100644
--- a/csrc/mmdeploy/operation/cuda/hwc2chw.cpp
+++ b/csrc/mmdeploy/operation/cuda/hwc2chw.cpp
@@ -4,40 +4,49 @@
 
 #include "mmdeploy/operation/vision.h"
 
-namespace mmdeploy::operation::cuda {
-
-template <typename T>
-void Transpose(const T* src, int height, int width, int channels, T* dst, cudaStream_t stream);
-
-class HWC2CHWImpl : public HWC2CHW {
- public:
-  Result<void> apply(const Tensor& src, Tensor& dst) override {
-    auto h = src.shape(1);
-    auto w = src.shape(2);
-    auto c = src.shape(3);
-
-    Tensor dst_tensor(src.desc());
-    dst_tensor.Reshape({1, c, h, w});
-
-    auto cuda_stream = GetNative<cudaStream_t>(stream());
-
-    if (DataType::kINT8 == src.data_type()) {
-      auto input = src.data<uint8_t>();
-      auto output = dst_tensor.data<uint8_t>();
-      Transpose(input, (int)h, (int)w, (int)c, output, cuda_stream);
-    } else if (DataType::kFLOAT == src.data_type()) {
-      auto input = src.data<float>();
-      auto output = dst_tensor.data<float>();
-      Transpose(input, (int)h, (int)w, (int)c, output, cuda_stream);
-    } else {
-      assert(0);
-    }
-
-    dst = std::move(dst_tensor);
-    return success();
-  }
-};
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(HWC2CHW, (cuda, 0), [] { return std::make_unique<HWC2CHWImpl>(); });
+namespace mmdeploy::operation::cuda
+{
+
+    template<typename T>
+    void Transpose(const T* src, int height, int width, int channels, T* dst, cudaStream_t stream);
+
+    class HWC2CHWImpl : public HWC2CHW
+    {
+      public:
+        Result<void> apply(const Tensor& src, Tensor& dst) override
+        {
+            auto   h = src.shape(1);
+            auto   w = src.shape(2);
+            auto   c = src.shape(3);
+
+            Tensor dst_tensor(src.desc());
+            dst_tensor.Reshape({1, c, h, w});
+
+            auto cuda_stream = GetNative<cudaStream_t>(stream());
+
+            if (DataType::kINT8 == src.data_type())
+            {
+                auto input  = src.data<uint8_t>();
+                auto output = dst_tensor.data<uint8_t>();
+                Transpose(input, (int)h, (int)w, (int)c, output, cuda_stream);
+            }
+            else if (DataType::kFLOAT == src.data_type())
+            {
+                auto input  = src.data<float>();
+                auto output = dst_tensor.data<float>();
+                Transpose(input, (int)h, (int)w, (int)c, output, cuda_stream);
+            }
+            else
+            {
+                assert(0);
+            }
+
+            dst = std::move(dst_tensor);
+            return success();
+        }
+    };
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(HWC2CHW, (cuda, 0), []
+                                   { return std::make_unique<HWC2CHWImpl>(); });
 
 }  // namespace mmdeploy::operation::cuda
diff --git a/csrc/mmdeploy/operation/cuda/normalize.cpp b/csrc/mmdeploy/operation/cuda/normalize.cpp
index b2168fed4b..23181279b7 100644
--- a/csrc/mmdeploy/operation/cuda/normalize.cpp
+++ b/csrc/mmdeploy/operation/cuda/normalize.cpp
@@ -5,70 +5,86 @@
 #include "mmdeploy/core/utils/formatter.h"
 #include "mmdeploy/operation/vision.h"
 
-namespace mmdeploy::operation::cuda {
+namespace mmdeploy::operation::cuda
+{
 
-namespace impl {
-template <typename T, int channels>
-void Normalize(const T* src, int height, int width, int stride, float* output, const float* mean,
-               const float* std, bool to_rgb, cudaStream_t stream);
-}
+    namespace impl
+    {
+        template<typename T, int channels>
+        void Normalize(const T* src, int height, int width, int stride, float* output, const float* mean, const float* std, bool to_rgb, cudaStream_t stream);
+    }
 
-class NormalizeImpl : public Normalize {
- public:
-  NormalizeImpl(Param param) : param_(std::move(param)) {}
+    class NormalizeImpl : public Normalize
+    {
+      public:
+        NormalizeImpl(Param param)
+            : param_(std::move(param))
+        {
+        }
 
-  Result<void> apply(const Tensor& src, Tensor& dst) override {
-    auto src_desc = src.desc();
-    int h = (int)src_desc.shape[1];
-    int w = (int)src_desc.shape[2];
-    int c = (int)src_desc.shape[3];
-    int stride = w * c;
+        Result<void> apply(const Tensor& src, Tensor& dst) override
+        {
+            auto       src_desc = src.desc();
+            int        h        = (int)src_desc.shape[1];
+            int        w        = (int)src_desc.shape[2];
+            int        c        = (int)src_desc.shape[3];
+            int        stride   = w * c;
 
-    TensorDesc dst_desc{device(), DataType::kFLOAT, src_desc.shape, src_desc.name};
-    Tensor dst_tensor{dst_desc};
-    auto output = dst_tensor.data<float>();
-    auto cuda_stream = GetNative<cudaStream_t>(stream());
+            TensorDesc dst_desc{device(), DataType::kFLOAT, src_desc.shape, src_desc.name};
+            Tensor     dst_tensor{dst_desc};
+            auto       output      = dst_tensor.data<float>();
+            auto       cuda_stream = GetNative<cudaStream_t>(stream());
 
-    if (DataType::kINT8 == src_desc.data_type) {
-      auto input = src.data<uint8_t>();
-      if (3 == c) {
-        impl::Normalize<uint8_t, 3>(input, h, w, stride, output, param_.mean.data(),
-                                    param_.std.data(), param_.to_rgb, cuda_stream);
-      } else if (1 == c) {
-        impl::Normalize<uint8_t, 1>(input, h, w, stride, output, param_.mean.data(),
-                                    param_.std.data(), param_.to_rgb, cuda_stream);
-      } else {
-        MMDEPLOY_ERROR("unsupported channels {}", c);
-        return Status(eNotSupported);
-      }
-    } else if (DataType::kFLOAT == src_desc.data_type) {
-      auto input = src.data<float>();
-      if (3 == c) {
-        impl::Normalize<float, 3>(input, h, w, stride, output, param_.mean.data(),
-                                  param_.std.data(), param_.to_rgb, cuda_stream);
-      } else if (1 == c) {
-        impl::Normalize<float, 1>(input, h, w, stride, output, param_.mean.data(),
-                                  param_.std.data(), param_.to_rgb, cuda_stream);
-      } else {
-        MMDEPLOY_ERROR("unsupported channels {}", c);
-        return Status(eNotSupported);
-      }
-    } else {
-      MMDEPLOY_ERROR("unsupported data type {}", src_desc.data_type);
-      assert(0);
-      return Status(eNotSupported);
-    }
+            if (DataType::kINT8 == src_desc.data_type)
+            {
+                auto input = src.data<uint8_t>();
+                if (3 == c)
+                {
+                    impl::Normalize<uint8_t, 3>(input, h, w, stride, output, param_.mean.data(), param_.std.data(), param_.to_rgb, cuda_stream);
+                }
+                else if (1 == c)
+                {
+                    impl::Normalize<uint8_t, 1>(input, h, w, stride, output, param_.mean.data(), param_.std.data(), param_.to_rgb, cuda_stream);
+                }
+                else
+                {
+                    MMDEPLOY_ERROR("unsupported channels {}", c);
+                    return Status(eNotSupported);
+                }
+            }
+            else if (DataType::kFLOAT == src_desc.data_type)
+            {
+                auto input = src.data<float>();
+                if (3 == c)
+                {
+                    impl::Normalize<float, 3>(input, h, w, stride, output, param_.mean.data(), param_.std.data(), param_.to_rgb, cuda_stream);
+                }
+                else if (1 == c)
+                {
+                    impl::Normalize<float, 1>(input, h, w, stride, output, param_.mean.data(), param_.std.data(), param_.to_rgb, cuda_stream);
+                }
+                else
+                {
+                    MMDEPLOY_ERROR("unsupported channels {}", c);
+                    return Status(eNotSupported);
+                }
+            }
+            else
+            {
+                MMDEPLOY_ERROR("unsupported data type {}", src_desc.data_type);
+                assert(0);
+                return Status(eNotSupported);
+            }
 
-    dst = std::move(dst_tensor);
-    return success();
-  }
+            dst = std::move(dst_tensor);
+            return success();
+        }
 
- protected:
-  Param param_;
-};
+      protected:
+        Param param_;
+    };
 
-MMDEPLOY_REGISTER_FACTORY_FUNC(Normalize, (cuda, 0), [](const Normalize::Param& param) {
-  return std::make_unique<NormalizeImpl>(param);
-});
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Normalize, (cuda, 0), [](const Normalize::Param& param)
+                                   { return std::make_unique<NormalizeImpl>(param); });
 
 }  // namespace mmdeploy::operation::cuda
diff --git a/csrc/mmdeploy/operation/cuda/normalize.cu b/csrc/mmdeploy/operation/cuda/normalize.cu
index efd1c47008..259c51783c 100644
--- a/csrc/mmdeploy/operation/cuda/normalize.cu
+++ b/csrc/mmdeploy/operation/cuda/normalize.cu
@@ -4,61 +4,62 @@
 
 #include <cstdint>
 
-namespace mmdeploy {
-namespace operation {
-namespace cuda {
-namespace impl {
+namespace mmdeploy
+{
+    namespace operation
+    {
+        namespace cuda
+        {
+            namespace impl
+            {
 
-template <typename T, int channels>
-__global__ void normalize(const T* src, int height, int width, int stride, float* output,
-                          const float3 mean, const float3 std, bool to_rgb) {
-  int x = (int)(blockIdx.x * blockDim.x + threadIdx.x);
-  int y = (int)(blockIdx.y * blockDim.y + threadIdx.y);
+                template<typename T, int channels>
+                __global__ void normalize(const T* src, int height, int width, int stride, float* output, const float3 mean, const float3 std, bool to_rgb)
+                {
+                    int x = (int)(blockIdx.x * blockDim.x + threadIdx.x);
+                    int y = (int)(blockIdx.y * blockDim.y + threadIdx.y);
 
-  if (x >= width || y >= height) {
-    return;
-  }
+                    if (x >= width || y >= height)
+                    {
+                        return;
+                    }
 
-  int loc = y * stride + x * channels;
-  auto mean_ptr = &mean.x;
-  auto std_ptr = &std.x;
-  if (to_rgb) {
-    for (int c = 0; c < channels; ++c) {
-      output[loc + c] = ((float)src[loc + channels - 1 - c] - mean_ptr[c]) * std_ptr[c];
-    }
-  } else {
-    for (int c = 0; c < channels; ++c) {
-      output[loc + c] = ((float)src[loc + c] - mean_ptr[c]) * std_ptr[c];
-    }
-  }
-}
+                    int  loc      = y * stride + x * channels;
+                    auto mean_ptr = &mean.x;
+                    auto std_ptr  = &std.x;
+                    if (to_rgb)
+                    {
+                        for (int c = 0; c < channels; ++c)
+                        {
+                            output[loc + c] = ((float)src[loc + channels - 1 - c] - mean_ptr[c]) * std_ptr[c];
+                        }
+                    }
+                    else
+                    {
+                        for (int c = 0; c < channels; ++c)
+                        {
+                            output[loc + c] = ((float)src[loc + c] - mean_ptr[c]) * std_ptr[c];
+                        }
+                    }
+                }
 
-template <typename T, int channels>
-void Normalize(const T* src, int height, int width, int stride, float* output, const float* mean,
-               const float* std, bool to_rgb, cudaStream_t stream) {
-  const dim3 thread_block(16, 16);
-  const dim3 num_blocks((width + thread_block.x - 1) / thread_block.x,
-                        (height + thread_block.y - 1) / thread_block.y);
-  const float3 _mean{mean[0], mean[1], mean[2]};
-  const float3 _std{float(1. / std[0]), float(1. / std[1]), float(1. / std[2])};
-  normalize<T, channels><<<num_blocks, thread_block, 0, stream>>>(src, height, width, stride,
-                                                                  output, _mean, _std, to_rgb);
-}
+                template<typename T, int channels>
+                void Normalize(const T* src, int height, int width, int stride, float* output, const float* mean, const float* std, bool to_rgb, cudaStream_t stream)
+                {
+                    const dim3   thread_block(16, 16);
+                    const dim3   num_blocks((width + thread_block.x - 1) / thread_block.x,
+                                          (height + thread_block.y - 1) / thread_block.y);
+                    const float3 _mean{mean[0], mean[1], mean[2]};
+                    const float3 _std{float(1. / std[0]), float(1. / std[1]), float(1. / std[2])};
+                    normalize<T, channels><<<num_blocks, thread_block, 0, stream>>>(src, height, width, stride, output, _mean, _std, to_rgb);
+                }
 
-template void Normalize<uint8_t, 3>(const uint8_t* src, int height, int width, int stride,
-                                    float* output, const float* mean, const float* std, bool to_rgb,
-                                    cudaStream_t stream);
-template void Normalize<uint8_t, 1>(const uint8_t* src, int height, int width, int stride,
-                                    float* output, const float* mean, const float* std, bool to_rgb,
-                                    cudaStream_t stream);
+                template void Normalize<uint8_t, 3>(const uint8_t* src, int height, int width, int stride, float* output, const float* mean, const float* std, bool to_rgb, cudaStream_t stream);
+                template void Normalize<uint8_t, 1>(const uint8_t* src, int height, int width, int stride, float* output, const float* mean, const float* std, bool to_rgb, cudaStream_t stream);
 
-template void Normalize<float, 3>(const float* src, int height, int width, int stride,
-                                  float* output, const float* mean, const float* std, bool to_rgb,
-                                  cudaStream_t stream);
-template void Normalize<float, 1>(const float* src, int height, int width, int stride,
-                                  float* output, const float* mean, const float* std, bool to_rgb,
-                                  cudaStream_t stream);
-}  // namespace impl
-}  // namespace cuda
-}  // namespace operation
+                template void Normalize<float, 3>(const float* src, int height, int width, int stride, float* output, const float* mean, const float* std, bool to_rgb, cudaStream_t stream);
+                template void Normalize<float, 1>(const float* src, int height, int width, int stride, float* output, const float* mean, const float* std, bool to_rgb, cudaStream_t stream);
+            }  // namespace impl
+        }      // namespace cuda
+    }          // namespace operation
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/operation/cuda/pad.cpp b/csrc/mmdeploy/operation/cuda/pad.cpp
index 7e40c38cbb..43970df81c 100644
--- a/csrc/mmdeploy/operation/cuda/pad.cpp
+++ b/csrc/mmdeploy/operation/cuda/pad.cpp
@@ -8,90 +8,104 @@
 
 using namespace ppl::cv::cuda;
 
-namespace mmdeploy::operation::cuda {
+namespace mmdeploy::operation::cuda
+{
 
-class PadImpl : public Pad {
- public:
-  PadImpl(ppl::cv::BorderType border_type, float pad_val)
-      : border_type_(border_type), pad_val_(pad_val) {}
+    class PadImpl : public Pad
+    {
+      public:
+        PadImpl(ppl::cv::BorderType border_type, float pad_val)
+            : border_type_(border_type)
+            , pad_val_(pad_val)
+        {
+        }
 
-  Result<void> apply(const Tensor& src, Tensor& dst, int top, int left, int bottom,
-                     int right) override {
-    auto desc = src.desc();
-    int height = desc.shape[1];
-    int width = desc.shape[2];
-    int c = desc.shape[3];
+        Result<void> apply(const Tensor& src, Tensor& dst, int top, int left, int bottom, int right) override
+        {
+            auto                 desc   = src.desc();
+            int                  height = desc.shape[1];
+            int                  width  = desc.shape[2];
+            int                  c      = desc.shape[3];
 
-    auto dst_height = height + top + bottom;
-    auto dst_width = width + left + right;
-    TensorShape dst_shape{1, dst_height, dst_width, c};
-    TensorDesc dst_desc{device(), desc.data_type, dst_shape, ""};
-    Tensor dst_tensor(dst_desc);
+            auto                 dst_height = height + top + bottom;
+            auto                 dst_width  = width + left + right;
+            TensorShape          dst_shape{1, dst_height, dst_width, c};
+            TensorDesc           dst_desc{device(), desc.data_type, dst_shape, ""};
+            Tensor               dst_tensor(dst_desc);
 
-    ppl::common::RetCode ret = 0;
-    auto cuda_stream = GetNative<cudaStream_t>(stream());
+            ppl::common::RetCode ret         = 0;
+            auto                 cuda_stream = GetNative<cudaStream_t>(stream());
 
-    if (desc.data_type == DataType::kFLOAT) {
-      auto src_buffer = src.data<float>();
-      auto dst_buffer = dst_tensor.data<float>();
-      if (3 == c) {
-        ret = CopyMakeBorder<float, 3>(cuda_stream, height, width, width * c, src_buffer,
-                                       dst_width * c, dst_buffer, top, bottom, left, right,
-                                       border_type_, pad_val_);
-      } else if (1 == c) {
-        ret = CopyMakeBorder<float, 1>(cuda_stream, height, width, width * c, src_buffer,
-                                       dst_width * c, dst_buffer, top, bottom, left, right,
-                                       border_type_, pad_val_);
-      } else {
-        MMDEPLOY_ERROR("unsupported channels {}", c);
-        assert(0);
-        return Status(eNotSupported);
-      }
-    } else if (desc.data_type == DataType::kINT8) {
-      auto src_buffer = src.data<uint8_t>();
-      auto dst_buffer = dst_tensor.data<uint8_t>();
-      if (3 == c) {
-        ret = CopyMakeBorder<ppl::cv::uchar, 3>(cuda_stream, height, width, width * c, src_buffer,
-                                                dst_width * c, dst_buffer, top, bottom, left, right,
-                                                border_type_, (ppl::cv::uchar)pad_val_);
-      } else if (1 == c) {
-        ret = CopyMakeBorder<ppl::cv::uchar, 1>(cuda_stream, height, width, width * c, src_buffer,
-                                                dst_width * c, dst_buffer, top, bottom, left, right,
-                                                border_type_, (ppl::cv::uchar)pad_val_);
-      } else {
-        MMDEPLOY_ERROR("unsupported channels {}", c);
-        assert(0);
-        return Status(eNotSupported);
-      }
-    } else {
-      MMDEPLOY_ERROR("unsupported data type {}", desc.data_type);
-      assert(0);
-      return Status(eNotSupported);
-    }
-    if (ret != 0) {
-      MMDEPLOY_ERROR("unexpected exception happened");
-      assert(0);
-      return Status(eNotSupported);
-    }
+            if (desc.data_type == DataType::kFLOAT)
+            {
+                auto src_buffer = src.data<float>();
+                auto dst_buffer = dst_tensor.data<float>();
+                if (3 == c)
+                {
+                    ret = CopyMakeBorder<float, 3>(cuda_stream, height, width, width * c, src_buffer, dst_width * c, dst_buffer, top, bottom, left, right, border_type_, pad_val_);
+                }
+                else if (1 == c)
+                {
+                    ret = CopyMakeBorder<float, 1>(cuda_stream, height, width, width * c, src_buffer, dst_width * c, dst_buffer, top, bottom, left, right, border_type_, pad_val_);
+                }
+                else
+                {
+                    MMDEPLOY_ERROR("unsupported channels {}", c);
+                    assert(0);
+                    return Status(eNotSupported);
+                }
+            }
+            else if (desc.data_type == DataType::kINT8)
+            {
+                auto src_buffer = src.data<uint8_t>();
+                auto dst_buffer = dst_tensor.data<uint8_t>();
+                if (3 == c)
+                {
+                    ret = CopyMakeBorder<ppl::cv::uchar, 3>(cuda_stream, height, width, width * c, src_buffer, dst_width * c, dst_buffer, top, bottom, left, right, border_type_, (ppl::cv::uchar)pad_val_);
+                }
+                else if (1 == c)
+                {
+                    ret = CopyMakeBorder<ppl::cv::uchar, 1>(cuda_stream, height, width, width * c, src_buffer, dst_width * c, dst_buffer, top, bottom, left, right, border_type_, (ppl::cv::uchar)pad_val_);
+                }
+                else
+                {
+                    MMDEPLOY_ERROR("unsupported channels {}", c);
+                    assert(0);
+                    return Status(eNotSupported);
+                }
+            }
+            else
+            {
+                MMDEPLOY_ERROR("unsupported data type {}", desc.data_type);
+                assert(0);
+                return Status(eNotSupported);
+            }
+            if (ret != 0)
+            {
+                MMDEPLOY_ERROR("unexpected exception happened");
+                assert(0);
+                return Status(eNotSupported);
+            }
 
-    dst = std::move(dst_tensor);
-    return success();
-  }
+            dst = std::move(dst_tensor);
+            return success();
+        }
 
- private:
-  ppl::cv::BorderType border_type_;
-  float pad_val_;
-};
+      private:
+        ppl::cv::BorderType border_type_;
+        float               pad_val_;
+    };
 
-static auto Create(const string_view& border_type, float pad_val) {
-  static const std::map<string_view, ppl::cv::BorderType> border_map{
-      {"constant", ppl::cv::BORDER_CONSTANT},
-      {"edge", ppl::cv::BORDER_REPLICATE},
-      {"reflect", ppl::cv::BORDER_REFLECT_101},
-      {"symmetric", ppl::cv::BORDER_REFLECT}};
-  return std::make_unique<PadImpl>(border_map.at(border_type), pad_val);
-}
+    static auto Create(const string_view& border_type, float pad_val)
+    {
+        static const std::map<string_view, ppl::cv::BorderType> border_map{
+            {"constant", ppl::cv::BORDER_CONSTANT},
+            {"edge", ppl::cv::BORDER_REPLICATE},
+            {"reflect", ppl::cv::BORDER_REFLECT_101},
+            {"symmetric", ppl::cv::BORDER_REFLECT}};
+        return std::make_unique<PadImpl>(border_map.at(border_type), pad_val);
+    }
 
-MMDEPLOY_REGISTER_FACTORY_FUNC(Pad, (cuda, 0), Create);
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Pad, (cuda, 0), Create);
 
 }  // namespace mmdeploy::operation::cuda
diff --git a/csrc/mmdeploy/operation/cuda/permute.cpp b/csrc/mmdeploy/operation/cuda/permute.cpp
index c5c87da881..2135ce64fe 100644
--- a/csrc/mmdeploy/operation/cuda/permute.cpp
+++ b/csrc/mmdeploy/operation/cuda/permute.cpp
@@ -6,86 +6,100 @@
 
 #include "mmdeploy/operation/vision.h"
 
-namespace mmdeploy::operation::cuda {
+namespace mmdeploy::operation::cuda
+{
 
-namespace impl {
-template <typename T>
-void Permute(const T* src, const TensorStride& src_strides, T* dst, const TensorStride& dst_strides,
-             int ndim, int total, cudaStream_t stream);
-}
+    namespace impl
+    {
+        template<typename T>
+        void Permute(const T* src, const TensorStride& src_strides, T* dst, const TensorStride& dst_strides, int ndim, int total, cudaStream_t stream);
+    }
 
-class PermuteImpl : public Permute {
- public:
-  explicit PermuteImpl() {}
+    class PermuteImpl : public Permute
+    {
+      public:
+        explicit PermuteImpl() {}
 
-  Result<void> apply(const Tensor& src, Tensor& dst, const std::vector<int>& axes) override {
-    int ndim = src.shape().size();
-    if (ndim != axes.size()) {
-      MMDEPLOY_ERROR("The size of axes should be equal of src, {} vs {}", axes.size(), ndim);
-      return Status(eInvalidArgument);
-    }
-    if (ndim > MAX_PERMUTE_DIM) {
-      MMDEPLOY_ERROR("Only support ndim < {}", MAX_PERMUTE_DIM);
-      return Status(eInvalidArgument);
-    }
-    std::vector<int> axes_vis(ndim, 0);
-    for (const auto& x : axes) {
-      if (x < 0 || x >= ndim || axes_vis[x]) {
-        MMDEPLOY_ERROR("Invalid axes");
-        return Status(eInvalidArgument);
-      }
-      axes_vis[x] = 1;
-    }
+        Result<void> apply(const Tensor& src, Tensor& dst, const std::vector<int>& axes) override
+        {
+            int ndim = src.shape().size();
+            if (ndim != axes.size())
+            {
+                MMDEPLOY_ERROR("The size of axes should be equal of src, {} vs {}", axes.size(), ndim);
+                return Status(eInvalidArgument);
+            }
+            if (ndim > MAX_PERMUTE_DIM)
+            {
+                MMDEPLOY_ERROR("Only support ndim < {}", MAX_PERMUTE_DIM);
+                return Status(eInvalidArgument);
+            }
+            std::vector<int> axes_vis(ndim, 0);
+            for (const auto& x : axes)
+            {
+                if (x < 0 || x >= ndim || axes_vis[x])
+                {
+                    MMDEPLOY_ERROR("Invalid axes");
+                    return Status(eInvalidArgument);
+                }
+                axes_vis[x] = 1;
+            }
 
-    Tensor dst_tensor(src.desc());
-    auto src_dims = src.shape();
-    TensorShape dst_dims(ndim);
-    for (int i = 0; i < src_dims.size(); i++) {
-      dst_dims[i] = src_dims[axes[i]];
-    }
-    dst_tensor.Reshape(dst_dims);
+            Tensor      dst_tensor(src.desc());
+            auto        src_dims = src.shape();
+            TensorShape dst_dims(ndim);
+            for (int i = 0; i < src_dims.size(); i++)
+            {
+                dst_dims[i] = src_dims[axes[i]];
+            }
+            dst_tensor.Reshape(dst_dims);
 
-    TensorStride dst_strides;
-    TensorStride src_strides;
+            TensorStride dst_strides;
+            TensorStride src_strides;
 
-    dst_strides[ndim - 1] = src_strides[ndim - 1] = 1;
-    for (int i = ndim - 2; i >= 0; i--) {
-      dst_strides[i] = dst_strides[i + 1] * dst_dims[i + 1];
-      src_strides[i] = src_strides[i + 1] * src_dims[i + 1];
-    }
+            dst_strides[ndim - 1] = src_strides[ndim - 1] = 1;
+            for (int i = ndim - 2; i >= 0; i--)
+            {
+                dst_strides[i] = dst_strides[i + 1] * dst_dims[i + 1];
+                src_strides[i] = src_strides[i + 1] * src_dims[i + 1];
+            }
 
-    TensorStride tmp;
-    for (int i = 0; i < ndim; i++) {
-      tmp[i] = src_strides[axes[i]];
-    }
-    src_strides = tmp;
+            TensorStride tmp;
+            for (int i = 0; i < ndim; i++)
+            {
+                tmp[i] = src_strides[axes[i]];
+            }
+            src_strides = tmp;
 
-    if (src.data_type() == DataType::kINT8) {
-      OUTCOME_TRY(PermuteDispatch<uint8_t>(src, dst_tensor, src_strides, dst_strides));
-    } else if (src.data_type() == DataType::kFLOAT) {
-      OUTCOME_TRY(PermuteDispatch<float>(src, dst_tensor, src_strides, dst_strides));
-    } else {
-      MMDEPLOY_ERROR("unsupported data type {}", src.data_type());
-      return Status(eNotSupported);
-    }
-    dst = std::move(dst_tensor);
-    return success();
-  }
+            if (src.data_type() == DataType::kINT8)
+            {
+                OUTCOME_TRY(PermuteDispatch<uint8_t>(src, dst_tensor, src_strides, dst_strides));
+            }
+            else if (src.data_type() == DataType::kFLOAT)
+            {
+                OUTCOME_TRY(PermuteDispatch<float>(src, dst_tensor, src_strides, dst_strides));
+            }
+            else
+            {
+                MMDEPLOY_ERROR("unsupported data type {}", src.data_type());
+                return Status(eNotSupported);
+            }
+            dst = std::move(dst_tensor);
+            return success();
+        }
 
-  template <typename T>
-  Result<void> PermuteDispatch(const Tensor& src, Tensor& dst, const TensorStride& src_strides,
-                               const TensorStride& dst_strides) {
-    auto src_data = src.data<T>();
-    auto dst_data = dst.data<T>();
-    auto ndim = src.shape().size();
-    auto total = src.size();
-    impl::Permute(src_data, src_strides, dst_data, dst_strides, ndim, total,
-                  GetNative<cudaStream_t>(stream()));
-    return success();
-  }
-};
+        template<typename T>
+        Result<void> PermuteDispatch(const Tensor& src, Tensor& dst, const TensorStride& src_strides, const TensorStride& dst_strides)
+        {
+            auto src_data = src.data<T>();
+            auto dst_data = dst.data<T>();
+            auto ndim     = src.shape().size();
+            auto total    = src.size();
+            impl::Permute(src_data, src_strides, dst_data, dst_strides, ndim, total, GetNative<cudaStream_t>(stream()));
+            return success();
+        }
+    };
 
-MMDEPLOY_REGISTER_FACTORY_FUNC(Permute, (cuda, 0),
-                               []() { return std::make_unique<PermuteImpl>(); });
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Permute, (cuda, 0), []()
+                                   { return std::make_unique<PermuteImpl>(); });
 
 }  // namespace mmdeploy::operation::cuda
diff --git a/csrc/mmdeploy/operation/cuda/permute.cu b/csrc/mmdeploy/operation/cuda/permute.cu
index 7f979ed3fc..5b95dc80e0 100644
--- a/csrc/mmdeploy/operation/cuda/permute.cu
+++ b/csrc/mmdeploy/operation/cuda/permute.cu
@@ -4,46 +4,48 @@
 
 #include "mmdeploy/operation/cuda/permute.h"
 
-namespace mmdeploy {
-namespace operation {
-namespace cuda {
-namespace impl {
-
-template <typename T>
-__global__ void permute(const T* src, const TensorStride src_strides, T* dst,
-                        const TensorStride dst_strides, int ndim, int total) {
-  int u = blockIdx.x * blockDim.x + threadIdx.x;
-  if (u >= total) {
-    return;
-  }
-
-  int remaining = u;
-  int v = 0;
-  for (size_t i = 0; i < ndim; i++) {
-    int p = remaining / dst_strides.v_[i];
-    remaining -= p * dst_strides.v_[i];
-    v += p * src_strides.v_[i];
-  }
-  dst[u] = src[v];
-}
-
-template <typename T>
-void Permute(const T* src, const TensorStride& src_strides, T* dst, const TensorStride& dst_strides,
-             int ndim, int total, cudaStream_t stream) {
-  int thread_num = 256;
-  int block_num = (total + thread_num - 1) / thread_num;
-  permute<T><<<block_num, thread_num, 0, stream>>>(src, src_strides, dst, dst_strides, ndim, total);
-}
-
-template void Permute<float>(const float* src, const TensorStride& src_strides, float* dst,
-                             const TensorStride& dst_strides, int ndim, int total,
-                             cudaStream_t stream);
-
-template void Permute<uint8_t>(const uint8_t* src, const TensorStride& src_strides, uint8_t* dst,
-                               const TensorStride& dst_strides, int ndim, int total,
-                               cudaStream_t stream);
-
-}  // namespace impl
-}  // namespace cuda
-}  // namespace operation
+namespace mmdeploy
+{
+    namespace operation
+    {
+        namespace cuda
+        {
+            namespace impl
+            {
+
+                template<typename T>
+                __global__ void permute(const T* src, const TensorStride src_strides, T* dst, const TensorStride dst_strides, int ndim, int total)
+                {
+                    int u = blockIdx.x * blockDim.x + threadIdx.x;
+                    if (u >= total)
+                    {
+                        return;
+                    }
+
+                    int remaining = u;
+                    int v         = 0;
+                    for (size_t i = 0; i < ndim; i++)
+                    {
+                        int p = remaining / dst_strides.v_[i];
+                        remaining -= p * dst_strides.v_[i];
+                        v += p * src_strides.v_[i];
+                    }
+                    dst[u] = src[v];
+                }
+
+                template<typename T>
+                void Permute(const T* src, const TensorStride& src_strides, T* dst, const TensorStride& dst_strides, int ndim, int total, cudaStream_t stream)
+                {
+                    int thread_num = 256;
+                    int block_num  = (total + thread_num - 1) / thread_num;
+                    permute<T><<<block_num, thread_num, 0, stream>>>(src, src_strides, dst, dst_strides, ndim, total);
+                }
+
+                template void Permute<float>(const float* src, const TensorStride& src_strides, float* dst, const TensorStride& dst_strides, int ndim, int total, cudaStream_t stream);
+
+                template void Permute<uint8_t>(const uint8_t* src, const TensorStride& src_strides, uint8_t* dst, const TensorStride& dst_strides, int ndim, int total, cudaStream_t stream);
+
+            }  // namespace impl
+        }      // namespace cuda
+    }          // namespace operation
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/operation/cuda/permute.h b/csrc/mmdeploy/operation/cuda/permute.h
index 7bbc0a404f..2078e6161b 100644
--- a/csrc/mmdeploy/operation/cuda/permute.h
+++ b/csrc/mmdeploy/operation/cuda/permute.h
@@ -6,19 +6,26 @@
 
 #include <cstdlib>
 
-namespace mmdeploy {
-namespace operation {
-namespace cuda {
+namespace mmdeploy
+{
+    namespace operation
+    {
+        namespace cuda
+        {
 
-const int MAX_PERMUTE_DIM = 8;
+            const int MAX_PERMUTE_DIM = 8;
 
-struct TensorStride {
-  int v_[MAX_PERMUTE_DIM];
-  int& operator[](size_t idx) { return v_[idx]; }
-};
+            struct TensorStride
+            {
+                int  v_[MAX_PERMUTE_DIM];
+                int& operator[](size_t idx)
+                {
+                    return v_[idx];
+                }
+            };
 
-}  // namespace cuda
-}  // namespace operation
+        }  // namespace cuda
+    }      // namespace operation
 }  // namespace mmdeploy
 
 #endif  // MMDEPLOY_OPERATION_CUDA_PERMUTE_H_
diff --git a/csrc/mmdeploy/operation/cuda/resize.cpp b/csrc/mmdeploy/operation/cuda/resize.cpp
index 55ff8e5efe..6264460d2e 100644
--- a/csrc/mmdeploy/operation/cuda/resize.cpp
+++ b/csrc/mmdeploy/operation/cuda/resize.cpp
@@ -5,88 +5,113 @@
 #include "mmdeploy/core/utils/formatter.h"
 #include "mmdeploy/operation/vision.h"
 
-namespace mmdeploy::operation::cuda {
-
-class ResizeImpl : public Resize {
- public:
-  ResizeImpl(ppl::cv::InterpolationType interp) : interp_(interp) {}
-
-  Result<void> apply(const Tensor& src, Tensor& dst, int dst_h, int dst_w) override {
-    assert(src.device() == device());
-
-    TensorDesc desc{device(), src.data_type(), {1, dst_h, dst_w, src.shape(3)}, src.name()};
-    Tensor dst_tensor(desc);
-
-    auto cuda_stream = GetNative<cudaStream_t>(stream());
-    if (src.data_type() == DataType::kINT8) {
-      OUTCOME_TRY(ResizeDispatch<uint8_t>(src, dst_tensor, cuda_stream));
-    } else if (src.data_type() == DataType::kFLOAT) {
-      OUTCOME_TRY(ResizeDispatch<float>(src, dst_tensor, cuda_stream));
-    } else {
-      MMDEPLOY_ERROR("unsupported data type {}", src.data_type());
-      return Status(eNotSupported);
+namespace mmdeploy::operation::cuda
+{
+
+    class ResizeImpl : public Resize
+    {
+      public:
+        ResizeImpl(ppl::cv::InterpolationType interp)
+            : interp_(interp)
+        {
+        }
+
+        Result<void> apply(const Tensor& src, Tensor& dst, int dst_h, int dst_w) override
+        {
+            assert(src.device() == device());
+
+            TensorDesc desc{device(), src.data_type(), {1, dst_h, dst_w, src.shape(3)}, src.name()};
+            Tensor     dst_tensor(desc);
+
+            auto       cuda_stream = GetNative<cudaStream_t>(stream());
+            if (src.data_type() == DataType::kINT8)
+            {
+                OUTCOME_TRY(ResizeDispatch<uint8_t>(src, dst_tensor, cuda_stream));
+            }
+            else if (src.data_type() == DataType::kFLOAT)
+            {
+                OUTCOME_TRY(ResizeDispatch<float>(src, dst_tensor, cuda_stream));
+            }
+            else
+            {
+                MMDEPLOY_ERROR("unsupported data type {}", src.data_type());
+                return Status(eNotSupported);
+            }
+
+            dst = std::move(dst_tensor);
+            return success();
+        }
+
+      private:
+        template<typename T>
+        auto Select(int channels) -> decltype(&ppl::cv::cuda::Resize<T, 1>)
+        {
+            switch (channels)
+            {
+                case 1:
+                    return &ppl::cv::cuda::Resize<T, 1>;
+                case 3:
+                    return &ppl::cv::cuda::Resize<T, 3>;
+                case 4:
+                    return &ppl::cv::cuda::Resize<T, 4>;
+                default:
+                    MMDEPLOY_ERROR("unsupported channels {}", channels);
+                    return nullptr;
+            }
+        }
+
+        template<class T>
+        Result<void> ResizeDispatch(const Tensor& src, Tensor& dst, cudaStream_t stream)
+        {
+            int                  h     = (int)src.shape(1);
+            int                  w     = (int)src.shape(2);
+            int                  c     = (int)src.shape(3);
+            int                  dst_h = (int)dst.shape(1);
+            int                  dst_w = (int)dst.shape(2);
+
+            auto                 input  = src.data<T>();
+            auto                 output = dst.data<T>();
+
+            ppl::common::RetCode ret = 0;
+
+            if (auto resize = Select<T>(c); resize)
+            {
+                ret = resize(stream, h, w, w * c, input, dst_h, dst_w, dst_w * c, output, interp_);
+            }
+            else
+            {
+                return Status(eNotSupported);
+            }
+
+            return ret == 0 ? success() : Result<void>(Status(eFail));
+        }
+
+        ppl::cv::InterpolationType interp_;
+    };
+
+    static auto Create(const string_view& interp)
+    {
+        ppl::cv::InterpolationType type{};
+        if (interp == "bilinear")
+        {
+            type = ppl::cv::InterpolationType::INTERPOLATION_LINEAR;
+        }
+        else if (interp == "nearest")
+        {
+            type = ppl::cv::InterpolationType::INTERPOLATION_NEAREST_POINT;
+        }
+        else if (interp == "area")
+        {
+            type = ppl::cv::InterpolationType::INTERPOLATION_AREA;
+        }
+        else
+        {
+            MMDEPLOY_ERROR("unsupported interpolation method: {}", interp);
+            throw_exception(eNotSupported);
+        }
+        return std::make_unique<ResizeImpl>(type);
     }
 
-    dst = std::move(dst_tensor);
-    return success();
-  }
-
- private:
-  template <typename T>
-  auto Select(int channels) -> decltype(&ppl::cv::cuda::Resize<T, 1>) {
-    switch (channels) {
-      case 1:
-        return &ppl::cv::cuda::Resize<T, 1>;
-      case 3:
-        return &ppl::cv::cuda::Resize<T, 3>;
-      case 4:
-        return &ppl::cv::cuda::Resize<T, 4>;
-      default:
-        MMDEPLOY_ERROR("unsupported channels {}", channels);
-        return nullptr;
-    }
-  }
-
-  template <class T>
-  Result<void> ResizeDispatch(const Tensor& src, Tensor& dst, cudaStream_t stream) {
-    int h = (int)src.shape(1);
-    int w = (int)src.shape(2);
-    int c = (int)src.shape(3);
-    int dst_h = (int)dst.shape(1);
-    int dst_w = (int)dst.shape(2);
-
-    auto input = src.data<T>();
-    auto output = dst.data<T>();
-
-    ppl::common::RetCode ret = 0;
-
-    if (auto resize = Select<T>(c); resize) {
-      ret = resize(stream, h, w, w * c, input, dst_h, dst_w, dst_w * c, output, interp_);
-    } else {
-      return Status(eNotSupported);
-    }
-
-    return ret == 0 ? success() : Result<void>(Status(eFail));
-  }
-
-  ppl::cv::InterpolationType interp_;
-};
-
-static auto Create(const string_view& interp) {
-  ppl::cv::InterpolationType type{};
-  if (interp == "bilinear") {
-    type = ppl::cv::InterpolationType::INTERPOLATION_LINEAR;
-  } else if (interp == "nearest") {
-    type = ppl::cv::InterpolationType::INTERPOLATION_NEAREST_POINT;
-  } else if (interp == "area") {
-    type = ppl::cv::InterpolationType::INTERPOLATION_AREA;
-  } else {
-    MMDEPLOY_ERROR("unsupported interpolation method: {}", interp);
-    throw_exception(eNotSupported);
-  }
-  return std::make_unique<ResizeImpl>(type);
-}
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(Resize, (cuda, 0), Create);
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Resize, (cuda, 0), Create);
 
 }  // namespace mmdeploy::operation::cuda
diff --git a/csrc/mmdeploy/operation/cuda/to_float.cpp b/csrc/mmdeploy/operation/cuda/to_float.cpp
index 0057529160..a2a7506f08 100644
--- a/csrc/mmdeploy/operation/cuda/to_float.cpp
+++ b/csrc/mmdeploy/operation/cuda/to_float.cpp
@@ -4,35 +4,40 @@
 
 #include "mmdeploy/operation/vision.h"
 
-namespace mmdeploy::operation::cuda {
-
-template <typename From, typename To>
-void Cast(const From* src, To* dst, size_t n, cudaStream_t stream);
-
-class ToFloatImpl : public ToFloat {
- public:
-  Result<void> apply(const Tensor& src, Tensor& dst) override {
-    auto data_type = src.desc().data_type;
-    if (data_type == DataType::kFLOAT) {
-      dst = src;
-      return success();
-    }
-
-    if (data_type == DataType::kINT8) {
-      auto desc = src.desc();
-      desc.data_type = DataType::kFLOAT;
-
-      Tensor dst_tensor(desc);
-      Cast(src.data<uint8_t>(), dst_tensor.data<float>(), src.size(),
-           GetNative<cudaStream_t>(stream()));
-
-      dst = std::move(dst_tensor);
-      return success();
-    }
-    throw_exception(eNotSupported);
-  }
-};
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(ToFloat, (cuda, 0), [] { return std::make_unique<ToFloatImpl>(); });
+namespace mmdeploy::operation::cuda
+{
+
+    template<typename From, typename To>
+    void Cast(const From* src, To* dst, size_t n, cudaStream_t stream);
+
+    class ToFloatImpl : public ToFloat
+    {
+      public:
+        Result<void> apply(const Tensor& src, Tensor& dst) override
+        {
+            auto data_type = src.desc().data_type;
+            if (data_type == DataType::kFLOAT)
+            {
+                dst = src;
+                return success();
+            }
+
+            if (data_type == DataType::kINT8)
+            {
+                auto desc      = src.desc();
+                desc.data_type = DataType::kFLOAT;
+
+                Tensor dst_tensor(desc);
+                Cast(src.data<uint8_t>(), dst_tensor.data<float>(), src.size(), GetNative<cudaStream_t>(stream()));
+
+                dst = std::move(dst_tensor);
+                return success();
+            }
+            throw_exception(eNotSupported);
+        }
+    };
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(ToFloat, (cuda, 0), []
+                                   { return std::make_unique<ToFloatImpl>(); });
 
 }  // namespace mmdeploy::operation::cuda
diff --git a/csrc/mmdeploy/operation/cuda/transpose.cu b/csrc/mmdeploy/operation/cuda/transpose.cu
index 5170ca7f2b..dba6058670 100644
--- a/csrc/mmdeploy/operation/cuda/transpose.cu
+++ b/csrc/mmdeploy/operation/cuda/transpose.cu
@@ -2,41 +2,43 @@
 
 #include <cstdint>
 
-namespace mmdeploy {
-namespace operation {
-namespace cuda {
-
-template <typename T>
-__global__ void transpose(const T* src, int height, int width, int channels, int src_width_stride,
-                          T* dst, int dst_channel_stride) {
-  auto x = blockIdx.x * blockDim.x + threadIdx.x;
-  auto y = blockIdx.y * blockDim.y + threadIdx.y;
-
-  if (x >= width || y >= height) return;
-
-  for (auto c = 0; c < channels; ++c) {
-    dst[c * dst_channel_stride + y * width + x] = src[y * src_width_stride + x * channels + c];
-  }
-}
-
-template <typename T>
-void Transpose(const T* src, int height, int width, int channels, T* dst, cudaStream_t stream) {
-  const dim3 thread_block(32, 8);
-  const dim3 block_num((width + thread_block.x - 1) / thread_block.x,
-                       (height + thread_block.y - 1) / thread_block.y);
-
-  auto src_width_stride = width * channels;
-  auto dst_channel_stride = width * height;
-
-  transpose<T><<<block_num, thread_block, 0, stream>>>(src, height, width, channels,
-                                                       src_width_stride, dst, dst_channel_stride);
-}
-
-template void Transpose<uint8_t>(const uint8_t* src, int height, int width, int channels,
-                                 uint8_t* dst, cudaStream_t stream);
-
-template void Transpose<float>(const float* src, int height, int width, int channels, float* dst,
-                               cudaStream_t stream);
-}  // namespace cuda
-}  // namespace operation
+namespace mmdeploy
+{
+    namespace operation
+    {
+        namespace cuda
+        {
+
+            template<typename T>
+            __global__ void transpose(const T* src, int height, int width, int channels, int src_width_stride, T* dst, int dst_channel_stride)
+            {
+                auto x = blockIdx.x * blockDim.x + threadIdx.x;
+                auto y = blockIdx.y * blockDim.y + threadIdx.y;
+
+                if (x >= width || y >= height) return;
+
+                for (auto c = 0; c < channels; ++c)
+                {
+                    dst[c * dst_channel_stride + y * width + x] = src[y * src_width_stride + x * channels + c];
+                }
+            }
+
+            template<typename T>
+            void Transpose(const T* src, int height, int width, int channels, T* dst, cudaStream_t stream)
+            {
+                const dim3 thread_block(32, 8);
+                const dim3 block_num((width + thread_block.x - 1) / thread_block.x,
+                                     (height + thread_block.y - 1) / thread_block.y);
+
+                auto       src_width_stride   = width * channels;
+                auto       dst_channel_stride = width * height;
+
+                transpose<T><<<block_num, thread_block, 0, stream>>>(src, height, width, channels, src_width_stride, dst, dst_channel_stride);
+            }
+
+            template void Transpose<uint8_t>(const uint8_t* src, int height, int width, int channels, uint8_t* dst, cudaStream_t stream);
+
+            template void Transpose<float>(const float* src, int height, int width, int channels, float* dst, cudaStream_t stream);
+        }  // namespace cuda
+    }      // namespace operation
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/operation/cuda/warp_affine.cpp b/csrc/mmdeploy/operation/cuda/warp_affine.cpp
index 6c4834f018..4ab70c9896 100644
--- a/csrc/mmdeploy/operation/cuda/warp_affine.cpp
+++ b/csrc/mmdeploy/operation/cuda/warp_affine.cpp
@@ -5,115 +5,136 @@
 #include "mmdeploy/operation/vision.h"
 #include "ppl/cv/cuda/warpaffine.h"
 
-namespace mmdeploy::operation::cuda {
-
-class WarpAffineImpl : public WarpAffine {
- public:
-  explicit WarpAffineImpl(ppl::cv::InterpolationType interp) : interp_(interp) {}
-
-  Result<void> apply(const Tensor& src, Tensor& dst, const float affine_matrix[6], int dst_h,
-                     int dst_w) override {
-    assert(src.device() == device());
-
-    TensorDesc desc{device(), src.data_type(), {1, dst_h, dst_w, src.shape(3)}, src.name()};
-    Tensor dst_tensor(desc);
-
-    const auto m = affine_matrix;
-    auto inv = Invert(affine_matrix);
-
-    auto cuda_stream = GetNative<cudaStream_t>(stream());
-    if (src.data_type() == DataType::kINT8) {
-      OUTCOME_TRY(Dispatch<uint8_t>(src, dst_tensor, inv.data(), cuda_stream));
-    } else if (src.data_type() == DataType::kFLOAT) {
-      OUTCOME_TRY(Dispatch<float>(src, dst_tensor, inv.data(), cuda_stream));
-    } else {
-      MMDEPLOY_ERROR("unsupported data type {}", src.data_type());
-      return Status(eNotSupported);
+namespace mmdeploy::operation::cuda
+{
+
+    class WarpAffineImpl : public WarpAffine
+    {
+      public:
+        explicit WarpAffineImpl(ppl::cv::InterpolationType interp)
+            : interp_(interp)
+        {
+        }
+
+        Result<void> apply(const Tensor& src, Tensor& dst, const float affine_matrix[6], int dst_h, int dst_w) override
+        {
+            assert(src.device() == device());
+
+            TensorDesc desc{device(), src.data_type(), {1, dst_h, dst_w, src.shape(3)}, src.name()};
+            Tensor     dst_tensor(desc);
+
+            const auto m   = affine_matrix;
+            auto       inv = Invert(affine_matrix);
+
+            auto       cuda_stream = GetNative<cudaStream_t>(stream());
+            if (src.data_type() == DataType::kINT8)
+            {
+                OUTCOME_TRY(Dispatch<uint8_t>(src, dst_tensor, inv.data(), cuda_stream));
+            }
+            else if (src.data_type() == DataType::kFLOAT)
+            {
+                OUTCOME_TRY(Dispatch<float>(src, dst_tensor, inv.data(), cuda_stream));
+            }
+            else
+            {
+                MMDEPLOY_ERROR("unsupported data type {}", src.data_type());
+                return Status(eNotSupported);
+            }
+
+            dst = std::move(dst_tensor);
+            return success();
+        }
+
+      private:
+        // ppl.cv uses inverted transform
+        // https://github.com/opencv/opencv/blob/bc6544c0bcfa9ca5db5e0d0551edf5c8e7da3852/modules/imgproc/src/imgwarp.cpp#L3478
+        static std::array<float, 6> Invert(const float affine_matrix[6])
+        {
+            const auto*          M = affine_matrix;
+            std::array<float, 6> inv{};
+            auto                 iM = inv.data();
+
+            auto                 D = M[0] * M[3 + 1] - M[1] * M[3];
+            D                      = D != 0.f ? 1.f / D : 0.f;
+            auto A11 = M[3 + 1] * D, A22 = M[0] * D, A12 = -M[1] * D, A21 = -M[3] * D;
+            auto b1 = -A11 * M[2] - A12 * M[3 + 2];
+            auto b2 = -A21 * M[2] - A22 * M[3 + 2];
+
+            iM[0]     = A11;
+            iM[1]     = A12;
+            iM[2]     = b1;
+            iM[3]     = A21;
+            iM[3 + 1] = A22;
+            iM[3 + 2] = b2;
+
+            return inv;
+        }
+
+        template<typename T>
+        auto Select(int channels) -> decltype(&ppl::cv::cuda::WarpAffine<T, 1>)
+        {
+            switch (channels)
+            {
+                case 1:
+                    return &ppl::cv::cuda::WarpAffine<T, 1>;
+                case 3:
+                    return &ppl::cv::cuda::WarpAffine<T, 3>;
+                case 4:
+                    return &ppl::cv::cuda::WarpAffine<T, 4>;
+                default:
+                    MMDEPLOY_ERROR("unsupported channels {}", channels);
+                    return nullptr;
+            }
+        }
+
+        template<class T>
+        Result<void> Dispatch(const Tensor& src, Tensor& dst, const float affine_matrix[6], cudaStream_t stream)
+        {
+            int                  h     = (int)src.shape(1);
+            int                  w     = (int)src.shape(2);
+            int                  c     = (int)src.shape(3);
+            int                  dst_h = (int)dst.shape(1);
+            int                  dst_w = (int)dst.shape(2);
+
+            auto                 input  = src.data<T>();
+            auto                 output = dst.data<T>();
+
+            ppl::common::RetCode ret = 0;
+
+            if (auto warp_affine = Select<T>(c); warp_affine)
+            {
+                ret = warp_affine(stream, h, w, w * c, input, dst_h, dst_w, dst_w * c, output, affine_matrix, interp_, ppl::cv::BORDER_CONSTANT, 0);
+            }
+            else
+            {
+                return Status(eNotSupported);
+            }
+
+            return ret == 0 ? success() : Result<void>(Status(eFail));
+        }
+
+        ppl::cv::InterpolationType interp_;
+    };
+
+    static auto Create(const string_view& interp)
+    {
+        ppl::cv::InterpolationType type{};
+        if (interp == "bilinear")
+        {
+            type = ppl::cv::InterpolationType::INTERPOLATION_LINEAR;
+        }
+        else if (interp == "nearest")
+        {
+            type = ppl::cv::InterpolationType::INTERPOLATION_NEAREST_POINT;
+        }
+        else
+        {
+            MMDEPLOY_ERROR("unsupported interpolation method: {}", interp);
+            throw_exception(eNotSupported);
+        }
+        return std::make_unique<WarpAffineImpl>(type);
     }
 
-    dst = std::move(dst_tensor);
-    return success();
-  }
-
- private:
-  // ppl.cv uses inverted transform
-  // https://github.com/opencv/opencv/blob/bc6544c0bcfa9ca5db5e0d0551edf5c8e7da3852/modules/imgproc/src/imgwarp.cpp#L3478
-  static std::array<float, 6> Invert(const float affine_matrix[6]) {
-    const auto* M = affine_matrix;
-    std::array<float, 6> inv{};
-    auto iM = inv.data();
-
-    auto D = M[0] * M[3 + 1] - M[1] * M[3];
-    D = D != 0.f ? 1.f / D : 0.f;
-    auto A11 = M[3 + 1] * D, A22 = M[0] * D, A12 = -M[1] * D, A21 = -M[3] * D;
-    auto b1 = -A11 * M[2] - A12 * M[3 + 2];
-    auto b2 = -A21 * M[2] - A22 * M[3 + 2];
-
-    iM[0] = A11;
-    iM[1] = A12;
-    iM[2] = b1;
-    iM[3] = A21;
-    iM[3 + 1] = A22;
-    iM[3 + 2] = b2;
-
-    return inv;
-  }
-
-  template <typename T>
-  auto Select(int channels) -> decltype(&ppl::cv::cuda::WarpAffine<T, 1>) {
-    switch (channels) {
-      case 1:
-        return &ppl::cv::cuda::WarpAffine<T, 1>;
-      case 3:
-        return &ppl::cv::cuda::WarpAffine<T, 3>;
-      case 4:
-        return &ppl::cv::cuda::WarpAffine<T, 4>;
-      default:
-        MMDEPLOY_ERROR("unsupported channels {}", channels);
-        return nullptr;
-    }
-  }
-
-  template <class T>
-  Result<void> Dispatch(const Tensor& src, Tensor& dst, const float affine_matrix[6],
-                        cudaStream_t stream) {
-    int h = (int)src.shape(1);
-    int w = (int)src.shape(2);
-    int c = (int)src.shape(3);
-    int dst_h = (int)dst.shape(1);
-    int dst_w = (int)dst.shape(2);
-
-    auto input = src.data<T>();
-    auto output = dst.data<T>();
-
-    ppl::common::RetCode ret = 0;
-
-    if (auto warp_affine = Select<T>(c); warp_affine) {
-      ret = warp_affine(stream, h, w, w * c, input, dst_h, dst_w, dst_w * c, output, affine_matrix,
-                        interp_, ppl::cv::BORDER_CONSTANT, 0);
-    } else {
-      return Status(eNotSupported);
-    }
-
-    return ret == 0 ? success() : Result<void>(Status(eFail));
-  }
-
-  ppl::cv::InterpolationType interp_;
-};
-
-static auto Create(const string_view& interp) {
-  ppl::cv::InterpolationType type{};
-  if (interp == "bilinear") {
-    type = ppl::cv::InterpolationType::INTERPOLATION_LINEAR;
-  } else if (interp == "nearest") {
-    type = ppl::cv::InterpolationType::INTERPOLATION_NEAREST_POINT;
-  } else {
-    MMDEPLOY_ERROR("unsupported interpolation method: {}", interp);
-    throw_exception(eNotSupported);
-  }
-  return std::make_unique<WarpAffineImpl>(type);
-}
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(WarpAffine, (cuda, 0), Create);
+    MMDEPLOY_REGISTER_FACTORY_FUNC(WarpAffine, (cuda, 0), Create);
 
 }  // namespace mmdeploy::operation::cuda
diff --git a/csrc/mmdeploy/operation/dummy/operations.cpp b/csrc/mmdeploy/operation/dummy/operations.cpp
index d2fac15e83..e2bd780178 100644
--- a/csrc/mmdeploy/operation/dummy/operations.cpp
+++ b/csrc/mmdeploy/operation/dummy/operations.cpp
@@ -2,97 +2,110 @@
 
 #include "mmdeploy/operation/vision.h"
 
-namespace mmdeploy::operation::dummy {
+namespace mmdeploy::operation::dummy
+{
 
-namespace {
+    namespace
+    {
 
-const Buffer& g_dummy_buffer() {
-  static Buffer buffer{Device(0), 0, nullptr};
-  return buffer;
-}
+        const Buffer& g_dummy_buffer()
+        {
+            static Buffer buffer{Device(0), 0, nullptr};
+            return buffer;
+        }
 
-}  // namespace
+    }  // namespace
 
-class HWC2CHWImpl : public HWC2CHW {
- public:
-  Result<void> apply(const Tensor& img, Tensor& dst) override {
-    auto& shape = img.shape();
-    dst = {{Device{0}, img.data_type(), {shape[0], shape[3], shape[1], shape[2]}},
-           g_dummy_buffer()};
-    return success();
-  }
-};
-MMDEPLOY_REGISTER_FACTORY_FUNC(HWC2CHW, (dummy, 0),
-                               []() { return std::make_unique<HWC2CHWImpl>(); });
+    class HWC2CHWImpl : public HWC2CHW
+    {
+      public:
+        Result<void> apply(const Tensor& img, Tensor& dst) override
+        {
+            auto& shape = img.shape();
+            dst         = {{Device{0}, img.data_type(), {shape[0], shape[3], shape[1], shape[2]}},
+                           g_dummy_buffer()};
+            return success();
+        }
+    };
+    MMDEPLOY_REGISTER_FACTORY_FUNC(HWC2CHW, (dummy, 0), []()
+                                   { return std::make_unique<HWC2CHWImpl>(); });
 
-class CropImpl : public Crop {
- public:
-  Result<void> apply(const Tensor& src, Tensor& dst, int top, int left, int bottom,
-                     int right) override {
-    auto shape = src.shape();
-    shape[1] = bottom - top + 1;  // h
-    shape[2] = right - left + 1;  // w
-    dst = {{Device{0}, src.data_type(), shape}, g_dummy_buffer()};
-    return success();
-  }
-};
-MMDEPLOY_REGISTER_FACTORY_FUNC(Crop, (dummy, 0), []() { return std::make_unique<CropImpl>(); });
+    class CropImpl : public Crop
+    {
+      public:
+        Result<void> apply(const Tensor& src, Tensor& dst, int top, int left, int bottom, int right) override
+        {
+            auto shape = src.shape();
+            shape[1]   = bottom - top + 1;  // h
+            shape[2]   = right - left + 1;  // w
+            dst        = {{Device{0}, src.data_type(), shape}, g_dummy_buffer()};
+            return success();
+        }
+    };
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Crop, (dummy, 0), []()
+                                   { return std::make_unique<CropImpl>(); });
 
-class ToFloatImpl : public ToFloat {
- public:
-  Result<void> apply(const Tensor& src, Tensor& dst) override {
-    dst = {{Device{0}, DataType::kFLOAT, src.shape()}, g_dummy_buffer()};
-    return success();
-  }
-};
-MMDEPLOY_REGISTER_FACTORY_FUNC(ToFloat, (dummy, 0),
-                               []() { return std::make_unique<ToFloatImpl>(); });
+    class ToFloatImpl : public ToFloat
+    {
+      public:
+        Result<void> apply(const Tensor& src, Tensor& dst) override
+        {
+            dst = {{Device{0}, DataType::kFLOAT, src.shape()}, g_dummy_buffer()};
+            return success();
+        }
+    };
+    MMDEPLOY_REGISTER_FACTORY_FUNC(ToFloat, (dummy, 0), []()
+                                   { return std::make_unique<ToFloatImpl>(); });
 
-class CvtColorImpl : public CvtColor {
- public:
-  Result<void> apply(const Mat& src, Mat& dst, PixelFormat dst_fmt) override {
-    dst = {src.height(), src.width(), dst_fmt, src.type(), nullptr, Device{0}};
-    return success();
-  }
-};
-MMDEPLOY_REGISTER_FACTORY_FUNC(CvtColor, (dummy, 0),
-                               [] { return std::make_unique<CvtColorImpl>(); });
+    class CvtColorImpl : public CvtColor
+    {
+      public:
+        Result<void> apply(const Mat& src, Mat& dst, PixelFormat dst_fmt) override
+        {
+            dst = {src.height(), src.width(), dst_fmt, src.type(), nullptr, Device{0}};
+            return success();
+        }
+    };
+    MMDEPLOY_REGISTER_FACTORY_FUNC(CvtColor, (dummy, 0), []
+                                   { return std::make_unique<CvtColorImpl>(); });
 
-class NormalizeImpl : public Normalize {
- public:
-  Result<void> apply(const Tensor& src, Tensor& dst) override {
-    dst = {{Device{0}, DataType::kFLOAT, src.shape()}, g_dummy_buffer()};
-    return success();
-  }
-};
-MMDEPLOY_REGISTER_FACTORY_FUNC(Normalize, (dummy, 0), [](const Normalize::Param& param) {
-  return std::make_unique<NormalizeImpl>();
-});
+    class NormalizeImpl : public Normalize
+    {
+      public:
+        Result<void> apply(const Tensor& src, Tensor& dst) override
+        {
+            dst = {{Device{0}, DataType::kFLOAT, src.shape()}, g_dummy_buffer()};
+            return success();
+        }
+    };
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Normalize, (dummy, 0), [](const Normalize::Param& param)
+                                   { return std::make_unique<NormalizeImpl>(); });
 
-class PadImpl : public Pad {
- public:
-  Result<void> apply(const Tensor& src, Tensor& dst, int top, int left, int bottom,
-                     int right) override {
-    auto shape = src.shape();  // 1HWC
-    shape[1] += top + bottom;
-    shape[2] += left + right;
-    dst = {{Device{0}, src.data_type(), shape}, g_dummy_buffer()};
-    return success();
-  }
-};
-MMDEPLOY_REGISTER_FACTORY_FUNC(Pad, (dummy, 0), [](const string_view& border_type, float pad_val) {
-  return std::make_unique<PadImpl>();
-});
+    class PadImpl : public Pad
+    {
+      public:
+        Result<void> apply(const Tensor& src, Tensor& dst, int top, int left, int bottom, int right) override
+        {
+            auto shape = src.shape();  // 1HWC
+            shape[1] += top + bottom;
+            shape[2] += left + right;
+            dst = {{Device{0}, src.data_type(), shape}, g_dummy_buffer()};
+            return success();
+        }
+    };
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Pad, (dummy, 0), [](const string_view& border_type, float pad_val)
+                                   { return std::make_unique<PadImpl>(); });
 
-class ResizeImpl : public Resize {
- public:
-  Result<void> apply(const Tensor& src, Tensor& dst, int dst_h, int dst_w) override {
-    dst = {{Device{0}, dst.data_type(), {1, dst_h, dst_w, src.shape(3)}}, g_dummy_buffer()};
-    return success();
-  }
-};
-MMDEPLOY_REGISTER_FACTORY_FUNC(Resize, (dummy, 0), [](const string_view& interp) {
-  return std::make_unique<ResizeImpl>();
-});
+    class ResizeImpl : public Resize
+    {
+      public:
+        Result<void> apply(const Tensor& src, Tensor& dst, int dst_h, int dst_w) override
+        {
+            dst = {{Device{0}, dst.data_type(), {1, dst_h, dst_w, src.shape(3)}}, g_dummy_buffer()};
+            return success();
+        }
+    };
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Resize, (dummy, 0), [](const string_view& interp)
+                                   { return std::make_unique<ResizeImpl>(); });
 
 }  // namespace mmdeploy::operation::dummy
diff --git a/csrc/mmdeploy/operation/managed.h b/csrc/mmdeploy/operation/managed.h
index 15866dbbc7..1d277efc16 100644
--- a/csrc/mmdeploy/operation/managed.h
+++ b/csrc/mmdeploy/operation/managed.h
@@ -5,177 +5,234 @@
 
 #include "mmdeploy/operation/operation.h"
 
-namespace mmdeploy::operation {
-
-namespace _apply {
-
-inline Result<void> Copy(const Buffer& src, Buffer& dst, size_t size, Stream& stream) {
-  OUTCOME_TRY(stream.Copy(src, dst, size));
-  if (dst.GetDevice() != stream.GetDevice()) {
-    OUTCOME_TRY(stream.Wait());
-  }
-  return success();
-}
-
-inline Result<Tensor> Secure(const Tensor& val, const Device& device, Stream& stream) {
-  if (val.device() == device || gContext().use_dummy()) {
-    return val;
-  }
-
-  TensorDesc desc{device, val.data_type(), val.shape(), val.name()};
-  Tensor dst(desc);
-
-  OUTCOME_TRY(Copy(val.buffer(), dst.buffer(), val.byte_size(), stream));
-
-  gContext().Track(dst);
-  return dst;
-}
-
-inline Result<Mat> Secure(const Mat& val, const Device& device, Stream& stream) {
-  if (val.device() == device || gContext().use_dummy()) {
-    return val;
-  }
-
-  Mat dst{val.height(), val.width(), val.pixel_format(), val.type(), device};
-
-  OUTCOME_TRY(Copy(val.buffer(), dst.buffer(), val.byte_size(), stream));
-
-  gContext().Track(dst);
-  return dst;
-}
-
-template <typename T>
-struct _base_handler {
-  using type = T;
-  static T input(T x, const Device&, Stream&) { return x; }
-  static T pass(T x) { return x; }
-  static void output(T) {}
-};
-
-template <typename T>
-struct _handler : _base_handler<T> {};
-
-template <>
-struct _handler<const Tensor&> : _base_handler<const Tensor&> {
-  using type = Result<Tensor>;
-  static type input(const Tensor& tensor, const Device& device, Stream& stream) {
-    return Secure(tensor, device, stream);
-  }
-  static const Tensor& pass(const type& tensor) { return tensor.value(); }
-  static void output(const Result<Tensor>&) {}
-};
-
-template <>
-struct _handler<const Mat&> {
-  using type = Result<Mat>;
-  static type input(const Mat& mat, const Device& device, Stream& stream) {
-    return Secure(mat, device, stream);
-  }
-  static const Mat& pass(const type& mat) { return mat.value(); }
-  static void output(const type&) {}
-};
-
-template <>
-struct _handler<const std::vector<Tensor>&> {
-  using type = Result<std::vector<Tensor>>;
-  static type input(const std::vector<Tensor>& tensors, const Device& device, Stream& stream) {
-    std::vector<Tensor> rets(tensors.size());
-    for (size_t i = 0; i < tensors.size(); ++i) {
-      OUTCOME_TRY(rets[i], Secure(tensors[i], device, stream));
-    }
-    return rets;
-  }
-  static const std::vector<Tensor>& pass(const type& tensors) { return tensors.value(); }
-  static void output(const type&) {}
-};
-
-template <>
-struct _handler<Tensor&> : _base_handler<Tensor&> {
-  static void output(Tensor& tensor) { gContext().Track(tensor); }
-};
-
-template <>
-struct _handler<Mat&> : _base_handler<Mat&> {
-  static void output(Mat& mat) { gContext().Track(mat); }
-};
-
-inline Result<void> _check() { return success(); }
-
-template <typename T, typename... Ts>
-Result<void> _check(T&& x, Ts&&... xs) {
-  return _check((Ts &&) xs...);
-}
-
-template <typename T, typename... Ts>
-Result<void> _check(Result<T>& x, Ts&&... xs) {
-  OUTCOME_TRY(x);
-  return _check((Ts &&) xs...);
-}
-
-template <typename Sig>
-struct apply_impl {
-  static_assert(!std::is_same_v<Sig, Sig>, "Not a member function pointer");
-};
-
-template <typename Ret, typename C, typename... Args>
-struct apply_impl<Ret (C::*)(Args...)> {
-  const Device& device;
-  Stream& stream;
-
-  template <typename Op, typename... As>
-  Result<void> operator()(Op& op, As&&... as) const {
-    return apply(op, std::index_sequence_for<Args...>{}, (As &&) as...);
-  }
-
-  template <typename Op, typename... As, size_t... Is>
-  Result<void> apply(Op& op, std::index_sequence<Is...>, As&&... as) const {
-    // transform input args and store them in a tuple
-    std::tuple<typename _handler<Args>::type...> tmps{
-        _handler<Args>::input((As &&) as, device, stream)...};
-
-    // check if any copy operations are failed
-    OUTCOME_TRY(_check(std::get<Is>(tmps)...));
-
-    // apply the operation
-    OUTCOME_TRY(op.apply(_handler<Args>::pass(std::get<Is>(tmps))...));
-
-    // track output data (Tensor& and Mat&)
-    (_handler<Args>::output(std::get<Is>(tmps)), ...);
-    return success();
-  }
-};
-
-template <typename Op, typename... Args>
-Result<void> apply(Op& op, Args&&... args) {
-  _apply::apply_impl<decltype(&Op::apply)> impl{op.device(), op.stream()};
-  return impl(op, (Args &&) args...);
-}
-
-}  // namespace _apply
-
-template <typename Op>
-class Managed {
- public:
-  Managed() = default;
-
-  explicit Managed(std::unique_ptr<Op> op) : op_(std::move(op)) {}
-
-  template <typename... Args>
-  Result<void> Apply(Args&&... args) {
-    assert(op_);
-    return _apply::apply(*op_, (Args &&) args...);
-  }
-
-  template <typename... Args>
-  static Managed<Op> Create(Args&&... args) {
-    return Managed<Op>(operation::Create<Op>((Args &&) args...));
-  }
-
- private:
-  std::unique_ptr<Op> op_;
-};
-
-using _apply::Secure;
+namespace mmdeploy::operation
+{
+
+    namespace _apply
+    {
+
+        inline Result<void> Copy(const Buffer& src, Buffer& dst, size_t size, Stream& stream)
+        {
+            OUTCOME_TRY(stream.Copy(src, dst, size));
+            if (dst.GetDevice() != stream.GetDevice())
+            {
+                OUTCOME_TRY(stream.Wait());
+            }
+            return success();
+        }
+
+        inline Result<Tensor> Secure(const Tensor& val, const Device& device, Stream& stream)
+        {
+            if (val.device() == device || gContext().use_dummy())
+            {
+                return val;
+            }
+
+            TensorDesc desc{device, val.data_type(), val.shape(), val.name()};
+            Tensor     dst(desc);
+
+            OUTCOME_TRY(Copy(val.buffer(), dst.buffer(), val.byte_size(), stream));
+
+            gContext().Track(dst);
+            return dst;
+        }
+
+        inline Result<Mat> Secure(const Mat& val, const Device& device, Stream& stream)
+        {
+            if (val.device() == device || gContext().use_dummy())
+            {
+                return val;
+            }
+
+            Mat dst{val.height(), val.width(), val.pixel_format(), val.type(), device};
+
+            OUTCOME_TRY(Copy(val.buffer(), dst.buffer(), val.byte_size(), stream));
+
+            gContext().Track(dst);
+            return dst;
+        }
+
+        template<typename T>
+        struct _base_handler
+        {
+            using type = T;
+            static T input(T x, const Device&, Stream&)
+            {
+                return x;
+            }
+            static T pass(T x)
+            {
+                return x;
+            }
+            static void output(T) {}
+        };
+
+        template<typename T>
+        struct _handler : _base_handler<T>
+        {
+        };
+
+        template<>
+        struct _handler<const Tensor&> : _base_handler<const Tensor&>
+        {
+            using type = Result<Tensor>;
+            static type input(const Tensor& tensor, const Device& device, Stream& stream)
+            {
+                return Secure(tensor, device, stream);
+            }
+            static const Tensor& pass(const type& tensor)
+            {
+                return tensor.value();
+            }
+            static void output(const Result<Tensor>&) {}
+        };
+
+        template<>
+        struct _handler<const Mat&>
+        {
+            using type = Result<Mat>;
+            static type input(const Mat& mat, const Device& device, Stream& stream)
+            {
+                return Secure(mat, device, stream);
+            }
+            static const Mat& pass(const type& mat)
+            {
+                return mat.value();
+            }
+            static void output(const type&) {}
+        };
+
+        template<>
+        struct _handler<const std::vector<Tensor>&>
+        {
+            using type = Result<std::vector<Tensor>>;
+            static type input(const std::vector<Tensor>& tensors, const Device& device, Stream& stream)
+            {
+                std::vector<Tensor> rets(tensors.size());
+                for (size_t i = 0; i < tensors.size(); ++i)
+                {
+                    OUTCOME_TRY(rets[i], Secure(tensors[i], device, stream));
+                }
+                return rets;
+            }
+            static const std::vector<Tensor>& pass(const type& tensors)
+            {
+                return tensors.value();
+            }
+            static void output(const type&) {}
+        };
+
+        template<>
+        struct _handler<Tensor&> : _base_handler<Tensor&>
+        {
+            static void output(Tensor& tensor)
+            {
+                gContext().Track(tensor);
+            }
+        };
+
+        template<>
+        struct _handler<Mat&> : _base_handler<Mat&>
+        {
+            static void output(Mat& mat)
+            {
+                gContext().Track(mat);
+            }
+        };
+
+        inline Result<void> _check()
+        {
+            return success();
+        }
+
+        template<typename T, typename... Ts>
+        Result<void> _check(T&& x, Ts&&... xs)
+        {
+            return _check((Ts&&)xs...);
+        }
+
+        template<typename T, typename... Ts>
+        Result<void> _check(Result<T>& x, Ts&&... xs)
+        {
+            OUTCOME_TRY(x);
+            return _check((Ts&&)xs...);
+        }
+
+        template<typename Sig>
+        struct apply_impl
+        {
+            static_assert(!std::is_same_v<Sig, Sig>, "Not a member function pointer");
+        };
+
+        template<typename Ret, typename C, typename... Args>
+        struct apply_impl<Ret (C::*)(Args...)>
+        {
+            const Device& device;
+            Stream&       stream;
+
+            template<typename Op, typename... As>
+            Result<void> operator()(Op& op, As&&... as) const
+            {
+                return apply(op, std::index_sequence_for<Args...>{}, (As&&)as...);
+            }
+
+            template<typename Op, typename... As, size_t... Is>
+            Result<void> apply(Op& op, std::index_sequence<Is...>, As&&... as) const
+            {
+                // transform input args and store them in a tuple
+                std::tuple<typename _handler<Args>::type...> tmps{
+                    _handler<Args>::input((As&&)as, device, stream)...};
+
+                // check if any copy operations are failed
+                OUTCOME_TRY(_check(std::get<Is>(tmps)...));
+
+                // apply the operation
+                OUTCOME_TRY(op.apply(_handler<Args>::pass(std::get<Is>(tmps))...));
+
+                // track output data (Tensor& and Mat&)
+                (_handler<Args>::output(std::get<Is>(tmps)), ...);
+                return success();
+            }
+        };
+
+        template<typename Op, typename... Args>
+        Result<void> apply(Op& op, Args&&... args)
+        {
+            _apply::apply_impl<decltype(&Op::apply)> impl{op.device(), op.stream()};
+            return impl(op, (Args&&)args...);
+        }
+
+    }  // namespace _apply
+
+    template<typename Op>
+    class Managed
+    {
+      public:
+        Managed() = default;
+
+        explicit Managed(std::unique_ptr<Op> op)
+            : op_(std::move(op))
+        {
+        }
+
+        template<typename... Args>
+        Result<void> Apply(Args&&... args)
+        {
+            assert(op_);
+            return _apply::apply(*op_, (Args&&)args...);
+        }
+
+        template<typename... Args>
+        static Managed<Op> Create(Args&&... args)
+        {
+            return Managed<Op>(operation::Create<Op>((Args&&)args...));
+        }
+
+      private:
+        std::unique_ptr<Op> op_;
+    };
+
+    using _apply::Secure;
 
 }  // namespace mmdeploy::operation
 
diff --git a/csrc/mmdeploy/operation/operation.cpp b/csrc/mmdeploy/operation/operation.cpp
index 9c2f0d26ad..4cf7f0d0ef 100644
--- a/csrc/mmdeploy/operation/operation.cpp
+++ b/csrc/mmdeploy/operation/operation.cpp
@@ -4,36 +4,58 @@
 
 #include "mmdeploy/core/logger.h"
 
-namespace mmdeploy::operation {
+namespace mmdeploy::operation
+{
 
-thread_local Context* g_context{};
+    thread_local Context* g_context{};
 
-Context::Context(Device device, Stream stream)
-    : device_(device), stream_(std::move(stream)), parent_(std::exchange(g_context, this)) {}
+    Context::Context(Device device, Stream stream)
+        : device_(device)
+        , stream_(std::move(stream))
+        , parent_(std::exchange(g_context, this))
+    {
+    }
 
-Context::~Context() {
-  if (stream_) {
-    if (auto ec = stream_.Wait(); ec.has_error()) {
-      MMDEPLOY_ERROR("Stream synchronization failed: {}", ec.error().message().c_str());
+    Context::~Context()
+    {
+        if (stream_)
+        {
+            if (auto ec = stream_.Wait(); ec.has_error())
+            {
+                MMDEPLOY_ERROR("Stream synchronization failed: {}", ec.error().message().c_str());
+            }
+        }
+        g_context = std::exchange(parent_, nullptr);
     }
-  }
-  g_context = std::exchange(parent_, nullptr);
-}
 
-static Stream GetCurrentStream() { return g_context ? g_context->stream() : Stream{}; }
+    static Stream GetCurrentStream()
+    {
+        return g_context ? g_context->stream() : Stream{};
+    }
 
-static Device GetCurrentDevice() { return g_context ? g_context->device() : Device{}; }
+    static Device GetCurrentDevice()
+    {
+        return g_context ? g_context->device() : Device{};
+    }
 
-Context::Context(Device device) : Context(device, GetCurrentStream()) {}
+    Context::Context(Device device)
+        : Context(device, GetCurrentStream())
+    {
+    }
 
-Context::Context(Stream stream) : Context(GetCurrentDevice(), std::move(stream)) {}
+    Context::Context(Stream stream)
+        : Context(GetCurrentDevice(), std::move(stream))
+    {
+    }
 
-Context& gContext() {
-  if (g_context) {
-    return *g_context;
-  }
-  MMDEPLOY_ERROR("Operations must be used inside scopes guarded by operation::Context, aborting.");
-  std::abort();
-}
+    Context& gContext()
+    {
+        if (g_context)
+        {
+            return *g_context;
+        }
+        MMDEPLOY_ERROR("Operations must be used inside scopes guarded by operation::Context, aborting.");
+        std::abort();
+    }
 
 }  // namespace mmdeploy::operation
diff --git a/csrc/mmdeploy/operation/operation.h b/csrc/mmdeploy/operation/operation.h
index ed8d954d44..f27242a286 100644
--- a/csrc/mmdeploy/operation/operation.h
+++ b/csrc/mmdeploy/operation/operation.h
@@ -11,102 +11,151 @@
 #include "mmdeploy/core/utils/device_utils.h"
 #include "mmdeploy/core/utils/formatter.h"
 
-namespace mmdeploy::operation {
-
-using namespace mmdeploy::framework;
-using std::string_view;
-using std::unique_ptr;
-
-class MMDEPLOY_API Context {
- public:
-  explicit Context(Device device);
-  explicit Context(Stream stream);
-  explicit Context(Device device, Stream stream);
-  ~Context();
-
-  Context(const Context&) = delete;
-  Context(Context&&) noexcept = delete;
-  Context& operator=(const Context&) = delete;
-  Context& operator=(Context&&) noexcept = delete;
-
-  void Track(const Tensor& tensor) { buffers_.push_back(tensor.buffer()); }
-  void Track(const Mat& mat) { buffers_.push_back(mat.buffer()); };
-  void Track(const Buffer& buffer) { buffers_.push_back(buffer); };
-
-  template <typename T, typename... Args>
-  T Create(Args&&... args) {
-    return _track(T((Args &&) args...));
-  }
-
-  const Device& device() const noexcept { return device_; }
-  Stream& stream() noexcept { return stream_; }
-  const std::vector<Buffer>& buffers() const noexcept { return buffers_; }
-
-  bool use_dummy() const noexcept { return use_dummy_; }
-  void set_use_dummy(bool value) noexcept { use_dummy_ = value; }
-
- private:
-  Tensor&& _track(Tensor&& tensor) {
-    Track(tensor);
-    return std::move(tensor);
-  }
-  Mat&& _track(Mat&& mat) {
-    Track(mat);
-    return std::move(mat);
-  }
-  Buffer&& _track(Buffer&& buffer) {
-    Track(buffer);
-    return std::move(buffer);
-  }
-
- private:
-  Device device_;
-  Stream stream_;
-  std::vector<Buffer> buffers_;
-  bool use_dummy_{false};
-  Context* parent_;
-};
-
-MMDEPLOY_API Context& gContext();
-
-template <typename T, typename... Args>
-static unique_ptr<T> Create(Args&&... args) {
-  std::vector<string_view> tried;
-  if (!gContext().use_dummy()) {
-    std::vector<Device> candidates{gContext().device()};
-    if (candidates[0].is_device()) {
-      candidates.emplace_back(0);
-    }
-    for (const auto& device : candidates) {
-      if (auto platform = GetPlatformName(device)) {
-        tried.emplace_back(platform);
-        if (auto creator = gRegistry<T>().Get(platform)) {
-          Context context(device);
-          return creator->Create((Args &&) args...);
+namespace mmdeploy::operation
+{
+
+    using namespace mmdeploy::framework;
+    using std::string_view;
+    using std::unique_ptr;
+
+    class MMDEPLOY_API Context
+    {
+      public:
+        explicit Context(Device device);
+        explicit Context(Stream stream);
+        explicit Context(Device device, Stream stream);
+        ~Context();
+
+        Context(const Context&)                = delete;
+        Context(Context&&) noexcept            = delete;
+        Context& operator=(const Context&)     = delete;
+        Context& operator=(Context&&) noexcept = delete;
+
+        void     Track(const Tensor& tensor)
+        {
+            buffers_.push_back(tensor.buffer());
         }
-      }
-    }
-  } else {
-    tried.emplace_back("dummy");
-    if (auto creator = gRegistry<T>().Get("dummy")) {
-      return creator->Create((Args &&) args...);
+        void Track(const Mat& mat)
+        {
+            buffers_.push_back(mat.buffer());
+        };
+        void Track(const Buffer& buffer)
+        {
+            buffers_.push_back(buffer);
+        };
+
+        template<typename T, typename... Args>
+        T Create(Args&&... args)
+        {
+            return _track(T((Args&&)args...));
+        }
+
+        const Device& device() const noexcept
+        {
+            return device_;
+        }
+        Stream& stream() noexcept
+        {
+            return stream_;
+        }
+        const std::vector<Buffer>& buffers() const noexcept
+        {
+            return buffers_;
+        }
+
+        bool use_dummy() const noexcept
+        {
+            return use_dummy_;
+        }
+        void set_use_dummy(bool value) noexcept
+        {
+            use_dummy_ = value;
+        }
+
+      private:
+        Tensor&& _track(Tensor&& tensor)
+        {
+            Track(tensor);
+            return std::move(tensor);
+        }
+        Mat&& _track(Mat&& mat)
+        {
+            Track(mat);
+            return std::move(mat);
+        }
+        Buffer&& _track(Buffer&& buffer)
+        {
+            Track(buffer);
+            return std::move(buffer);
+        }
+
+      private:
+        Device              device_;
+        Stream              stream_;
+        std::vector<Buffer> buffers_;
+        bool                use_dummy_{false};
+        Context*            parent_;
+    };
+
+    MMDEPLOY_API Context& gContext();
+
+    template<typename T, typename... Args>
+    static unique_ptr<T> Create(Args&&... args)
+    {
+        std::vector<string_view> tried;
+        if (!gContext().use_dummy())
+        {
+            std::vector<Device> candidates{gContext().device()};
+            if (candidates[0].is_device())
+            {
+                candidates.emplace_back(0);
+            }
+            for (const auto& device : candidates)
+            {
+                if (auto platform = GetPlatformName(device))
+                {
+                    tried.emplace_back(platform);
+                    if (auto creator = gRegistry<T>().Get(platform))
+                    {
+                        Context context(device);
+                        return creator->Create((Args&&)args...);
+                    }
+                }
+            }
+        }
+        else
+        {
+            tried.emplace_back("dummy");
+            if (auto creator = gRegistry<T>().Get("dummy"))
+            {
+                return creator->Create((Args&&)args...);
+            }
+        }
+        MMDEPLOY_ERROR("Unable to create operation, tried platforms: {}", tried);
+        throw_exception(eNotSupported);
     }
-  }
-  MMDEPLOY_ERROR("Unable to create operation, tried platforms: {}", tried);
-  throw_exception(eNotSupported);
-}
-
-class Operation {
- public:
-  Operation() : device_(gContext().device()) {}
-  virtual ~Operation() = default;
-
-  const Device& device() const noexcept { return device_; }
-  static Stream& stream() noexcept { return gContext().stream(); }
-
- protected:
-  Device device_;
-};
+
+    class Operation
+    {
+      public:
+        Operation()
+            : device_(gContext().device())
+        {
+        }
+        virtual ~Operation() = default;
+
+        const Device& device() const noexcept
+        {
+            return device_;
+        }
+        static Stream& stream() noexcept
+        {
+            return gContext().stream();
+        }
+
+      protected:
+        Device device_;
+    };
 
 }  // namespace mmdeploy::operation
 
diff --git a/csrc/mmdeploy/operation/vision.cpp b/csrc/mmdeploy/operation/vision.cpp
index 0c0b13eb9a..e9d743a9fc 100644
--- a/csrc/mmdeploy/operation/vision.cpp
+++ b/csrc/mmdeploy/operation/vision.cpp
@@ -2,18 +2,19 @@
 
 #include "mmdeploy/operation/vision.h"
 
-namespace mmdeploy::operation {
+namespace mmdeploy::operation
+{
 
-MMDEPLOY_DEFINE_REGISTRY(CvtColor);
-MMDEPLOY_DEFINE_REGISTRY(Resize);
-MMDEPLOY_DEFINE_REGISTRY(Pad);
-MMDEPLOY_DEFINE_REGISTRY(ToFloat);
-MMDEPLOY_DEFINE_REGISTRY(HWC2CHW);
-MMDEPLOY_DEFINE_REGISTRY(Normalize);
-MMDEPLOY_DEFINE_REGISTRY(Crop);
-MMDEPLOY_DEFINE_REGISTRY(Flip);
-MMDEPLOY_DEFINE_REGISTRY(WarpAffine);
-MMDEPLOY_DEFINE_REGISTRY(CropResizePad);
-MMDEPLOY_DEFINE_REGISTRY(Permute);
+    MMDEPLOY_DEFINE_REGISTRY(CvtColor);
+    MMDEPLOY_DEFINE_REGISTRY(Resize);
+    MMDEPLOY_DEFINE_REGISTRY(Pad);
+    MMDEPLOY_DEFINE_REGISTRY(ToFloat);
+    MMDEPLOY_DEFINE_REGISTRY(HWC2CHW);
+    MMDEPLOY_DEFINE_REGISTRY(Normalize);
+    MMDEPLOY_DEFINE_REGISTRY(Crop);
+    MMDEPLOY_DEFINE_REGISTRY(Flip);
+    MMDEPLOY_DEFINE_REGISTRY(WarpAffine);
+    MMDEPLOY_DEFINE_REGISTRY(CropResizePad);
+    MMDEPLOY_DEFINE_REGISTRY(Permute);
 
 }  // namespace mmdeploy::operation
diff --git a/csrc/mmdeploy/operation/vision.h b/csrc/mmdeploy/operation/vision.h
index 013c3852b8..6876409283 100644
--- a/csrc/mmdeploy/operation/vision.h
+++ b/csrc/mmdeploy/operation/vision.h
@@ -8,95 +8,106 @@
 #include "mmdeploy/core/tensor.h"
 #include "mmdeploy/operation/operation.h"
 
-namespace mmdeploy::operation {
-
-class CvtColor : public Operation {
- public:
-  virtual Result<void> apply(const Mat& src, Mat& dst, PixelFormat dst_fmt) = 0;
-};
-MMDEPLOY_DECLARE_REGISTRY(CvtColor, unique_ptr<CvtColor>());
-
-// resize in HWC format
-class Resize : public Operation {
- public:
-  virtual Result<void> apply(const Tensor& src, Tensor& dst, int dst_h, int dst_w) = 0;
-};
-MMDEPLOY_DECLARE_REGISTRY(Resize, unique_ptr<Resize>(const string_view& interp));
-
-// pad in HWC format
-class Pad : public Operation {
- public:
-  virtual Result<void> apply(const Tensor& src, Tensor& dst, int top, int left, int bottom,
-                             int right) = 0;
-};
-MMDEPLOY_DECLARE_REGISTRY(Pad, unique_ptr<Pad>(const string_view& border_type, float pad_val));
-
-// uint8 to float
-class ToFloat : public Operation {
- public:
-  virtual Result<void> apply(const Tensor& src, Tensor& dst) = 0;
-};
-MMDEPLOY_DECLARE_REGISTRY(ToFloat, unique_ptr<ToFloat>());
-
-class HWC2CHW : public Operation {
- public:
-  virtual Result<void> apply(const Tensor& src, Tensor& dst) = 0;
-};
-MMDEPLOY_DECLARE_REGISTRY(HWC2CHW, unique_ptr<HWC2CHW>());
-
-// normalize in HWC format
-class Normalize : public Operation {
- public:
-  struct Param {
-    std::vector<float> mean;
-    std::vector<float> std;
-    bool to_rgb;
-  };
-
-  virtual Result<void> apply(const Tensor& src, Tensor& dst) = 0;
-};
-MMDEPLOY_DECLARE_REGISTRY(Normalize, unique_ptr<Normalize>(const Normalize::Param& param));
-
-// crop in HWC format
-class Crop : public Operation {
- public:
-  virtual Result<void> apply(const Tensor& src, Tensor& dst, int top, int left, int bottom,
-                             int right) = 0;
-};
-MMDEPLOY_DECLARE_REGISTRY(Crop, unique_ptr<Crop>());
-
-class Flip : public Operation {
- public:
-  explicit Flip(int flip_code) : flip_code_(flip_code) {}
-
-  virtual Result<void> apply(const Tensor& src, Tensor& dst) = 0;
-
- protected:
-  int flip_code_;
-};
-MMDEPLOY_DECLARE_REGISTRY(Flip, unique_ptr<Flip>(int flip_code));
-
-// 2x3 OpenCV affine matrix, row major
-class WarpAffine : public Operation {
- public:
-  virtual Result<void> apply(const Tensor& src, Tensor& dst, const float affine_matrix[6],
-                             int dst_h, int dst_w) = 0;
-};
-MMDEPLOY_DECLARE_REGISTRY(WarpAffine, unique_ptr<WarpAffine>(const string_view& interp));
-
-class CropResizePad : public Operation {
- public:
-  virtual Result<void> apply(const Tensor& src, const std::vector<int>& crop_rect,
-                             const std::vector<int>& target_size, const std::vector<int>& pad_rect,
-                             Tensor& dst) = 0;
-};
-
-MMDEPLOY_DECLARE_REGISTRY(CropResizePad, unique_ptr<CropResizePad>());
-class Permute : public Operation {
- public:
-  virtual Result<void> apply(const Tensor& src, Tensor& dst, const std::vector<int>& axes) = 0;
-};
-MMDEPLOY_DECLARE_REGISTRY(Permute, unique_ptr<Permute>());
+namespace mmdeploy::operation
+{
+
+    class CvtColor : public Operation
+    {
+      public:
+        virtual Result<void> apply(const Mat& src, Mat& dst, PixelFormat dst_fmt) = 0;
+    };
+    MMDEPLOY_DECLARE_REGISTRY(CvtColor, unique_ptr<CvtColor>());
+
+    // resize in HWC format
+    class Resize : public Operation
+    {
+      public:
+        virtual Result<void> apply(const Tensor& src, Tensor& dst, int dst_h, int dst_w) = 0;
+    };
+    MMDEPLOY_DECLARE_REGISTRY(Resize, unique_ptr<Resize>(const string_view& interp));
+
+    // pad in HWC format
+    class Pad : public Operation
+    {
+      public:
+        virtual Result<void> apply(const Tensor& src, Tensor& dst, int top, int left, int bottom, int right) = 0;
+    };
+    MMDEPLOY_DECLARE_REGISTRY(Pad, unique_ptr<Pad>(const string_view& border_type, float pad_val));
+
+    // uint8 to float
+    class ToFloat : public Operation
+    {
+      public:
+        virtual Result<void> apply(const Tensor& src, Tensor& dst) = 0;
+    };
+    MMDEPLOY_DECLARE_REGISTRY(ToFloat, unique_ptr<ToFloat>());
+
+    class HWC2CHW : public Operation
+    {
+      public:
+        virtual Result<void> apply(const Tensor& src, Tensor& dst) = 0;
+    };
+    MMDEPLOY_DECLARE_REGISTRY(HWC2CHW, unique_ptr<HWC2CHW>());
+
+    // normalize in HWC format
+    class Normalize : public Operation
+    {
+      public:
+        struct Param
+        {
+            std::vector<float> mean;
+            std::vector<float> std;
+            bool               to_rgb;
+        };
+
+        virtual Result<void> apply(const Tensor& src, Tensor& dst) = 0;
+    };
+    MMDEPLOY_DECLARE_REGISTRY(Normalize, unique_ptr<Normalize>(const Normalize::Param& param));
+
+    // crop in HWC format
+    class Crop : public Operation
+    {
+      public:
+        virtual Result<void> apply(const Tensor& src, Tensor& dst, int top, int left, int bottom, int right) = 0;
+    };
+    MMDEPLOY_DECLARE_REGISTRY(Crop, unique_ptr<Crop>());
+
+    class Flip : public Operation
+    {
+      public:
+        explicit Flip(int flip_code)
+            : flip_code_(flip_code)
+        {
+        }
+
+        virtual Result<void> apply(const Tensor& src, Tensor& dst) = 0;
+
+      protected:
+        int flip_code_;
+    };
+    MMDEPLOY_DECLARE_REGISTRY(Flip, unique_ptr<Flip>(int flip_code));
+
+    // 2x3 OpenCV affine matrix, row major
+    class WarpAffine : public Operation
+    {
+      public:
+        virtual Result<void> apply(const Tensor& src, Tensor& dst, const float affine_matrix[6], int dst_h, int dst_w) = 0;
+    };
+    MMDEPLOY_DECLARE_REGISTRY(WarpAffine, unique_ptr<WarpAffine>(const string_view& interp));
+
+    class CropResizePad : public Operation
+    {
+      public:
+        virtual Result<void> apply(const Tensor& src, const std::vector<int>& crop_rect, const std::vector<int>& target_size, const std::vector<int>& pad_rect, Tensor& dst) = 0;
+    };
+
+    MMDEPLOY_DECLARE_REGISTRY(CropResizePad, unique_ptr<CropResizePad>());
+    class Permute : public Operation
+    {
+      public:
+        virtual Result<void> apply(const Tensor& src, Tensor& dst, const std::vector<int>& axes) = 0;
+    };
+    MMDEPLOY_DECLARE_REGISTRY(Permute, unique_ptr<Permute>());
 
 }  // namespace mmdeploy::operation
 
diff --git a/csrc/mmdeploy/preprocess/CMakeLists.txt b/csrc/mmdeploy/preprocess/CMakeLists.txt
index e55f7ca19a..228d8c11a7 100644
--- a/csrc/mmdeploy/preprocess/CMakeLists.txt
+++ b/csrc/mmdeploy/preprocess/CMakeLists.txt
@@ -4,9 +4,9 @@ project(mmdeploy_transform_module)
 
 add_subdirectory(transform)
 
-if (MMDEPLOY_ELENA_FUSION)
-    add_subdirectory(elena)
-endif ()
+if(MMDEPLOY_ELENA_FUSION)
+  add_subdirectory(elena)
+endif()
 
 mmdeploy_add_module(${PROJECT_NAME} transform_module.cpp)
 target_link_libraries(${PROJECT_NAME} PRIVATE mmdeploy::transform)
diff --git a/csrc/mmdeploy/preprocess/elena/CMakeLists.txt b/csrc/mmdeploy/preprocess/elena/CMakeLists.txt
index 4ed0a4ed12..50517eeddf 100644
--- a/csrc/mmdeploy/preprocess/elena/CMakeLists.txt
+++ b/csrc/mmdeploy/preprocess/elena/CMakeLists.txt
@@ -2,22 +2,20 @@
 
 project(mmdeploy_elena_transform_impl)
 
-set(SRCS
-        fused.cpp
-        elena_registry.cpp)
+set(SRCS fused.cpp elena_registry.cpp)
 
 file(GLOB CPU_KERNEL_SRCS "cpu_kernel/*.cpp")
 
 set(ALL_SRCS ${SRCS} ${CPU_KERNEL_SRCS})
-if ("cuda" IN_LIST MMDEPLOY_TARGET_DEVICES)
-    file(GLOB CUDA_KERNEL_SRCS "cuda_kernel/*.cu")
-    set(ALL_SRCS ${ALL_SRCS} ${CUDA_KERNEL_SRCS})
-endif ()
+if("cuda" IN_LIST MMDEPLOY_TARGET_DEVICES)
+  file(GLOB CUDA_KERNEL_SRCS "cuda_kernel/*.cu")
+  set(ALL_SRCS ${ALL_SRCS} ${CUDA_KERNEL_SRCS})
+endif()
 
 mmdeploy_add_module(${PROJECT_NAME} "${ALL_SRCS}")
 target_include_directories(${PROJECT_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
 target_link_libraries(${PROJECT_NAME} PRIVATE mmdeploy::transform)
-if ("cuda" IN_LIST MMDEPLOY_TARGET_DEVICES)
-    target_link_libraries(${PROJECT_NAME} PRIVATE cuda)
-endif ()
+if("cuda" IN_LIST MMDEPLOY_TARGET_DEVICES)
+  target_link_libraries(${PROJECT_NAME} PRIVATE cuda)
+endif()
 add_library(mmdeploy::transform_impl::elena ALIAS ${PROJECT_NAME})
diff --git a/csrc/mmdeploy/preprocess/elena/elena_registry.cpp b/csrc/mmdeploy/preprocess/elena/elena_registry.cpp
index d38101ae9f..c848f3adaf 100644
--- a/csrc/mmdeploy/preprocess/elena/elena_registry.cpp
+++ b/csrc/mmdeploy/preprocess/elena/elena_registry.cpp
@@ -4,29 +4,36 @@
 
 #include "mmdeploy/core/logger.h"
 
-namespace mmdeploy {
-namespace elena {
+namespace mmdeploy
+{
+    namespace elena
+    {
 
-FuseKernel& FuseKernel::Get() {
-  static FuseKernel fuse_kernel;
-  return fuse_kernel;
-}
+        FuseKernel& FuseKernel::Get()
+        {
+            static FuseKernel fuse_kernel;
+            return fuse_kernel;
+        }
 
-FuseFunc FuseKernel::GetFunc(const std::string& name) {
-  if (entries_.count(name)) {
-    return entries_[name];
-  }
-  return nullptr;
-}
+        FuseFunc FuseKernel::GetFunc(const std::string& name)
+        {
+            if (entries_.count(name))
+            {
+                return entries_[name];
+            }
+            return nullptr;
+        }
 
-int FuseKernel::Register(const std::string& name, FuseFunc func) {
-  if (entries_.count(name)) {
-    return -1;
-  }
-  MMDEPLOY_DEBUG("Register fuse kernel: '{}'", name);
-  entries_.emplace(name, func);
-  return 0;
-}
+        int FuseKernel::Register(const std::string& name, FuseFunc func)
+        {
+            if (entries_.count(name))
+            {
+                return -1;
+            }
+            MMDEPLOY_DEBUG("Register fuse kernel: '{}'", name);
+            entries_.emplace(name, func);
+            return 0;
+        }
 
-}  // namespace elena
+    }  // namespace elena
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/preprocess/elena/elena_registry.h b/csrc/mmdeploy/preprocess/elena/elena_registry.h
index 6eda61d8cd..8bb2d87f2c 100644
--- a/csrc/mmdeploy/preprocess/elena/elena_registry.h
+++ b/csrc/mmdeploy/preprocess/elena/elena_registry.h
@@ -8,38 +8,38 @@
 
 #include "mmdeploy/core/macro.h"
 
-namespace mmdeploy {
-namespace elena {
-
-using FuseFunc = void (*)(void* stream, uint8_t* data_in, int src_h, int src_w, const char* format,
-                          int resize_h, int resize_w, const char* interpolation, int crop_top,
-                          int crop_left, int crop_h, int crop_w, float mean0, float mean1,
-                          float mean2, float std0, float std1, float std2, int pad_top,
-                          int pad_left, int pad_bottom, int pad_right, int pad_h, int pad_w,
-                          float pad_value, float* data_out, int dst_h, int dst_w);
-
-class MMDEPLOY_API FuseKernel {
- public:
-  static FuseKernel& Get();
-  int Register(const std::string& name, FuseFunc func);
-  FuseFunc GetFunc(const std::string& name);
-
- private:
-  FuseKernel() = default;
-  std::map<std::string, FuseFunc> entries_;
-};
-
-class MMDEPLOY_API FuseKernelRegister {
- public:
-  FuseKernelRegister(const std::string& name, FuseFunc func) {
-    FuseKernel::Get().Register(name, func);
-  }
-};
-
-}  // namespace elena
+namespace mmdeploy
+{
+    namespace elena
+    {
+
+        using FuseFunc = void (*)(void* stream, uint8_t* data_in, int src_h, int src_w, const char* format, int resize_h, int resize_w, const char* interpolation, int crop_top, int crop_left, int crop_h, int crop_w, float mean0, float mean1, float mean2, float std0, float std1, float std2, int pad_top, int pad_left, int pad_bottom, int pad_right, int pad_h, int pad_w, float pad_value, float* data_out, int dst_h, int dst_w);
+
+        class MMDEPLOY_API FuseKernel
+        {
+          public:
+            static FuseKernel& Get();
+            int                Register(const std::string& name, FuseFunc func);
+            FuseFunc           GetFunc(const std::string& name);
+
+          private:
+            FuseKernel() = default;
+            std::map<std::string, FuseFunc> entries_;
+        };
+
+        class MMDEPLOY_API FuseKernelRegister
+        {
+          public:
+            FuseKernelRegister(const std::string& name, FuseFunc func)
+            {
+                FuseKernel::Get().Register(name, func);
+            }
+        };
+
+    }  // namespace elena
 }  // namespace mmdeploy
 
 #define REGISTER_FUSE_KERNEL(name, module_name, func) \
-  static ::mmdeploy::elena::FuseKernelRegister g_register_##name##_##func(module_name, func);
+    static ::mmdeploy::elena::FuseKernelRegister g_register_##name##_##func(module_name, func);
 
 #endif
diff --git a/csrc/mmdeploy/preprocess/elena/fused.cpp b/csrc/mmdeploy/preprocess/elena/fused.cpp
index e7f49b45ac..35bb5b352b 100644
--- a/csrc/mmdeploy/preprocess/elena/fused.cpp
+++ b/csrc/mmdeploy/preprocess/elena/fused.cpp
@@ -9,130 +9,146 @@
 #include "mmdeploy/preprocess/transform/tracer.h"
 #include "mmdeploy/preprocess/transform/transform.h"
 
-namespace mmdeploy::transform {
-
-using namespace trace;
-using namespace elena;
-
-struct ExtractTransParamVisitor {
-  bool valid{true};
-  std::set<std::string> st;
-
-  std::array<float, 3> mean;
-  std::array<float, 3> std;
-  std::array<int, 2> resize_hw;
-  std::string resize_mode;
-  float pad_val;
-  std::array<int, 4> pad_tlbr;
-  std::array<int, 2> pad_hw;
-  std::array<int, 4> crop_tlbr;
-  std::array<int, 2> crop_hw;
-
-  void CheckValid(const std::string& name) {
-    if (st.count(name)) {
-      valid = false;
-      return;
-    }
-    st.insert(name);
-  }
-
-  void operator()(CvtColorParam&) {}
-  void operator()(CastParam&) {}
-  void operator()(HWC2CHWParam&) {}
-
-  void operator()(ResizeParam& param) {
-    CheckValid("Resize");
-    resize_hw = {param.size[0], param.size[1]};
-    resize_mode = param.mode;
-  }
-  void operator()(PadParam& param) {
-    CheckValid("Pad");
-    pad_val = param.pad_val;
-    std::copy_n(param.tlbr.begin(), 4, pad_tlbr.begin());
-    std::copy_n(param.size.begin(), 2, pad_hw.begin());
-  }
-  void operator()(NormParam& param) {
-    CheckValid("Normalize");
-    std::copy(param.mean.begin(), param.mean.end(), mean.begin());
-    std::copy(param.std.begin(), param.std.end(), std.begin());
-  }
-  void operator()(CropParam& param) {
-    CheckValid("CenterCrop");
-    std::copy_n(param.tlbr.begin(), 4, crop_tlbr.begin());
-    std::copy_n(param.size.begin(), 2, crop_hw.begin());
-  }
-};
-
-class Fused : public Transform {
- public:
-  explicit Fused(const Value& args) {
-    device_ = operation::gContext().device();
-    tag_ = args["hash_code"].get<std::string>();
-    tag_.append("_").append(GetPlatformName(device_));
-    func_ = FuseKernel::Get().GetFunc(tag_);
-    if (!func_) {
-      MMDEPLOY_ERROR("can't find fuse function with tag: {}", tag_);
-      throw_exception(eNotSupported);
-    }
-  }
-
-  struct Context {
-    Context() { operation::gContext().set_use_dummy(false); }
-    ~Context() { operation::gContext().set_use_dummy(true); }
-  };
-
-  Result<void> Apply(Value& data) override {
-    auto tracer = data["__tracer__"].get<Tracer>();
-    Mat _src_mat = data["ori_img"].get<Mat>();
-
-    auto& stream = operation::gContext().stream();
-
-    // ! Create a scope that `use_dummy` is false
-    Context context;
-    OUTCOME_TRY(auto src_mat, operation::Secure(_src_mat, device_, stream));
-
-    ExtractTransParamVisitor visitor{};
-    for (auto&& trans : tracer.trans_) {
-      std::visit(visitor, trans);
-    }
-
-    if (!visitor.valid) {
-      MMDEPLOY_ERROR("unsupported fuse transform");
-      return Status(eNotSupported);
-    }
-    if (src_mat.type() != DataType::kINT8) {
-      MMDEPLOY_ERROR("unsupported data type in fuse transform");
-      return Status(eNotSupported);
-    }
-
-    auto img_fields = GetImageFields(data);
-    for (auto& key : img_fields) {
-      assert(data.contains(key));
-      auto src_tensor = data[key].get<Tensor>();
-      auto desc = src_tensor.desc();
-      desc.device = device_;
-      Tensor dst_tensor{desc};
-
-      func_(stream.GetNative(), src_mat.data<uint8_t>(), src_mat.height(), src_mat.width(),
-            to_string(src_mat.pixel_format()).c_str(), visitor.resize_hw[0], visitor.resize_hw[1],
-            visitor.resize_mode.c_str(), visitor.crop_tlbr[0], visitor.crop_tlbr[1],
-            visitor.crop_hw[0], visitor.crop_hw[1], visitor.mean[0], visitor.mean[1],
-            visitor.mean[2], visitor.std[0], visitor.std[1], visitor.std[2], visitor.pad_tlbr[0],
-            visitor.pad_tlbr[1], visitor.pad_tlbr[2], visitor.pad_tlbr[3], visitor.pad_hw[0],
-            visitor.pad_hw[1], visitor.pad_val, dst_tensor.data<float>(), dst_tensor.shape(2),
-            dst_tensor.shape(3));
-      operation::gContext().Track(dst_tensor);
-      data[key] = std::move(dst_tensor);
-    }
-    return success();
-  }
-
- private:
-  Device device_;
-  std::string tag_;
-  FuseFunc func_;
-};
-
-MMDEPLOY_REGISTER_TRANSFORM(Fused);
+namespace mmdeploy::transform
+{
+
+    using namespace trace;
+    using namespace elena;
+
+    struct ExtractTransParamVisitor
+    {
+        bool                  valid{true};
+        std::set<std::string> st;
+
+        std::array<float, 3>  mean;
+        std::array<float, 3>  std;
+        std::array<int, 2>    resize_hw;
+        std::string           resize_mode;
+        float                 pad_val;
+        std::array<int, 4>    pad_tlbr;
+        std::array<int, 2>    pad_hw;
+        std::array<int, 4>    crop_tlbr;
+        std::array<int, 2>    crop_hw;
+
+        void                  CheckValid(const std::string& name)
+        {
+            if (st.count(name))
+            {
+                valid = false;
+                return;
+            }
+            st.insert(name);
+        }
+
+        void operator()(CvtColorParam&) {}
+        void operator()(CastParam&) {}
+        void operator()(HWC2CHWParam&) {}
+
+        void operator()(ResizeParam& param)
+        {
+            CheckValid("Resize");
+            resize_hw   = {param.size[0], param.size[1]};
+            resize_mode = param.mode;
+        }
+        void operator()(PadParam& param)
+        {
+            CheckValid("Pad");
+            pad_val = param.pad_val;
+            std::copy_n(param.tlbr.begin(), 4, pad_tlbr.begin());
+            std::copy_n(param.size.begin(), 2, pad_hw.begin());
+        }
+        void operator()(NormParam& param)
+        {
+            CheckValid("Normalize");
+            std::copy(param.mean.begin(), param.mean.end(), mean.begin());
+            std::copy(param.std.begin(), param.std.end(), std.begin());
+        }
+        void operator()(CropParam& param)
+        {
+            CheckValid("CenterCrop");
+            std::copy_n(param.tlbr.begin(), 4, crop_tlbr.begin());
+            std::copy_n(param.size.begin(), 2, crop_hw.begin());
+        }
+    };
+
+    class Fused : public Transform
+    {
+      public:
+        explicit Fused(const Value& args)
+        {
+            device_ = operation::gContext().device();
+            tag_    = args["hash_code"].get<std::string>();
+            tag_.append("_").append(GetPlatformName(device_));
+            func_ = FuseKernel::Get().GetFunc(tag_);
+            if (!func_)
+            {
+                MMDEPLOY_ERROR("can't find fuse function with tag: {}", tag_);
+                throw_exception(eNotSupported);
+            }
+        }
+
+        struct Context
+        {
+            Context()
+            {
+                operation::gContext().set_use_dummy(false);
+            }
+            ~Context()
+            {
+                operation::gContext().set_use_dummy(true);
+            }
+        };
+
+        Result<void> Apply(Value& data) override
+        {
+            auto    tracer   = data["__tracer__"].get<Tracer>();
+            Mat     _src_mat = data["ori_img"].get<Mat>();
+
+            auto&   stream = operation::gContext().stream();
+
+            // ! Create a scope that `use_dummy` is false
+            Context context;
+            OUTCOME_TRY(auto src_mat, operation::Secure(_src_mat, device_, stream));
+
+            ExtractTransParamVisitor visitor{};
+            for (auto&& trans : tracer.trans_)
+            {
+                std::visit(visitor, trans);
+            }
+
+            if (!visitor.valid)
+            {
+                MMDEPLOY_ERROR("unsupported fuse transform");
+                return Status(eNotSupported);
+            }
+            if (src_mat.type() != DataType::kINT8)
+            {
+                MMDEPLOY_ERROR("unsupported data type in fuse transform");
+                return Status(eNotSupported);
+            }
+
+            auto img_fields = GetImageFields(data);
+            for (auto& key : img_fields)
+            {
+                assert(data.contains(key));
+                auto src_tensor = data[key].get<Tensor>();
+                auto desc       = src_tensor.desc();
+                desc.device     = device_;
+                Tensor dst_tensor{desc};
+
+                func_(stream.GetNative(), src_mat.data<uint8_t>(), src_mat.height(), src_mat.width(), to_string(src_mat.pixel_format()).c_str(), visitor.resize_hw[0], visitor.resize_hw[1], visitor.resize_mode.c_str(), visitor.crop_tlbr[0], visitor.crop_tlbr[1], visitor.crop_hw[0], visitor.crop_hw[1], visitor.mean[0], visitor.mean[1], visitor.mean[2], visitor.std[0], visitor.std[1], visitor.std[2], visitor.pad_tlbr[0], visitor.pad_tlbr[1], visitor.pad_tlbr[2], visitor.pad_tlbr[3], visitor.pad_hw[0], visitor.pad_hw[1], visitor.pad_val, dst_tensor.data<float>(), dst_tensor.shape(2), dst_tensor.shape(3));
+                operation::gContext().Track(dst_tensor);
+                data[key] = std::move(dst_tensor);
+            }
+            return success();
+        }
+
+      private:
+        Device      device_;
+        std::string tag_;
+        FuseFunc    func_;
+    };
+
+    MMDEPLOY_REGISTER_TRANSFORM(Fused);
 
 }  // namespace mmdeploy::transform
diff --git a/csrc/mmdeploy/preprocess/transform/CMakeLists.txt b/csrc/mmdeploy/preprocess/transform/CMakeLists.txt
index ccabdc9ab7..849f342c02 100644
--- a/csrc/mmdeploy/preprocess/transform/CMakeLists.txt
+++ b/csrc/mmdeploy/preprocess/transform/CMakeLists.txt
@@ -3,23 +3,23 @@
 project(mmdeploy_transform)
 
 set(SRCS
-        collect.cpp
-        compose.cpp
-        center_crop.cpp
-        three_crop.cpp
-        ten_crop.cpp
-        image2tensor.cpp
-        letter_resize.cpp
-        default_format_bundle.cpp
-        load.cpp
-        normalize.cpp
-        pad.cpp
-        resize.cpp
-        transform.cpp
-        tracer.cpp
-        lift.cpp)
+    collect.cpp
+    compose.cpp
+    center_crop.cpp
+    three_crop.cpp
+    ten_crop.cpp
+    image2tensor.cpp
+    letter_resize.cpp
+    default_format_bundle.cpp
+    load.cpp
+    normalize.cpp
+    pad.cpp
+    resize.cpp
+    transform.cpp
+    tracer.cpp
+    lift.cpp)
 mmdeploy_add_module(${PROJECT_NAME} LIBRARY "${SRCS}")
 target_link_libraries(${PROJECT_NAME} PRIVATE mmdeploy_operation)
 target_include_directories(
-        ${PROJECT_NAME} PUBLIC $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/preprocess>)
+  ${PROJECT_NAME} PUBLIC $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/preprocess>)
 add_library(mmdeploy::transform ALIAS ${PROJECT_NAME})
diff --git a/csrc/mmdeploy/preprocess/transform/center_crop.cpp b/csrc/mmdeploy/preprocess/transform/center_crop.cpp
index 9022dbae46..7c4a0210fc 100644
--- a/csrc/mmdeploy/preprocess/transform/center_crop.cpp
+++ b/csrc/mmdeploy/preprocess/transform/center_crop.cpp
@@ -10,84 +10,100 @@
 
 using namespace std;
 
-namespace mmdeploy::transform {
-
-class CenterCrop : public Transform {
- public:
-  explicit CenterCrop(const Value& args) {
-    if (!args.contains(("crop_size"))) {
-      MMDEPLOY_ERROR("'crop_size' is expected");
-      throw_exception(eInvalidArgument);
-    }
-    if (args["crop_size"].is_number_integer()) {
-      int crop_size = args["crop_size"].get<int>();
-      crop_size_[0] = crop_size_[1] = crop_size;
-    } else if (args["crop_size"].is_array() && args["crop_size"].size() == 2) {
-      crop_size_[0] = args["crop_size"][0].get<int>();
-      crop_size_[1] = args["crop_size"][1].get<int>();
-    } else {
-      MMDEPLOY_ERROR("'crop_size' should be integer or an int array of size 2");
-      throw_exception(eInvalidArgument);
-    }
-
-    crop_ = operation::Managed<operation::Crop>::Create();
-  }
-
-  Result<void> Apply(Value& data) override {
-    MMDEPLOY_DEBUG("input: {}", data);
-    auto img_fields = GetImageFields(data);
-
-    for (auto& key : img_fields) {
-      auto tensor = data[key].get<Tensor>();
-      auto desc = tensor.desc();
-      int h = desc.shape[1];
-      int w = desc.shape[2];
-      int crop_height = crop_size_[0];
-      int crop_width = crop_size_[1];
-
-      int y1 = std::max(0, int(std::round((h - crop_height) / 2.0)));
-      int x1 = std::max(0, int(std::round((w - crop_width) / 2.0)));
-      int y2 = std::min(h, y1 + crop_height) - 1;
-      int x2 = std::min(w, x1 + crop_width) - 1;
-
-      Tensor dst_tensor;
-      OUTCOME_TRY(crop_.Apply(tensor, dst_tensor, y1, x1, y2, x2));
-
-      auto& shape = dst_tensor.desc().shape;
-
-      // trace static info & runtime args
-      if (data.contains("__tracer__")) {
-        data["__tracer__"].get_ref<Tracer&>().CenterCrop(
-            {y1, x1, h - (int)shape[1] - y1, w - (int)shape[2] - x1},
-            {(int)shape[1], (int)shape[2]}, tensor.data_type());
-      }
-
-      data["img_shape"] = {shape[0], shape[1], shape[2], shape[3]};
-      if (data.contains("scale_factor")) {
-        // image has been processed by `Resize` transform before.
-        // Compute cropped image's offset against the original image
-        assert(data["scale_factor"].is_array() && data["scale_factor"].size() >= 2);
-        float w_scale = data["scale_factor"][0].get<float>();
-        float h_scale = data["scale_factor"][1].get<float>();
-        data["offset"].push_back(x1 / w_scale);
-        data["offset"].push_back(y1 / h_scale);
-      } else {
-        data["offset"].push_back(x1);
-        data["offset"].push_back(y1);
-      }
-
-      data[key] = std::move(dst_tensor);
-    }
-
-    MMDEPLOY_DEBUG("output: {}", data);
-    return success();
-  }
-
- private:
-  operation::Managed<operation::Crop> crop_;
-  std::array<int, 2> crop_size_{};
-};
-
-MMDEPLOY_REGISTER_TRANSFORM(CenterCrop);
+namespace mmdeploy::transform
+{
+
+    class CenterCrop : public Transform
+    {
+      public:
+        explicit CenterCrop(const Value& args)
+        {
+            if (!args.contains(("crop_size")))
+            {
+                MMDEPLOY_ERROR("'crop_size' is expected");
+                throw_exception(eInvalidArgument);
+            }
+            if (args["crop_size"].is_number_integer())
+            {
+                int crop_size = args["crop_size"].get<int>();
+                crop_size_[0] = crop_size_[1] = crop_size;
+            }
+            else if (args["crop_size"].is_array() && args["crop_size"].size() == 2)
+            {
+                crop_size_[0] = args["crop_size"][0].get<int>();
+                crop_size_[1] = args["crop_size"][1].get<int>();
+            }
+            else
+            {
+                MMDEPLOY_ERROR("'crop_size' should be integer or an int array of size 2");
+                throw_exception(eInvalidArgument);
+            }
+
+            crop_ = operation::Managed<operation::Crop>::Create();
+        }
+
+        Result<void> Apply(Value& data) override
+        {
+            MMDEPLOY_DEBUG("input: {}", data);
+            auto img_fields = GetImageFields(data);
+
+            for (auto& key : img_fields)
+            {
+                auto   tensor      = data[key].get<Tensor>();
+                auto   desc        = tensor.desc();
+                int    h           = desc.shape[1];
+                int    w           = desc.shape[2];
+                int    crop_height = crop_size_[0];
+                int    crop_width  = crop_size_[1];
+
+                int    y1 = std::max(0, int(std::round((h - crop_height) / 2.0)));
+                int    x1 = std::max(0, int(std::round((w - crop_width) / 2.0)));
+                int    y2 = std::min(h, y1 + crop_height) - 1;
+                int    x2 = std::min(w, x1 + crop_width) - 1;
+
+                Tensor dst_tensor;
+                OUTCOME_TRY(crop_.Apply(tensor, dst_tensor, y1, x1, y2, x2));
+
+                auto& shape = dst_tensor.desc().shape;
+
+                // trace static info & runtime args
+                if (data.contains("__tracer__"))
+                {
+                    data["__tracer__"].get_ref<Tracer&>().CenterCrop(
+                        {y1, x1, h - (int)shape[1] - y1, w - (int)shape[2] - x1},
+                        {(int)shape[1], (int)shape[2]},
+                        tensor.data_type());
+                }
+
+                data["img_shape"] = {shape[0], shape[1], shape[2], shape[3]};
+                if (data.contains("scale_factor"))
+                {
+                    // image has been processed by `Resize` transform before.
+                    // Compute cropped image's offset against the original image
+                    assert(data["scale_factor"].is_array() && data["scale_factor"].size() >= 2);
+                    float w_scale = data["scale_factor"][0].get<float>();
+                    float h_scale = data["scale_factor"][1].get<float>();
+                    data["offset"].push_back(x1 / w_scale);
+                    data["offset"].push_back(y1 / h_scale);
+                }
+                else
+                {
+                    data["offset"].push_back(x1);
+                    data["offset"].push_back(y1);
+                }
+
+                data[key] = std::move(dst_tensor);
+            }
+
+            MMDEPLOY_DEBUG("output: {}", data);
+            return success();
+        }
+
+      private:
+        operation::Managed<operation::Crop> crop_;
+        std::array<int, 2>                  crop_size_{};
+    };
+
+    MMDEPLOY_REGISTER_TRANSFORM(CenterCrop);
 
 }  // namespace mmdeploy::transform
diff --git a/csrc/mmdeploy/preprocess/transform/collect.cpp b/csrc/mmdeploy/preprocess/transform/collect.cpp
index 041320ff17..edeb2f93df 100644
--- a/csrc/mmdeploy/preprocess/transform/collect.cpp
+++ b/csrc/mmdeploy/preprocess/transform/collect.cpp
@@ -3,67 +3,84 @@
 #include "mmdeploy/core/utils/formatter.h"
 #include "mmdeploy/preprocess/transform/transform.h"
 
-namespace mmdeploy::transform {
+namespace mmdeploy::transform
+{
 
-class Collect : public Transform {
- public:
-  explicit Collect(const Value& args) {
-    if (!args.contains("keys") || !args["keys"].is_array()) {
-      MMDEPLOY_ERROR("'keys' is missed in arguments, or it is not an array as expected");
-      throw_exception(eInvalidArgument);
-    }
-    if (args.contains("meta_keys") && !args["meta_keys"].is_array()) {
-      MMDEPLOY_ERROR("'meta_keys' has to be an array");
-      throw_exception(eInvalidArgument);
-    }
+    class Collect : public Transform
+    {
+      public:
+        explicit Collect(const Value& args)
+        {
+            if (!args.contains("keys") || !args["keys"].is_array())
+            {
+                MMDEPLOY_ERROR("'keys' is missed in arguments, or it is not an array as expected");
+                throw_exception(eInvalidArgument);
+            }
+            if (args.contains("meta_keys") && !args["meta_keys"].is_array())
+            {
+                MMDEPLOY_ERROR("'meta_keys' has to be an array");
+                throw_exception(eInvalidArgument);
+            }
 
-    for (auto& v : args["keys"]) {
-      keys_.push_back(v.get<std::string>());
-    }
-    if (args.contains("meta_keys")) {
-      for (auto& v : args["meta_keys"]) {
-        meta_keys_.push_back(v.get<std::string>());
-      }
-    }
-  }
+            for (auto& v : args["keys"])
+            {
+                keys_.push_back(v.get<std::string>());
+            }
+            if (args.contains("meta_keys"))
+            {
+                for (auto& v : args["meta_keys"])
+                {
+                    meta_keys_.push_back(v.get<std::string>());
+                }
+            }
+        }
 
-  Result<void> Apply(Value& data) override {
-    MMDEPLOY_DEBUG("input: {}", data);
-    Value::Object output;
+        Result<void> Apply(Value& data) override
+        {
+            MMDEPLOY_DEBUG("input: {}", data);
+            Value::Object output;
 
-    // collect 'ori_img' and 'attribute' from `input`, because those two fields
-    // are given by users, not generated by transform ops
-    if (data.contains("ori_img")) {
-      output["ori_img"] = data["ori_img"];
-    }
-    if (data.contains("attribute")) {
-      output["attribute"] = data["attribute"];
-    }
+            // collect 'ori_img' and 'attribute' from `input`, because those two fields
+            // are given by users, not generated by transform ops
+            if (data.contains("ori_img"))
+            {
+                output["ori_img"] = data["ori_img"];
+            }
+            if (data.contains("attribute"))
+            {
+                output["attribute"] = data["attribute"];
+            }
 
-    for (auto& meta_key : meta_keys_) {
-      if (data.contains(meta_key)) {
-        output["img_metas"][meta_key] = data[meta_key];
-      }
-    }
-    for (auto& key : keys_) {
-      if (!data.contains(key)) {
-        MMDEPLOY_INFO("missed key '{}' in input", key);
-        return Status(eInvalidArgument);
-      } else {
-        output[key] = data[key];
-      }
-    }
+            for (auto& meta_key : meta_keys_)
+            {
+                if (data.contains(meta_key))
+                {
+                    output["img_metas"][meta_key] = data[meta_key];
+                }
+            }
+            for (auto& key : keys_)
+            {
+                if (!data.contains(key))
+                {
+                    MMDEPLOY_INFO("missed key '{}' in input", key);
+                    return Status(eInvalidArgument);
+                }
+                else
+                {
+                    output[key] = data[key];
+                }
+            }
 
-    data = std::move(output);
-    MMDEPLOY_DEBUG("output: {}", data);
-    return success();
-  }
+            data = std::move(output);
+            MMDEPLOY_DEBUG("output: {}", data);
+            return success();
+        }
 
- private:
-  std::vector<std::string> keys_;
-  std::vector<std::string> meta_keys_;
-};
+      private:
+        std::vector<std::string> keys_;
+        std::vector<std::string> meta_keys_;
+    };
 
-MMDEPLOY_REGISTER_TRANSFORM(Collect);
+    MMDEPLOY_REGISTER_TRANSFORM(Collect);
 
 }  // namespace mmdeploy::transform
diff --git a/csrc/mmdeploy/preprocess/transform/compose.cpp b/csrc/mmdeploy/preprocess/transform/compose.cpp
index 44fbd86d4c..e8e5de0182 100644
--- a/csrc/mmdeploy/preprocess/transform/compose.cpp
+++ b/csrc/mmdeploy/preprocess/transform/compose.cpp
@@ -5,95 +5,112 @@
 #include "mmdeploy/core/utils/formatter.h"
 #include "mmdeploy/preprocess/transform/transform.h"
 
-namespace mmdeploy::transform {
+namespace mmdeploy::transform
+{
 
-class Compose : public Transform {
- public:
-  explicit Compose(const Value& args) {
-    assert(args.contains("context"));
+    class Compose : public Transform
+    {
+      public:
+        explicit Compose(const Value& args)
+        {
+            assert(args.contains("context"));
 
-    Value context;
-    context = args["context"];
-    context["device"].get_to(device_);
-    context["stream"].get_to(stream_);
+            Value context;
+            context = args["context"];
+            context["device"].get_to(device_);
+            context["stream"].get_to(stream_);
 
-    if (auto parent = context.value<profiler::Scope*>("scope", nullptr)) {
-      scope_ = parent->CreateScope("Compose");
-      context["scope"] = scope_;
-    }
+            if (auto parent = context.value<profiler::Scope*>("scope", nullptr))
+            {
+                scope_           = parent->CreateScope("Compose");
+                context["scope"] = scope_;
+            }
 
-    auto transforms = args["transforms"].array();
-    operation::Context ctx(device_, stream_);
+            auto               transforms = args["transforms"].array();
+            operation::Context ctx(device_, stream_);
 
-    EnableTransformFusion(args, transforms);
+            EnableTransformFusion(args, transforms);
 
-    for (auto cfg : transforms) {
-      cfg["context"] = context;
-      auto type = cfg.value("type", std::string{});
-      MMDEPLOY_DEBUG("creating transform: {} with cfg: {}", type, cfg);
-      auto creator = gRegistry<Transform>().Get(type);
-      if (!creator) {
-        MMDEPLOY_ERROR("Unable to find Transform creator: {}. Available transforms: {}", type,
-                       gRegistry<Transform>().List());
-        throw_exception(eEntryNotFound);
-      }
-      auto transform = creator->Create(cfg);
-      if (!transform) {
-        MMDEPLOY_ERROR("Failed to create transform: {}, config: {}", type, cfg);
-        throw_exception(eFail);
-      }
-      transforms_.push_back(std::move(transform));
-      if (scope_) {
-        transform_scopes_.push_back(scope_->CreateScope(type));
-      }
-    }
-  }
+            for (auto cfg : transforms)
+            {
+                cfg["context"] = context;
+                auto type      = cfg.value("type", std::string{});
+                MMDEPLOY_DEBUG("creating transform: {} with cfg: {}", type, cfg);
+                auto creator = gRegistry<Transform>().Get(type);
+                if (!creator)
+                {
+                    MMDEPLOY_ERROR("Unable to find Transform creator: {}. Available transforms: {}", type, gRegistry<Transform>().List());
+                    throw_exception(eEntryNotFound);
+                }
+                auto transform = creator->Create(cfg);
+                if (!transform)
+                {
+                    MMDEPLOY_ERROR("Failed to create transform: {}, config: {}", type, cfg);
+                    throw_exception(eFail);
+                }
+                transforms_.push_back(std::move(transform));
+                if (scope_)
+                {
+                    transform_scopes_.push_back(scope_->CreateScope(type));
+                }
+            }
+        }
 
-  Result<void> Apply(Value& data) override {
-    profiler::ScopedCounter counter(scope_);
-    operation::Context context(device_, stream_);
-    if (!hash_code_.empty()) {
-      context.set_use_dummy(true);
-    }
-    DeviceGuard guard(device_);
-    for (size_t i = 0; i < transforms_.size(); ++i) {
-      std::optional<profiler::ScopedCounter> child_counter;
-      if (scope_) {
-        child_counter.emplace(transform_scopes_[i]);
-      }
-      OUTCOME_TRY(transforms_[i]->Apply(data));
-      if (scope_) {
-        OUTCOME_TRY(stream_.Wait());
-      }
-    }
-    return success();
-  }
+        Result<void> Apply(Value& data) override
+        {
+            profiler::ScopedCounter counter(scope_);
+            operation::Context      context(device_, stream_);
+            if (!hash_code_.empty())
+            {
+                context.set_use_dummy(true);
+            }
+            DeviceGuard guard(device_);
+            for (size_t i = 0; i < transforms_.size(); ++i)
+            {
+                std::optional<profiler::ScopedCounter> child_counter;
+                if (scope_)
+                {
+                    child_counter.emplace(transform_scopes_[i]);
+                }
+                OUTCOME_TRY(transforms_[i]->Apply(data));
+                if (scope_)
+                {
+                    OUTCOME_TRY(stream_.Wait());
+                }
+            }
+            return success();
+        }
 
- private:
-  void EnableTransformFusion(const Value& args, Value::Array& transforms) {
-    if (args.value("fuse_transform", false)) {
-      hash_code_ = args.value("sha256", hash_code_);
-      if (!hash_code_.empty()) {
-        operation::gContext().set_use_dummy(true);
-        auto it = transforms.begin();
-        for (; it != transforms.end(); ++it) {
-          if (it->value<std::string>("type", {}) == "Collect") {
-            break;
-          }
+      private:
+        void EnableTransformFusion(const Value& args, Value::Array& transforms)
+        {
+            if (args.value("fuse_transform", false))
+            {
+                hash_code_ = args.value("sha256", hash_code_);
+                if (!hash_code_.empty())
+                {
+                    operation::gContext().set_use_dummy(true);
+                    auto it = transforms.begin();
+                    for (; it != transforms.end(); ++it)
+                    {
+                        if (it->value<std::string>("type", {}) == "Collect")
+                        {
+                            break;
+                        }
+                    }
+                    transforms.insert(it, Value::Object{{"type", "Fused"}, {"hash_code", hash_code_}});
+                }
+            }
         }
-        transforms.insert(it, Value::Object{{"type", "Fused"}, {"hash_code", hash_code_}});
-      }
-    }
-  }
 
-  std::vector<std::unique_ptr<Transform>> transforms_;
-  Device device_;
-  Stream stream_;
-  std::vector<profiler::Scope*> transform_scopes_;
-  profiler::Scope* scope_{};
-  std::string hash_code_;
-};
+        std::vector<std::unique_ptr<Transform>> transforms_;
+        Device                                  device_;
+        Stream                                  stream_;
+        std::vector<profiler::Scope*>           transform_scopes_;
+        profiler::Scope*                        scope_{};
+        std::string                             hash_code_;
+    };
 
-MMDEPLOY_REGISTER_TRANSFORM(Compose);
+    MMDEPLOY_REGISTER_TRANSFORM(Compose);
 
 }  // namespace mmdeploy::transform
diff --git a/csrc/mmdeploy/preprocess/transform/default_format_bundle.cpp b/csrc/mmdeploy/preprocess/transform/default_format_bundle.cpp
index 3cca32b9d6..f0a9e5ee9e 100644
--- a/csrc/mmdeploy/preprocess/transform/default_format_bundle.cpp
+++ b/csrc/mmdeploy/preprocess/transform/default_format_bundle.cpp
@@ -7,68 +7,82 @@
 #include "mmdeploy/preprocess/transform/tracer.h"
 #include "mmdeploy/preprocess/transform/transform.h"
 
-namespace mmdeploy::transform {
+namespace mmdeploy::transform
+{
 
-class DefaultFormatBundle : public Transform {
- public:
-  explicit DefaultFormatBundle(const Value& args) {
-    if (args.contains("img_to_float") && args["img_to_float"].is_boolean()) {
-      img_to_float_ = args["img_to_float"].get<bool>();
-    }
-    to_float_ = operation::Managed<operation::ToFloat>::Create();
-    hwc2chw_ = operation::Managed<operation::HWC2CHW>::Create();
-  }
+    class DefaultFormatBundle : public Transform
+    {
+      public:
+        explicit DefaultFormatBundle(const Value& args)
+        {
+            if (args.contains("img_to_float") && args["img_to_float"].is_boolean())
+            {
+                img_to_float_ = args["img_to_float"].get<bool>();
+            }
+            to_float_ = operation::Managed<operation::ToFloat>::Create();
+            hwc2chw_  = operation::Managed<operation::HWC2CHW>::Create();
+        }
 
-  Result<void> Apply(Value& data) override {
-    MMDEPLOY_DEBUG("DefaultFormatBundle input: {}", data);
+        Result<void> Apply(Value& data) override
+        {
+            MMDEPLOY_DEBUG("DefaultFormatBundle input: {}", data);
 
-    if (data.contains("img")) {
-      Tensor tensor = data["img"].get<Tensor>();
-      auto input_data_type = tensor.data_type();
-      if (img_to_float_) {
-        OUTCOME_TRY(to_float_.Apply(tensor, tensor));
-      }
+            if (data.contains("img"))
+            {
+                Tensor tensor          = data["img"].get<Tensor>();
+                auto   input_data_type = tensor.data_type();
+                if (img_to_float_)
+                {
+                    OUTCOME_TRY(to_float_.Apply(tensor, tensor));
+                }
 
-      // set default meta keys
-      if (!data.contains("pad_shape")) {
-        for (auto v : tensor.shape()) {
-          data["pad_shape"].push_back(v);
-        }
-      }
-      if (!data.contains("scale_factor")) {
-        for (int i = 0; i < 4; ++i) {
-          data["scale_factor"].push_back(1.0);
-        }
-      }
-      if (!data.contains("img_norm_cfg")) {
-        int channel = tensor.shape()[3];
-        for (int i = 0; i < channel; i++) {
-          data["img_norm_cfg"]["mean"].push_back(0.0);
-          data["img_norm_cfg"]["std"].push_back(1.0);
-        }
-        data["img_norm_cfg"]["to_rgb"] = false;
-      }
+                // set default meta keys
+                if (!data.contains("pad_shape"))
+                {
+                    for (auto v : tensor.shape())
+                    {
+                        data["pad_shape"].push_back(v);
+                    }
+                }
+                if (!data.contains("scale_factor"))
+                {
+                    for (int i = 0; i < 4; ++i)
+                    {
+                        data["scale_factor"].push_back(1.0);
+                    }
+                }
+                if (!data.contains("img_norm_cfg"))
+                {
+                    int channel = tensor.shape()[3];
+                    for (int i = 0; i < channel; i++)
+                    {
+                        data["img_norm_cfg"]["mean"].push_back(0.0);
+                        data["img_norm_cfg"]["std"].push_back(1.0);
+                    }
+                    data["img_norm_cfg"]["to_rgb"] = false;
+                }
 
-      // trace static info & runtime args
-      if (data.contains("__tracer__")) {
-        data["__tracer__"].get_ref<Tracer&>().DefaultFormatBundle(img_to_float_, input_data_type);
-      }
+                // trace static info & runtime args
+                if (data.contains("__tracer__"))
+                {
+                    data["__tracer__"].get_ref<Tracer&>().DefaultFormatBundle(img_to_float_, input_data_type);
+                }
 
-      // transpose
-      OUTCOME_TRY(hwc2chw_.Apply(tensor, tensor));
-      data["img"] = std::move(tensor);
-    }
+                // transpose
+                OUTCOME_TRY(hwc2chw_.Apply(tensor, tensor));
+                data["img"] = std::move(tensor);
+            }
 
-    MMDEPLOY_DEBUG("DefaultFormatBundle output: {}", data);
-    return success();
-  }
+            MMDEPLOY_DEBUG("DefaultFormatBundle output: {}", data);
+            return success();
+        }
 
- private:
-  operation::Managed<operation::ToFloat> to_float_;
-  operation::Managed<operation::HWC2CHW> hwc2chw_;
-  bool img_to_float_ = true;
-};
+      private:
+        operation::Managed<operation::ToFloat> to_float_;
+        operation::Managed<operation::HWC2CHW> hwc2chw_;
+        bool                                   img_to_float_ = true;
+    };
 
-MMDEPLOY_REGISTER_TRANSFORM(DefaultFormatBundle);
+    MMDEPLOY_REGISTER_TRANSFORM(DefaultFormatBundle);
 
 }  // namespace mmdeploy::transform
diff --git a/csrc/mmdeploy/preprocess/transform/image2tensor.cpp b/csrc/mmdeploy/preprocess/transform/image2tensor.cpp
index c622b477eb..6df9812d49 100644
--- a/csrc/mmdeploy/preprocess/transform/image2tensor.cpp
+++ b/csrc/mmdeploy/preprocess/transform/image2tensor.cpp
@@ -9,44 +9,51 @@
 #include "mmdeploy/preprocess/transform/tracer.h"
 #include "mmdeploy/preprocess/transform/transform.h"
 
-namespace mmdeploy::transform {
-
-class ImageToTensor : public Transform {
- public:
-  explicit ImageToTensor(const Value& args) {
-    for (auto& key : args["keys"]) {
-      keys_.push_back(key.get<std::string>());
-    }
-    hwc2chw_ = operation::Managed<operation::HWC2CHW>::Create();
-  }
-
-  Result<void> Apply(Value& data) override {
-    MMDEPLOY_DEBUG("input: {}", data);
-    for (auto& key : keys_) {
-      assert(data.contains(key));
-      Tensor src_tensor = data[key].get<Tensor>();
-      auto& shape = src_tensor.desc().shape;
-
-      assert(shape.size() == 4);
-      assert(shape[3] == 1 || shape[3] == 3);
-
-      Tensor dst;
-      OUTCOME_TRY(hwc2chw_.Apply(src_tensor, dst));
-      data[key] = std::move(dst);
-
-      if (data.contains("__tracer__")) {
-        data["__tracer__"].get_ref<Tracer&>().ImageToTensor(src_tensor.data_type());
-      }
-    }  // for key
-    MMDEPLOY_DEBUG("output: {}", data);
-    return success();
-  }
-
- private:
-  operation::Managed<operation::HWC2CHW> hwc2chw_;
-  std::vector<std::string> keys_;
-};
-
-MMDEPLOY_REGISTER_TRANSFORM(ImageToTensor);
+namespace mmdeploy::transform
+{
+
+    class ImageToTensor : public Transform
+    {
+      public:
+        explicit ImageToTensor(const Value& args)
+        {
+            for (auto& key : args["keys"])
+            {
+                keys_.push_back(key.get<std::string>());
+            }
+            hwc2chw_ = operation::Managed<operation::HWC2CHW>::Create();
+        }
+
+        Result<void> Apply(Value& data) override
+        {
+            MMDEPLOY_DEBUG("input: {}", data);
+            for (auto& key : keys_)
+            {
+                assert(data.contains(key));
+                Tensor src_tensor = data[key].get<Tensor>();
+                auto&  shape      = src_tensor.desc().shape;
+
+                assert(shape.size() == 4);
+                assert(shape[3] == 1 || shape[3] == 3);
+
+                Tensor dst;
+                OUTCOME_TRY(hwc2chw_.Apply(src_tensor, dst));
+                data[key] = std::move(dst);
+
+                if (data.contains("__tracer__"))
+                {
+                    data["__tracer__"].get_ref<Tracer&>().ImageToTensor(src_tensor.data_type());
+                }
+            }  // for key
+            MMDEPLOY_DEBUG("output: {}", data);
+            return success();
+        }
+
+      private:
+        operation::Managed<operation::HWC2CHW> hwc2chw_;
+        std::vector<std::string>               keys_;
+    };
+
+    MMDEPLOY_REGISTER_TRANSFORM(ImageToTensor);
 
 }  // namespace mmdeploy::transform
diff --git a/csrc/mmdeploy/preprocess/transform/letter_resize.cpp b/csrc/mmdeploy/preprocess/transform/letter_resize.cpp
index 735b5efc8c..a723810165 100644
--- a/csrc/mmdeploy/preprocess/transform/letter_resize.cpp
+++ b/csrc/mmdeploy/preprocess/transform/letter_resize.cpp
@@ -12,138 +12,166 @@
 
 using namespace std;
 
-namespace mmdeploy::transform {
-
-class LetterResize : public Transform {
- public:
-  explicit LetterResize(const Value& args) {
-    keep_ratio_ = args.value<bool>("keep_ratio", false);
-    if (args.contains("scale")) {
-      if (args["scale"].is_number_integer()) {
-        auto size = args["scale"].get<int>();
-        img_scale_ = {size, size};
-      } else if (args["scale"].is_array()) {
-        if (args["scale"].size() != 2) {
-          MMDEPLOY_ERROR("'scale' expects an array of size 2, but got {}", args["scale"].size());
-          throw_exception(eInvalidArgument);
+namespace mmdeploy::transform
+{
+
+    class LetterResize : public Transform
+    {
+      public:
+        explicit LetterResize(const Value& args)
+        {
+            keep_ratio_ = args.value<bool>("keep_ratio", false);
+            if (args.contains("scale"))
+            {
+                if (args["scale"].is_number_integer())
+                {
+                    auto size  = args["scale"].get<int>();
+                    img_scale_ = {size, size};
+                }
+                else if (args["scale"].is_array())
+                {
+                    if (args["scale"].size() != 2)
+                    {
+                        MMDEPLOY_ERROR("'scale' expects an array of size 2, but got {}", args["scale"].size());
+                        throw_exception(eInvalidArgument);
+                    }
+                    auto height = args["scale"][0].get<int>();
+                    auto width  = args["scale"][1].get<int>();
+                    img_scale_  = {height, width};
+                }
+                else
+                {
+                    MMDEPLOY_ERROR("'scale' is expected to be an integer or and array of size 2");
+                    throw_exception(eInvalidArgument);
+                }
+            }
+            if (args.contains("pad_val"))
+            {
+                if (args["pad_val"].is_number())
+                {
+                    pad_val_ = args["pad_val"].get<float>();
+                }
+                else if (args["pad_val"].contains("img"))
+                {
+                    pad_val_ = args["pad_val"]["img"].get<float>();
+                }
+            }
+            interpolation_  = args.value<string>("interpolation", "bilinear");
+            allow_scale_up_ = args.value<bool>("allow_scale_up", true);
+            use_mini_pad_   = args.value<bool>("use_mini_pad", false);
+            stretch_only_   = args.value<bool>("stretch_only", false);
+
+            vector<string> interpolations{"nearest", "bilinear", "bicubic", "area", "lanczos"};
+            if (std::find(interpolations.begin(), interpolations.end(), interpolation_) ==
+                interpolations.end())
+            {
+                MMDEPLOY_ERROR("'{}' interpolation is not supported", interpolation_);
+                throw_exception(eInvalidArgument);
+            }
+
+            resize_ = operation::Managed<operation::Resize>::Create(interpolation_);
+            pad_    = operation::Managed<operation::Pad>::Create(std::string("constant"), pad_val_);
         }
-        auto height = args["scale"][0].get<int>();
-        auto width = args["scale"][1].get<int>();
-        img_scale_ = {height, width};
-      } else {
-        MMDEPLOY_ERROR("'scale' is expected to be an integer or and array of size 2");
-        throw_exception(eInvalidArgument);
-      }
-    }
-    if (args.contains("pad_val")) {
-      if (args["pad_val"].is_number()) {
-        pad_val_ = args["pad_val"].get<float>();
-      } else if (args["pad_val"].contains("img")) {
-        pad_val_ = args["pad_val"]["img"].get<float>();
-      }
-    }
-    interpolation_ = args.value<string>("interpolation", "bilinear");
-    allow_scale_up_ = args.value<bool>("allow_scale_up", true);
-    use_mini_pad_ = args.value<bool>("use_mini_pad", false);
-    stretch_only_ = args.value<bool>("stretch_only", false);
-
-    vector<string> interpolations{"nearest", "bilinear", "bicubic", "area", "lanczos"};
-    if (std::find(interpolations.begin(), interpolations.end(), interpolation_) ==
-        interpolations.end()) {
-      MMDEPLOY_ERROR("'{}' interpolation is not supported", interpolation_);
-      throw_exception(eInvalidArgument);
-    }
-
-    resize_ = operation::Managed<operation::Resize>::Create(interpolation_);
-    pad_ = operation::Managed<operation::Pad>::Create(std::string("constant"), pad_val_);
-  }
-  Result<void> Apply(Value& data) override {
-    MMDEPLOY_DEBUG("input: {}", data);
-    auto img_fields = GetImageFields(data);
-    for (auto& key : img_fields) {
-      Tensor src_img = data[key].get<Tensor>();
-      auto desc = src_img.desc();
-      assert(desc.shape.size() == 4);
-
-      int h = desc.shape[1];
-      int w = desc.shape[2];
-      float scale_factor = 0.f;
-
-      float ratio = 0.f;
-      std::vector<float> ratios{};
-
-      ratio = std::min(img_scale_[0] * 1.f / h, img_scale_[1] * 1.f / w);
-
-      // only scale down, do not scale up (for better test mAP)
-      if (!(allow_scale_up_)) {
-        ratio = std::min(ratio, 1.f);
-      }
-      ratios = {ratio, ratio};  // float -> (float, float) for (height, width)
-      std::vector<int> no_pad_shape = {int(std::round(h * ratios[0])),
-                                       int(std::round(w * ratios[1]))};
-      // padding height & width
-      int padding_h = img_scale_[0] - no_pad_shape[0];
-      int padding_w = img_scale_[1] - no_pad_shape[1];
-      if (use_mini_pad_) {
-        // minimum rectangle padding
-        padding_h = padding_h % 32;
-        padding_w = padding_w % 32;
-      } else if (stretch_only_) {
-        // stretch to the specified size directly
-        padding_h = 0;
-        padding_w = 0;
-        no_pad_shape = {img_scale_[0], img_scale_[1]};
-        ratios = {img_scale_[0] * 1.f / h, img_scale_[1] * 1.f / w};
-      }
-
-      Tensor dst_img;
-      if (!(no_pad_shape[0] == h && no_pad_shape[1] == w)) {
-        OUTCOME_TRY(resize_.Apply(src_img, dst_img, no_pad_shape[0], no_pad_shape[1]));
-      } else {
-        dst_img = src_img;
-      }
-
-      // TODO update when mmyolo match the scale sequence with mmcv
-      ratios = {ratios[1], ratios[0]};  // mmcv scale factor is (w, h)
-      if (data.contains("scale_factor")) {
-        data["scale_factor"] = {data["scale_factor"][0].get<float>() * ratios[0],
-                                data["scale_factor"][1].get<float>() * ratios[1],
-                                data["scale_factor"][2].get<float>() * ratios[0],
-                                data["scale_factor"][3].get<float>() * ratios[1]};
-      } else {
-        data["scale_factor"] = {ratios[0], ratios[1], ratios[0], ratios[1]};
-      }
-
-      // padding
-      int top_padding = int(std::round(padding_h / 2 - 0.1));
-      int left_padding = int(std::round(padding_w / 2 - 0.1));
-      int bottom_padding = padding_h - top_padding;
-      int right_padding = padding_w - left_padding;
-      if ((top_padding != 0) || (left_padding != 0) || (bottom_padding != 0) ||
-          (right_padding != 0)) {
-        pad_.Apply(dst_img, dst_img, top_padding, left_padding, bottom_padding, right_padding);
-      }
-
-      data["img_shape"] = {1, dst_img.shape(1), dst_img.shape(2), desc.shape[3]};
-      data["pad_param"] = {top_padding, left_padding, bottom_padding, right_padding};
-      data[key] = std::move(dst_img);
-    }
-    MMDEPLOY_DEBUG("output: {}", data);
-    return success();
-  }
-
- protected:
-  operation::Managed<operation::Resize> resize_;
-  operation::Managed<operation::Pad> pad_;
-  std::array<int, 2> img_scale_;
-  std::string interpolation_{"bilinear"};
-  float pad_val_{0};
-  bool keep_ratio_{true};
-  bool use_mini_pad_{false};
-  bool stretch_only_{false};
-  bool allow_scale_up_{true};
-};
-
-MMDEPLOY_REGISTER_TRANSFORM(LetterResize);
+        Result<void> Apply(Value& data) override
+        {
+            MMDEPLOY_DEBUG("input: {}", data);
+            auto img_fields = GetImageFields(data);
+            for (auto& key : img_fields)
+            {
+                Tensor src_img = data[key].get<Tensor>();
+                auto   desc    = src_img.desc();
+                assert(desc.shape.size() == 4);
+
+                int                h            = desc.shape[1];
+                int                w            = desc.shape[2];
+                float              scale_factor = 0.f;
+
+                float              ratio = 0.f;
+                std::vector<float> ratios{};
+
+                ratio = std::min(img_scale_[0] * 1.f / h, img_scale_[1] * 1.f / w);
+
+                // only scale down, do not scale up (for better test mAP)
+                if (!(allow_scale_up_))
+                {
+                    ratio = std::min(ratio, 1.f);
+                }
+                ratios                        = {ratio, ratio};  // float -> (float, float) for (height, width)
+                std::vector<int> no_pad_shape = {int(std::round(h * ratios[0])),
+                                                 int(std::round(w * ratios[1]))};
+                // padding height & width
+                int              padding_h    = img_scale_[0] - no_pad_shape[0];
+                int              padding_w    = img_scale_[1] - no_pad_shape[1];
+                if (use_mini_pad_)
+                {
+                    // minimum rectangle padding
+                    padding_h = padding_h % 32;
+                    padding_w = padding_w % 32;
+                }
+                else if (stretch_only_)
+                {
+                    // stretch to the specified size directly
+                    padding_h    = 0;
+                    padding_w    = 0;
+                    no_pad_shape = {img_scale_[0], img_scale_[1]};
+                    ratios       = {img_scale_[0] * 1.f / h, img_scale_[1] * 1.f / w};
+                }
+
+                Tensor dst_img;
+                if (!(no_pad_shape[0] == h && no_pad_shape[1] == w))
+                {
+                    OUTCOME_TRY(resize_.Apply(src_img, dst_img, no_pad_shape[0], no_pad_shape[1]));
+                }
+                else
+                {
+                    dst_img = src_img;
+                }
+
+                // TODO update when mmyolo match the scale sequence with mmcv
+                ratios = {ratios[1], ratios[0]};  // mmcv scale factor is (w, h)
+                if (data.contains("scale_factor"))
+                {
+                    data["scale_factor"] = {data["scale_factor"][0].get<float>() * ratios[0],
+                                            data["scale_factor"][1].get<float>() * ratios[1],
+                                            data["scale_factor"][2].get<float>() * ratios[0],
+                                            data["scale_factor"][3].get<float>() * ratios[1]};
+                }
+                else
+                {
+                    data["scale_factor"] = {ratios[0], ratios[1], ratios[0], ratios[1]};
+                }
+
+                // padding
+                int top_padding    = int(std::round(padding_h / 2 - 0.1));
+                int left_padding   = int(std::round(padding_w / 2 - 0.1));
+                int bottom_padding = padding_h - top_padding;
+                int right_padding  = padding_w - left_padding;
+                if ((top_padding != 0) || (left_padding != 0) || (bottom_padding != 0) ||
+                    (right_padding != 0))
+                {
+                    pad_.Apply(dst_img, dst_img, top_padding, left_padding, bottom_padding, right_padding);
+                }
+
+                data["img_shape"] = {1, dst_img.shape(1), dst_img.shape(2), desc.shape[3]};
+                data["pad_param"] = {top_padding, left_padding, bottom_padding, right_padding};
+                data[key]         = std::move(dst_img);
+            }
+            MMDEPLOY_DEBUG("output: {}", data);
+            return success();
+        }
+
+      protected:
+        operation::Managed<operation::Resize> resize_;
+        operation::Managed<operation::Pad>    pad_;
+        std::array<int, 2>                    img_scale_;
+        std::string                           interpolation_{"bilinear"};
+        float                                 pad_val_{0};
+        bool                                  keep_ratio_{true};
+        bool                                  use_mini_pad_{false};
+        bool                                  stretch_only_{false};
+        bool                                  allow_scale_up_{true};
+    };
+
+    MMDEPLOY_REGISTER_TRANSFORM(LetterResize);
 
 }  // namespace mmdeploy::transform
diff --git a/csrc/mmdeploy/preprocess/transform/lift.cpp b/csrc/mmdeploy/preprocess/transform/lift.cpp
index 2e0144d250..0aab73f745 100644
--- a/csrc/mmdeploy/preprocess/transform/lift.cpp
+++ b/csrc/mmdeploy/preprocess/transform/lift.cpp
@@ -4,32 +4,39 @@
 #include "mmdeploy/core/utils/formatter.h"
 #include "mmdeploy/preprocess/transform/transform.h"
 
-namespace mmdeploy::transform {
+namespace mmdeploy::transform
+{
 
-class Lift : public Transform {
- public:
-  explicit Lift(const Value& args) {
-    const char* type = "Compose";
-    if (auto creator = gRegistry<Transform>().Get(type)) {
-      compose_ = creator->Create(args);
-    } else {
-      MMDEPLOY_ERROR("Unable to find Transform creator: {}. Available transforms: {}", type,
-                     gRegistry<Transform>().List());
-      throw_exception(eEntryNotFound);
-    }
-  }
+    class Lift : public Transform
+    {
+      public:
+        explicit Lift(const Value& args)
+        {
+            const char* type = "Compose";
+            if (auto creator = gRegistry<Transform>().Get(type))
+            {
+                compose_ = creator->Create(args);
+            }
+            else
+            {
+                MMDEPLOY_ERROR("Unable to find Transform creator: {}. Available transforms: {}", type, gRegistry<Transform>().List());
+                throw_exception(eEntryNotFound);
+            }
+        }
 
-  Result<void> Apply(Value& data) override {
-    for (auto& item : data.array()) {
-      OUTCOME_TRY(compose_->Apply(item));
-    }
-    return success();
-  }
+        Result<void> Apply(Value& data) override
+        {
+            for (auto& item : data.array())
+            {
+                OUTCOME_TRY(compose_->Apply(item));
+            }
+            return success();
+        }
 
- private:
-  std::unique_ptr<Transform> compose_;
-};
+      private:
+        std::unique_ptr<Transform> compose_;
+    };
 
-MMDEPLOY_REGISTER_TRANSFORM(Lift);
+    MMDEPLOY_REGISTER_TRANSFORM(Lift);
 
 }  // namespace mmdeploy::transform
diff --git a/csrc/mmdeploy/preprocess/transform/load.cpp b/csrc/mmdeploy/preprocess/transform/load.cpp
index 88782d5c59..9a67a40e1c 100644
--- a/csrc/mmdeploy/preprocess/transform/load.cpp
+++ b/csrc/mmdeploy/preprocess/transform/load.cpp
@@ -6,101 +6,113 @@
 #include "mmdeploy/preprocess/transform/tracer.h"
 #include "mmdeploy/preprocess/transform/transform.h"
 
-namespace mmdeploy::transform {
-
-using operation::CvtColor;
-using operation::ToFloat;
-
-inline Tensor to_tensor(const Mat& mat) {
-  assert(mat.pixel_format() != PixelFormat::kNV12 && mat.pixel_format() != PixelFormat::kNV21);
-  TensorDesc desc{mat.device(), mat.type(), {1, mat.height(), mat.width(), mat.channel()}, ""};
-  return {desc, mat.buffer()};
-}
-
-class PrepareImage : public Transform {
- public:
-  explicit PrepareImage(const Value& args) {
-    to_float32_ = args.value("to_float32", to_float32_);
-    color_type_ = args.value("color_type", color_type_);
-    channel_order_ = args.value("channel_order", channel_order_);
-
-    cvt_color_ = operation::Managed<CvtColor>::Create();
-    to_float_ = operation::Managed<ToFloat>::Create();
-  }
-  /**
-     * Input:
-      {
-        "ori_img": cv::Mat,
-        "attribute": {
-        }
-      }
-
-     * Output:
-      {
-        "ori_img": cv::Mat,
-        "img": Tensor,
-        "img_shape": [],
-        "ori_shape": [],
-        "img_fields": ["img"],
-        "attribute": {
-        }
-      }
-     */
+namespace mmdeploy::transform
+{
 
-  Result<void> Apply(Value& data) override {
-    MMDEPLOY_DEBUG("input: {}", data);
+    using operation::CvtColor;
+    using operation::ToFloat;
 
-    // early exit
-    if (data.contains("img") && data["img"].is_any<Tensor>()) {
-      return success();
+    inline Tensor to_tensor(const Mat& mat)
+    {
+        assert(mat.pixel_format() != PixelFormat::kNV12 && mat.pixel_format() != PixelFormat::kNV21);
+        TensorDesc desc{mat.device(), mat.type(), {1, mat.height(), mat.width(), mat.channel()}, ""};
+        return {desc, mat.buffer()};
     }
 
-    assert(data.contains("ori_img"));
-
-    Mat src_mat = data["ori_img"].get<Mat>();
-    Mat dst_mat;
-    if (color_type_ == "color" || color_type_ == "color_ignore_orientation") {
-      if (channel_order_ == "bgr") {
-        OUTCOME_TRY(cvt_color_.Apply(src_mat, dst_mat, PixelFormat::kBGR));
-      } else {
-        OUTCOME_TRY(cvt_color_.Apply(src_mat, dst_mat, PixelFormat::kRGB));
-      }
-    } else {
-      OUTCOME_TRY(cvt_color_.Apply(src_mat, dst_mat, PixelFormat::kGRAYSCALE));
-    }
-    auto tensor = to_tensor(dst_mat);
-    if (to_float32_) {
-      OUTCOME_TRY(to_float_.Apply(tensor, tensor));
-    }
+    class PrepareImage : public Transform
+    {
+      public:
+        explicit PrepareImage(const Value& args)
+        {
+            to_float32_    = args.value("to_float32", to_float32_);
+            color_type_    = args.value("color_type", color_type_);
+            channel_order_ = args.value("channel_order", channel_order_);
+
+            cvt_color_ = operation::Managed<CvtColor>::Create();
+            to_float_  = operation::Managed<ToFloat>::Create();
+        }
+        /**
+           * Input:
+            {
+              "ori_img": cv::Mat,
+              "attribute": {
+              }
+            }
+
+           * Output:
+            {
+              "ori_img": cv::Mat,
+              "img": Tensor,
+              "img_shape": [],
+              "ori_shape": [],
+              "img_fields": ["img"],
+              "attribute": {
+              }
+            }
+           */
+
+        Result<void> Apply(Value& data) override
+        {
+            MMDEPLOY_DEBUG("input: {}", data);
+
+            // early exit
+            if (data.contains("img") && data["img"].is_any<Tensor>())
+            {
+                return success();
+            }
+
+            assert(data.contains("ori_img"));
+
+            Mat src_mat = data["ori_img"].get<Mat>();
+            Mat dst_mat;
+            if (color_type_ == "color" || color_type_ == "color_ignore_orientation")
+            {
+                if (channel_order_ == "bgr")
+                {
+                    OUTCOME_TRY(cvt_color_.Apply(src_mat, dst_mat, PixelFormat::kBGR));
+                }
+                else
+                {
+                    OUTCOME_TRY(cvt_color_.Apply(src_mat, dst_mat, PixelFormat::kRGB));
+                }
+            }
+            else
+            {
+                OUTCOME_TRY(cvt_color_.Apply(src_mat, dst_mat, PixelFormat::kGRAYSCALE));
+            }
+            auto tensor = to_tensor(dst_mat);
+            if (to_float32_)
+            {
+                OUTCOME_TRY(to_float_.Apply(tensor, tensor));
+            }
+
+            data["img"] = tensor;
+
+            for (auto v : tensor.desc().shape)
+            {
+                data["img_shape"].push_back(v);
+            }
+            data["ori_shape"] = {1, src_mat.height(), src_mat.width(), src_mat.channel()};
+            data["img_fields"].push_back("img");
+
+            // trace static info & runtime args
+            Tracer tracer;
+            tracer.PrepareImage(color_type_, to_float32_, {1, src_mat.height(), src_mat.width(), src_mat.channel()}, src_mat.pixel_format(), src_mat.type());
+            data["__tracer__"] = std::move(tracer);
+
+            MMDEPLOY_DEBUG("output: {}", data);
+
+            return success();
+        }
 
-    data["img"] = tensor;
+      private:
+        operation::Managed<CvtColor> cvt_color_;
+        operation::Managed<ToFloat>  to_float_;
+        bool                         to_float32_{false};
+        std::string                  color_type_{"color"};
+        std::string                  channel_order_{"bgr"};
+    };
 
-    for (auto v : tensor.desc().shape) {
-      data["img_shape"].push_back(v);
-    }
-    data["ori_shape"] = {1, src_mat.height(), src_mat.width(), src_mat.channel()};
-    data["img_fields"].push_back("img");
-
-    // trace static info & runtime args
-    Tracer tracer;
-    tracer.PrepareImage(color_type_, to_float32_,
-                        {1, src_mat.height(), src_mat.width(), src_mat.channel()},
-                        src_mat.pixel_format(), src_mat.type());
-    data["__tracer__"] = std::move(tracer);
-
-    MMDEPLOY_DEBUG("output: {}", data);
-
-    return success();
-  }
-
- private:
-  operation::Managed<CvtColor> cvt_color_;
-  operation::Managed<ToFloat> to_float_;
-  bool to_float32_{false};
-  std::string color_type_{"color"};
-  std::string channel_order_{"bgr"};
-};
-
-MMDEPLOY_REGISTER_TRANSFORM2(PrepareImage, (LoadImageFromFile, 0));
+    MMDEPLOY_REGISTER_TRANSFORM2(PrepareImage, (LoadImageFromFile, 0));
 
 }  // namespace mmdeploy::transform
diff --git a/csrc/mmdeploy/preprocess/transform/normalize.cpp b/csrc/mmdeploy/preprocess/transform/normalize.cpp
index 5fe2139ec9..e42d8e8b83 100644
--- a/csrc/mmdeploy/preprocess/transform/normalize.cpp
+++ b/csrc/mmdeploy/preprocess/transform/normalize.cpp
@@ -8,132 +8,149 @@
 #include "mmdeploy/preprocess/transform/tracer.h"
 #include "mmdeploy/preprocess/transform/transform.h"
 
-namespace mmdeploy::transform {
-
-inline Tensor to_tensor(const Mat& mat) {
-  assert(mat.pixel_format() != PixelFormat::kNV12 && mat.pixel_format() != PixelFormat::kNV21);
-  TensorDesc desc{mat.device(), mat.type(), {1, mat.height(), mat.width(), mat.channel()}, ""};
-  return {desc, mat.buffer()};
-}
-
-inline Mat to_mat(const Tensor& tensor, PixelFormat format) {
-  assert(tensor.shape().size() == 4 && tensor.shape(0) == 1);
-  return {
-      static_cast<int>(tensor.shape(1)),  // height
-      static_cast<int>(tensor.shape(2)),  // width
-      format,                             // pixel format
-      tensor.data_type(),                 // data type
-      std::shared_ptr<void>(const_cast<void*>(tensor.data()),
-                            [buffer = tensor.buffer()](auto) {}),  // data
-      tensor.device()                                              // device
-  };
-}
-
-class Normalize : public Transform {
- public:
-  explicit Normalize(const Value& args) {
-    if (!args.contains("mean") || !args.contains("std")) {
-      MMDEPLOY_ERROR("no 'mean' or 'std' is configured");
-      throw_exception(eInvalidArgument);
-    }
-    for (auto& v : args["mean"]) {
-      mean_.push_back(v.get<float>());
-    }
-    for (auto& v : args["std"]) {
-      std_.push_back(v.get<float>());
-    }
-    to_rgb_ = args.value("to_rgb", to_rgb_);
-    to_float_ = args.value("to_float", to_float_);
-
-    if (!to_float_) {
-      if (std::count(mean_.begin(), mean_.end(), 0.f) != mean_.size() ||
-          std::count(std_.begin(), std_.end(), 1.f) != std_.size()) {
-        MMDEPLOY_ERROR("Non-trivial mean {} and std {} are not supported in uint8 mode", mean_,
-                       std_);
-        throw_exception(eInvalidArgument);
-      }
-    }
-
-    // auto context = GetContext(args);
-    normalize_ = operation::Managed<operation::Normalize>::Create(
-        operation::Normalize::Param{mean_, std_, to_rgb_});
-    cvt_color_ = operation::Managed<operation::CvtColor>::Create();
-  }
+namespace mmdeploy::transform
+{
 
-  /**
-    input:
+    inline Tensor to_tensor(const Mat& mat)
     {
-      "ori_img": Mat,
-      "img": Tensor,
-      "attribute": "",
-      "img_shape": [int],
-      "ori_shape": [int],
-      "img_fields": [int]
+        assert(mat.pixel_format() != PixelFormat::kNV12 && mat.pixel_format() != PixelFormat::kNV21);
+        TensorDesc desc{mat.device(), mat.type(), {1, mat.height(), mat.width(), mat.channel()}, ""};
+        return {desc, mat.buffer()};
     }
-    output:
+
+    inline Mat to_mat(const Tensor& tensor, PixelFormat format)
     {
-      "img": Tensor,
-      "attribute": "",
-      "img_shape": [int],
-      "ori_shape": [int],
-      "img_fields": [string],
-      "img_norm_cfg": {
-        "mean": [float],
-        "std": [float],
-        "to_rgb": true
-      }
-    }
-   */
-
-  Result<void> Apply(Value& data) override {
-    MMDEPLOY_DEBUG("input: {}", data);
-
-    auto img_fields = GetImageFields(data);
-    for (auto& key : img_fields) {
-      Tensor tensor = data[key].get<Tensor>();
-      auto desc = tensor.desc();
-      assert(desc.data_type == DataType::kINT8 || desc.data_type == DataType::kFLOAT);
-      assert(desc.shape.size() == 4 /*n, h, w, c*/);
-      assert(desc.shape[3] == mean_.size());
-
-      Tensor dst;
-      if (to_float_) {
-        OUTCOME_TRY(normalize_.Apply(tensor, dst));
-        data[key] = std::move(dst);
-      } else if (to_rgb_) {
-        auto src_mat = to_mat(tensor, PixelFormat::kBGR);
-        Mat dst_mat;
-        OUTCOME_TRY(cvt_color_.Apply(src_mat, dst_mat, PixelFormat::kRGB));
-        dst = to_tensor(dst_mat);
-        data[key] = std::move(dst);
-      }
-
-      for (auto& v : mean_) {
-        data["img_norm_cfg"]["mean"].push_back(v);
-      }
-      for (auto v : std_) {
-        data["img_norm_cfg"]["std"].push_back(v);
-      }
-      data["img_norm_cfg"]["to_rgb"] = to_rgb_;
-
-      // trace static info & runtime args
-      if (data.contains("__tracer__")) {
-        data["__tracer__"].get_ref<Tracer&>().Normalize(mean_, std_, to_rgb_, desc.data_type);
-      }
+        assert(tensor.shape().size() == 4 && tensor.shape(0) == 1);
+        return {
+            static_cast<int>(tensor.shape(1)),  // height
+            static_cast<int>(tensor.shape(2)),  // width
+            format,                             // pixel format
+            tensor.data_type(),                 // data type
+            std::shared_ptr<void>(const_cast<void*>(tensor.data()),
+                                  [buffer = tensor.buffer()](auto) {}),  // data
+            tensor.device()                                              // device
+        };
     }
-    MMDEPLOY_DEBUG("output: {}", data);
-    return success();
-  }
-
- private:
-  operation::Managed<operation::Normalize> normalize_;
-  operation::Managed<operation::CvtColor> cvt_color_;
-  std::vector<float> mean_;
-  std::vector<float> std_;
-  bool to_rgb_{true};
-  bool to_float_{true};
-};
-
-MMDEPLOY_REGISTER_TRANSFORM(Normalize);
+
+    class Normalize : public Transform
+    {
+      public:
+        explicit Normalize(const Value& args)
+        {
+            if (!args.contains("mean") || !args.contains("std"))
+            {
+                MMDEPLOY_ERROR("no 'mean' or 'std' is configured");
+                throw_exception(eInvalidArgument);
+            }
+            for (auto& v : args["mean"])
+            {
+                mean_.push_back(v.get<float>());
+            }
+            for (auto& v : args["std"])
+            {
+                std_.push_back(v.get<float>());
+            }
+            to_rgb_   = args.value("to_rgb", to_rgb_);
+            to_float_ = args.value("to_float", to_float_);
+
+            if (!to_float_)
+            {
+                if (std::count(mean_.begin(), mean_.end(), 0.f) != mean_.size() ||
+                    std::count(std_.begin(), std_.end(), 1.f) != std_.size())
+                {
+                    MMDEPLOY_ERROR("Non-trivial mean {} and std {} are not supported in uint8 mode", mean_, std_);
+                    throw_exception(eInvalidArgument);
+                }
+            }
+
+            // auto context = GetContext(args);
+            normalize_ = operation::Managed<operation::Normalize>::Create(
+                operation::Normalize::Param{mean_, std_, to_rgb_});
+            cvt_color_ = operation::Managed<operation::CvtColor>::Create();
+        }
+
+        /**
+          input:
+          {
+            "ori_img": Mat,
+            "img": Tensor,
+            "attribute": "",
+            "img_shape": [int],
+            "ori_shape": [int],
+            "img_fields": [int]
+          }
+          output:
+          {
+            "img": Tensor,
+            "attribute": "",
+            "img_shape": [int],
+            "ori_shape": [int],
+            "img_fields": [string],
+            "img_norm_cfg": {
+              "mean": [float],
+              "std": [float],
+              "to_rgb": true
+            }
+          }
+         */
+
+        Result<void> Apply(Value& data) override
+        {
+            MMDEPLOY_DEBUG("input: {}", data);
+
+            auto img_fields = GetImageFields(data);
+            for (auto& key : img_fields)
+            {
+                Tensor tensor = data[key].get<Tensor>();
+                auto   desc   = tensor.desc();
+                assert(desc.data_type == DataType::kINT8 || desc.data_type == DataType::kFLOAT);
+                assert(desc.shape.size() == 4 /*n, h, w, c*/);
+                assert(desc.shape[3] == mean_.size());
+
+                Tensor dst;
+                if (to_float_)
+                {
+                    OUTCOME_TRY(normalize_.Apply(tensor, dst));
+                    data[key] = std::move(dst);
+                }
+                else if (to_rgb_)
+                {
+                    auto src_mat = to_mat(tensor, PixelFormat::kBGR);
+                    Mat  dst_mat;
+                    OUTCOME_TRY(cvt_color_.Apply(src_mat, dst_mat, PixelFormat::kRGB));
+                    dst       = to_tensor(dst_mat);
+                    data[key] = std::move(dst);
+                }
+
+                for (auto& v : mean_)
+                {
+                    data["img_norm_cfg"]["mean"].push_back(v);
+                }
+                for (auto v : std_)
+                {
+                    data["img_norm_cfg"]["std"].push_back(v);
+                }
+                data["img_norm_cfg"]["to_rgb"] = to_rgb_;
+
+                // trace static info & runtime args
+                if (data.contains("__tracer__"))
+                {
+                    data["__tracer__"].get_ref<Tracer&>().Normalize(mean_, std_, to_rgb_, desc.data_type);
+                }
+            }
+            MMDEPLOY_DEBUG("output: {}", data);
+            return success();
+        }
+
+      private:
+        operation::Managed<operation::Normalize> normalize_;
+        operation::Managed<operation::CvtColor>  cvt_color_;
+        std::vector<float>                       mean_;
+        std::vector<float>                       std_;
+        bool                                     to_rgb_{true};
+        bool                                     to_float_{true};
+    };
+
+    MMDEPLOY_REGISTER_TRANSFORM(Normalize);
 
 }  // namespace mmdeploy::transform
diff --git a/csrc/mmdeploy/preprocess/transform/pad.cpp b/csrc/mmdeploy/preprocess/transform/pad.cpp
index cdfd9d873c..64466b5557 100644
--- a/csrc/mmdeploy/preprocess/transform/pad.cpp
+++ b/csrc/mmdeploy/preprocess/transform/pad.cpp
@@ -10,142 +10,177 @@
 
 using namespace std;
 
-namespace mmdeploy::transform {
-
-class Pad : public Transform {
- public:
-  explicit Pad(const Value& args) {
-    size_[0] = size_[1] = 0;
-    if (args.contains("size") && args["size"].is_number_integer()) {
-      size_[0] = size_[1] = (args["size"].get<int>());
-    }
-    if (args.contains("size") && args["size"].is_array()) {
-      if (args["size"].size() != 2) {
-        MMDEPLOY_ERROR("the length of size should be 2");
-        throw_exception(eInvalidArgument);
-      }
-      size_[0] = args["size"][0].get<int>();
-      size_[1] = args["size"][1].get<int>();
-    }
-
-    size_divisor_ = args.value("size_divisor", 1);
-    if (args.contains("pad_val")) {
-      if (args["pad_val"].is_number()) {
-        pad_val_ = args["pad_val"].get<float>();
-      } else if (args["pad_val"].contains("img")) {
-        pad_val_ = args["pad_val"]["img"][0].get<float>();
-      } else {
-        MMDEPLOY_ERROR("args must be number or img dict");
-        throw_exception(eInvalidArgument);
-      }
-    } else {
-      pad_val_ = 0.0f;
-    }
-
-    logical_or_val_ = args.value("logical_or_val", 0);
-    add_pix_val_ = args.value("add_pix_val", 0);
-
-    pad_to_square_ = args.value("pad_to_square", false);
-    padding_mode_ = args.value("padding_mode", std::string("constant"));
-    orientation_agnostic_ = args.value("orientation_agnostic", false);
-
-    pad_ = operation::Managed<operation::Pad>::Create(padding_mode_, pad_val_);
-  }
-
-  Result<void> Apply(Value& data) override {
-    MMDEPLOY_DEBUG("input: {}", data);
-
-    auto img_fields = GetImageFields(data);
-    for (auto& key : img_fields) {
-      Tensor output_tensor;
-      auto tensor = data[key].get<Tensor>();
-      assert(tensor.desc().shape.size() == 4);
-      assert(tensor.desc().shape[0] == 1);
-      assert(tensor.desc().shape[3] == 3 || tensor.desc().shape[3] == 1);
-
-      int height = tensor.shape(1);
-      int width = tensor.shape(2);
-
-      std::array<int, 4> padding{0, 0, 0, 0};
-      if (pad_to_square_) {
-        int max_size = std::max(tensor.shape(1), tensor.shape(2));
-        padding = {0, 0, max_size - width, max_size - height};
-        data["pad_fixed_size"].push_back(max_size);
-        data["pad_fixed_size"].push_back(max_size);
-      } else if (size_[0] != 0 && size_[1] != 0) {
-        if (orientation_agnostic_) {
-          auto size_min = min(size_[0], size_[1]);
-          auto size_max = max(size_[0], size_[1]);
-          auto pad_h = width < height ? size_max : size_min;
-          auto pad_w = width < height ? size_min : size_max;
-          padding = {0, 0, pad_w - width, pad_h - height};
-          data["pad_fixed_size"].push_back(pad_h);
-          data["pad_fixed_size"].push_back(pad_w);
-        } else {
-          padding = {0, 0, size_[0] - width, size_[1] - height};
-          data["pad_fixed_size"].push_back(size_[1]);
-          data["pad_fixed_size"].push_back(size_[0]);
+namespace mmdeploy::transform
+{
+
+    class Pad : public Transform
+    {
+      public:
+        explicit Pad(const Value& args)
+        {
+            size_[0] = size_[1] = 0;
+            if (args.contains("size") && args["size"].is_number_integer())
+            {
+                size_[0] = size_[1] = (args["size"].get<int>());
+            }
+            if (args.contains("size") && args["size"].is_array())
+            {
+                if (args["size"].size() != 2)
+                {
+                    MMDEPLOY_ERROR("the length of size should be 2");
+                    throw_exception(eInvalidArgument);
+                }
+                size_[0] = args["size"][0].get<int>();
+                size_[1] = args["size"][1].get<int>();
+            }
+
+            size_divisor_ = args.value("size_divisor", 1);
+            if (args.contains("pad_val"))
+            {
+                if (args["pad_val"].is_number())
+                {
+                    pad_val_ = args["pad_val"].get<float>();
+                }
+                else if (args["pad_val"].contains("img"))
+                {
+                    pad_val_ = args["pad_val"]["img"][0].get<float>();
+                }
+                else
+                {
+                    MMDEPLOY_ERROR("args must be number or img dict");
+                    throw_exception(eInvalidArgument);
+                }
+            }
+            else
+            {
+                pad_val_ = 0.0f;
+            }
+
+            logical_or_val_ = args.value("logical_or_val", 0);
+            add_pix_val_    = args.value("add_pix_val", 0);
+
+            pad_to_square_        = args.value("pad_to_square", false);
+            padding_mode_         = args.value("padding_mode", std::string("constant"));
+            orientation_agnostic_ = args.value("orientation_agnostic", false);
+
+            pad_ = operation::Managed<operation::Pad>::Create(padding_mode_, pad_val_);
         }
-      } else if (size_divisor_ != 1) {
-        auto pad_h = (height + size_divisor_ - 1) / size_divisor_ * size_divisor_;
-        auto pad_w = (width + size_divisor_ - 1) / size_divisor_ * size_divisor_;
-        padding = {0, 0, pad_w - width, pad_h - height};
-        data["pad_size_divisor"] = size_divisor_;
-        data["pad_fixed_size"].push_back(pad_h);
-        data["pad_fixed_size"].push_back(pad_w);
-      } else if (logical_or_val_ > 0) {
-        int pad_h = (height | logical_or_val_) + add_pix_val_;
-        int pad_w = (width | logical_or_val_) + add_pix_val_;
-        int offset_h = pad_h / 2 - height / 2;
-        int offset_w = pad_w / 2 - width / 2;
-        padding = {offset_w, offset_h, pad_w - width - offset_w, pad_h - height - offset_h};
-        data["border"].push_back(offset_h);
-        data["border"].push_back(offset_w);
-        data["border"].push_back(offset_h + height);
-        data["border"].push_back(offset_w + width);
-      } else {
-        output_tensor = tensor;
-        data["pad_fixed_size"].push_back(height);
-        data["pad_fixed_size"].push_back(width);
-      }
-
-      if (std::count(begin(padding), end(padding), 0) != 4) {
-        OUTCOME_TRY(
-            pad_.Apply(tensor, output_tensor, padding[1], padding[0], padding[3], padding[2]));
-      } else {
-        output_tensor = tensor;
-      }
-
-      for (auto& v : output_tensor.shape()) {
-        data["pad_shape"].push_back(v);
-      }
-
-      // trace static info & runtime args
-      if (data.contains("__tracer__")) {
-        data["__tracer__"].get_ref<Tracer&>().Pad(
-            pad_val_, {padding[1], padding[0], padding[3], padding[2]},
-            {(int)output_tensor.shape(1), (int)output_tensor.shape(2)}, output_tensor.data_type());
-      }
-
-      data[key] = std::move(output_tensor);
-    }
-
-    MMDEPLOY_DEBUG("output: {}", data);
-    return success();
-  }
-
- private:
-  operation::Managed<operation::Pad> pad_;
-  std::array<int, 2> size_;
-  int size_divisor_;
-  int logical_or_val_;
-  int add_pix_val_;
-  float pad_val_;
-  bool pad_to_square_;
-  bool orientation_agnostic_;
-  std::string padding_mode_;
-};
-
-MMDEPLOY_REGISTER_TRANSFORM(Pad);
+
+        Result<void> Apply(Value& data) override
+        {
+            MMDEPLOY_DEBUG("input: {}", data);
+
+            auto img_fields = GetImageFields(data);
+            for (auto& key : img_fields)
+            {
+                Tensor output_tensor;
+                auto   tensor = data[key].get<Tensor>();
+                assert(tensor.desc().shape.size() == 4);
+                assert(tensor.desc().shape[0] == 1);
+                assert(tensor.desc().shape[3] == 3 || tensor.desc().shape[3] == 1);
+
+                int                height = tensor.shape(1);
+                int                width  = tensor.shape(2);
+
+                std::array<int, 4> padding{0, 0, 0, 0};
+                if (pad_to_square_)
+                {
+                    int max_size = std::max(tensor.shape(1), tensor.shape(2));
+                    padding      = {0, 0, max_size - width, max_size - height};
+                    data["pad_fixed_size"].push_back(max_size);
+                    data["pad_fixed_size"].push_back(max_size);
+                }
+                else if (size_[0] != 0 && size_[1] != 0)
+                {
+                    if (orientation_agnostic_)
+                    {
+                        auto size_min = min(size_[0], size_[1]);
+                        auto size_max = max(size_[0], size_[1]);
+                        auto pad_h    = width < height ? size_max : size_min;
+                        auto pad_w    = width < height ? size_min : size_max;
+                        padding       = {0, 0, pad_w - width, pad_h - height};
+                        data["pad_fixed_size"].push_back(pad_h);
+                        data["pad_fixed_size"].push_back(pad_w);
+                    }
+                    else
+                    {
+                        padding = {0, 0, size_[0] - width, size_[1] - height};
+                        data["pad_fixed_size"].push_back(size_[1]);
+                        data["pad_fixed_size"].push_back(size_[0]);
+                    }
+                }
+                else if (size_divisor_ != 1)
+                {
+                    auto pad_h               = (height + size_divisor_ - 1) / size_divisor_ * size_divisor_;
+                    auto pad_w               = (width + size_divisor_ - 1) / size_divisor_ * size_divisor_;
+                    padding                  = {0, 0, pad_w - width, pad_h - height};
+                    data["pad_size_divisor"] = size_divisor_;
+                    data["pad_fixed_size"].push_back(pad_h);
+                    data["pad_fixed_size"].push_back(pad_w);
+                }
+                else if (logical_or_val_ > 0)
+                {
+                    int pad_h    = (height | logical_or_val_) + add_pix_val_;
+                    int pad_w    = (width | logical_or_val_) + add_pix_val_;
+                    int offset_h = pad_h / 2 - height / 2;
+                    int offset_w = pad_w / 2 - width / 2;
+                    padding      = {offset_w, offset_h, pad_w - width - offset_w, pad_h - height - offset_h};
+                    data["border"].push_back(offset_h);
+                    data["border"].push_back(offset_w);
+                    data["border"].push_back(offset_h + height);
+                    data["border"].push_back(offset_w + width);
+                }
+                else
+                {
+                    output_tensor = tensor;
+                    data["pad_fixed_size"].push_back(height);
+                    data["pad_fixed_size"].push_back(width);
+                }
+
+                if (std::count(begin(padding), end(padding), 0) != 4)
+                {
+                    OUTCOME_TRY(
+                        pad_.Apply(tensor, output_tensor, padding[1], padding[0], padding[3], padding[2]));
+                }
+                else
+                {
+                    output_tensor = tensor;
+                }
+
+                for (auto& v : output_tensor.shape())
+                {
+                    data["pad_shape"].push_back(v);
+                }
+
+                // trace static info & runtime args
+                if (data.contains("__tracer__"))
+                {
+                    data["__tracer__"].get_ref<Tracer&>().Pad(
+                        pad_val_,
+                        {padding[1], padding[0], padding[3], padding[2]},
+                        {(int)output_tensor.shape(1), (int)output_tensor.shape(2)},
+                        output_tensor.data_type());
+                }
+
+                data[key] = std::move(output_tensor);
+            }
+
+            MMDEPLOY_DEBUG("output: {}", data);
+            return success();
+        }
+
+      private:
+        operation::Managed<operation::Pad> pad_;
+        std::array<int, 2>                 size_;
+        int                                size_divisor_;
+        int                                logical_or_val_;
+        int                                add_pix_val_;
+        float                              pad_val_;
+        bool                               pad_to_square_;
+        bool                               orientation_agnostic_;
+        std::string                        padding_mode_;
+    };
+
+    MMDEPLOY_REGISTER_TRANSFORM(Pad);
 
 }  // namespace mmdeploy::transform
diff --git a/csrc/mmdeploy/preprocess/transform/resize.cpp b/csrc/mmdeploy/preprocess/transform/resize.cpp
index 44375ec1a0..2378ca3440 100644
--- a/csrc/mmdeploy/preprocess/transform/resize.cpp
+++ b/csrc/mmdeploy/preprocess/transform/resize.cpp
@@ -12,130 +12,162 @@
 
 using namespace std;
 
-namespace mmdeploy::transform {
-
-class Resize : public Transform {
- public:
-  explicit Resize(const Value& args) {
-    keep_ratio_ = args.value<bool>("keep_ratio", false);
-    if (args.contains("size")) {
-      if (args["size"].is_number_integer()) {
-        auto size = args["size"].get<int>();
-        img_scale_ = {size, size};
-      } else if (args["size"].is_array()) {
-        if (args["size"].size() != 2) {
-          MMDEPLOY_ERROR("'size' expects an array of size 2, but got {}", args["size"].size());
-          throw_exception(eInvalidArgument);
+namespace mmdeploy::transform
+{
+
+    class Resize : public Transform
+    {
+      public:
+        explicit Resize(const Value& args)
+        {
+            keep_ratio_ = args.value<bool>("keep_ratio", false);
+            if (args.contains("size"))
+            {
+                if (args["size"].is_number_integer())
+                {
+                    auto size  = args["size"].get<int>();
+                    img_scale_ = {size, size};
+                }
+                else if (args["size"].is_array())
+                {
+                    if (args["size"].size() != 2)
+                    {
+                        MMDEPLOY_ERROR("'size' expects an array of size 2, but got {}", args["size"].size());
+                        throw_exception(eInvalidArgument);
+                    }
+                    // the order in openmmalb config is [width, height], while in SDK it is [height, width]
+                    // keep the last dim -1
+                    auto width  = args["size"][0].get<int>();
+                    auto height = args["size"][1].get<int>();
+                    if (-1 == height)
+                    {
+                        img_scale_ = {width, -1};
+                    }
+                    else
+                    {
+                        img_scale_ = {height, width};
+                    }
+                }
+                else
+                {
+                    MMDEPLOY_ERROR("'size' is expected to be an integer or and array of size 2");
+                    throw_exception(eInvalidArgument);
+                }
+            }
+            interpolation_ = args.value<string>("interpolation", "bilinear");
+
+            vector<string> interpolations{"nearest", "bilinear", "bicubic", "area", "lanczos"};
+            if (std::find(interpolations.begin(), interpolations.end(), interpolation_) ==
+                interpolations.end())
+            {
+                MMDEPLOY_ERROR("'{}' interpolation is not supported", interpolation_);
+                throw_exception(eInvalidArgument);
+            }
+
+            resize_ = operation::Managed<operation::Resize>::Create(interpolation_);
         }
-        // the order in openmmalb config is [width, height], while in SDK it is [height, width]
-        // keep the last dim -1
-        auto width = args["size"][0].get<int>();
-        auto height = args["size"][1].get<int>();
-        if (-1 == height) {
-          img_scale_ = {width, -1};
-        } else {
-          img_scale_ = {height, width};
+        Result<void> Apply(Value& data) override
+        {
+            MMDEPLOY_DEBUG("input: {}", data);
+            auto img_fields = GetImageFields(data);
+
+            for (auto& key : img_fields)
+            {
+                Tensor src_img = data[key].get<Tensor>();
+                auto   desc    = src_img.desc();
+                assert(desc.shape.size() == 4);
+
+                int   h            = desc.shape[1];
+                int   w            = desc.shape[2];
+                int   dst_h        = 0;
+                int   dst_w        = 0;
+                float scale_factor = 0.f;
+
+                if (data.contains("scale"))
+                {
+                    assert(data["scale"].is_array() && data["scale"].size() == 2);
+                    dst_h = data["scale"][0].get<int>();
+                    dst_w = data["scale"][1].get<int>();
+                }
+                else if (data.contains("scale_factor"))
+                {
+                    assert(data["scale_factor"].is_number());
+                    scale_factor = data["scale_factor"].get<float>();
+                    dst_h        = int(h * scale_factor + 0.5);
+                    dst_w        = int(w * scale_factor + 0.5);
+                }
+                else
+                {
+                    MMDEPLOY_DEBUG(
+                        "neither 'scale' or 'scale_factor' is provided in input value. "
+                        "'img_scale' will be used");
+                    if (-1 == img_scale_[1])
+                    {
+                        if (w < h)
+                        {
+                            dst_w = img_scale_[0];
+                            dst_h = dst_w * h / w;
+                        }
+                        else
+                        {
+                            dst_h = img_scale_[0];
+                            dst_w = dst_h * w / h;
+                        }
+                    }
+                    else
+                    {
+                        dst_h = img_scale_[0];
+                        dst_w = img_scale_[1];
+                    }
+                }
+                if (keep_ratio_)
+                {
+                    int max_long_edge  = dst_w;
+                    int max_short_edge = dst_h;
+                    if (max_long_edge < max_short_edge)
+                    {
+                        std::swap(max_long_edge, max_short_edge);
+                    }
+                    scale_factor = std::min(max_long_edge * 1.0 / (1.0 * std::max(h, w)),
+                                            max_short_edge * 1.0 / (1.0 * std::min(h, w)));
+                    dst_w        = int(w * scale_factor + 0.5);
+                    dst_h        = int(h * scale_factor + 0.5);
+                }
+                Tensor dst_img;
+                if (dst_h != h || dst_w != w)
+                {
+                    OUTCOME_TRY(resize_.Apply(src_img, dst_img, dst_h, dst_w));
+                }
+                else
+                {
+                    dst_img = src_img;
+                }
+                auto w_scale         = dst_w * 1.0 / w;
+                auto h_scale         = dst_h * 1.0 / h;
+                data["scale_factor"] = {w_scale, h_scale, w_scale, h_scale};
+                data["img_shape"]    = {1, dst_h, dst_w, desc.shape[3]};
+                data["keep_ratio"]   = keep_ratio_;
+
+                data[key] = dst_img;
+
+                // trace static info & runtime args
+                if (data.contains("__tracer__"))
+                {
+                    data["__tracer__"].get_ref<Tracer&>().Resize(interpolation_, {dst_h, dst_w}, src_img.data_type());
+                }
+            }
+
+            MMDEPLOY_DEBUG("output: {}", data);
+            return success();
         }
-      } else {
-        MMDEPLOY_ERROR("'size' is expected to be an integer or and array of size 2");
-        throw_exception(eInvalidArgument);
-      }
-    }
-    interpolation_ = args.value<string>("interpolation", "bilinear");
-
-    vector<string> interpolations{"nearest", "bilinear", "bicubic", "area", "lanczos"};
-    if (std::find(interpolations.begin(), interpolations.end(), interpolation_) ==
-        interpolations.end()) {
-      MMDEPLOY_ERROR("'{}' interpolation is not supported", interpolation_);
-      throw_exception(eInvalidArgument);
-    }
-
-    resize_ = operation::Managed<operation::Resize>::Create(interpolation_);
-  }
-  Result<void> Apply(Value& data) override {
-    MMDEPLOY_DEBUG("input: {}", data);
-    auto img_fields = GetImageFields(data);
-
-    for (auto& key : img_fields) {
-      Tensor src_img = data[key].get<Tensor>();
-      auto desc = src_img.desc();
-      assert(desc.shape.size() == 4);
-
-      int h = desc.shape[1];
-      int w = desc.shape[2];
-      int dst_h = 0;
-      int dst_w = 0;
-      float scale_factor = 0.f;
-
-      if (data.contains("scale")) {
-        assert(data["scale"].is_array() && data["scale"].size() == 2);
-        dst_h = data["scale"][0].get<int>();
-        dst_w = data["scale"][1].get<int>();
-      } else if (data.contains("scale_factor")) {
-        assert(data["scale_factor"].is_number());
-        scale_factor = data["scale_factor"].get<float>();
-        dst_h = int(h * scale_factor + 0.5);
-        dst_w = int(w * scale_factor + 0.5);
-      } else {
-        MMDEPLOY_DEBUG(
-            "neither 'scale' or 'scale_factor' is provided in input value. "
-            "'img_scale' will be used");
-        if (-1 == img_scale_[1]) {
-          if (w < h) {
-            dst_w = img_scale_[0];
-            dst_h = dst_w * h / w;
-          } else {
-            dst_h = img_scale_[0];
-            dst_w = dst_h * w / h;
-          }
-        } else {
-          dst_h = img_scale_[0];
-          dst_w = img_scale_[1];
-        }
-      }
-      if (keep_ratio_) {
-        int max_long_edge = dst_w;
-        int max_short_edge = dst_h;
-        if (max_long_edge < max_short_edge) {
-          std::swap(max_long_edge, max_short_edge);
-        }
-        scale_factor = std::min(max_long_edge * 1.0 / (1.0 * std::max(h, w)),
-                                max_short_edge * 1.0 / (1.0 * std::min(h, w)));
-        dst_w = int(w * scale_factor + 0.5);
-        dst_h = int(h * scale_factor + 0.5);
-      }
-      Tensor dst_img;
-      if (dst_h != h || dst_w != w) {
-        OUTCOME_TRY(resize_.Apply(src_img, dst_img, dst_h, dst_w));
-      } else {
-        dst_img = src_img;
-      }
-      auto w_scale = dst_w * 1.0 / w;
-      auto h_scale = dst_h * 1.0 / h;
-      data["scale_factor"] = {w_scale, h_scale, w_scale, h_scale};
-      data["img_shape"] = {1, dst_h, dst_w, desc.shape[3]};
-      data["keep_ratio"] = keep_ratio_;
-
-      data[key] = dst_img;
-
-      // trace static info & runtime args
-      if (data.contains("__tracer__")) {
-        data["__tracer__"].get_ref<Tracer&>().Resize(interpolation_, {dst_h, dst_w},
-                                                     src_img.data_type());
-      }
-    }
-
-    MMDEPLOY_DEBUG("output: {}", data);
-    return success();
-  }
-
- protected:
-  operation::Managed<operation::Resize> resize_;
-  std::array<int, 2> img_scale_{};
-  std::string interpolation_{"bilinear"};
-  bool keep_ratio_{true};
-};
-
-MMDEPLOY_REGISTER_TRANSFORM(Resize);
+
+      protected:
+        operation::Managed<operation::Resize> resize_;
+        std::array<int, 2>                    img_scale_{};
+        std::string                           interpolation_{"bilinear"};
+        bool                                  keep_ratio_{true};
+    };
+
+    MMDEPLOY_REGISTER_TRANSFORM(Resize);
 
 }  // namespace mmdeploy::transform
diff --git a/csrc/mmdeploy/preprocess/transform/ten_crop.cpp b/csrc/mmdeploy/preprocess/transform/ten_crop.cpp
index 1cf44446f5..ebdd663497 100644
--- a/csrc/mmdeploy/preprocess/transform/ten_crop.cpp
+++ b/csrc/mmdeploy/preprocess/transform/ten_crop.cpp
@@ -9,82 +9,93 @@
 
 using namespace std;
 
-namespace mmdeploy::transform {
-
-class TenCrop : public Transform {
- public:
-  explicit TenCrop(const Value& args);
-  ~TenCrop() override = default;
-
-  Result<void> Apply(Value& data) override;
-
- protected:
-  std::array<int, 2> crop_size_{};
-  operation::Managed<operation::Crop> crop_;
-  operation::Managed<operation::Flip> flip_;
-};
-
-TenCrop::TenCrop(const Value& args) {
-  // (w, h) of crop size
-  if (!args.contains(("crop_size"))) {
-    MMDEPLOY_ERROR("'crop_size' is expected");
-    throw_exception(eInvalidArgument);
-  }
-  if (args["crop_size"].is_number_integer()) {
-    crop_size_[0] = crop_size_[1] = args["crop_size"].get<int>();
-  } else if (args["crop_size"].is_array() && args["crop_size"].size() == 2) {
-    crop_size_[0] = args["crop_size"][0].get<int>();
-    crop_size_[1] = args["crop_size"][1].get<int>();
-  } else {
-    MMDEPLOY_ERROR("'crop_size' should be integer or an int array of size 2");
-    throw_exception(eInvalidArgument);
-  }
-
-  crop_ = operation::Managed<operation::Crop>::Create();
-  // horizontal flip
-  flip_ = operation::Managed<operation::Flip>::Create(1);
-}
-
-Result<void> TenCrop::Apply(Value& data) {
-  MMDEPLOY_DEBUG("input: {}", data);
-
-  // copy input data, and update its properties
-  Value output = data;
-  auto tensor = data["img"].get<Tensor>();
-  int img_h = tensor.shape(1);
-  int img_w = tensor.shape(2);
-  int crop_w = crop_size_[0];
-  int crop_h = crop_size_[1];
-
-  int w_step = (img_w - crop_w) / 4;
-  int h_step = (img_h - crop_h) / 4;
-  std::array<std::pair<int, int>, 5> offsets = {{{0, 0},
-                                                 {4 * w_step, 0},
-                                                 {0, 4 * h_step},
-                                                 {4 * w_step, 4 * h_step},
-                                                 {2 * w_step, 2 * h_step}}};
-  vector<Tensor> cropped;
-  cropped.reserve(10);
-  for (const auto& [offx, offy] : offsets) {
-    int y1 = offy;
-    int y2 = offy + crop_h - 1;
-    int x1 = offx;
-    int x2 = offx + crop_w - 1;
-    // ! No reallocation
-    auto& cropped_tensor = cropped.emplace_back();
-    auto& flipped_tensor = cropped.emplace_back();
-
-    OUTCOME_TRY(crop_.Apply(tensor, cropped_tensor, y1, x1, y2, x2));
-    OUTCOME_TRY(flip_.Apply(cropped_tensor, flipped_tensor));
-  }
-
-  Value::Array imgs;
-  std::move(cropped.begin(), cropped.end(), std::back_inserter(imgs));
-  data["imgs"] = std::move(imgs);
-
-  return success();
-}
-
-MMDEPLOY_REGISTER_TRANSFORM(TenCrop);
+namespace mmdeploy::transform
+{
+
+    class TenCrop : public Transform
+    {
+      public:
+        explicit TenCrop(const Value& args);
+        ~TenCrop() override = default;
+
+        Result<void> Apply(Value& data) override;
+
+      protected:
+        std::array<int, 2>                  crop_size_{};
+        operation::Managed<operation::Crop> crop_;
+        operation::Managed<operation::Flip> flip_;
+    };
+
+    TenCrop::TenCrop(const Value& args)
+    {
+        // (w, h) of crop size
+        if (!args.contains(("crop_size")))
+        {
+            MMDEPLOY_ERROR("'crop_size' is expected");
+            throw_exception(eInvalidArgument);
+        }
+        if (args["crop_size"].is_number_integer())
+        {
+            crop_size_[0] = crop_size_[1] = args["crop_size"].get<int>();
+        }
+        else if (args["crop_size"].is_array() && args["crop_size"].size() == 2)
+        {
+            crop_size_[0] = args["crop_size"][0].get<int>();
+            crop_size_[1] = args["crop_size"][1].get<int>();
+        }
+        else
+        {
+            MMDEPLOY_ERROR("'crop_size' should be integer or an int array of size 2");
+            throw_exception(eInvalidArgument);
+        }
+
+        crop_ = operation::Managed<operation::Crop>::Create();
+        // horizontal flip
+        flip_ = operation::Managed<operation::Flip>::Create(1);
+    }
+
+    Result<void> TenCrop::Apply(Value& data)
+    {
+        MMDEPLOY_DEBUG("input: {}", data);
+
+        // copy input data, and update its properties
+        Value                              output = data;
+        auto                               tensor = data["img"].get<Tensor>();
+        int                                img_h  = tensor.shape(1);
+        int                                img_w  = tensor.shape(2);
+        int                                crop_w = crop_size_[0];
+        int                                crop_h = crop_size_[1];
+
+        int                                w_step  = (img_w - crop_w) / 4;
+        int                                h_step  = (img_h - crop_h) / 4;
+        std::array<std::pair<int, int>, 5> offsets = {{{0, 0},
+                                                       {4 * w_step, 0},
+                                                       {0, 4 * h_step},
+                                                       {4 * w_step, 4 * h_step},
+                                                       {2 * w_step, 2 * h_step}}};
+        vector<Tensor>                     cropped;
+        cropped.reserve(10);
+        for (const auto& [offx, offy] : offsets)
+        {
+            int   y1             = offy;
+            int   y2             = offy + crop_h - 1;
+            int   x1             = offx;
+            int   x2             = offx + crop_w - 1;
+            // ! No reallocation
+            auto& cropped_tensor = cropped.emplace_back();
+            auto& flipped_tensor = cropped.emplace_back();
+
+            OUTCOME_TRY(crop_.Apply(tensor, cropped_tensor, y1, x1, y2, x2));
+            OUTCOME_TRY(flip_.Apply(cropped_tensor, flipped_tensor));
+        }
+
+        Value::Array imgs;
+        std::move(cropped.begin(), cropped.end(), std::back_inserter(imgs));
+        data["imgs"] = std::move(imgs);
+
+        return success();
+    }
+
+    MMDEPLOY_REGISTER_TRANSFORM(TenCrop);
 
 }  // namespace mmdeploy::transform
diff --git a/csrc/mmdeploy/preprocess/transform/three_crop.cpp b/csrc/mmdeploy/preprocess/transform/three_crop.cpp
index fe8f1904ea..bca672aad0 100644
--- a/csrc/mmdeploy/preprocess/transform/three_crop.cpp
+++ b/csrc/mmdeploy/preprocess/transform/three_crop.cpp
@@ -8,87 +8,103 @@
 
 using namespace std;
 
-namespace mmdeploy::transform {
-
-namespace {
-
-Result<void> check_input_shape(int img_h, int img_w, int crop_h, int crop_w) {
-  if (img_h == crop_h || img_w == crop_w) {
-    return success();
-  }
-  MMDEPLOY_ERROR("ThreeCrop error, img_h: {} != crop_h: {} && img_w: {} != crop_w {}", img_h,
-                 crop_h, img_w, crop_w);
-  return Status(eInvalidArgument);
-}
-
-}  // namespace
-
-class ThreeCrop : public Transform {
- public:
-  explicit ThreeCrop(const Value& args);
-  ~ThreeCrop() override = default;
-
-  Result<void> Apply(Value& data) override;
-
- protected:
-  std::array<int, 2> crop_size_{};
-  operation::Managed<operation::Crop> crop_;
-};
-
-ThreeCrop::ThreeCrop(const Value& args) {
-  // (w, h) of crop size
-  if (!args.contains(("crop_size"))) {
-    MMDEPLOY_ERROR("'crop_size' is expected");
-    throw_exception(eInvalidArgument);
-  }
-  if (args["crop_size"].is_number_integer()) {
-    crop_size_[0] = crop_size_[1] = args["crop_size"].get<int>();
-  } else if (args["crop_size"].is_array() && args["crop_size"].size() == 2) {
-    crop_size_[0] = args["crop_size"][0].get<int>();
-    crop_size_[1] = args["crop_size"][1].get<int>();
-  } else {
-    MMDEPLOY_ERROR("'crop_size' should be integer or an int array of size 2");
-    throw_exception(eInvalidArgument);
-  }
-
-  crop_ = operation::Managed<operation::Crop>::Create();
-}
-
-Result<void> ThreeCrop::Apply(Value& data) {
-  auto tensor = data["img"].get<Tensor>();
-  auto desc = tensor.desc();
-  int img_h = desc.shape[1];
-  int img_w = desc.shape[2];
-  int crop_w = crop_size_[0];
-  int crop_h = crop_size_[1];
-  OUTCOME_TRY(check_input_shape(img_h, img_w, crop_h, crop_w));
-
-  std::array<std::pair<int, int>, 3> offsets;
-  if (crop_h == img_h) {
-    int w_step = (img_w - crop_w) / 2;
-    offsets = {{{0, 0}, {2 * w_step, 0}, {w_step, 0}}};
-  } else if (crop_w == img_w) {
-    int h_step = (img_h - crop_h) / 2;
-    offsets = {{{0, 0}, {0, 2 * h_step}, {0, h_step}}};
-  }
-  vector<Tensor> cropped;
-  cropped.reserve(3);
-  for (const auto& [offx, offy] : offsets) {
-    int y1 = offy;
-    int y2 = offy + crop_h - 1;
-    int x1 = offx;
-    int x2 = offx + crop_w - 1;
-    auto& dst_tensor = cropped.emplace_back();
-
-    OUTCOME_TRY(crop_.Apply(tensor, dst_tensor, y1, x1, y2, x2));
-  }
-
-  Value::Array imgs;
-  std::move(cropped.begin(), cropped.end(), std::back_inserter(imgs));
-  data["imgs"] = std::move(imgs);
-  return success();
-}
-
-MMDEPLOY_REGISTER_TRANSFORM(ThreeCrop);
+namespace mmdeploy::transform
+{
+
+    namespace
+    {
+
+        Result<void> check_input_shape(int img_h, int img_w, int crop_h, int crop_w)
+        {
+            if (img_h == crop_h || img_w == crop_w)
+            {
+                return success();
+            }
+            MMDEPLOY_ERROR("ThreeCrop error, img_h: {} != crop_h: {} && img_w: {} != crop_w {}", img_h, crop_h, img_w, crop_w);
+            return Status(eInvalidArgument);
+        }
+
+    }  // namespace
+
+    class ThreeCrop : public Transform
+    {
+      public:
+        explicit ThreeCrop(const Value& args);
+        ~ThreeCrop() override = default;
+
+        Result<void> Apply(Value& data) override;
+
+      protected:
+        std::array<int, 2>                  crop_size_{};
+        operation::Managed<operation::Crop> crop_;
+    };
+
+    ThreeCrop::ThreeCrop(const Value& args)
+    {
+        // (w, h) of crop size
+        if (!args.contains(("crop_size")))
+        {
+            MMDEPLOY_ERROR("'crop_size' is expected");
+            throw_exception(eInvalidArgument);
+        }
+        if (args["crop_size"].is_number_integer())
+        {
+            crop_size_[0] = crop_size_[1] = args["crop_size"].get<int>();
+        }
+        else if (args["crop_size"].is_array() && args["crop_size"].size() == 2)
+        {
+            crop_size_[0] = args["crop_size"][0].get<int>();
+            crop_size_[1] = args["crop_size"][1].get<int>();
+        }
+        else
+        {
+            MMDEPLOY_ERROR("'crop_size' should be integer or an int array of size 2");
+            throw_exception(eInvalidArgument);
+        }
+
+        crop_ = operation::Managed<operation::Crop>::Create();
+    }
+
+    Result<void> ThreeCrop::Apply(Value& data)
+    {
+        auto tensor = data["img"].get<Tensor>();
+        auto desc   = tensor.desc();
+        int  img_h  = desc.shape[1];
+        int  img_w  = desc.shape[2];
+        int  crop_w = crop_size_[0];
+        int  crop_h = crop_size_[1];
+        OUTCOME_TRY(check_input_shape(img_h, img_w, crop_h, crop_w));
+
+        std::array<std::pair<int, int>, 3> offsets;
+        if (crop_h == img_h)
+        {
+            int w_step = (img_w - crop_w) / 2;
+            offsets    = {{{0, 0}, {2 * w_step, 0}, {w_step, 0}}};
+        }
+        else if (crop_w == img_w)
+        {
+            int h_step = (img_h - crop_h) / 2;
+            offsets    = {{{0, 0}, {0, 2 * h_step}, {0, h_step}}};
+        }
+        vector<Tensor> cropped;
+        cropped.reserve(3);
+        for (const auto& [offx, offy] : offsets)
+        {
+            int   y1         = offy;
+            int   y2         = offy + crop_h - 1;
+            int   x1         = offx;
+            int   x2         = offx + crop_w - 1;
+            auto& dst_tensor = cropped.emplace_back();
+
+            OUTCOME_TRY(crop_.Apply(tensor, dst_tensor, y1, x1, y2, x2));
+        }
+
+        Value::Array imgs;
+        std::move(cropped.begin(), cropped.end(), std::back_inserter(imgs));
+        data["imgs"] = std::move(imgs);
+        return success();
+    }
+
+    MMDEPLOY_REGISTER_TRANSFORM(ThreeCrop);
 
 }  // namespace mmdeploy::transform
diff --git a/csrc/mmdeploy/preprocess/transform/tracer.cpp b/csrc/mmdeploy/preprocess/transform/tracer.cpp
index 7d2b28752e..5f961061bb 100644
--- a/csrc/mmdeploy/preprocess/transform/tracer.cpp
+++ b/csrc/mmdeploy/preprocess/transform/tracer.cpp
@@ -2,76 +2,85 @@
 
 #include "mmdeploy/preprocess/transform/tracer.h"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-using namespace trace;
+    using namespace trace;
 
-void Tracer::PrepareImage(const std::string &color_type, bool to_float32, TensorShape shape,
-                          PixelFormat pfmt, DataType dtype) {
-  PixelFormat pdst = PixelFormat::kGRAYSCALE;
-  if (color_type == "color" || color_type == "color_ignore_orientation") {
-    pdst = PixelFormat::kBGR;
-  }
-  trans_.push_back(CvtColorParam{dtype, pfmt, pdst});
-  state_ = {dtype, pdst, shape};
+    void Tracer::PrepareImage(const std::string& color_type, bool to_float32, TensorShape shape, PixelFormat pfmt, DataType dtype)
+    {
+        PixelFormat pdst = PixelFormat::kGRAYSCALE;
+        if (color_type == "color" || color_type == "color_ignore_orientation")
+        {
+            pdst = PixelFormat::kBGR;
+        }
+        trans_.push_back(CvtColorParam{dtype, pfmt, pdst});
+        state_ = {dtype, pdst, shape};
 
-  if (to_float32) {
-    trans_.push_back(CastParam{dtype, DataType::kFLOAT});
-    state_.dtype = DataType::kFLOAT;
-    common_dtype_ = DataType::kFLOAT;
-  }
-}
+        if (to_float32)
+        {
+            trans_.push_back(CastParam{dtype, DataType::kFLOAT});
+            state_.dtype  = DataType::kFLOAT;
+            common_dtype_ = DataType::kFLOAT;
+        }
+    }
 
-void Tracer::Resize(const std::string &mode, const std::vector<int> &size, DataType dtype) {
-  trans_.push_back(ResizeParam{dtype, size, mode});
-  state_.shape[1] = size[0];
-  state_.shape[2] = size[1];
-}
+    void Tracer::Resize(const std::string& mode, const std::vector<int>& size, DataType dtype)
+    {
+        trans_.push_back(ResizeParam{dtype, size, mode});
+        state_.shape[1] = size[0];
+        state_.shape[2] = size[1];
+    }
 
-void Tracer::Pad(float pad_val, const std::vector<int> &tlbr, const std::vector<int> &size,
-                 DataType dtype) {
-  trans_.push_back(PadParam{dtype, pad_val, tlbr, size});
-  state_.shape[1] = size[0];
-  state_.shape[2] = size[1];
-}
+    void Tracer::Pad(float pad_val, const std::vector<int>& tlbr, const std::vector<int>& size, DataType dtype)
+    {
+        trans_.push_back(PadParam{dtype, pad_val, tlbr, size});
+        state_.shape[1] = size[0];
+        state_.shape[2] = size[1];
+    }
 
-void Tracer::Normalize(const std::vector<float> &mean, const std::vector<float> &std, bool to_rgb,
-                       DataType dtype) {
-  if (common_dtype_ == std::nullopt || common_dtype_.value() != DataType::kFLOAT) {
-    trans_.push_back(CastParam{dtype, DataType::kFLOAT});
-    state_.dtype = DataType::kFLOAT;
-    common_dtype_ = DataType::kFLOAT;
-  }
+    void Tracer::Normalize(const std::vector<float>& mean, const std::vector<float>& std, bool to_rgb, DataType dtype)
+    {
+        if (common_dtype_ == std::nullopt || common_dtype_.value() != DataType::kFLOAT)
+        {
+            trans_.push_back(CastParam{dtype, DataType::kFLOAT});
+            state_.dtype  = DataType::kFLOAT;
+            common_dtype_ = DataType::kFLOAT;
+        }
 
-  if (to_rgb) {
-    trans_.push_back(CvtColorParam{DataType::kFLOAT, state_.pfmt, PixelFormat::kRGB});
-    state_.pfmt = PixelFormat::kRGB;
-  }
+        if (to_rgb)
+        {
+            trans_.push_back(CvtColorParam{DataType::kFLOAT, state_.pfmt, PixelFormat::kRGB});
+            state_.pfmt = PixelFormat::kRGB;
+        }
 
-  trans_.push_back(NormParam{state_.dtype, mean, std});
-}
+        trans_.push_back(NormParam{state_.dtype, mean, std});
+    }
 
-void Tracer::CenterCrop(const std::vector<int> &tlbr, const std::vector<int> &size,
-                        DataType dtype) {
-  trans_.push_back(CropParam{state_.dtype, tlbr, size});
-  state_.shape[1] = size[0];
-  state_.shape[2] = size[1];
-}
+    void Tracer::CenterCrop(const std::vector<int>& tlbr, const std::vector<int>& size, DataType dtype)
+    {
+        trans_.push_back(CropParam{state_.dtype, tlbr, size});
+        state_.shape[1] = size[0];
+        state_.shape[2] = size[1];
+    }
 
-void Tracer::DefaultFormatBundle(bool to_float, DataType dtype) {
-  if (to_float && (common_dtype_ == std::nullopt || common_dtype_.value() != DataType::kFLOAT)) {
-    trans_.push_back(CastParam{dtype, DataType::kFLOAT});
-    state_.dtype = DataType::kFLOAT;
-    common_dtype_ = DataType::kFLOAT;
-  }
+    void Tracer::DefaultFormatBundle(bool to_float, DataType dtype)
+    {
+        if (to_float && (common_dtype_ == std::nullopt || common_dtype_.value() != DataType::kFLOAT))
+        {
+            trans_.push_back(CastParam{dtype, DataType::kFLOAT});
+            state_.dtype  = DataType::kFLOAT;
+            common_dtype_ = DataType::kFLOAT;
+        }
 
-  trans_.push_back(HWC2CHWParam{state_.dtype});
-  state_.shape = {state_.shape[0], state_.shape[3], state_.shape[1], state_.shape[2]};
-}
+        trans_.push_back(HWC2CHWParam{state_.dtype});
+        state_.shape = {state_.shape[0], state_.shape[3], state_.shape[1], state_.shape[2]};
+    }
 
-void Tracer::ImageToTensor(DataType dtype) {
-  trans_.push_back(HWC2CHWParam{state_.dtype});
-  state_.shape = {state_.shape[0], state_.shape[3], state_.shape[1], state_.shape[2]};
-}
+    void Tracer::ImageToTensor(DataType dtype)
+    {
+        trans_.push_back(HWC2CHWParam{state_.dtype});
+        state_.shape = {state_.shape[0], state_.shape[3], state_.shape[1], state_.shape[2]};
+    }
 
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/preprocess/transform/tracer.h b/csrc/mmdeploy/preprocess/transform/tracer.h
index d8885fe22a..148a8c1615 100644
--- a/csrc/mmdeploy/preprocess/transform/tracer.h
+++ b/csrc/mmdeploy/preprocess/transform/tracer.h
@@ -13,89 +13,96 @@
 #include "mmdeploy/core/tensor.h"
 #include "mmdeploy/core/types.h"
 
-namespace mmdeploy {
-
-using namespace framework;
-
-namespace trace {
-
-struct CvtColorParam {
-  DataType dtype;
-  PixelFormat srt;
-  PixelFormat dst;
-};
-
-struct CastParam {
-  DataType srt;
-  DataType dst;
-};
-
-struct ResizeParam {
-  DataType dtype;
-  std::vector<int> size;
-  std::string mode;
-};
-
-struct CropParam {
-  DataType dtype;
-  std::vector<int> tlbr;
-  std::vector<int> size;
-};
-
-struct NormParam {
-  DataType dtype;
-  std::vector<float> mean;
-  std::vector<float> std;
-};
-
-struct PadParam {
-  DataType dtype;
-  float pad_val;
-  std::vector<int> tlbr;
-  std::vector<int> size;
-};
-
-struct HWC2CHWParam {
-  DataType dtype;
-};
-
-using TransParamType = std::variant<CvtColorParam, CastParam, ResizeParam, PadParam, NormParam,
-                                    CropParam, HWC2CHWParam>;
-
-}  // namespace trace
-
-class MMDEPLOY_API Tracer {
- public:
-  void Resize(const std::string &mode, const std::vector<int> &size, DataType dtype);
-
-  void PrepareImage(const std::string &color_type, bool to_float32, TensorShape shape,
-                    PixelFormat pfmt, DataType dtype);
-
-  void Pad(float pad_val, const std::vector<int> &tlbr, const std::vector<int> &size,
-           DataType dtype);
-
-  void Normalize(const std::vector<float> &mean, const std::vector<float> &std, bool to_rgb,
-                 DataType dtype);
-
-  void CenterCrop(const std::vector<int> &tlbr, const std::vector<int> &size, DataType dtype);
-
-  void DefaultFormatBundle(bool to_float, DataType dtype);
-
-  void ImageToTensor(DataType dtype);
-
- public:
-  struct state_t {
-    DataType dtype;
-    PixelFormat pfmt;
-    TensorShape shape;
-  };
-  using StateType = struct state_t;
-  StateType state_;
-  std::optional<DataType> common_dtype_;
-  std::vector<trace::TransParamType> trans_;
-};
-
-MMDEPLOY_REGISTER_TYPE_ID(Tracer, 9);
+namespace mmdeploy
+{
+
+    using namespace framework;
+
+    namespace trace
+    {
+
+        struct CvtColorParam
+        {
+            DataType    dtype;
+            PixelFormat srt;
+            PixelFormat dst;
+        };
+
+        struct CastParam
+        {
+            DataType srt;
+            DataType dst;
+        };
+
+        struct ResizeParam
+        {
+            DataType         dtype;
+            std::vector<int> size;
+            std::string      mode;
+        };
+
+        struct CropParam
+        {
+            DataType         dtype;
+            std::vector<int> tlbr;
+            std::vector<int> size;
+        };
+
+        struct NormParam
+        {
+            DataType           dtype;
+            std::vector<float> mean;
+            std::vector<float> std;
+        };
+
+        struct PadParam
+        {
+            DataType         dtype;
+            float            pad_val;
+            std::vector<int> tlbr;
+            std::vector<int> size;
+        };
+
+        struct HWC2CHWParam
+        {
+            DataType dtype;
+        };
+
+        using TransParamType = std::variant<CvtColorParam, CastParam, ResizeParam, PadParam, NormParam, CropParam, HWC2CHWParam>;
+
+    }  // namespace trace
+
+    class MMDEPLOY_API Tracer
+    {
+      public:
+        void Resize(const std::string& mode, const std::vector<int>& size, DataType dtype);
+
+        void PrepareImage(const std::string& color_type, bool to_float32, TensorShape shape, PixelFormat pfmt, DataType dtype);
+
+        void Pad(float pad_val, const std::vector<int>& tlbr, const std::vector<int>& size, DataType dtype);
+
+        void Normalize(const std::vector<float>& mean, const std::vector<float>& std, bool to_rgb, DataType dtype);
+
+        void CenterCrop(const std::vector<int>& tlbr, const std::vector<int>& size, DataType dtype);
+
+        void DefaultFormatBundle(bool to_float, DataType dtype);
+
+        void ImageToTensor(DataType dtype);
+
+      public:
+        struct state_t
+        {
+            DataType    dtype;
+            PixelFormat pfmt;
+            TensorShape shape;
+        };
+        using StateType = struct state_t;
+        StateType                          state_;
+        std::optional<DataType>            common_dtype_;
+        std::vector<trace::TransParamType> trans_;
+    };
+
+    MMDEPLOY_REGISTER_TYPE_ID(Tracer, 9);
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/preprocess/transform/transform.cpp b/csrc/mmdeploy/preprocess/transform/transform.cpp
index 9569959fc7..008fa3b46a 100644
--- a/csrc/mmdeploy/preprocess/transform/transform.cpp
+++ b/csrc/mmdeploy/preprocess/transform/transform.cpp
@@ -4,23 +4,30 @@
 
 #include "mmdeploy/core/registry.h"
 
-namespace mmdeploy::transform {
+namespace mmdeploy::transform
+{
 
-std::vector<std::string> GetImageFields(const Value& input) {
-  if (input.contains("img_fields")) {
-    if (input["img_fields"].is_string()) {
-      return {input["img_fields"].get<std::string>()};
-    } else if (input["img_fields"].is_array()) {
-      std::vector<std::string> img_fields;
-      for (auto& v : input["img_fields"]) {
-        img_fields.push_back(v.get<std::string>());
-      }
-      return img_fields;
-    }
-  }
-  return {"img"};
-};
+    std::vector<std::string> GetImageFields(const Value& input)
+    {
+        if (input.contains("img_fields"))
+        {
+            if (input["img_fields"].is_string())
+            {
+                return {input["img_fields"].get<std::string>()};
+            }
+            else if (input["img_fields"].is_array())
+            {
+                std::vector<std::string> img_fields;
+                for (auto& v : input["img_fields"])
+                {
+                    img_fields.push_back(v.get<std::string>());
+                }
+                return img_fields;
+            }
+        }
+        return {"img"};
+    };
 
-MMDEPLOY_DEFINE_REGISTRY(Transform);
+    MMDEPLOY_DEFINE_REGISTRY(Transform);
 
 }  // namespace mmdeploy::transform
diff --git a/csrc/mmdeploy/preprocess/transform/transform.h b/csrc/mmdeploy/preprocess/transform/transform.h
index e8246e54e8..6ad5ceb7f6 100644
--- a/csrc/mmdeploy/preprocess/transform/transform.h
+++ b/csrc/mmdeploy/preprocess/transform/transform.h
@@ -8,32 +8,33 @@
 #include "mmdeploy/core/registry.h"
 #include "mmdeploy/operation/operation.h"
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-using namespace framework;
+    using namespace framework;
 
-namespace transform {
+    namespace transform
+    {
 
-class MMDEPLOY_API Transform {
- public:
-  virtual ~Transform() = default;
-  virtual Result<void> Apply(Value& data) = 0;
-};
+        class MMDEPLOY_API Transform
+        {
+          public:
+            virtual ~Transform()                    = default;
+            virtual Result<void> Apply(Value& data) = 0;
+        };
 
-MMDEPLOY_API std::vector<std::string> GetImageFields(const Value& input);
+        MMDEPLOY_API std::vector<std::string> GetImageFields(const Value& input);
 
-MMDEPLOY_DECLARE_REGISTRY(Transform, std::unique_ptr<Transform>(const Value& config));
+        MMDEPLOY_DECLARE_REGISTRY(Transform, std::unique_ptr<Transform>(const Value& config));
 
-#define MMDEPLOY_REGISTER_TRANSFORM2(type, desc)                                                   \
-  MMDEPLOY_REGISTER_FACTORY_FUNC(::mmdeploy::transform::Transform, desc, [](const Value& config) { \
-    return std::make_unique<type>(config);                                                         \
-  });
+#define MMDEPLOY_REGISTER_TRANSFORM2(type, desc) \
+    MMDEPLOY_REGISTER_FACTORY_FUNC(::mmdeploy::transform::Transform, desc, [](const Value& config) { return std::make_unique<type>(config); });
 
 #define MMDEPLOY_REGISTER_TRANSFORM(type) MMDEPLOY_REGISTER_TRANSFORM2(type, (type, 0))
 
-}  // namespace transform
+    }  // namespace transform
 
-using transform::Transform;
+    using transform::Transform;
 
 }  // namespace mmdeploy
 
diff --git a/csrc/mmdeploy/preprocess/transform_module.cpp b/csrc/mmdeploy/preprocess/transform_module.cpp
index b718843ea8..59b534a749 100644
--- a/csrc/mmdeploy/preprocess/transform_module.cpp
+++ b/csrc/mmdeploy/preprocess/transform_module.cpp
@@ -6,50 +6,54 @@
 #include "mmdeploy/experimental/module_adapter.h"
 #include "mmdeploy/preprocess/transform/transform.h"
 
-namespace mmdeploy {
-
-class TransformModule {
- public:
-  ~TransformModule();
-  TransformModule(TransformModule&&) noexcept;
-
-  explicit TransformModule(const Value& args);
-  Result<Value> operator()(const Value& input);
-
- private:
-  std::unique_ptr<transform::Transform> transform_;
-};
-
-TransformModule::~TransformModule() = default;
-
-TransformModule::TransformModule(TransformModule&&) noexcept = default;
-
-TransformModule::TransformModule(const Value& args) {
-  const auto type = "Compose";
-  auto creator = gRegistry<transform::Transform>().Get(type);
-  if (!creator) {
-    MMDEPLOY_ERROR("Unable to find Transform creator: {}. Available transforms: {}", type,
-                   gRegistry<transform::Transform>().List());
-    throw_exception(eEntryNotFound);
-  }
-  auto cfg = args;
-  if (cfg.contains("device")) {
-    MMDEPLOY_WARN("force using device: {}", cfg["device"].get<const char*>());
-    auto device = Device(cfg["device"].get<const char*>());
-    cfg["context"]["device"] = device;
-    cfg["context"]["stream"] = Stream::GetDefault(device);
-  }
-  transform_ = creator->Create(cfg);
-}
-
-Result<Value> TransformModule::operator()(const Value& input) {
-  auto data = input;
-  OUTCOME_TRY(transform_->Apply(data));
-  return data;
-}
-
-MMDEPLOY_REGISTER_FACTORY_FUNC(Module, (Transform, 0), [](const Value& config) {
-  return CreateTask(TransformModule{config});
-});
+namespace mmdeploy
+{
+
+    class TransformModule
+    {
+      public:
+        ~TransformModule();
+        TransformModule(TransformModule&&) noexcept;
+
+        explicit TransformModule(const Value& args);
+        Result<Value> operator()(const Value& input);
+
+      private:
+        std::unique_ptr<transform::Transform> transform_;
+    };
+
+    TransformModule::~TransformModule() = default;
+
+    TransformModule::TransformModule(TransformModule&&) noexcept = default;
+
+    TransformModule::TransformModule(const Value& args)
+    {
+        const auto type    = "Compose";
+        auto       creator = gRegistry<transform::Transform>().Get(type);
+        if (!creator)
+        {
+            MMDEPLOY_ERROR("Unable to find Transform creator: {}. Available transforms: {}", type, gRegistry<transform::Transform>().List());
+            throw_exception(eEntryNotFound);
+        }
+        auto cfg = args;
+        if (cfg.contains("device"))
+        {
+            MMDEPLOY_WARN("force using device: {}", cfg["device"].get<const char*>());
+            auto device              = Device(cfg["device"].get<const char*>());
+            cfg["context"]["device"] = device;
+            cfg["context"]["stream"] = Stream::GetDefault(device);
+        }
+        transform_ = creator->Create(cfg);
+    }
+
+    Result<Value> TransformModule::operator()(const Value& input)
+    {
+        auto data = input;
+        OUTCOME_TRY(transform_->Apply(data));
+        return data;
+    }
+
+    MMDEPLOY_REGISTER_FACTORY_FUNC(Module, (Transform, 0), [](const Value& config)
+                                   { return CreateTask(TransformModule{config}); });
 
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/utils/dlpack/CMakeLists.txt b/csrc/mmdeploy/utils/dlpack/CMakeLists.txt
index ab521a05c1..6baa1bac24 100644
--- a/csrc/mmdeploy/utils/dlpack/CMakeLists.txt
+++ b/csrc/mmdeploy/utils/dlpack/CMakeLists.txt
@@ -4,11 +4,11 @@ project(mmdeploy_dlpack_utils)
 
 mmdeploy_add_library(${PROJECT_NAME} STATIC dlpack_utils.cpp)
 
-target_link_libraries(${PROJECT_NAME}
-        PRIVATE mmdeploy::core)
+target_link_libraries(${PROJECT_NAME} PRIVATE mmdeploy::core)
 
-target_include_directories(${PROJECT_NAME}
-        INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>)
+target_include_directories(
+  ${PROJECT_NAME} INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>)
 
-target_include_directories(${PROJECT_NAME} PRIVATE
-        $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/third_party/dlpack>)
+target_include_directories(
+  ${PROJECT_NAME}
+  PRIVATE $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/third_party/dlpack>)
diff --git a/csrc/mmdeploy/utils/dlpack/dlpack_utils.cpp b/csrc/mmdeploy/utils/dlpack/dlpack_utils.cpp
index 343fcb8bed..77f8b3c437 100644
--- a/csrc/mmdeploy/utils/dlpack/dlpack_utils.cpp
+++ b/csrc/mmdeploy/utils/dlpack/dlpack_utils.cpp
@@ -11,177 +11,201 @@
 #include "mmdeploy/core/tensor.h"
 #include "mmdeploy/core/types.h"
 
-namespace mmdeploy {
-
-using mmdeploy::framework::Device;
-using mmdeploy::framework::Stream;
-using mmdeploy::framework::Tensor;
-using mmdeploy::framework::TensorShape;
-
-static inline int64_t element_size(DataType data_type) {
-  switch (data_type) {
-    case DataType::kFLOAT:
-      return 4;
-    case DataType::kHALF:
-      return 2;
-    case DataType::kINT8:
-      return 1;
-    case DataType::kINT32:
-      return 4;
-    case DataType::kINT64:
-      return 8;
-    default:
-      return 0;
-  }
-}
-
-static inline int64_t get_size(const std::vector<int64_t>& shape) {
-  if (shape.empty()) {
-    return 0;
-  }
-  auto _size = std::accumulate(begin(shape), end(shape), 1LL, std::multiplies<>());
-  return std::max(0LL, _size);
-}
-
-inline static Result<Device> FromDLDevice(const DLDevice& device) {
-  int device_id = device.device_id;
-
-  switch (device.device_type) {
-    case kDLCPU:
-      return Device("cpu", device_id);
-    case kDLCUDA:
-      return Device("cuda", device_id);
-    default:
-      MMDEPLOY_ERROR("Unsupported DLDevice.");
-      return Status(eNotSupported);
-  }
-}
-
-inline static DLDevice ToDLDevice(const Device& device) {
-  auto device_type = device.is_device() ? kDLCUDA : kDLCPU;
-  int device_id = device.device_id();
-  return DLDevice{device_type, device_id};
-}
-
-inline static Result<DataType> FromDLDataType(const DLDataType& dtype) {
-  if (dtype.lanes != 1) {
-    MMDEPLOY_ERROR("DLDataType.lanes != 1 is not supported.");
-    return Status(eNotSupported);
-  }
-  switch (dtype.code) {
-    case kDLFloat:
-      if (dtype.bits == 32)
-        return DataType::kFLOAT;
-      else {
-        MMDEPLOY_ERROR("Unsupported bits. {}", dtype.bits);
-        return Status(eNotSupported);
-      }
-    case kDLInt:
-      if (dtype.bits == 32) return DataType::kINT32;
-      if (dtype.bits == 64) return DataType::kINT64;
-      if (dtype.bits == 8)
-        return DataType::kINT8;
-      else {
-        MMDEPLOY_ERROR("Unsupported bits. {}", dtype.bits);
-        return Status(eNotSupported);
-      }
-      break;
-    default:
-      MMDEPLOY_ERROR("Unsupported DLDataType.");
-      return Status(eNotSupported);
-  }
-}
-
-inline static Result<DLDataType> ToDLDataType(const DataType& dtype) {
-  switch (dtype) {
-    case DataType::kFLOAT:
-      return DLDataType{kDLFloat, 32, 1};
-    case DataType::kINT32:
-      return DLDataType{kDLInt, 32, 1};
-    case DataType::kINT64:
-      return DLDataType{kDLInt, 64, 1};
-    case DataType::kINT8:
-      return DLDataType{kDLInt, 8, 1};
-    default:
-      MMDEPLOY_ERROR("Unsupported mmdeploy::DataType");
-      return Status(eNotSupported);
-  }
-}
-
-static void TensorDeleter(struct DLManagedTensor* self) {
-  auto tensor = static_cast<Tensor*>(self->manager_ctx);
-  delete tensor;
-}
-
-static bool IsContiguous(const int64_t* shape, const int64_t* stride, int ndim) {
-  if (ndim <= 1 || stride == nullptr) return true;
-  for (auto i = 1; i < ndim; ++i) {
-    if (stride[i - 1] != shape[i] * stride[i]) return false;
-  }
-  return true;
-}
-
-Result<DLManagedTensor*> ToDLPack(Tensor& tensor, Stream stream) {
-  using mmdeploy::framework::Buffer;
-  auto managed_tensor = new DLManagedTensor();
-
-  // set deleter
-  managed_tensor->deleter = TensorDeleter;
-  Tensor* new_tensor = nullptr;
-
-  // create manager_ctx
-  {
-    auto desc = tensor.desc();
-    uint64_t data_val = reinterpret_cast<uint64_t>(tensor.data());
-    if ((data_val & 0xff) != 0) {
-      // copy buffer if data is not aligned.
-      new_tensor =
-          new Tensor(desc, Buffer(desc.device, tensor.byte_size(), tensor.allocator(), 256));
-      OUTCOME_TRY(tensor.CopyTo(*new_tensor, stream));
-    } else {
-      // reuse buffer
-      new_tensor = new Tensor(desc, tensor.buffer());
+namespace mmdeploy
+{
+
+    using mmdeploy::framework::Device;
+    using mmdeploy::framework::Stream;
+    using mmdeploy::framework::Tensor;
+    using mmdeploy::framework::TensorShape;
+
+    static inline int64_t element_size(DataType data_type)
+    {
+        switch (data_type)
+        {
+            case DataType::kFLOAT:
+                return 4;
+            case DataType::kHALF:
+                return 2;
+            case DataType::kINT8:
+                return 1;
+            case DataType::kINT32:
+                return 4;
+            case DataType::kINT64:
+                return 8;
+            default:
+                return 0;
+        }
+    }
+
+    static inline int64_t get_size(const std::vector<int64_t>& shape)
+    {
+        if (shape.empty())
+        {
+            return 0;
+        }
+        auto _size = std::accumulate(begin(shape), end(shape), 1LL, std::multiplies<>());
+        return std::max(0LL, _size);
+    }
+
+    inline static Result<Device> FromDLDevice(const DLDevice& device)
+    {
+        int device_id = device.device_id;
+
+        switch (device.device_type)
+        {
+            case kDLCPU:
+                return Device("cpu", device_id);
+            case kDLCUDA:
+                return Device("cuda", device_id);
+            default:
+                MMDEPLOY_ERROR("Unsupported DLDevice.");
+                return Status(eNotSupported);
+        }
+    }
+
+    inline static DLDevice ToDLDevice(const Device& device)
+    {
+        auto device_type = device.is_device() ? kDLCUDA : kDLCPU;
+        int  device_id   = device.device_id();
+        return DLDevice{device_type, device_id};
+    }
+
+    inline static Result<DataType> FromDLDataType(const DLDataType& dtype)
+    {
+        if (dtype.lanes != 1)
+        {
+            MMDEPLOY_ERROR("DLDataType.lanes != 1 is not supported.");
+            return Status(eNotSupported);
+        }
+        switch (dtype.code)
+        {
+            case kDLFloat:
+                if (dtype.bits == 32)
+                    return DataType::kFLOAT;
+                else
+                {
+                    MMDEPLOY_ERROR("Unsupported bits. {}", dtype.bits);
+                    return Status(eNotSupported);
+                }
+            case kDLInt:
+                if (dtype.bits == 32) return DataType::kINT32;
+                if (dtype.bits == 64) return DataType::kINT64;
+                if (dtype.bits == 8)
+                    return DataType::kINT8;
+                else
+                {
+                    MMDEPLOY_ERROR("Unsupported bits. {}", dtype.bits);
+                    return Status(eNotSupported);
+                }
+                break;
+            default:
+                MMDEPLOY_ERROR("Unsupported DLDataType.");
+                return Status(eNotSupported);
+        }
+    }
+
+    inline static Result<DLDataType> ToDLDataType(const DataType& dtype)
+    {
+        switch (dtype)
+        {
+            case DataType::kFLOAT:
+                return DLDataType{kDLFloat, 32, 1};
+            case DataType::kINT32:
+                return DLDataType{kDLInt, 32, 1};
+            case DataType::kINT64:
+                return DLDataType{kDLInt, 64, 1};
+            case DataType::kINT8:
+                return DLDataType{kDLInt, 8, 1};
+            default:
+                MMDEPLOY_ERROR("Unsupported mmdeploy::DataType");
+                return Status(eNotSupported);
+        }
+    }
+
+    static void TensorDeleter(struct DLManagedTensor* self)
+    {
+        auto tensor = static_cast<Tensor*>(self->manager_ctx);
+        delete tensor;
+    }
+
+    static bool IsContiguous(const int64_t* shape, const int64_t* stride, int ndim)
+    {
+        if (ndim <= 1 || stride == nullptr) return true;
+        for (auto i = 1; i < ndim; ++i)
+        {
+            if (stride[i - 1] != shape[i] * stride[i]) return false;
+        }
+        return true;
     }
-    managed_tensor->manager_ctx = static_cast<void*>(new_tensor);
-  }
-
-  // setup dl_tensor
-  {
-    auto& dl_tensor = managed_tensor->dl_tensor;
-    auto& desc = new_tensor->desc();
-    dl_tensor.data = new_tensor->data();
-    dl_tensor.device = ToDLDevice(desc.device);
-    OUTCOME_TRY(dl_tensor.dtype, ToDLDataType(desc.data_type));
-    dl_tensor.ndim = desc.shape.size();
-    dl_tensor.byte_offset = 0;
-    dl_tensor.shape = (int64_t*)(&(desc.shape[0]));
-    dl_tensor.strides = nullptr;
-  }
-
-  return managed_tensor;
-}  // namespace mmdeploy
 
-Result<Tensor> FromDLPack(DLManagedTensor* managed_tensor, const std::string& name, Stream stream) {
-  using mmdeploy::framework::TensorDesc;
-  auto& dl_tensor = managed_tensor->dl_tensor;
-  if (!IsContiguous(dl_tensor.shape, dl_tensor.strides, dl_tensor.ndim)) {
-    MMDEPLOY_ERROR("Only contiguous DLTensor is supported now.");
-    return Status(eNotSupported);
-  }
-
-  TensorShape shape(dl_tensor.shape, dl_tensor.shape + dl_tensor.ndim);
-  OUTCOME_TRY(auto device, FromDLDevice(dl_tensor.device));
-  OUTCOME_TRY(auto dtype, FromDLDataType(dl_tensor.dtype));
-
-  // create tensor
-  TensorDesc desc{device, dtype, shape, name};
-  auto buffer_size = get_size(shape) * element_size(dtype);
-  auto raw_data = static_cast<void*>(static_cast<uint8_t*>(dl_tensor.data) + dl_tensor.byte_offset);
-  Tensor ret(desc);
-  OUTCOME_TRY(ret.CopyFrom(raw_data, stream));
-
-  // delete old tensor
-  if (managed_tensor->deleter != nullptr) managed_tensor->deleter(managed_tensor);
-  return ret;
-}
+    Result<DLManagedTensor*> ToDLPack(Tensor& tensor, Stream stream)
+    {
+        using mmdeploy::framework::Buffer;
+        auto managed_tensor = new DLManagedTensor();
+
+        // set deleter
+        managed_tensor->deleter = TensorDeleter;
+        Tensor* new_tensor      = nullptr;
+
+        // create manager_ctx
+        {
+            auto     desc     = tensor.desc();
+            uint64_t data_val = reinterpret_cast<uint64_t>(tensor.data());
+            if ((data_val & 0xff) != 0)
+            {
+                // copy buffer if data is not aligned.
+                new_tensor =
+                    new Tensor(desc, Buffer(desc.device, tensor.byte_size(), tensor.allocator(), 256));
+                OUTCOME_TRY(tensor.CopyTo(*new_tensor, stream));
+            }
+            else
+            {
+                // reuse buffer
+                new_tensor = new Tensor(desc, tensor.buffer());
+            }
+            managed_tensor->manager_ctx = static_cast<void*>(new_tensor);
+        }
+
+        // setup dl_tensor
+        {
+            auto& dl_tensor  = managed_tensor->dl_tensor;
+            auto& desc       = new_tensor->desc();
+            dl_tensor.data   = new_tensor->data();
+            dl_tensor.device = ToDLDevice(desc.device);
+            OUTCOME_TRY(dl_tensor.dtype, ToDLDataType(desc.data_type));
+            dl_tensor.ndim        = desc.shape.size();
+            dl_tensor.byte_offset = 0;
+            dl_tensor.shape       = (int64_t*)(&(desc.shape[0]));
+            dl_tensor.strides     = nullptr;
+        }
+
+        return managed_tensor;
+    }  // namespace mmdeploy
+
+    Result<Tensor> FromDLPack(DLManagedTensor* managed_tensor, const std::string& name, Stream stream)
+    {
+        using mmdeploy::framework::TensorDesc;
+        auto& dl_tensor = managed_tensor->dl_tensor;
+        if (!IsContiguous(dl_tensor.shape, dl_tensor.strides, dl_tensor.ndim))
+        {
+            MMDEPLOY_ERROR("Only contiguous DLTensor is supported now.");
+            return Status(eNotSupported);
+        }
+
+        TensorShape shape(dl_tensor.shape, dl_tensor.shape + dl_tensor.ndim);
+        OUTCOME_TRY(auto device, FromDLDevice(dl_tensor.device));
+        OUTCOME_TRY(auto dtype, FromDLDataType(dl_tensor.dtype));
+
+        // create tensor
+        TensorDesc desc{device, dtype, shape, name};
+        auto       buffer_size = get_size(shape) * element_size(dtype);
+        auto       raw_data    = static_cast<void*>(static_cast<uint8_t*>(dl_tensor.data) + dl_tensor.byte_offset);
+        Tensor     ret(desc);
+        OUTCOME_TRY(ret.CopyFrom(raw_data, stream));
+
+        // delete old tensor
+        if (managed_tensor->deleter != nullptr) managed_tensor->deleter(managed_tensor);
+        return ret;
+    }
 }  // namespace mmdeploy
diff --git a/csrc/mmdeploy/utils/dlpack/dlpack_utils.h b/csrc/mmdeploy/utils/dlpack/dlpack_utils.h
index 66112ae801..7db297e36e 100644
--- a/csrc/mmdeploy/utils/dlpack/dlpack_utils.h
+++ b/csrc/mmdeploy/utils/dlpack/dlpack_utils.h
@@ -7,11 +7,11 @@
 #include "mmdeploy/core/tensor.h"
 
 struct DLManagedTensor;
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-Result<DLManagedTensor*> ToDLPack(framework::Tensor& tensor, framework::Stream stream = {});
-Result<framework::Tensor> FromDLPack(DLManagedTensor* managed_tensor, const std::string& name = "",
-                                     framework::Stream stream = {});
+    Result<DLManagedTensor*>  ToDLPack(framework::Tensor& tensor, framework::Stream stream = {});
+    Result<framework::Tensor> FromDLPack(DLManagedTensor* managed_tensor, const std::string& name = "", framework::Stream stream = {});
 }  // namespace mmdeploy
 
 #endif  // MMDEPLOY_CSRC_UTILS_DLPACK_DLPACK_UTILS_H_
diff --git a/csrc/mmdeploy/utils/opencv/CMakeLists.txt b/csrc/mmdeploy/utils/opencv/CMakeLists.txt
index 2dafb8da65..24d31436d4 100644
--- a/csrc/mmdeploy/utils/opencv/CMakeLists.txt
+++ b/csrc/mmdeploy/utils/opencv/CMakeLists.txt
@@ -4,9 +4,10 @@ project(mmdeploy_opencv_utils)
 
 mmdeploy_add_library(${PROJECT_NAME} opencv_utils.cpp)
 
-target_link_libraries(${PROJECT_NAME}
-        PRIVATE mmdeploy::core
-        PUBLIC ${OpenCV_LIBS})
+target_link_libraries(
+  ${PROJECT_NAME}
+  PRIVATE mmdeploy::core
+  PUBLIC ${OpenCV_LIBS})
 
-target_include_directories(${PROJECT_NAME}
-        INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>)
+target_include_directories(
+  ${PROJECT_NAME} INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>)
diff --git a/csrc/mmdeploy/utils/opencv/opencv_utils.cpp b/csrc/mmdeploy/utils/opencv/opencv_utils.cpp
index c303d63fc0..21b9e09fd6 100644
--- a/csrc/mmdeploy/utils/opencv/opencv_utils.cpp
+++ b/csrc/mmdeploy/utils/opencv/opencv_utils.cpp
@@ -8,316 +8,383 @@
 #include "mmdeploy/core/utils/formatter.h"
 #include "opencv2/imgproc/imgproc.hpp"
 
-namespace mmdeploy::cpu {
-
-using namespace framework;
-
-Mat CVMat2Mat(const cv::Mat& mat, PixelFormat format) {
-  std::shared_ptr<void> data(mat.data, [mat = mat](void* p) {});
-  DataType type;
-  auto depth = mat.depth();
-  switch (depth) {
-    case CV_8S:  // fall through
-    case CV_8U:
-      type = DataType::kINT8;
-      break;
-    case CV_16S:  // fall through
-    case CV_16U:
-      type = DataType::kHALF;
-      break;
-    case CV_32S:
-      type = DataType::kINT32;
-      break;
-    case CV_32F:
-      type = DataType::kFLOAT;
-      break;
-    default:
-      assert(0);
-  }
-  return Mat{mat.rows, mat.cols, format, type, data, Device{"cpu"}};
-}
-
-cv::Mat Mat2CVMat(const Mat& mat) {
-  static const std::map<DataType, int> type_mapper{{DataType::kFLOAT, CV_32F},
-                                                   {DataType::kHALF, CV_16U},
-                                                   {DataType::kINT8, CV_8U},
-                                                   {DataType::kINT32, CV_32S}};
-  auto type = CV_MAKETYPE(type_mapper.at(mat.type()), mat.channel());
-  auto format = mat.pixel_format();
-  if (PixelFormat::kBGR == format || PixelFormat::kRGB == format) {
-    return cv::Mat(mat.height(), mat.width(), type, mat.data<void>());
-  } else if (PixelFormat::kGRAYSCALE == format) {
-    return cv::Mat(mat.height(), mat.width(), type, mat.data<void>());
-  } else if (PixelFormat::kNV12 == format) {
-    cv::Mat src_mat(mat.height() * 3 / 2, mat.width(), type, mat.data<void>());
-    cv::Mat dst_mat;
-    cv::cvtColor(src_mat, dst_mat, cv::COLOR_YUV2BGR_NV12);
-    return dst_mat;
-  } else if (PixelFormat::kNV21 == format) {
-    cv::Mat src_mat(mat.height() * 3 / 2, mat.width(), type, mat.data<void>());
-    cv::Mat dst_mat;
-    cv::cvtColor(src_mat, dst_mat, cv::COLOR_YUV2BGR_NV21);
-    return dst_mat;
-  } else if (PixelFormat::kBGRA == format) {
-    return cv::Mat(mat.height(), mat.width(), type, mat.data<void>());
-  } else {
-    MMDEPLOY_ERROR("unsupported mat format {}", format);
-    return {};
-  }
-}
-
-cv::Mat Tensor2CVMat(const Tensor& tensor) {
-  auto desc = tensor.desc();
-  int h = (int)desc.shape[1];
-  int w = (int)desc.shape[2];
-  int c = (int)desc.shape[3];
-
-  if (DataType::kINT8 == desc.data_type) {
-    return {h, w, CV_8UC(c), const_cast<void*>(tensor.data())};
-  } else if (DataType::kFLOAT == desc.data_type) {
-    return {h, w, CV_32FC(c), const_cast<void*>(tensor.data())};
-  } else if (DataType::kINT32 == desc.data_type) {
-    return {h, w, CV_32SC(c), const_cast<void*>(tensor.data())};
-  } else {
-    assert(0);
-    MMDEPLOY_ERROR("unsupported type: {}", desc.data_type);
-    return {};
-  }
-}
-
-Tensor CVMat2Tensor(const cv::Mat& mat) {
-  TensorShape shape;
-  DataType data_type = DataType::kINT8;
-  if (mat.depth() == CV_8U) {
-    shape = {1, mat.rows, mat.cols, mat.channels()};
-  } else if (mat.depth() == CV_32F) {
-    shape = {1, mat.rows, mat.cols, mat.channels()};
-    data_type = DataType::kFLOAT;
-  } else if (mat.depth() == CV_32S) {
-    shape = {1, mat.rows, mat.cols, mat.channels()};
-    data_type = DataType::kINT32;
-  } else {
-    MMDEPLOY_ERROR("unsupported mat dat type {}", mat.type());
-    assert(0);
-    return {};
-  }
-  std::shared_ptr<void> data(mat.data, [mat = mat](void*) {});
-  TensorDesc desc{Device{"cpu"}, data_type, shape};
-  return Tensor{desc, data};
-}
-
-Result<int> GetInterpolationMethod(const std::string_view& method) {
-  if (method == "bilinear") {
-    return cv::INTER_LINEAR;
-  } else if (method == "nearest") {
-    return cv::INTER_NEAREST;
-  } else if (method == "area") {
-    return cv::INTER_AREA;
-  } else if (method == "bicubic") {
-    return cv::INTER_CUBIC;
-  } else if (method == "lanczos") {
-    return cv::INTER_LANCZOS4;
-  }
-  MMDEPLOY_ERROR("unsupported interpolation method: {}", method);
-  return Status(eNotSupported);
-}
-
-cv::Mat Resize(const cv::Mat& src, int dst_height, int dst_width,
-               const std::string& interpolation) {
-  cv::Mat dst(dst_height, dst_width, src.type());
-  auto method = GetInterpolationMethod(interpolation).value();
-  cv::resize(src, dst, dst.size(), 0, 0, method);
-  return dst;
-}
-
-cv::Mat WarpAffine(const cv::Mat& src, const cv::Mat& affine_matrix, int dst_height, int dst_width,
-                   int interpolation) {
-  cv::Mat dst(dst_height, dst_width, src.type());
-  cv::warpAffine(src, dst, affine_matrix, dst.size(), interpolation);
-  return dst;
-}
-
-cv::Mat Crop(const cv::Mat& src, int top, int left, int bottom, int right) {
-  return src(cv::Range(top, bottom + 1), cv::Range(left, right + 1)).clone();
-}
-
-template <int C0, int C1, int C2, typename T>
-void normalize3(const T* __restrict src, float* __restrict dst, size_t size, const float* mean,
-                const float* std) {
-  const float _mean[3] = {mean[0], mean[1], mean[2]};
-  const float _inv[3] = {1.f / std[0], 1.f / std[1], 1.f / std[2]};
-  for (size_t i = 0; i < size * 3; i += 3) {
-    dst[i] = (src[i + C0] - _mean[0]) * _inv[0];
-    dst[i + 1] = (src[i + C1] - _mean[1]) * _inv[1];
-    dst[i + 2] = (src[i + C2] - _mean[2]) * _inv[2];
-  }
-}
-
-template <typename T>
-void normalize1(const T* __restrict src, float* __restrict dst, size_t size, const float* mean,
-                const float* std) {
-  float _mean = mean[0];
-  float _inv = 1.f / std[0];
-  for (size_t i = 0; i < size; ++i) {
-    dst[i] = (src[i] - _mean) * _inv;
-  }
-}
-
-cv::Mat Normalize(cv::Mat& src, const std::vector<float>& mean, const std::vector<float>& std,
-                  bool to_rgb, bool inplace) {
-  assert(src.channels() == mean.size());
-  assert(mean.size() == std.size());
-
-  if (!inplace && src.isContinuous() && (src.channels() == 3 || src.channels() == 1)) {
-    if (src.depth() == CV_8U) {
-      cv::Mat dst(src.size(), CV_32FC(src.channels()));
-      auto normalize = src.channels() == 3
-                           ? (to_rgb ? normalize3<2, 1, 0, uint8_t> : normalize3<0, 1, 2, uint8_t>)
-                           : normalize1<uint8_t>;
-      normalize(src.ptr<uint8_t>(), dst.ptr<float>(), src.total(), mean.data(), std.data());
-      return dst;
-    } else if (src.depth() == CV_32F) {
-      cv::Mat dst(src.size(), CV_32FC(src.channels()));
-      auto normalize = src.channels() == 3
-                           ? (to_rgb ? normalize3<2, 1, 0, float> : normalize3<0, 1, 2, float>)
-                           : normalize1<float>;
-      normalize(src.ptr<float>(), dst.ptr<float>(), src.total(), mean.data(), std.data());
-      return dst;
+namespace mmdeploy::cpu
+{
+
+    using namespace framework;
+
+    Mat CVMat2Mat(const cv::Mat& mat, PixelFormat format)
+    {
+        std::shared_ptr<void> data(mat.data, [mat = mat](void* p) {});
+        DataType              type;
+        auto                  depth = mat.depth();
+        switch (depth)
+        {
+            case CV_8S:  // fall through
+            case CV_8U:
+                type = DataType::kINT8;
+                break;
+            case CV_16S:  // fall through
+            case CV_16U:
+                type = DataType::kHALF;
+                break;
+            case CV_32S:
+                type = DataType::kINT32;
+                break;
+            case CV_32F:
+                type = DataType::kFLOAT;
+                break;
+            default:
+                assert(0);
+        }
+        return Mat{mat.rows, mat.cols, format, type, data, Device{"cpu"}};
     }
-  }
-
-  cv::Mat dst;
-  if (src.depth() == CV_32F) {
-    dst = inplace ? src : src.clone();
-  } else {
-    src.convertTo(dst, CV_32FC(src.channels()));
-  }
-
-  if (to_rgb && dst.channels() == 3) {
-    cv::cvtColor(dst, dst, cv::COLOR_BGR2RGB);
-  }
-
-  auto _mean = mean;
-  auto _std = std;
-  for (auto i = mean.size(); i < 4; ++i) {
-    _mean.push_back(0.);
-    _std.push_back(1.0);
-  }
-  cv::Scalar mean_scalar(_mean[0], _mean[1], _mean[2], _mean[3]);
-  cv::Scalar std_scalar(1.0 / _std[0], 1.0 / _std[1], 1.0 / _std[2], 1.0 / _std[3]);
-
-  cv::subtract(dst, mean_scalar, dst);
-  cv::multiply(dst, std_scalar, dst);
-  return dst;
-}
-
-cv::Mat Transpose(const cv::Mat& src) {
-  cv::Mat _src{src.rows * src.cols, src.channels(), CV_MAKETYPE(src.depth(), 1), src.data};
-  cv::Mat _dst;
-  cv::transpose(_src, _dst);
-  return _dst;
-}
-
-namespace {
-
-class ColorConversionTable {
-  static constexpr auto kSize = static_cast<size_t>(PixelFormat::kCOUNT);
-
-  int codes_[kSize][kSize]{};
-
-  // until we have "Deducing `this`" in C++23
-  template <typename Self>
-  static auto& get_impl(Self& self, PixelFormat src, PixelFormat dst) {
-    return self.codes_[static_cast<int32_t>(src)][static_cast<int32_t>(dst)];
-  }
-
- public:
-  auto& get(PixelFormat src, PixelFormat dst) noexcept { return get_impl(*this, src, dst); }
-  auto& get(PixelFormat src, PixelFormat dst) const noexcept { return get_impl(*this, src, dst); }
-
-  ColorConversionTable() {
-    for (auto& row : codes_) {
-      std::fill(std::begin(row), std::end(row), -1);
+
+    cv::Mat Mat2CVMat(const Mat& mat)
+    {
+        static const std::map<DataType, int> type_mapper{{DataType::kFLOAT, CV_32F},
+                                                         {DataType::kHALF, CV_16U},
+                                                         {DataType::kINT8, CV_8U},
+                                                         {DataType::kINT32, CV_32S}};
+        auto                                 type   = CV_MAKETYPE(type_mapper.at(mat.type()), mat.channel());
+        auto                                 format = mat.pixel_format();
+        if (PixelFormat::kBGR == format || PixelFormat::kRGB == format)
+        {
+            return cv::Mat(mat.height(), mat.width(), type, mat.data<void>());
+        }
+        else if (PixelFormat::kGRAYSCALE == format)
+        {
+            return cv::Mat(mat.height(), mat.width(), type, mat.data<void>());
+        }
+        else if (PixelFormat::kNV12 == format)
+        {
+            cv::Mat src_mat(mat.height() * 3 / 2, mat.width(), type, mat.data<void>());
+            cv::Mat dst_mat;
+            cv::cvtColor(src_mat, dst_mat, cv::COLOR_YUV2BGR_NV12);
+            return dst_mat;
+        }
+        else if (PixelFormat::kNV21 == format)
+        {
+            cv::Mat src_mat(mat.height() * 3 / 2, mat.width(), type, mat.data<void>());
+            cv::Mat dst_mat;
+            cv::cvtColor(src_mat, dst_mat, cv::COLOR_YUV2BGR_NV21);
+            return dst_mat;
+        }
+        else if (PixelFormat::kBGRA == format)
+        {
+            return cv::Mat(mat.height(), mat.width(), type, mat.data<void>());
+        }
+        else
+        {
+            MMDEPLOY_ERROR("unsupported mat format {}", format);
+            return {};
+        }
+    }
+
+    cv::Mat Tensor2CVMat(const Tensor& tensor)
+    {
+        auto desc = tensor.desc();
+        int  h    = (int)desc.shape[1];
+        int  w    = (int)desc.shape[2];
+        int  c    = (int)desc.shape[3];
+
+        if (DataType::kINT8 == desc.data_type)
+        {
+            return {h, w, CV_8UC(c), const_cast<void*>(tensor.data())};
+        }
+        else if (DataType::kFLOAT == desc.data_type)
+        {
+            return {h, w, CV_32FC(c), const_cast<void*>(tensor.data())};
+        }
+        else if (DataType::kINT32 == desc.data_type)
+        {
+            return {h, w, CV_32SC(c), const_cast<void*>(tensor.data())};
+        }
+        else
+        {
+            assert(0);
+            MMDEPLOY_ERROR("unsupported type: {}", desc.data_type);
+            return {};
+        }
+    }
+
+    Tensor CVMat2Tensor(const cv::Mat& mat)
+    {
+        TensorShape shape;
+        DataType    data_type = DataType::kINT8;
+        if (mat.depth() == CV_8U)
+        {
+            shape = {1, mat.rows, mat.cols, mat.channels()};
+        }
+        else if (mat.depth() == CV_32F)
+        {
+            shape     = {1, mat.rows, mat.cols, mat.channels()};
+            data_type = DataType::kFLOAT;
+        }
+        else if (mat.depth() == CV_32S)
+        {
+            shape     = {1, mat.rows, mat.cols, mat.channels()};
+            data_type = DataType::kINT32;
+        }
+        else
+        {
+            MMDEPLOY_ERROR("unsupported mat dat type {}", mat.type());
+            assert(0);
+            return {};
+        }
+        std::shared_ptr<void> data(mat.data, [mat = mat](void*) {});
+        TensorDesc            desc{Device{"cpu"}, data_type, shape};
+        return Tensor{desc, data};
+    }
+
+    Result<int> GetInterpolationMethod(const std::string_view& method)
+    {
+        if (method == "bilinear")
+        {
+            return cv::INTER_LINEAR;
+        }
+        else if (method == "nearest")
+        {
+            return cv::INTER_NEAREST;
+        }
+        else if (method == "area")
+        {
+            return cv::INTER_AREA;
+        }
+        else if (method == "bicubic")
+        {
+            return cv::INTER_CUBIC;
+        }
+        else if (method == "lanczos")
+        {
+            return cv::INTER_LANCZOS4;
+        }
+        MMDEPLOY_ERROR("unsupported interpolation method: {}", method);
+        return Status(eNotSupported);
+    }
+
+    cv::Mat Resize(const cv::Mat& src, int dst_height, int dst_width, const std::string& interpolation)
+    {
+        cv::Mat dst(dst_height, dst_width, src.type());
+        auto    method = GetInterpolationMethod(interpolation).value();
+        cv::resize(src, dst, dst.size(), 0, 0, method);
+        return dst;
+    }
+
+    cv::Mat WarpAffine(const cv::Mat& src, const cv::Mat& affine_matrix, int dst_height, int dst_width, int interpolation)
+    {
+        cv::Mat dst(dst_height, dst_width, src.type());
+        cv::warpAffine(src, dst, affine_matrix, dst.size(), interpolation);
+        return dst;
+    }
+
+    cv::Mat Crop(const cv::Mat& src, int top, int left, int bottom, int right)
+    {
+        return src(cv::Range(top, bottom + 1), cv::Range(left, right + 1)).clone();
+    }
+
+    template<int C0, int C1, int C2, typename T>
+    void normalize3(const T* __restrict src, float* __restrict dst, size_t size, const float* mean, const float* std)
+    {
+        const float _mean[3] = {mean[0], mean[1], mean[2]};
+        const float _inv[3]  = {1.f / std[0], 1.f / std[1], 1.f / std[2]};
+        for (size_t i = 0; i < size * 3; i += 3)
+        {
+            dst[i]     = (src[i + C0] - _mean[0]) * _inv[0];
+            dst[i + 1] = (src[i + C1] - _mean[1]) * _inv[1];
+            dst[i + 2] = (src[i + C2] - _mean[2]) * _inv[2];
+        }
+    }
+
+    template<typename T>
+    void normalize1(const T* __restrict src, float* __restrict dst, size_t size, const float* mean, const float* std)
+    {
+        float _mean = mean[0];
+        float _inv  = 1.f / std[0];
+        for (size_t i = 0; i < size; ++i)
+        {
+            dst[i] = (src[i] - _mean) * _inv;
+        }
+    }
+
+    cv::Mat Normalize(cv::Mat& src, const std::vector<float>& mean, const std::vector<float>& std, bool to_rgb, bool inplace)
+    {
+        assert(src.channels() == mean.size());
+        assert(mean.size() == std.size());
+
+        if (!inplace && src.isContinuous() && (src.channels() == 3 || src.channels() == 1))
+        {
+            if (src.depth() == CV_8U)
+            {
+                cv::Mat dst(src.size(), CV_32FC(src.channels()));
+                auto    normalize = src.channels() == 3 ? (to_rgb ? normalize3<2, 1, 0, uint8_t> : normalize3<0, 1, 2, uint8_t>) : normalize1<uint8_t>;
+                normalize(src.ptr<uint8_t>(), dst.ptr<float>(), src.total(), mean.data(), std.data());
+                return dst;
+            }
+            else if (src.depth() == CV_32F)
+            {
+                cv::Mat dst(src.size(), CV_32FC(src.channels()));
+                auto    normalize = src.channels() == 3 ? (to_rgb ? normalize3<2, 1, 0, float> : normalize3<0, 1, 2, float>) : normalize1<float>;
+                normalize(src.ptr<float>(), dst.ptr<float>(), src.total(), mean.data(), std.data());
+                return dst;
+            }
+        }
+
+        cv::Mat dst;
+        if (src.depth() == CV_32F)
+        {
+            dst = inplace ? src : src.clone();
+        }
+        else
+        {
+            src.convertTo(dst, CV_32FC(src.channels()));
+        }
+
+        if (to_rgb && dst.channels() == 3)
+        {
+            cv::cvtColor(dst, dst, cv::COLOR_BGR2RGB);
+        }
+
+        auto _mean = mean;
+        auto _std  = std;
+        for (auto i = mean.size(); i < 4; ++i)
+        {
+            _mean.push_back(0.);
+            _std.push_back(1.0);
+        }
+        cv::Scalar mean_scalar(_mean[0], _mean[1], _mean[2], _mean[3]);
+        cv::Scalar std_scalar(1.0 / _std[0], 1.0 / _std[1], 1.0 / _std[2], 1.0 / _std[3]);
+
+        cv::subtract(dst, mean_scalar, dst);
+        cv::multiply(dst, std_scalar, dst);
+        return dst;
+    }
+
+    cv::Mat Transpose(const cv::Mat& src)
+    {
+        cv::Mat _src{src.rows * src.cols, src.channels(), CV_MAKETYPE(src.depth(), 1), src.data};
+        cv::Mat _dst;
+        cv::transpose(_src, _dst);
+        return _dst;
+    }
+
+    namespace
+    {
+
+        class ColorConversionTable
+        {
+            static constexpr auto kSize = static_cast<size_t>(PixelFormat::kCOUNT);
+
+            int                   codes_[kSize][kSize]{};
+
+            // until we have "Deducing `this`" in C++23
+            template<typename Self>
+            static auto& get_impl(Self& self, PixelFormat src, PixelFormat dst)
+            {
+                return self.codes_[static_cast<int32_t>(src)][static_cast<int32_t>(dst)];
+            }
+
+          public:
+            auto& get(PixelFormat src, PixelFormat dst) noexcept
+            {
+                return get_impl(*this, src, dst);
+            }
+            auto& get(PixelFormat src, PixelFormat dst) const noexcept
+            {
+                return get_impl(*this, src, dst);
+            }
+
+            ColorConversionTable()
+            {
+                for (auto& row : codes_)
+                {
+                    std::fill(std::begin(row), std::end(row), -1);
+                }
+                using namespace pixel_formats;
+                // to BGR
+                get(kRGB, kBGR)   = cv::COLOR_RGB2BGR;
+                get(kGRAY, kBGR)  = cv::COLOR_GRAY2BGR;
+                get(kNV21, kBGR)  = cv::COLOR_YUV2BGR_NV21;
+                get(kNV12, kBGR)  = cv::COLOR_YUV2BGR_NV12;
+                get(kBGRA, kBGR)  = cv::COLOR_BGRA2BGR;
+                // to RGB
+                get(kBGR, kRGB)   = cv::COLOR_BGR2RGB;
+                get(kGRAY, kRGB)  = cv::COLOR_GRAY2RGB;
+                get(kNV21, kRGB)  = cv::COLOR_YUV2RGB_NV21;
+                get(kNV12, kRGB)  = cv::COLOR_YUV2RGB_NV12;
+                get(kBGRA, kRGB)  = cv::COLOR_BGRA2RGB;
+                // to GRAY
+                get(kBGR, kGRAY)  = cv::COLOR_BGR2GRAY;
+                get(kRGB, kGRAY)  = cv::COLOR_RGB2GRAY;
+                get(kNV21, kGRAY) = cv::COLOR_YUV2GRAY_NV21;
+                get(kNV12, kGRAY) = cv::COLOR_YUV2GRAY_NV12;
+                get(kBGRA, kGRAY) = cv::COLOR_BGRA2GRAY;
+            }
+        };
+
+        int GetConversionCode(PixelFormat src_fmt, PixelFormat dst_fmt)
+        {
+            static const ColorConversionTable table{};
+            return table.get(src_fmt, dst_fmt);
+        }
+
+    }  // namespace
+
+    cv::Mat CvtColor(const cv::Mat& src, PixelFormat src_format, PixelFormat dst_format)
+    {
+        if (src_format == dst_format)
+        {
+            return src;
+        }
+        auto code = GetConversionCode(src_format, dst_format);
+        if (code == -1)
+        {
+            MMDEPLOY_ERROR("Unsupported color conversion {} -> {}", src_format, dst_format);
+            return {};
+        }
+        cv::Mat dst;
+        cv::cvtColor(src, dst, code);
+        return dst;
+    }
+
+    cv::Mat Pad(const cv::Mat& src, int top, int left, int bottom, int right, int border_type, float val)
+    {
+        cv::Mat    dst;
+        cv::Scalar scalar = {val, val, val, val};
+        cv::copyMakeBorder(src, dst, top, bottom, left, right, border_type, scalar);
+        return dst;
+    }
+
+    cv::Mat CropResizePad(const cv::Mat& src, const std::vector<int>& crop_rect, const std::vector<int>& target_size, const std::vector<int>& pad_rect)
+    {
+        int     width  = target_size[0] + pad_rect[1] + pad_rect[3];
+        int     height = target_size[1] + pad_rect[0] + pad_rect[2];
+        cv::Mat dst    = cv::Mat::zeros(height, width, src.type());
+        if (crop_rect[2] - crop_rect[0] + 1 > 0 && crop_rect[3] - crop_rect[1] + 1 > 0)
+        {
+            cv::Rect roi1 = {crop_rect[1], crop_rect[0], crop_rect[3] - crop_rect[1] + 1, crop_rect[2] - crop_rect[0] + 1};
+            cv::Rect roi2 = {pad_rect[1], pad_rect[0], target_size[0], target_size[1]};
+            cv::resize(src(roi1), dst(roi2), {target_size[0], target_size[1]});
+        }
+        return dst;
+    }
+
+    bool Compare(const cv::Mat& src1, const cv::Mat& src2, float threshold)
+    {
+        cv::Mat _src1, _src2, diff;
+        src1.convertTo(_src1, CV_32FC(src1.channels()));
+        src2.convertTo(_src2, CV_32FC(src2.channels()));
+
+        cv::absdiff(_src1, _src2, diff);
+        auto sum    = cv::sum(cv::sum(diff));
+        auto metric = sum[0] / (src1.rows * src1.cols);
+
+        if (metric < threshold)
+        {
+            return true;
+        }
+        MMDEPLOY_ERROR("sum: {}, average: {}", sum[0], metric);
+        return false;
     }
-    using namespace pixel_formats;
-    // to BGR
-    get(kRGB, kBGR) = cv::COLOR_RGB2BGR;
-    get(kGRAY, kBGR) = cv::COLOR_GRAY2BGR;
-    get(kNV21, kBGR) = cv::COLOR_YUV2BGR_NV21;
-    get(kNV12, kBGR) = cv::COLOR_YUV2BGR_NV12;
-    get(kBGRA, kBGR) = cv::COLOR_BGRA2BGR;
-    // to RGB
-    get(kBGR, kRGB) = cv::COLOR_BGR2RGB;
-    get(kGRAY, kRGB) = cv::COLOR_GRAY2RGB;
-    get(kNV21, kRGB) = cv::COLOR_YUV2RGB_NV21;
-    get(kNV12, kRGB) = cv::COLOR_YUV2RGB_NV12;
-    get(kBGRA, kRGB) = cv::COLOR_BGRA2RGB;
-    // to GRAY
-    get(kBGR, kGRAY) = cv::COLOR_BGR2GRAY;
-    get(kRGB, kGRAY) = cv::COLOR_RGB2GRAY;
-    get(kNV21, kGRAY) = cv::COLOR_YUV2GRAY_NV21;
-    get(kNV12, kGRAY) = cv::COLOR_YUV2GRAY_NV12;
-    get(kBGRA, kGRAY) = cv::COLOR_BGRA2GRAY;
-  }
-};
-
-int GetConversionCode(PixelFormat src_fmt, PixelFormat dst_fmt) {
-  static const ColorConversionTable table{};
-  return table.get(src_fmt, dst_fmt);
-}
-
-}  // namespace
-
-cv::Mat CvtColor(const cv::Mat& src, PixelFormat src_format, PixelFormat dst_format) {
-  if (src_format == dst_format) {
-    return src;
-  }
-  auto code = GetConversionCode(src_format, dst_format);
-  if (code == -1) {
-    MMDEPLOY_ERROR("Unsupported color conversion {} -> {}", src_format, dst_format);
-    return {};
-  }
-  cv::Mat dst;
-  cv::cvtColor(src, dst, code);
-  return dst;
-}
-
-cv::Mat Pad(const cv::Mat& src, int top, int left, int bottom, int right, int border_type,
-            float val) {
-  cv::Mat dst;
-  cv::Scalar scalar = {val, val, val, val};
-  cv::copyMakeBorder(src, dst, top, bottom, left, right, border_type, scalar);
-  return dst;
-}
-
-cv::Mat CropResizePad(const cv::Mat& src, const std::vector<int>& crop_rect,
-                      const std::vector<int>& target_size, const std::vector<int>& pad_rect) {
-  int width = target_size[0] + pad_rect[1] + pad_rect[3];
-  int height = target_size[1] + pad_rect[0] + pad_rect[2];
-  cv::Mat dst = cv::Mat::zeros(height, width, src.type());
-  if (crop_rect[2] - crop_rect[0] + 1 > 0 && crop_rect[3] - crop_rect[1] + 1 > 0) {
-    cv::Rect roi1 = {crop_rect[1], crop_rect[0], crop_rect[3] - crop_rect[1] + 1,
-                     crop_rect[2] - crop_rect[0] + 1};
-    cv::Rect roi2 = {pad_rect[1], pad_rect[0], target_size[0], target_size[1]};
-    cv::resize(src(roi1), dst(roi2), {target_size[0], target_size[1]});
-  }
-  return dst;
-}
-
-bool Compare(const cv::Mat& src1, const cv::Mat& src2, float threshold) {
-  cv::Mat _src1, _src2, diff;
-  src1.convertTo(_src1, CV_32FC(src1.channels()));
-  src2.convertTo(_src2, CV_32FC(src2.channels()));
-
-  cv::absdiff(_src1, _src2, diff);
-  auto sum = cv::sum(cv::sum(diff));
-  auto metric = sum[0] / (src1.rows * src1.cols);
-
-  if (metric < threshold) {
-    return true;
-  }
-  MMDEPLOY_ERROR("sum: {}, average: {}", sum[0], metric);
-  return false;
-}
 
 }  // namespace mmdeploy::cpu
diff --git a/csrc/mmdeploy/utils/opencv/opencv_utils.h b/csrc/mmdeploy/utils/opencv/opencv_utils.h
index 9dd268e651..a0b639a15d 100644
--- a/csrc/mmdeploy/utils/opencv/opencv_utils.h
+++ b/csrc/mmdeploy/utils/opencv/opencv_utils.h
@@ -9,158 +9,165 @@
 #include "mmdeploy/core/tensor.h"
 #include "opencv2/core/core.hpp"
 
-namespace mmdeploy {
-namespace cpu {
-
-MMDEPLOY_API cv::Mat Mat2CVMat(const framework::Mat& mat);
-MMDEPLOY_API cv::Mat Tensor2CVMat(const framework::Tensor& tensor);
-
-MMDEPLOY_API framework::Mat CVMat2Mat(const cv::Mat& mat, PixelFormat format);
-MMDEPLOY_API framework::Tensor CVMat2Tensor(const cv::Mat& mat);
-
-MMDEPLOY_API Result<int> GetInterpolationMethod(const std::string_view& method);
-
-/**
- * @brief resize an image to specified size
- *
- * @param src input image
- * @param dst_height output image's height
- * @param dst_width output image's width
- * @return output image if success, error code otherwise
- */
-MMDEPLOY_API cv::Mat Resize(const cv::Mat& src, int dst_height, int dst_width,
-                            const std::string& interpolation);
-
-MMDEPLOY_API cv::Mat WarpAffine(const cv::Mat& src, const cv::Mat& affine_matrix, int dst_height,
-                                int dst_width, int interpolation);
-
-/**
- * @brief crop an image
- *
- * @param src input image
- * @param top
- * @param left
- * @param bottom
- * @param right
- * @return cv::Mat
- */
-MMDEPLOY_API cv::Mat Crop(const cv::Mat& src, int top, int left, int bottom, int right);
-
-/**
- * @brief Do normalization to an image
- *
- * @param src input image. It is assumed to be BGR if the channel is 3
- * @param mean
- * @param std
- * @param to_rgb
- * @param inplace
- * @return cv::Mat
- */
-MMDEPLOY_API cv::Mat Normalize(cv::Mat& src, const std::vector<float>& mean,
-                               const std::vector<float>& std, bool to_rgb, bool inplace = true);
-
-/**
- * @brief tranpose an image, from {h, w, c} to {c, h, w}
- *
- * @param src input image
- * @return
- */
-MMDEPLOY_API cv::Mat Transpose(const cv::Mat& src);
-
-/**
- * @brief convert an image to another color space
- *
- * @param src
- * @param src_format
- * @param dst_format
- * @return
- */
-MMDEPLOY_API cv::Mat CvtColor(const cv::Mat& src, PixelFormat src_format, PixelFormat dst_format);
-
-/**
- *
- * @param src
- * @param top
- * @param left
- * @param bottom
- * @param right
- * @param border_type
- * @param val
- * @return
- */
-MMDEPLOY_API cv::Mat Pad(const cv::Mat& src, int top, int left, int bottom, int right,
-                         int border_type, float val);
-
-/**
- * @param src
- * @param crop_rect t, l, b, r
- * @param target_size w, h
- * @param pad_rect t, l, b, r
- */
-MMDEPLOY_API cv::Mat CropResizePad(const cv::Mat& src, const std::vector<int>& crop_rect,
-                                   const std::vector<int>& target_size,
-                                   const std::vector<int>& pad_rect);
-
-/**
- * @brief compare two images
- *
- * @param src1 one input image
- * @param src2 the other input image
- * @return bool true means the images are the same
- */
-MMDEPLOY_API bool Compare(const cv::Mat& src1, const cv::Mat& src2, float threshold = .5f);
-
-}  // namespace cpu
-
-namespace detail {
-
-template <typename T>
-struct IsCvPoint : std::false_type {};
-
-template <typename T>
-struct IsCvPoint<::cv::Point_<T>> : std::true_type {};
-
-template <typename Archive, typename T,
-          std::enable_if_t<detail::IsCvPoint<uncvref_t<T>>::value, int> = 0>
-void serialize(Archive&& archive, T&& p) {
-  int size{2};
-  std::forward<Archive>(archive).init(size);
-  std::forward<Archive>(archive).item(std::forward<T>(p).x);
-  std::forward<Archive>(archive).item(std::forward<T>(p).y);
-}
-
-template <typename Archive, typename T, std::enable_if_t<detail::IsCvPoint<T>::value, int> = 0>
-void save(Archive& archive, std::vector<T>& v) {
-  archive.init(array_tag<T>{v.size() * 2});
-  for (const auto& p : v) {
-    archive.item(p.x);
-    archive.item(p.y);
-  }
-}
-
-template <typename Archive, typename T, std::enable_if_t<detail::IsCvPoint<T>::value, int> = 0>
-void save(Archive& archive, const std::vector<T>& v) {
-  archive.init(array_tag<T>{v.size() * 2});
-  for (const auto& p : v) {
-    archive.item(p.x);
-    archive.item(p.y);
-  }
-}
-
-template <typename Archive, typename T, std::enable_if_t<detail::IsCvPoint<T>::value, int> = 0>
-void load(Archive& archive, std::vector<T>& v) {
-  size_t size{};
-  archive.init(size);
-  size /= 2;
-  T p;
-  for (int i = 0; i < size; ++i) {
-    archive.item(p.x);
-    archive.item(p.y);
-    v.push_back(p);
-  }
-}
-
-}  // namespace detail
+namespace mmdeploy
+{
+    namespace cpu
+    {
+
+        MMDEPLOY_API cv::Mat Mat2CVMat(const framework::Mat& mat);
+        MMDEPLOY_API cv::Mat Tensor2CVMat(const framework::Tensor& tensor);
+
+        MMDEPLOY_API framework::Mat CVMat2Mat(const cv::Mat& mat, PixelFormat format);
+        MMDEPLOY_API framework::Tensor CVMat2Tensor(const cv::Mat& mat);
+
+        MMDEPLOY_API Result<int> GetInterpolationMethod(const std::string_view& method);
+
+        /**
+         * @brief resize an image to specified size
+         *
+         * @param src input image
+         * @param dst_height output image's height
+         * @param dst_width output image's width
+         * @return output image if success, error code otherwise
+         */
+        MMDEPLOY_API cv::Mat Resize(const cv::Mat& src, int dst_height, int dst_width, const std::string& interpolation);
+
+        MMDEPLOY_API cv::Mat WarpAffine(const cv::Mat& src, const cv::Mat& affine_matrix, int dst_height, int dst_width, int interpolation);
+
+        /**
+         * @brief crop an image
+         *
+         * @param src input image
+         * @param top
+         * @param left
+         * @param bottom
+         * @param right
+         * @return cv::Mat
+         */
+        MMDEPLOY_API cv::Mat Crop(const cv::Mat& src, int top, int left, int bottom, int right);
+
+        /**
+         * @brief Do normalization to an image
+         *
+         * @param src input image. It is assumed to be BGR if the channel is 3
+         * @param mean
+         * @param std
+         * @param to_rgb
+         * @param inplace
+         * @return cv::Mat
+         */
+        MMDEPLOY_API cv::Mat Normalize(cv::Mat& src, const std::vector<float>& mean, const std::vector<float>& std, bool to_rgb, bool inplace = true);
+
+        /**
+         * @brief tranpose an image, from {h, w, c} to {c, h, w}
+         *
+         * @param src input image
+         * @return
+         */
+        MMDEPLOY_API cv::Mat Transpose(const cv::Mat& src);
+
+        /**
+         * @brief convert an image to another color space
+         *
+         * @param src
+         * @param src_format
+         * @param dst_format
+         * @return
+         */
+        MMDEPLOY_API cv::Mat CvtColor(const cv::Mat& src, PixelFormat src_format, PixelFormat dst_format);
+
+        /**
+         *
+         * @param src
+         * @param top
+         * @param left
+         * @param bottom
+         * @param right
+         * @param border_type
+         * @param val
+         * @return
+         */
+        MMDEPLOY_API cv::Mat Pad(const cv::Mat& src, int top, int left, int bottom, int right, int border_type, float val);
+
+        /**
+         * @param src
+         * @param crop_rect t, l, b, r
+         * @param target_size w, h
+         * @param pad_rect t, l, b, r
+         */
+        MMDEPLOY_API cv::Mat CropResizePad(const cv::Mat& src, const std::vector<int>& crop_rect, const std::vector<int>& target_size, const std::vector<int>& pad_rect);
+
+        /**
+         * @brief compare two images
+         *
+         * @param src1 one input image
+         * @param src2 the other input image
+         * @return bool true means the images are the same
+         */
+        MMDEPLOY_API bool    Compare(const cv::Mat& src1, const cv::Mat& src2, float threshold = .5f);
+
+    }  // namespace cpu
+
+    namespace detail
+    {
+
+        template<typename T>
+        struct IsCvPoint : std::false_type
+        {
+        };
+
+        template<typename T>
+        struct IsCvPoint<::cv::Point_<T>> : std::true_type
+        {
+        };
+
+        template<typename Archive, typename T, std::enable_if_t<detail::IsCvPoint<uncvref_t<T>>::value, int> = 0>
+        void serialize(Archive&& archive, T&& p)
+        {
+            int size{2};
+            std::forward<Archive>(archive).init(size);
+            std::forward<Archive>(archive).item(std::forward<T>(p).x);
+            std::forward<Archive>(archive).item(std::forward<T>(p).y);
+        }
+
+        template<typename Archive, typename T, std::enable_if_t<detail::IsCvPoint<T>::value, int> = 0>
+        void save(Archive& archive, std::vector<T>& v)
+        {
+            archive.init(array_tag<T>{v.size() * 2});
+            for (const auto& p : v)
+            {
+                archive.item(p.x);
+                archive.item(p.y);
+            }
+        }
+
+        template<typename Archive, typename T, std::enable_if_t<detail::IsCvPoint<T>::value, int> = 0>
+        void save(Archive& archive, const std::vector<T>& v)
+        {
+            archive.init(array_tag<T>{v.size() * 2});
+            for (const auto& p : v)
+            {
+                archive.item(p.x);
+                archive.item(p.y);
+            }
+        }
+
+        template<typename Archive, typename T, std::enable_if_t<detail::IsCvPoint<T>::value, int> = 0>
+        void load(Archive& archive, std::vector<T>& v)
+        {
+            size_t size{};
+            archive.init(size);
+            size /= 2;
+            T p;
+            for (int i = 0; i < size; ++i)
+            {
+                archive.item(p.x);
+                archive.item(p.y);
+                v.push_back(p);
+            }
+        }
+
+    }  // namespace detail
 
 }  // namespace mmdeploy
 
diff --git a/demo/csrc/CMakeLists.txt b/demo/csrc/CMakeLists.txt
index 8de1c05cbf..2ebb7bab4c 100644
--- a/demo/csrc/CMakeLists.txt
+++ b/demo/csrc/CMakeLists.txt
@@ -2,31 +2,35 @@
 cmake_minimum_required(VERSION 3.14)
 project(mmdeploy-example)
 
-if (NOT (${CMAKE_PROJECT_NAME} STREQUAL "MMDeploy"))
-    find_package(MMDeploy REQUIRED)
-endif ()
-
+if(NOT (${CMAKE_PROJECT_NAME} STREQUAL "MMDeploy"))
+  find_package(MMDeploy REQUIRED)
+endif()
 
 function(add_example task folder name)
-    if ((NOT task) OR (task IN_LIST MMDEPLOY_TASKS))
-        # Search for c/cpp sources
-        file(GLOB _SRCS ${folder}/${name}.c*)
-        add_executable(${name} ${_SRCS})
-        if (NOT (MSVC OR APPLE))
-            # Disable new dtags so that executables can run even without LD_LIBRARY_PATH set
-            target_link_libraries(${name} PRIVATE -Wl,--disable-new-dtags)
-        endif ()
-        if (MMDEPLOY_BUILD_SDK_MONOLITHIC)
-            target_link_libraries(${name} PRIVATE mmdeploy ${OpenCV_LIBS})
-        else ()
-            # Load MMDeploy modules
-            mmdeploy_load_static(${name} MMDeployStaticModules)
-            mmdeploy_load_dynamic(${name} MMDeployDynamicModules)
-            # Link to MMDeploy libraries
-            target_link_libraries(${name} PRIVATE MMDeployLibs ${OpenCV_LIBS})
-        endif ()
-        install(TARGETS ${name} RUNTIME DESTINATION bin)
-    endif ()
+  if((NOT task) OR (task IN_LIST MMDEPLOY_TASKS))
+    # Search for c/cpp sources
+    file(GLOB _SRCS ${folder}/${name}.c*)
+    add_executable(${name} ${_SRCS})
+    if(NOT (MSVC OR APPLE))
+      # Disable new dtags so that executables can run even without
+      # LD_LIBRARY_PATH set
+      target_link_libraries(${name} PRIVATE -Wl,--disable-new-dtags)
+    endif()
+    if(MMDEPLOY_BUILD_SDK_MONOLITHIC)
+      target_link_libraries(${name} PRIVATE mmdeploy ${OpenCV_LIBS})
+    else()
+      # Load MMDeploy modules
+      mmdeploy_load_static(${name} MMDeployStaticModules)
+      mmdeploy_load_dynamic(${name} MMDeployDynamicModules)
+      # Link to MMDeploy libraries
+      target_link_libraries(${name} PRIVATE MMDeployLibs ${OpenCV_LIBS})
+      message("MMDeployStaticModules: ${MMDeployStaticModules}")
+      message("MMDeployDynamicModules: ${MMDeployDynamicModules}")
+      message("MMDeployLibs: ${MMDeployLibs}")
+      message("OpenCV_LIBS: ${OpenCV_LIBS}")
+    endif()
+    install(TARGETS ${name} RUNTIME DESTINATION bin)
+  endif()
 endfunction()
 
 add_example(classifier c image_classification)
@@ -39,8 +43,7 @@ add_example(text_detector c ocr)
 add_example(pose_detector c pose_detection)
 add_example(rotated_detector c rotated_object_detection)
 add_example(video_recognizer c video_recognition)
-# TODO: figure out a better way
-# add_example("" c det_cls)
+# TODO: figure out a better way add_example("" c det_cls)
 
 add_example(classifier cpp classifier)
 add_example(detector cpp detector)
diff --git a/demo/csrc/c/batch_image_classification.cpp b/demo/csrc/c/batch_image_classification.cpp
index a9529f9bab..5f7f6dc3d9 100644
--- a/demo/csrc/c/batch_image_classification.cpp
+++ b/demo/csrc/c/batch_image_classification.cpp
@@ -4,97 +4,112 @@
 
 #include "mmdeploy/classifier.h"
 
-static int batch_inference(mmdeploy_classifier_t classifier,
-                           const std::vector<int>& image_ids,
+static int batch_inference(mmdeploy_classifier_t              classifier,
+                           const std::vector<int>&            image_ids,
                            const std::vector<mmdeploy_mat_t>& mats);
 
-int main(int argc, char* argv[]) {
-  if (argc < 5) {
-    fprintf(stderr, "usage:\n  image_classification device_name dump_model_directory "
-            "imagelist.txt batch_size\n");
-    return 1;
-  }
-  auto device_name = argv[1];
-  auto model_path = argv[2];
-
-  mmdeploy_classifier_t classifier{};
-  int status{};
-  status = mmdeploy_classifier_create_by_path(model_path, device_name, 0, &classifier);
-  if (status != MMDEPLOY_SUCCESS) {
-    fprintf(stderr, "failed to create classifier, code: %d\n", (int)status);
-    return 1;
-  }
+int        main(int argc, char* argv[])
+{
+    if (argc < 5)
+    {
+        fprintf(stderr, "usage:\n  image_classification device_name dump_model_directory "
+                        "imagelist.txt batch_size\n");
+        return 1;
+    }
+    auto                  device_name = argv[1];
+    auto                  model_path  = argv[2];
 
-  // `file_path` is the path of an image list file
-  std::string file_path = argv[3];
-  const int batch = std::stoi(argv[argc-1]);
+    mmdeploy_classifier_t classifier{};
+    int                   status{};
+    status = mmdeploy_classifier_create_by_path(model_path, device_name, 0, &classifier);
+    if (status != MMDEPLOY_SUCCESS)
+    {
+        fprintf(stderr, "failed to create classifier, code: %d\n", (int)status);
+        return 1;
+    }
 
-  // read image paths from the file
-  std::ifstream ifs(file_path);
-  std::string img_path;
-  std::vector<std::string> img_paths;
-  while (ifs >> img_path) {
-    img_paths.emplace_back(std::move(img_path));
-  }
+    // `file_path` is the path of an image list file
+    std::string              file_path = argv[3];
+    const int                batch     = std::stoi(argv[argc - 1]);
 
-  // read images and process batch inference
-  std::vector<cv::Mat> images;
-  std::vector<int> image_ids;
-  std::vector<mmdeploy_mat_t> mats;
-  for (int i = 0; i < (int)img_paths.size(); ++i) {
-    auto img = cv::imread(img_paths[i]);
-    if (!img.data) {
-      fprintf(stderr, "failed to load image: %s\n", img_paths[i].c_str());
-      continue;
+    // read image paths from the file
+    std::ifstream            ifs(file_path);
+    std::string              img_path;
+    std::vector<std::string> img_paths;
+    while (ifs >> img_path)
+    {
+        img_paths.emplace_back(std::move(img_path));
     }
-    images.push_back(img);
-    image_ids.push_back(i);
-    mmdeploy_mat_t mat{
-        img.data, img.rows, img.cols, 3, MMDEPLOY_PIXEL_FORMAT_BGR, MMDEPLOY_DATA_TYPE_UINT8};
-    mats.push_back(mat);
 
-    // process batch inference
-    if ((int)mats.size() == batch) {
-      if (batch_inference(classifier, image_ids, mats) != 0) {
-        continue;
-      }
-      // clear buffer for next batch
-      mats.clear();
-      image_ids.clear();
-      images.clear();
+    // read images and process batch inference
+    std::vector<cv::Mat>        images;
+    std::vector<int>            image_ids;
+    std::vector<mmdeploy_mat_t> mats;
+    for (int i = 0; i < (int)img_paths.size(); ++i)
+    {
+        auto img = cv::imread(img_paths[i]);
+        if (!img.data)
+        {
+            fprintf(stderr, "failed to load image: %s\n", img_paths[i].c_str());
+            continue;
+        }
+        images.push_back(img);
+        image_ids.push_back(i);
+        mmdeploy_mat_t mat{
+            img.data,
+            img.rows,
+            img.cols,
+            3,
+            MMDEPLOY_PIXEL_FORMAT_BGR,
+            MMDEPLOY_DATA_TYPE_UINT8};
+        mats.push_back(mat);
+
+        // process batch inference
+        if ((int)mats.size() == batch)
+        {
+            if (batch_inference(classifier, image_ids, mats) != 0)
+            {
+                continue;
+            }
+            // clear buffer for next batch
+            mats.clear();
+            image_ids.clear();
+            images.clear();
+        }
+    }
+    // process batch inference if there are still unhandled images
+    if (!mats.empty())
+    {
+        (void)batch_inference(classifier, image_ids, mats);
     }
-  }
-  // process batch inference if there are still unhandled images
-  if (!mats.empty()) {
-    (void)batch_inference(classifier, image_ids, mats);
-  }
 
-  mmdeploy_classifier_destroy(classifier);
+    mmdeploy_classifier_destroy(classifier);
 
-  return 0;
+    return 0;
 }
 
 
-int batch_inference(mmdeploy_classifier_t classifier, const std::vector<int>& image_ids,
-                    const std::vector<mmdeploy_mat_t>& mats) {
-  mmdeploy_classification_t* res{};
-  int* res_count{};
-  auto status = mmdeploy_classifier_apply(classifier, mats.data(), (int)mats.size(),
-                                          &res, &res_count);
-  if (status != MMDEPLOY_SUCCESS) {
-    fprintf(stderr, "failed to apply classifier to batch images %d, code: %d\n",
-            (int)mats.size(), (int)status);
-    return 1;
-  }
-  // print the inference results
-  auto res_ptr = res;
-  for (int j = 0; j < (int)mats.size(); ++j) {
-    fprintf(stderr, "results in the %d-th image:\n", image_ids[j]);
-    for (int k = 0; k < res_count[j]; ++k, ++res_ptr) {
-      fprintf(stderr, "  label: %d, score: %.4f\n", res_ptr->label_id, res_ptr->score);
+int batch_inference(mmdeploy_classifier_t classifier, const std::vector<int>& image_ids, const std::vector<mmdeploy_mat_t>& mats)
+{
+    mmdeploy_classification_t* res{};
+    int*                       res_count{};
+    auto                       status = mmdeploy_classifier_apply(classifier, mats.data(), (int)mats.size(), &res, &res_count);
+    if (status != MMDEPLOY_SUCCESS)
+    {
+        fprintf(stderr, "failed to apply classifier to batch images %d, code: %d\n", (int)mats.size(), (int)status);
+        return 1;
+    }
+    // print the inference results
+    auto res_ptr = res;
+    for (int j = 0; j < (int)mats.size(); ++j)
+    {
+        fprintf(stderr, "results in the %d-th image:\n", image_ids[j]);
+        for (int k = 0; k < res_count[j]; ++k, ++res_ptr)
+        {
+            fprintf(stderr, "  label: %d, score: %.4f\n", res_ptr->label_id, res_ptr->score);
+        }
     }
-  }
-  // release results buffer
-  mmdeploy_classifier_release_result(res, res_count, (int)mats.size());
-  return 0;
+    // release results buffer
+    mmdeploy_classifier_release_result(res, res_count, (int)mats.size());
+    return 0;
 }
diff --git a/demo/csrc/c/batch_object_detection.cpp b/demo/csrc/c/batch_object_detection.cpp
index 04cfe90f93..c638e1dfd1 100644
--- a/demo/csrc/c/batch_object_detection.cpp
+++ b/demo/csrc/c/batch_object_detection.cpp
@@ -5,143 +5,164 @@
 
 #include "mmdeploy/detector.h"
 
-static int batch_inference(mmdeploy_detector_t detector, std::vector<cv::Mat>& images,
-                           const std::vector<int>& image_ids,
-                           const std::vector<mmdeploy_mat_t>& mats);
-
-static void visualize_detection(const std::string& output_name, cv::Mat& image,
-                                const mmdeploy_detection_t* bboxes_ptr, int bboxes_num);
-
-int main(int argc, char* argv[]) {
-  if (argc < 5) {
-    fprintf(stderr, "usage:\n  object_detection device_name sdk_model_path "
-            "file_path batch_size\n");
-    return 1;
-  }
-  auto device_name = argv[1];
-  auto model_path = argv[2];
-
-  mmdeploy_detector_t detector{};
-  int status{};
-  status = mmdeploy_detector_create_by_path(model_path, device_name, 0, &detector);
-  if (status != MMDEPLOY_SUCCESS) {
-    fprintf(stderr, "failed to create detector, code: %d\n", (int)status);
-    return 1;
-  }
-
-  // file_path is the path of an image list file
-  std::string file_path = argv[3];
-  const int batch = std::stoi(argv[argc-1]);
-
-  // read image paths from the file
-  std::ifstream ifs(file_path);
-  std::string img_path;
-  std::vector<std::string> img_paths;
-  while (ifs >> img_path) {
-    img_paths.emplace_back(std::move(img_path));
-  }
-
-
-  // read images and process batch inference
-  std::vector<cv::Mat> images;
-  std::vector<int> image_ids;
-  std::vector<mmdeploy_mat_t> mats;
-  for (int i = 0; i < (int)img_paths.size(); ++i) {
-    auto img = cv::imread(img_paths[i]);
-    if (!img.data) {
-      fprintf(stderr, "failed to load image: %s\n", img_paths[i].c_str());
-      continue;
+static int  batch_inference(mmdeploy_detector_t detector, std::vector<cv::Mat>& images, const std::vector<int>& image_ids, const std::vector<mmdeploy_mat_t>& mats);
+
+static void visualize_detection(const std::string& output_name, cv::Mat& image, const mmdeploy_detection_t* bboxes_ptr, int bboxes_num);
+
+int         main(int argc, char* argv[])
+{
+    if (argc < 5)
+    {
+        fprintf(stderr, "usage:\n  object_detection device_name sdk_model_path "
+                        "file_path batch_size\n");
+        return 1;
     }
-    images.push_back(img);
-    image_ids.push_back(i);
-    mmdeploy_mat_t mat{
-        img.data, img.rows, img.cols, 3, MMDEPLOY_PIXEL_FORMAT_BGR, MMDEPLOY_DATA_TYPE_UINT8};
-    mats.push_back(mat);
-
-    // process batch inference
-    if ((int)mats.size() == batch) {
-      if (batch_inference(detector, images, image_ids, mats) != 0) {
-        continue;
-      }
-      // clear buffer for next batch
-      mats.clear();
-      image_ids.clear();
-      images.clear();
+    auto                device_name = argv[1];
+    auto                model_path  = argv[2];
+
+    mmdeploy_detector_t detector{};
+    int                 status{};
+    status = mmdeploy_detector_create_by_path(model_path, device_name, 0, &detector);
+    if (status != MMDEPLOY_SUCCESS)
+    {
+        fprintf(stderr, "failed to create detector, code: %d\n", (int)status);
+        return 1;
     }
-  }
-  // process batch inference if there are still unhandled images
-  if (!mats.empty()) {
-    (void)batch_inference(detector, images, image_ids, mats);
-  }
-
-  mmdeploy_detector_destroy(detector);
-  return 0;
-}
 
-int batch_inference(mmdeploy_detector_t detector, std::vector<cv::Mat>& images,
-                    const std::vector<int>& image_ids,
-                    const std::vector<mmdeploy_mat_t>& mats) {
-  mmdeploy_detection_t* bboxes{};
-  int* res_count{};
-  auto status = mmdeploy_detector_apply(detector, mats.data(), mats.size(), &bboxes, &res_count);
-  if (status != MMDEPLOY_SUCCESS) {
-    fprintf(stderr, "failed to apply detector, code: %d\n", (int)status);
-    return 1;
-  }
-
-  mmdeploy_detection_t* bboxes_ptr = bboxes;
-  for (int i = 0; i < (int)mats.size(); ++i) {
-    fprintf(stdout, "results in the %d-th image:\n  bbox_count=%d\n", image_ids[i], res_count[i]);
-    const std::string output_name = "output_detection_" + std::to_string(image_ids[i]) + ".png";
-    visualize_detection(output_name, images[i], bboxes_ptr, res_count[i]);
-    bboxes_ptr = bboxes_ptr + res_count[i];
-  }
-
-  mmdeploy_detector_release_result(bboxes, res_count, mats.size());
-  return 0;
-}
+    // file_path is the path of an image list file
+    std::string              file_path = argv[3];
+    const int                batch     = std::stoi(argv[argc - 1]);
+
+    // read image paths from the file
+    std::ifstream            ifs(file_path);
+    std::string              img_path;
+    std::vector<std::string> img_paths;
+    while (ifs >> img_path)
+    {
+        img_paths.emplace_back(std::move(img_path));
+    }
 
 
-void visualize_detection(const std::string& output_name, cv::Mat& image,
-                         const mmdeploy_detection_t* bboxes_ptr, int bbox_num) {
-  for (int i = 0; i < bbox_num; ++i, ++bboxes_ptr) {
-    const auto& box = bboxes_ptr->bbox;
-    const auto& mask = bboxes_ptr->mask;
+    // read images and process batch inference
+    std::vector<cv::Mat>        images;
+    std::vector<int>            image_ids;
+    std::vector<mmdeploy_mat_t> mats;
+    for (int i = 0; i < (int)img_paths.size(); ++i)
+    {
+        auto img = cv::imread(img_paths[i]);
+        if (!img.data)
+        {
+            fprintf(stderr, "failed to load image: %s\n", img_paths[i].c_str());
+            continue;
+        }
+        images.push_back(img);
+        image_ids.push_back(i);
+        mmdeploy_mat_t mat{
+            img.data,
+            img.rows,
+            img.cols,
+            3,
+            MMDEPLOY_PIXEL_FORMAT_BGR,
+            MMDEPLOY_DATA_TYPE_UINT8};
+        mats.push_back(mat);
+
+        // process batch inference
+        if ((int)mats.size() == batch)
+        {
+            if (batch_inference(detector, images, image_ids, mats) != 0)
+            {
+                continue;
+            }
+            // clear buffer for next batch
+            mats.clear();
+            image_ids.clear();
+            images.clear();
+        }
+    }
+    // process batch inference if there are still unhandled images
+    if (!mats.empty())
+    {
+        (void)batch_inference(detector, images, image_ids, mats);
+    }
 
-    fprintf(stdout,
-            "  box %d, left=%.2f, top=%.2f, right=%.2f, bottom=%.2f, "
-            "label=%d, score=%.4f\n",
-            i, box.left, box.top, box.right, box.bottom, bboxes_ptr->label_id, bboxes_ptr->score);
+    mmdeploy_detector_destroy(detector);
+    return 0;
+}
 
-    // skip detections with invalid bbox size (bbox height or width < 1)
-    if ((box.right - box.left) < 1 || (box.bottom - box.top) < 1) {
-      continue;
+int batch_inference(mmdeploy_detector_t detector, std::vector<cv::Mat>& images, const std::vector<int>& image_ids, const std::vector<mmdeploy_mat_t>& mats)
+{
+    mmdeploy_detection_t* bboxes{};
+    int*                  res_count{};
+    auto                  status = mmdeploy_detector_apply(detector, mats.data(), mats.size(), &bboxes, &res_count);
+    if (status != MMDEPLOY_SUCCESS)
+    {
+        fprintf(stderr, "failed to apply detector, code: %d\n", (int)status);
+        return 1;
     }
 
-    // skip detections less than specified score threshold
-    if (bboxes_ptr->score < 0.3) {
-      continue;
+    mmdeploy_detection_t* bboxes_ptr = bboxes;
+    for (int i = 0; i < (int)mats.size(); ++i)
+    {
+        fprintf(stdout, "results in the %d-th image:\n  bbox_count=%d\n", image_ids[i], res_count[i]);
+        const std::string output_name = "output_detection_" + std::to_string(image_ids[i]) + ".png";
+        visualize_detection(output_name, images[i], bboxes_ptr, res_count[i]);
+        bboxes_ptr = bboxes_ptr + res_count[i];
     }
 
-    // generate mask overlay if model exports masks
-    if (mask != nullptr) {
-      fprintf(stdout, "mask %d, height=%d, width=%d\n", i, mask->height, mask->width);
-
-      cv::Mat imgMask(mask->height, mask->width, CV_8UC1, &mask->data[0]);
-      auto x0 = std::max(std::floor(box.left) - 1, 0.f);
-      auto y0 = std::max(std::floor(box.top) - 1, 0.f);
-      cv::Rect roi((int)x0, (int)y0, mask->width, mask->height);
-
-      // split the RGB channels, overlay mask to a specific color channel
-      cv::Mat ch[3];
-      split(image, ch);
-      int col = 0;
-      cv::bitwise_or(imgMask, ch[col](roi), ch[col](roi));
-      merge(ch, 3, image);
-    }
+    mmdeploy_detector_release_result(bboxes, res_count, mats.size());
+    return 0;
+}
+
 
-    cv::rectangle(image, cv::Point{(int)box.left, (int)box.top},
-                  cv::Point{(int)box.right, (int)box.bottom}, cv::Scalar{0, 255, 0});
-  }
-  cv::imwrite(output_name, image);
+void visualize_detection(const std::string& output_name, cv::Mat& image, const mmdeploy_detection_t* bboxes_ptr, int bbox_num)
+{
+    for (int i = 0; i < bbox_num; ++i, ++bboxes_ptr)
+    {
+        const auto& box  = bboxes_ptr->bbox;
+        const auto& mask = bboxes_ptr->mask;
+
+        fprintf(stdout,
+                "  box %d, left=%.2f, top=%.2f, right=%.2f, bottom=%.2f, "
+                "label=%d, score=%.4f\n",
+                i,
+                box.left,
+                box.top,
+                box.right,
+                box.bottom,
+                bboxes_ptr->label_id,
+                bboxes_ptr->score);
+
+        // skip detections with invalid bbox size (bbox height or width < 1)
+        if ((box.right - box.left) < 1 || (box.bottom - box.top) < 1)
+        {
+            continue;
+        }
+
+        // skip detections less than specified score threshold
+        if (bboxes_ptr->score < 0.3)
+        {
+            continue;
+        }
+
+        // generate mask overlay if model exports masks
+        if (mask != nullptr)
+        {
+            fprintf(stdout, "mask %d, height=%d, width=%d\n", i, mask->height, mask->width);
+
+            cv::Mat  imgMask(mask->height, mask->width, CV_8UC1, &mask->data[0]);
+            auto     x0 = std::max(std::floor(box.left) - 1, 0.f);
+            auto     y0 = std::max(std::floor(box.top) - 1, 0.f);
+            cv::Rect roi((int)x0, (int)y0, mask->width, mask->height);
+
+            // split the RGB channels, overlay mask to a specific color channel
+            cv::Mat  ch[3];
+            split(image, ch);
+            int col = 0;
+            cv::bitwise_or(imgMask, ch[col](roi), ch[col](roi));
+            merge(ch, 3, image);
+        }
+
+        cv::rectangle(image, cv::Point{(int)box.left, (int)box.top}, cv::Point{(int)box.right, (int)box.bottom}, cv::Scalar{0, 255, 0});
+    }
+    cv::imwrite(output_name, image);
 }
diff --git a/demo/csrc/c/det_cls.cpp b/demo/csrc/c/det_cls.cpp
index 5d084dd23a..04e6677250 100644
--- a/demo/csrc/c/det_cls.cpp
+++ b/demo/csrc/c/det_cls.cpp
@@ -48,80 +48,85 @@ const auto config_json = R"(
 
 using namespace mmdeploy;
 
-class CropBox {
- public:
-  Result<Value> operator()(const Value& img, const Value& dets) {
-    auto patch = img["ori_img"].get<framework::Mat>();
-    if (dets.is_object() && dets.contains("bbox")) {
-      auto _box = from_value<std::vector<float>>(dets["bbox"]);
-      cv::Rect rect(cv::Rect2f(cv::Point2f(_box[0], _box[1]), cv::Point2f(_box[2], _box[3])));
-      patch = crop(patch, rect);
+class CropBox
+{
+  public:
+    Result<Value> operator()(const Value& img, const Value& dets)
+    {
+        auto patch = img["ori_img"].get<framework::Mat>();
+        if (dets.is_object() && dets.contains("bbox"))
+        {
+            auto     _box = from_value<std::vector<float>>(dets["bbox"]);
+            cv::Rect rect(cv::Rect2f(cv::Point2f(_box[0], _box[1]), cv::Point2f(_box[2], _box[3])));
+            patch = crop(patch, rect);
+        }
+        return Value{{"ori_img", patch}};
+    }
+
+  private:
+    static framework::Mat crop(const framework::Mat& img, cv::Rect rect)
+    {
+        cv::Mat mat(img.height(), img.width(), CV_8UC(img.channel()), img.data<void>());
+        rect &= cv::Rect(cv::Point(0, 0), mat.size());
+        mat = mat(rect).clone();
+        std::shared_ptr<void> data(mat.data, [mat = mat](void*) {});
+        return framework::Mat{mat.rows, mat.cols, img.pixel_format(), img.type(), std::move(data)};
     }
-    return Value{{"ori_img", patch}};
-  }
-
- private:
-  static framework::Mat crop(const framework::Mat& img, cv::Rect rect) {
-    cv::Mat mat(img.height(), img.width(), CV_8UC(img.channel()), img.data<void>());
-    rect &= cv::Rect(cv::Point(0, 0), mat.size());
-    mat = mat(rect).clone();
-    std::shared_ptr<void> data(mat.data, [mat = mat](void*) {});
-    return framework::Mat{mat.rows, mat.cols, img.pixel_format(), img.type(), std::move(data)};
-  }
 };
 
-MMDEPLOY_REGISTER_FACTORY_FUNC(Module, (CropBox, 0),
-                               [](const Value&) { return CreateTask(CropBox{}); });
+MMDEPLOY_REGISTER_FACTORY_FUNC(Module, (CropBox, 0), [](const Value&)
+                               { return CreateTask(CropBox{}); });
 
-int main() {
-  auto config = from_json<Value>(config_json);
+int main()
+{
+    auto              config = from_json<Value>(config_json);
 
-  mmdeploy_device_t device{};
-  mmdeploy_device_create("cpu", 0, &device);
-  mmdeploy_profiler_t profiler{};
-  mmdeploy_profiler_create("profile.bin", &profiler);
+    mmdeploy_device_t device{};
+    mmdeploy_device_create("cpu", 0, &device);
+    mmdeploy_profiler_t profiler{};
+    mmdeploy_profiler_create("profile.bin", &profiler);
 
-  mmdeploy_context_t ctx{};
-  mmdeploy_context_create(&ctx);
+    mmdeploy_context_t ctx{};
+    mmdeploy_context_create(&ctx);
 
-  mmdeploy_context_add(ctx, MMDEPLOY_TYPE_DEVICE, nullptr, device);
-  mmdeploy_context_add(ctx, MMDEPLOY_TYPE_PROFILER, nullptr, profiler);
+    mmdeploy_context_add(ctx, MMDEPLOY_TYPE_DEVICE, nullptr, device);
+    mmdeploy_context_add(ctx, MMDEPLOY_TYPE_PROFILER, nullptr, profiler);
 
-  auto thread_pool = mmdeploy_executor_create_thread_pool(4);
-  auto infer_thread = mmdeploy_executor_create_thread();
-  mmdeploy_context_add(ctx, MMDEPLOY_TYPE_SCHEDULER, "preprocess", thread_pool);
-  mmdeploy_context_add(ctx, MMDEPLOY_TYPE_SCHEDULER, "crop", thread_pool);
-  mmdeploy_context_add(ctx, MMDEPLOY_TYPE_SCHEDULER, "net", infer_thread);
-  mmdeploy_context_add(ctx, MMDEPLOY_TYPE_SCHEDULER, "postprocess", thread_pool);
+    auto thread_pool  = mmdeploy_executor_create_thread_pool(4);
+    auto infer_thread = mmdeploy_executor_create_thread();
+    mmdeploy_context_add(ctx, MMDEPLOY_TYPE_SCHEDULER, "preprocess", thread_pool);
+    mmdeploy_context_add(ctx, MMDEPLOY_TYPE_SCHEDULER, "crop", thread_pool);
+    mmdeploy_context_add(ctx, MMDEPLOY_TYPE_SCHEDULER, "net", infer_thread);
+    mmdeploy_context_add(ctx, MMDEPLOY_TYPE_SCHEDULER, "postprocess", thread_pool);
 
-  mmdeploy_pipeline_t pipeline{};
-  if (auto ec = mmdeploy_pipeline_create_v3((mmdeploy_value_t)&config, ctx, &pipeline)) {
-    MMDEPLOY_ERROR("failed to create pipeline: {}", ec);
-    return -1;
-  }
+    mmdeploy_pipeline_t pipeline{};
+    if (auto ec = mmdeploy_pipeline_create_v3((mmdeploy_value_t)&config, ctx, &pipeline))
+    {
+        MMDEPLOY_ERROR("failed to create pipeline: {}", ec);
+        return -1;
+    }
 
-  cv::Mat mat = cv::imread("../demo.jpg");
-  framework::Mat img(mat.rows, mat.cols, PixelFormat::kBGR, DataType::kINT8, mat.data,
-                     framework::Device(0));
+    cv::Mat          mat = cv::imread("../demo.jpg");
+    framework::Mat   img(mat.rows, mat.cols, PixelFormat::kBGR, DataType::kINT8, mat.data, framework::Device(0));
 
-  Value input = Value::Array{Value::Array{Value::Object{{"ori_img", img}}}};
+    Value            input = Value::Array{Value::Array{Value::Object{{"ori_img", img}}}};
 
-  mmdeploy_value_t tmp{};
-  mmdeploy_pipeline_apply(pipeline, (mmdeploy_value_t)&input, &tmp);
+    mmdeploy_value_t tmp{};
+    mmdeploy_pipeline_apply(pipeline, (mmdeploy_value_t)&input, &tmp);
 
-  auto output = std::move(*(Value*)tmp);
-  mmdeploy_value_destroy(tmp);
+    auto output = std::move(*(Value*)tmp);
+    mmdeploy_value_destroy(tmp);
 
-  MMDEPLOY_INFO("{}", output);
+    MMDEPLOY_INFO("{}", output);
 
-  mmdeploy_pipeline_destroy(pipeline);
+    mmdeploy_pipeline_destroy(pipeline);
 
-  mmdeploy_context_destroy(ctx);
-  mmdeploy_scheduler_destroy(infer_thread);
-  mmdeploy_scheduler_destroy(thread_pool);
+    mmdeploy_context_destroy(ctx);
+    mmdeploy_scheduler_destroy(infer_thread);
+    mmdeploy_scheduler_destroy(thread_pool);
 
-  mmdeploy_device_destroy(device);
-  mmdeploy_profiler_destroy(profiler);
+    mmdeploy_device_destroy(device);
+    mmdeploy_profiler_destroy(profiler);
 
-  return 0;
+    return 0;
 }
diff --git a/demo/csrc/c/det_pose.cpp b/demo/csrc/c/det_pose.cpp
index a12a33ef7b..9da724e3c8 100644
--- a/demo/csrc/c/det_pose.cpp
+++ b/demo/csrc/c/det_pose.cpp
@@ -56,129 +56,159 @@ const auto config_json = R"(
 
 using namespace mmdeploy;
 
-class AddBboxField {
- public:
-  Result<Value> operator()(const Value& img, const Value& dets) {
-    auto _img = img["ori_img"].get<framework::Mat>();
-    cv::Rect rect(0, 0, _img.width(), _img.height());
-    if (dets.is_object() && dets.contains("bbox")) {
-      auto _box = from_value<std::vector<float>>(dets["bbox"]);
-      rect = cv::Rect(cv::Rect2f(cv::Point2f(_box[0], _box[1]), cv::Point2f(_box[2], _box[3])));
+class AddBboxField
+{
+  public:
+    Result<Value> operator()(const Value& img, const Value& dets)
+    {
+        auto     _img = img["ori_img"].get<framework::Mat>();
+        cv::Rect rect(0, 0, _img.width(), _img.height());
+        if (dets.is_object() && dets.contains("bbox"))
+        {
+            auto _box = from_value<std::vector<float>>(dets["bbox"]);
+            rect      = cv::Rect(cv::Rect2f(cv::Point2f(_box[0], _box[1]), cv::Point2f(_box[2], _box[3])));
+        }
+        return Value{{"ori_img", _img}, {"bbox", {rect.x, rect.y, rect.width, rect.height}}};
     }
-    return Value{{"ori_img", _img}, {"bbox", {rect.x, rect.y, rect.width, rect.height}}};
-  }
 };
 
-MMDEPLOY_REGISTER_FACTORY_FUNC(Module, (AddBboxField, 0),
-                               [](const Value&) { return CreateTask(AddBboxField{}); });
+MMDEPLOY_REGISTER_FACTORY_FUNC(Module, (AddBboxField, 0), [](const Value&)
+                               { return CreateTask(AddBboxField{}); });
 
-Result<Value> FilterBbox(const Value& dets) {
-  Value::Array rets;
-  for (const auto& det : dets) {
-    if (det["label_id"].get<int>() == 0 && det["score"].get<float>() >= 0.3) {
-      rets.push_back(det);
+Result<Value> FilterBbox(const Value& dets)
+{
+    Value::Array rets;
+    for (const auto& det : dets)
+    {
+        if (det["label_id"].get<int>() == 0 && det["score"].get<float>() >= 0.3)
+        {
+            rets.push_back(det);
+        }
     }
-  }
-  return rets;
+    return rets;
 }
 
-MMDEPLOY_REGISTER_FACTORY_FUNC(Module, (FilterBbox, 0),
-                               [](const Value&) { return CreateTask(FilterBbox); });
+MMDEPLOY_REGISTER_FACTORY_FUNC(Module, (FilterBbox, 0), [](const Value&)
+                               { return CreateTask(FilterBbox); });
 
 static std::vector<std::pair<int, int>> skeleton{
-    {15, 13}, {13, 11}, {16, 14}, {14, 12}, {11, 12}, {5, 11}, {6, 12}, {5, 6}, {5, 7}, {6, 8},
-    {7, 9},   {8, 10},  {1, 2},   {0, 1},   {0, 2},   {1, 3},  {2, 4},  {3, 5}, {4, 6}};
-
-int main(int argc, char* argv[]) {
-  if (argc != 5) {
-    MMDEPLOY_INFO("usage: det_pose device det_model pose_model image");
-    return 0;
-  }
-  const auto device_name = argv[1];
-  const auto det_model_path = argv[2];
-  const auto pose_model_path = argv[3];
-  const auto image_path = argv[4];
-
-  auto config = from_json<Value>(config_json);
-  config["tasks"][0]["params"]["model"] = det_model_path;
-  config["tasks"][2]["tasks"][1]["params"]["model"] = pose_model_path;
-
-  mmdeploy_context_t context{};
-  mmdeploy_context_create(&context);
-
-  auto thread_pool = mmdeploy_executor_create_thread_pool(4);
-  auto single_thread = mmdeploy_executor_create_thread();
-  mmdeploy_context_add(context, MMDEPLOY_TYPE_SCHEDULER, "preprocess", thread_pool);
-  mmdeploy_context_add(context, MMDEPLOY_TYPE_SCHEDULER, "net", single_thread);
-  mmdeploy_context_add(context, MMDEPLOY_TYPE_SCHEDULER, "postprocess", thread_pool);
-
-  mmdeploy_device_t device{};
-  mmdeploy_device_create(device_name, 0, &device);
-  mmdeploy_context_add(context, MMDEPLOY_TYPE_DEVICE, nullptr, device);
-
-  mmdeploy_pipeline_t pipeline{};
-  if (auto ec = mmdeploy_pipeline_create_v3((mmdeploy_value_t)&config, context, &pipeline)) {
-    MMDEPLOY_ERROR("failed to create pipeline: {}", ec);
-    return -1;
-  }
+    {15, 13},
+    {13, 11},
+    {16, 14},
+    {14, 12},
+    {11, 12},
+    {5, 11},
+    {6, 12},
+    {5, 6},
+    {5, 7},
+    {6, 8},
+    {7, 9},
+    {8, 10},
+    {1, 2},
+    {0, 1},
+    {0, 2},
+    {1, 3},
+    {2, 4},
+    {3, 5},
+    {4, 6}};
+
+int main(int argc, char* argv[])
+{
+    if (argc != 5)
+    {
+        MMDEPLOY_INFO("usage: det_pose device det_model pose_model image");
+        return 0;
+    }
+    const auto device_name     = argv[1];
+    const auto det_model_path  = argv[2];
+    const auto pose_model_path = argv[3];
+    const auto image_path      = argv[4];
+
+    auto       config                                 = from_json<Value>(config_json);
+    config["tasks"][0]["params"]["model"]             = det_model_path;
+    config["tasks"][2]["tasks"][1]["params"]["model"] = pose_model_path;
+
+    mmdeploy_context_t context{};
+    mmdeploy_context_create(&context);
+
+    auto thread_pool   = mmdeploy_executor_create_thread_pool(4);
+    auto single_thread = mmdeploy_executor_create_thread();
+    mmdeploy_context_add(context, MMDEPLOY_TYPE_SCHEDULER, "preprocess", thread_pool);
+    mmdeploy_context_add(context, MMDEPLOY_TYPE_SCHEDULER, "net", single_thread);
+    mmdeploy_context_add(context, MMDEPLOY_TYPE_SCHEDULER, "postprocess", thread_pool);
+
+    mmdeploy_device_t device{};
+    mmdeploy_device_create(device_name, 0, &device);
+    mmdeploy_context_add(context, MMDEPLOY_TYPE_DEVICE, nullptr, device);
+
+    mmdeploy_pipeline_t pipeline{};
+    if (auto ec = mmdeploy_pipeline_create_v3((mmdeploy_value_t)&config, context, &pipeline))
+    {
+        MMDEPLOY_ERROR("failed to create pipeline: {}", ec);
+        return -1;
+    }
 
-  cv::Mat mat = cv::imread(image_path);
-  if (!mat.data) {
-    MMDEPLOY_ERROR("invalid image path: {}", image_path);
-  }
-  framework::Mat img(mat.rows, mat.cols, PixelFormat::kBGR, DataType::kINT8, mat.data,
-                     framework::Device(0));
+    cv::Mat mat = cv::imread(image_path);
+    if (!mat.data)
+    {
+        MMDEPLOY_ERROR("invalid image path: {}", image_path);
+    }
+    framework::Mat   img(mat.rows, mat.cols, PixelFormat::kBGR, DataType::kINT8, mat.data, framework::Device(0));
 
-  Value input{{{{"ori_img", img}}}};
+    Value            input{{{{"ori_img", img}}}};
 
-  mmdeploy_value_t tmp{};
-  mmdeploy_pipeline_apply(pipeline, (mmdeploy_value_t)&input, &tmp);
+    mmdeploy_value_t tmp{};
+    mmdeploy_pipeline_apply(pipeline, (mmdeploy_value_t)&input, &tmp);
 
-  mmdeploy_detection_t* dets{};
-  int* det_count{};
-  mmdeploy_detector_get_result(tmp, &dets, &det_count);
+    mmdeploy_detection_t* dets{};
+    int*                  det_count{};
+    mmdeploy_detector_get_result(tmp, &dets, &det_count);
 
-  auto output = std::move(*(Value*)tmp);
-  mmdeploy_value_destroy(tmp);
+    auto output = std::move(*(Value*)tmp);
+    mmdeploy_value_destroy(tmp);
 
-  // result of second output
-  auto& pose = output[1];
+    // result of second output
+    auto&                      pose = output[1];
 
-  mmdeploy_pose_detection_t* kps{};
-  mmdeploy_pose_detector_get_result((mmdeploy_value_t)&pose, &kps);
+    mmdeploy_pose_detection_t* kps{};
+    mmdeploy_pose_detector_get_result((mmdeploy_value_t)&pose, &kps);
 
-  MMDEPLOY_INFO("{}", *det_count);
+    MMDEPLOY_INFO("{}", *det_count);
 
-  for (int i = 0; i < *det_count; ++i) {
-    if (dets[i].label_id != 0 || dets[i].score < 0.3) {
-      continue;
-    }
-    const auto& bbox = dets[i].bbox;
-    cv::Point p1(bbox.left, bbox.top);
-    cv::Point p2(bbox.right, bbox.bottom);
-    cv::rectangle(mat, p1, p2, cv::Scalar(0, 255, 0));
-    for (int j = 0; j < kps[i].length; ++j) {
-      cv::Point p(kps[i].point[j].x, kps[i].point[j].y);
-      cv::circle(mat, p, 1, cv::Scalar(0, 255, 255), 2, cv::LINE_AA);
-    }
-    for (int j = 0; j < skeleton.size(); ++j) {
-      int u = skeleton[j].first;
-      cv::Point p_u(kps[i].point[u].x, kps[i].point[u].y);
-      int v = skeleton[j].second;
-      cv::Point p_v(kps[i].point[v].x, kps[i].point[v].y);
-      cv::line(mat, p_u, p_v, cv::Scalar(0, 255, 255), 1, cv::LINE_AA);
+    for (int i = 0; i < *det_count; ++i)
+    {
+        if (dets[i].label_id != 0 || dets[i].score < 0.3)
+        {
+            continue;
+        }
+        const auto& bbox = dets[i].bbox;
+        cv::Point   p1(bbox.left, bbox.top);
+        cv::Point   p2(bbox.right, bbox.bottom);
+        cv::rectangle(mat, p1, p2, cv::Scalar(0, 255, 0));
+        for (int j = 0; j < kps[i].length; ++j)
+        {
+            cv::Point p(kps[i].point[j].x, kps[i].point[j].y);
+            cv::circle(mat, p, 1, cv::Scalar(0, 255, 255), 2, cv::LINE_AA);
+        }
+        for (int j = 0; j < skeleton.size(); ++j)
+        {
+            int       u = skeleton[j].first;
+            cv::Point p_u(kps[i].point[u].x, kps[i].point[u].y);
+            int       v = skeleton[j].second;
+            cv::Point p_v(kps[i].point[v].x, kps[i].point[v].y);
+            cv::line(mat, p_u, p_v, cv::Scalar(0, 255, 255), 1, cv::LINE_AA);
+        }
     }
-  }
 
-  mmdeploy_pose_detector_release_result(kps, pose.size());
+    mmdeploy_pose_detector_release_result(kps, pose.size());
 
-  cv::imwrite("output_det_pose.jpg", mat);
+    cv::imwrite("output_det_pose.jpg", mat);
 
-  mmdeploy_pipeline_destroy(pipeline);
+    mmdeploy_pipeline_destroy(pipeline);
 
-  mmdeploy_context_destroy(context);
-  mmdeploy_scheduler_destroy(single_thread);
-  mmdeploy_scheduler_destroy(thread_pool);
+    mmdeploy_context_destroy(context);
+    mmdeploy_scheduler_destroy(single_thread);
+    mmdeploy_scheduler_destroy(thread_pool);
 
-  return 0;
+    return 0;
 }
diff --git a/demo/csrc/c/image_classification.cpp b/demo/csrc/c/image_classification.cpp
index 5e64581b9f..56bee51728 100644
--- a/demo/csrc/c/image_classification.cpp
+++ b/demo/csrc/c/image_classification.cpp
@@ -4,45 +4,56 @@
 
 #include "mmdeploy/classifier.h"
 
-int main(int argc, char* argv[]) {
-  if (argc != 4) {
-    fprintf(stderr, "usage:\n  image_classification device_name dump_model_directory image_path\n");
-    return 1;
-  }
-  auto device_name = argv[1];
-  auto model_path = argv[2];
-  auto image_path = argv[3];
-  cv::Mat img = cv::imread(image_path);
-  if (!img.data) {
-    fprintf(stderr, "failed to load image: %s\n", image_path);
-    return 1;
-  }
-
-  mmdeploy_classifier_t classifier{};
-  int status{};
-  status = mmdeploy_classifier_create_by_path(model_path, device_name, 0, &classifier);
-  if (status != MMDEPLOY_SUCCESS) {
-    fprintf(stderr, "failed to create classifier, code: %d\n", (int)status);
-    return 1;
-  }
-
-  mmdeploy_mat_t mat{
-      img.data, img.rows, img.cols, 3, MMDEPLOY_PIXEL_FORMAT_BGR, MMDEPLOY_DATA_TYPE_UINT8};
-
-  mmdeploy_classification_t* res{};
-  int* res_count{};
-  status = mmdeploy_classifier_apply(classifier, &mat, 1, &res, &res_count);
-  if (status != MMDEPLOY_SUCCESS) {
-    fprintf(stderr, "failed to apply classifier, code: %d\n", (int)status);
-    return 1;
-  }
-  for (int i = 0; i < res_count[0]; ++i) {
-    fprintf(stderr, "label: %d, score: %.4f\n", res[i].label_id, res[i].score);
-  }
-
-  mmdeploy_classifier_release_result(res, res_count, 1);
-
-  mmdeploy_classifier_destroy(classifier);
-
-  return 0;
+int main(int argc, char* argv[])
+{
+    if (argc != 4)
+    {
+        fprintf(stderr, "usage:\n  image_classification device_name dump_model_directory image_path\n");
+        return 1;
+    }
+    auto    device_name = argv[1];
+    auto    model_path  = argv[2];
+    auto    image_path  = argv[3];
+    cv::Mat img         = cv::imread(image_path);
+    if (!img.data)
+    {
+        fprintf(stderr, "failed to load image: %s\n", image_path);
+        return 1;
+    }
+
+    mmdeploy_classifier_t classifier{};
+    int                   status{};
+    status = mmdeploy_classifier_create_by_path(model_path, device_name, 0, &classifier);
+    if (status != MMDEPLOY_SUCCESS)
+    {
+        fprintf(stderr, "failed to create classifier, code: %d\n", (int)status);
+        return 1;
+    }
+
+    mmdeploy_mat_t mat{
+        img.data,
+        img.rows,
+        img.cols,
+        3,
+        MMDEPLOY_PIXEL_FORMAT_BGR,
+        MMDEPLOY_DATA_TYPE_UINT8};
+
+    mmdeploy_classification_t* res{};
+    int*                       res_count{};
+    status = mmdeploy_classifier_apply(classifier, &mat, 1, &res, &res_count);
+    if (status != MMDEPLOY_SUCCESS)
+    {
+        fprintf(stderr, "failed to apply classifier, code: %d\n", (int)status);
+        return 1;
+    }
+    for (int i = 0; i < res_count[0]; ++i)
+    {
+        fprintf(stderr, "label: %d, score: %.4f\n", res[i].label_id, res[i].score);
+    }
+
+    mmdeploy_classifier_release_result(res, res_count, 1);
+
+    mmdeploy_classifier_destroy(classifier);
+
+    return 0;
 }
diff --git a/demo/csrc/c/image_restorer.cpp b/demo/csrc/c/image_restorer.cpp
index ed12eefa7a..87ae0eed0c 100644
--- a/demo/csrc/c/image_restorer.cpp
+++ b/demo/csrc/c/image_restorer.cpp
@@ -7,44 +7,54 @@
 
 #include "mmdeploy/restorer.h"
 
-int main(int argc, char* argv[]) {
-  if (argc != 4) {
-    fprintf(stderr, "usage:\n  image_restorer device_name model_path image_path\n");
-    return 1;
-  }
-  auto device_name = argv[1];
-  auto model_path = argv[2];
-  auto image_path = argv[3];
-  cv::Mat img = cv::imread(image_path);
-  if (!img.data) {
-    fprintf(stderr, "failed to load image: %s\n", image_path);
-    return 1;
-  }
-
-  mmdeploy_restorer_t restorer{};
-  int status{};
-  status = mmdeploy_restorer_create_by_path(model_path, device_name, 0, &restorer);
-  if (status != MMDEPLOY_SUCCESS) {
-    fprintf(stderr, "failed to create restorer, code: %d\n", (int)status);
-    return 1;
-  }
-
-  mmdeploy_mat_t mat{
-      img.data, img.rows, img.cols, 3, MMDEPLOY_PIXEL_FORMAT_BGR, MMDEPLOY_DATA_TYPE_UINT8};
-
-  mmdeploy_mat_t* result{};
-  status = mmdeploy_restorer_apply(restorer, &mat, 1, &result);
-  if (status != MMDEPLOY_SUCCESS) {
-    fprintf(stderr, "failed to apply restorer, code: %d\n", (int)status);
-    return 1;
-  }
-
-  cv::Mat sr_img(result->height, result->width, CV_8UC3, result->data);
-  cv::cvtColor(sr_img, sr_img, cv::COLOR_RGB2BGR);
-  cv::imwrite("output_restorer.bmp", sr_img);
-
-  mmdeploy_restorer_release_result(result, 1);
-  mmdeploy_restorer_destroy(restorer);
-
-  return 0;
+int main(int argc, char* argv[])
+{
+    if (argc != 4)
+    {
+        fprintf(stderr, "usage:\n  image_restorer device_name model_path image_path\n");
+        return 1;
+    }
+    auto    device_name = argv[1];
+    auto    model_path  = argv[2];
+    auto    image_path  = argv[3];
+    cv::Mat img         = cv::imread(image_path);
+    if (!img.data)
+    {
+        fprintf(stderr, "failed to load image: %s\n", image_path);
+        return 1;
+    }
+
+    mmdeploy_restorer_t restorer{};
+    int                 status{};
+    status = mmdeploy_restorer_create_by_path(model_path, device_name, 0, &restorer);
+    if (status != MMDEPLOY_SUCCESS)
+    {
+        fprintf(stderr, "failed to create restorer, code: %d\n", (int)status);
+        return 1;
+    }
+
+    mmdeploy_mat_t mat{
+        img.data,
+        img.rows,
+        img.cols,
+        3,
+        MMDEPLOY_PIXEL_FORMAT_BGR,
+        MMDEPLOY_DATA_TYPE_UINT8};
+
+    mmdeploy_mat_t* result{};
+    status = mmdeploy_restorer_apply(restorer, &mat, 1, &result);
+    if (status != MMDEPLOY_SUCCESS)
+    {
+        fprintf(stderr, "failed to apply restorer, code: %d\n", (int)status);
+        return 1;
+    }
+
+    cv::Mat sr_img(result->height, result->width, CV_8UC3, result->data);
+    cv::cvtColor(sr_img, sr_img, cv::COLOR_RGB2BGR);
+    cv::imwrite("output_restorer.bmp", sr_img);
+
+    mmdeploy_restorer_release_result(result, 1);
+    mmdeploy_restorer_destroy(restorer);
+
+    return 0;
 }
diff --git a/demo/csrc/c/image_segmentation.cpp b/demo/csrc/c/image_segmentation.cpp
index df26d1585c..f8ae7b6a5f 100644
--- a/demo/csrc/c/image_segmentation.cpp
+++ b/demo/csrc/c/image_segmentation.cpp
@@ -12,80 +12,93 @@
 
 using namespace std;
 
-vector<cv::Vec3b> gen_palette(int num_classes) {
-  std::mt19937 gen;
-  std::uniform_int_distribution<ushort> uniform_dist(0, 255);
+vector<cv::Vec3b> gen_palette(int num_classes)
+{
+    std::mt19937                          gen;
+    std::uniform_int_distribution<ushort> uniform_dist(0, 255);
 
-  vector<cv::Vec3b> palette;
-  palette.reserve(num_classes);
-  for (auto i = 0; i < num_classes; ++i) {
-    palette.emplace_back(uniform_dist(gen), uniform_dist(gen), uniform_dist(gen));
-  }
-  return palette;
+    vector<cv::Vec3b>                     palette;
+    palette.reserve(num_classes);
+    for (auto i = 0; i < num_classes; ++i)
+    {
+        palette.emplace_back(uniform_dist(gen), uniform_dist(gen), uniform_dist(gen));
+    }
+    return palette;
 }
 
-int main(int argc, char* argv[]) {
-  if (argc != 4) {
-    fprintf(stderr, "usage:\n  image_segmentation device_name model_path image_path\n");
-    return 1;
-  }
-  auto device_name = argv[1];
-  auto model_path = argv[2];
-  auto image_path = argv[3];
-  cv::Mat img = cv::imread(image_path);
-  if (!img.data) {
-    fprintf(stderr, "failed to load image: %s\n", image_path);
-    return 1;
-  }
+int main(int argc, char* argv[])
+{
+    if (argc != 4)
+    {
+        fprintf(stderr, "usage:\n  image_segmentation device_name model_path image_path\n");
+        return 1;
+    }
+    auto    device_name = argv[1];
+    auto    model_path  = argv[2];
+    auto    image_path  = argv[3];
+    cv::Mat img         = cv::imread(image_path);
+    if (!img.data)
+    {
+        fprintf(stderr, "failed to load image: %s\n", image_path);
+        return 1;
+    }
 
-  mmdeploy_segmentor_t segmentor{};
-  int status{};
-  status = mmdeploy_segmentor_create_by_path(model_path, device_name, 0, &segmentor);
-  if (status != MMDEPLOY_SUCCESS) {
-    fprintf(stderr, "failed to create segmentor, code: %d\n", (int)status);
-    return 1;
-  }
+    mmdeploy_segmentor_t segmentor{};
+    int                  status{};
+    status = mmdeploy_segmentor_create_by_path(model_path, device_name, 0, &segmentor);
+    if (status != MMDEPLOY_SUCCESS)
+    {
+        fprintf(stderr, "failed to create segmentor, code: %d\n", (int)status);
+        return 1;
+    }
 
-  mmdeploy_mat_t mat{
-      img.data, img.rows, img.cols, 3, MMDEPLOY_PIXEL_FORMAT_BGR, MMDEPLOY_DATA_TYPE_UINT8};
+    mmdeploy_mat_t mat{
+        img.data,
+        img.rows,
+        img.cols,
+        3,
+        MMDEPLOY_PIXEL_FORMAT_BGR,
+        MMDEPLOY_DATA_TYPE_UINT8};
 
-  mmdeploy_segmentation_t* result{};
-  status = mmdeploy_segmentor_apply(segmentor, &mat, 1, &result);
-  if (status != MMDEPLOY_SUCCESS) {
-    fprintf(stderr, "failed to apply segmentor, code: %d\n", (int)status);
-    return 1;
-  }
+    mmdeploy_segmentation_t* result{};
+    status = mmdeploy_segmentor_apply(segmentor, &mat, 1, &result);
+    if (status != MMDEPLOY_SUCCESS)
+    {
+        fprintf(stderr, "failed to apply segmentor, code: %d\n", (int)status);
+        return 1;
+    }
 
-  auto palette = gen_palette(result->classes + 1);
+    auto             palette = gen_palette(result->classes + 1);
 
-  cv::Mat color_mask = cv::Mat::zeros(result->height, result->width, CV_8UC3);
-  int pos = 0;
-  int total = color_mask.rows * color_mask.cols;
-  std::vector<int> idxs(result->classes);
-  for (auto iter = color_mask.begin<cv::Vec3b>(); iter != color_mask.end<cv::Vec3b>(); ++iter) {
-    // output mask
-    if (result->mask) {
-      *iter = palette[result->mask[pos++]];
-    }
-    // output score
-    if (result->score) {
-      std::iota(idxs.begin(), idxs.end(), 0);
-      auto k =
-          std::max_element(idxs.begin(), idxs.end(),
-                           [&](int i, int j) {
-                             return result->score[i * total + pos] < result->score[j * total + pos];
-                           }) -
-          idxs.begin();
-      *iter = palette[k];
-      pos += 1;
+    cv::Mat          color_mask = cv::Mat::zeros(result->height, result->width, CV_8UC3);
+    int              pos        = 0;
+    int              total      = color_mask.rows * color_mask.cols;
+    std::vector<int> idxs(result->classes);
+    for (auto iter = color_mask.begin<cv::Vec3b>(); iter != color_mask.end<cv::Vec3b>(); ++iter)
+    {
+        // output mask
+        if (result->mask)
+        {
+            *iter = palette[result->mask[pos++]];
+        }
+        // output score
+        if (result->score)
+        {
+            std::iota(idxs.begin(), idxs.end(), 0);
+            auto k =
+                std::max_element(idxs.begin(), idxs.end(), [&](int i, int j)
+                                 { return result->score[i * total + pos] < result->score[j * total + pos]; }) -
+                idxs.begin();
+            *iter = palette[k];
+            pos += 1;
+        }
     }
-  }
 
-  img = img * 0.5 + color_mask * 0.5;
-  cv::imwrite("output_segmentation.png", img);
+    img = img * 0.5 + color_mask * 0.5;
+    cv::imwrite("output_segmentation.png", img);
 
-  mmdeploy_segmentor_release_result(result, 1);
-  mmdeploy_segmentor_destroy(segmentor);
+    mmdeploy_segmentor_release_result(result, 1);
+    mmdeploy_segmentor_destroy(segmentor);
 
-  return 0;
+    return 0;
 }
diff --git a/demo/csrc/c/object_detection.cpp b/demo/csrc/c/object_detection.cpp
index eee66be62b..f1ae9bcbb7 100644
--- a/demo/csrc/c/object_detection.cpp
+++ b/demo/csrc/c/object_detection.cpp
@@ -5,89 +5,103 @@
 
 #include "mmdeploy/detector.h"
 
-int main(int argc, char* argv[]) {
-  if (argc != 4) {
-    fprintf(stderr, "usage:\n  object_detection device_name model_path image_path\n");
-    return 1;
-  }
-  auto device_name = argv[1];
-  auto model_path = argv[2];
-  auto image_path = argv[3];
-  cv::Mat img = cv::imread(image_path);
-  if (!img.data) {
-    fprintf(stderr, "failed to load image: %s\n", image_path);
-    return 1;
-  }
-  cv::Size img_size = img.size();
-  mmdeploy_detector_t detector{};
-  int status{};
-  status = mmdeploy_detector_create_by_path(model_path, device_name, 0, &detector);
-  if (status != MMDEPLOY_SUCCESS) {
-    fprintf(stderr, "failed to create detector, code: %d\n", (int)status);
-    return 1;
-  }
+int main(int argc, char* argv[])
+{
+    if (argc != 4)
+    {
+        fprintf(stderr, "usage:\n  object_detection device_name model_path image_path\n");
+        return 1;
+    }
+    auto    device_name = argv[1];
+    auto    model_path  = argv[2];
+    auto    image_path  = argv[3];
+    cv::Mat img         = cv::imread(image_path);
+    if (!img.data)
+    {
+        fprintf(stderr, "failed to load image: %s\n", image_path);
+        return 1;
+    }
+    cv::Size            img_size = img.size();
+    mmdeploy_detector_t detector{};
+    int                 status{};
+    status = mmdeploy_detector_create_by_path(model_path, device_name, 0, &detector);
+    if (status != MMDEPLOY_SUCCESS)
+    {
+        fprintf(stderr, "failed to create detector, code: %d\n", (int)status);
+        return 1;
+    }
+
+    mmdeploy_mat_t mat{
+        img.data,
+        img.rows,
+        img.cols,
+        3,
+        MMDEPLOY_PIXEL_FORMAT_BGR,
+        MMDEPLOY_DATA_TYPE_UINT8};
 
-  mmdeploy_mat_t mat{
-      img.data, img.rows, img.cols, 3, MMDEPLOY_PIXEL_FORMAT_BGR, MMDEPLOY_DATA_TYPE_UINT8};
+    mmdeploy_detection_t* bboxes{};
+    int*                  res_count{};
+    status = mmdeploy_detector_apply(detector, &mat, 1, &bboxes, &res_count);
+    if (status != MMDEPLOY_SUCCESS)
+    {
+        fprintf(stderr, "failed to apply detector, code: %d\n", (int)status);
+        return 1;
+    }
 
-  mmdeploy_detection_t* bboxes{};
-  int* res_count{};
-  status = mmdeploy_detector_apply(detector, &mat, 1, &bboxes, &res_count);
-  if (status != MMDEPLOY_SUCCESS) {
-    fprintf(stderr, "failed to apply detector, code: %d\n", (int)status);
-    return 1;
-  }
+    fprintf(stdout, "bbox_count=%d\n", *res_count);
 
-  fprintf(stdout, "bbox_count=%d\n", *res_count);
+    for (int i = 0; i < *res_count; ++i)
+    {
+        const auto& box  = bboxes[i].bbox;
+        const auto& mask = bboxes[i].mask;
 
-  for (int i = 0; i < *res_count; ++i) {
-    const auto& box = bboxes[i].bbox;
-    const auto& mask = bboxes[i].mask;
+        fprintf(stdout, "box %d, left=%.2f, top=%.2f, right=%.2f, bottom=%.2f, label=%d, score=%.4f\n", i, box.left, box.top, box.right, box.bottom, bboxes[i].label_id, bboxes[i].score);
 
-    fprintf(stdout, "box %d, left=%.2f, top=%.2f, right=%.2f, bottom=%.2f, label=%d, score=%.4f\n",
-            i, box.left, box.top, box.right, box.bottom, bboxes[i].label_id, bboxes[i].score);
+        // skip detections with invalid bbox size (bbox height or width < 1)
+        if ((box.right - box.left) < 1 || (box.bottom - box.top) < 1)
+        {
+            continue;
+        }
 
-    // skip detections with invalid bbox size (bbox height or width < 1)
-    if ((box.right - box.left) < 1 || (box.bottom - box.top) < 1) {
-      continue;
-    }
+        // skip detections less than specified score threshold
+        if (bboxes[i].score < 0.3)
+        {
+            continue;
+        }
 
-    // skip detections less than specified score threshold
-    if (bboxes[i].score < 0.3) {
-      continue;
-    }
+        // generate mask overlay if model exports masks
+        if (mask != nullptr)
+        {
+            fprintf(stdout, "mask %d, height=%d, width=%d\n", i, mask->height, mask->width);
+            // split the RGB channels, overlay mask to a specific color channel
+            cv::Mat ch[3], mask_img;
+            int     col = 0;  // int col = i % 3;
+            split(img, ch);
+            cv::Mat imgMask(mask->height, mask->width, CV_8UC1, &mask->data[0]);
+            // rtmdet-inst
+            if (img_size.height == mask->height && img_size.width == mask->width)
+            {
+                mask_img = ch[col];
+            }
+            else
+            {
+                auto     x0 = std::max(std::floor(box.left) - 1, 0.f);
+                auto     y0 = std::max(std::floor(box.top) - 1, 0.f);
+                cv::Rect roi((int)x0, (int)y0, mask->width, mask->height);
+                mask_img = ch[col](roi);
+            }
+            cv::bitwise_or(imgMask, mask_img, mask_img);
+            merge(ch, 3, img);
+        }
 
-    // generate mask overlay if model exports masks
-    if (mask != nullptr) {
-      fprintf(stdout, "mask %d, height=%d, width=%d\n", i, mask->height, mask->width);
-      // split the RGB channels, overlay mask to a specific color channel
-      cv::Mat ch[3], mask_img;
-      int col = 0;  // int col = i % 3;
-      split(img, ch);
-      cv::Mat imgMask(mask->height, mask->width, CV_8UC1, &mask->data[0]);
-      // rtmdet-inst
-      if (img_size.height == mask->height && img_size.width == mask->width) {
-        mask_img = ch[col];
-      }
-      else {
-        auto x0 = std::max(std::floor(box.left) - 1, 0.f);
-        auto y0 = std::max(std::floor(box.top) - 1, 0.f);
-        cv::Rect roi((int)x0, (int)y0, mask->width, mask->height);
-        mask_img = ch[col](roi);
-      }
-      cv::bitwise_or(imgMask, mask_img, mask_img);
-      merge(ch, 3, img);
+        cv::rectangle(img, cv::Point{(int)box.left, (int)box.top}, cv::Point{(int)box.right, (int)box.bottom}, cv::Scalar{0, 255, 0});
     }
 
-    cv::rectangle(img, cv::Point{(int)box.left, (int)box.top},
-                  cv::Point{(int)box.right, (int)box.bottom}, cv::Scalar{0, 255, 0});
-  }
-
-  cv::imwrite("output_detection.png", img);
+    cv::imwrite("output_detection.png", img);
 
-  mmdeploy_detector_release_result(bboxes, res_count, 1);
+    mmdeploy_detector_release_result(bboxes, res_count, 1);
 
-  mmdeploy_detector_destroy(detector);
+    mmdeploy_detector_destroy(detector);
 
-  return 0;
+    return 0;
 }
diff --git a/demo/csrc/c/ocr.cpp b/demo/csrc/c/ocr.cpp
index 90ac1f8dc3..a30af9f0a4 100644
--- a/demo/csrc/c/ocr.cpp
+++ b/demo/csrc/c/ocr.cpp
@@ -6,76 +6,90 @@
 #include "mmdeploy/text_detector.h"
 #include "mmdeploy/text_recognizer.h"
 
-int main(int argc, char* argv[]) {
-  if (argc != 5) {
-    fprintf(stderr, "usage:\n  ocr device_name det_model_path reg_model_path image_path\n");
-    return 1;
-  }
-  auto device_name = argv[1];
-  auto det_model_path = argv[2];
-  auto reg_model_path = argv[3];
-  auto image_path = argv[4];
-  cv::Mat img = cv::imread(image_path);
-  if (!img.data) {
-    fprintf(stderr, "failed to load image: %s\n", image_path);
-    return 1;
-  }
+int main(int argc, char* argv[])
+{
+    if (argc != 5)
+    {
+        fprintf(stderr, "usage:\n  ocr device_name det_model_path reg_model_path image_path\n");
+        return 1;
+    }
+    auto    device_name    = argv[1];
+    auto    det_model_path = argv[2];
+    auto    reg_model_path = argv[3];
+    auto    image_path     = argv[4];
+    cv::Mat img            = cv::imread(image_path);
+    if (!img.data)
+    {
+        fprintf(stderr, "failed to load image: %s\n", image_path);
+        return 1;
+    }
 
-  mmdeploy_text_detector_t text_detector{};
-  int status{};
-  status = mmdeploy_text_detector_create_by_path(det_model_path, device_name, 0, &text_detector);
-  if (status != MMDEPLOY_SUCCESS) {
-    fprintf(stderr, "failed to create text_detector, code: %d\n", (int)status);
-    return 1;
-  }
+    mmdeploy_text_detector_t text_detector{};
+    int                      status{};
+    status = mmdeploy_text_detector_create_by_path(det_model_path, device_name, 0, &text_detector);
+    if (status != MMDEPLOY_SUCCESS)
+    {
+        fprintf(stderr, "failed to create text_detector, code: %d\n", (int)status);
+        return 1;
+    }
 
-  mmdeploy_text_recognizer_t text_recognizer{};
-  status =
-      mmdeploy_text_recognizer_create_by_path(reg_model_path, device_name, 0, &text_recognizer);
-  if (status != MMDEPLOY_SUCCESS) {
-    fprintf(stderr, "failed to create text_recognizer, code: %d\n", (int)status);
-    return 1;
-  }
+    mmdeploy_text_recognizer_t text_recognizer{};
+    status =
+        mmdeploy_text_recognizer_create_by_path(reg_model_path, device_name, 0, &text_recognizer);
+    if (status != MMDEPLOY_SUCCESS)
+    {
+        fprintf(stderr, "failed to create text_recognizer, code: %d\n", (int)status);
+        return 1;
+    }
 
-  mmdeploy_mat_t mat{
-      img.data, img.rows, img.cols, 3, MMDEPLOY_PIXEL_FORMAT_BGR, MMDEPLOY_DATA_TYPE_UINT8};
+    mmdeploy_mat_t mat{
+        img.data,
+        img.rows,
+        img.cols,
+        3,
+        MMDEPLOY_PIXEL_FORMAT_BGR,
+        MMDEPLOY_DATA_TYPE_UINT8};
 
-  mmdeploy_text_detection_t* bboxes{};
-  int* bbox_count{};
-  status = mmdeploy_text_detector_apply(text_detector, &mat, 1, &bboxes, &bbox_count);
-  if (status != MMDEPLOY_SUCCESS) {
-    fprintf(stderr, "failed to apply text_detector, code: %d\n", (int)status);
-    return 1;
-  }
-  fprintf(stdout, "bbox_count=%d\n", *bbox_count);
+    mmdeploy_text_detection_t* bboxes{};
+    int*                       bbox_count{};
+    status = mmdeploy_text_detector_apply(text_detector, &mat, 1, &bboxes, &bbox_count);
+    if (status != MMDEPLOY_SUCCESS)
+    {
+        fprintf(stderr, "failed to apply text_detector, code: %d\n", (int)status);
+        return 1;
+    }
+    fprintf(stdout, "bbox_count=%d\n", *bbox_count);
 
-  mmdeploy_text_recognition_t* texts{};
-  status =
-      mmdeploy_text_recognizer_apply_bbox(text_recognizer, &mat, 1, bboxes, bbox_count, &texts);
-  if (status != MMDEPLOY_SUCCESS) {
-    fprintf(stderr, "failed to apply text_recognizer, code: %d\n", (int)status);
-    return 1;
-  }
+    mmdeploy_text_recognition_t* texts{};
+    status =
+        mmdeploy_text_recognizer_apply_bbox(text_recognizer, &mat, 1, bboxes, bbox_count, &texts);
+    if (status != MMDEPLOY_SUCCESS)
+    {
+        fprintf(stderr, "failed to apply text_recognizer, code: %d\n", (int)status);
+        return 1;
+    }
 
-  for (int i = 0; i < *bbox_count; ++i) {
-    fprintf(stdout, "box[%d]: %s\n", i, texts[i].text);
-    std::vector<cv::Point> poly_points;
-    for (int j = 0; j < 4; ++j) {
-      auto const& pt = bboxes[i].bbox[j];
-      fprintf(stdout, "x: %.2f, y: %.2f, ", pt.x, pt.y);
-      poly_points.push_back({(int)pt.x, (int)pt.y});
+    for (int i = 0; i < *bbox_count; ++i)
+    {
+        fprintf(stdout, "box[%d]: %s\n", i, texts[i].text);
+        std::vector<cv::Point> poly_points;
+        for (int j = 0; j < 4; ++j)
+        {
+            auto const& pt = bboxes[i].bbox[j];
+            fprintf(stdout, "x: %.2f, y: %.2f, ", pt.x, pt.y);
+            poly_points.push_back({(int)pt.x, (int)pt.y});
+        }
+        fprintf(stdout, "\n");
+        cv::polylines(img, poly_points, true, cv::Scalar{0, 255, 0});
     }
-    fprintf(stdout, "\n");
-    cv::polylines(img, poly_points, true, cv::Scalar{0, 255, 0});
-  }
 
-  cv::imwrite("output_ocr.png", img);
+    cv::imwrite("output_ocr.png", img);
 
-  mmdeploy_text_recognizer_release_result(texts, *bbox_count);
-  mmdeploy_text_recognizer_destroy(text_recognizer);
+    mmdeploy_text_recognizer_release_result(texts, *bbox_count);
+    mmdeploy_text_recognizer_destroy(text_recognizer);
 
-  mmdeploy_text_detector_release_result(bboxes, bbox_count, 1);
-  mmdeploy_text_detector_destroy(text_detector);
+    mmdeploy_text_detector_release_result(bboxes, bbox_count, 1);
+    mmdeploy_text_detector_destroy(text_detector);
 
-  return 0;
+    return 0;
 }
diff --git a/demo/csrc/c/pose_detection.cpp b/demo/csrc/c/pose_detection.cpp
index 11d0ca6483..b6554b2582 100644
--- a/demo/csrc/c/pose_detection.cpp
+++ b/demo/csrc/c/pose_detection.cpp
@@ -6,45 +6,56 @@
 
 #include "mmdeploy/pose_detector.h"
 
-int main(int argc, char *argv[]) {
-  if (argc != 4) {
-    fprintf(stderr, "usage:\n  pose_detection device_name model_path image_path\n");
-    return 1;
-  }
-  auto device_name = argv[1];
-  auto model_path = argv[2];
-  auto image_path = argv[3];
-  cv::Mat img = cv::imread(image_path);
-  if (!img.data) {
-    fprintf(stderr, "failed to load image: %s\n", image_path);
-    return 1;
-  }
-
-  mmdeploy_pose_detector_t pose_detector{};
-  int status{};
-  status = mmdeploy_pose_detector_create_by_path(model_path, device_name, 0, &pose_detector);
-  if (status != MMDEPLOY_SUCCESS) {
-    fprintf(stderr, "failed to create pose_estimator, code: %d\n", (int)status);
-    return 1;
-  }
-
-  mmdeploy_mat_t mat{
-      img.data, img.rows, img.cols, 3, MMDEPLOY_PIXEL_FORMAT_BGR, MMDEPLOY_DATA_TYPE_UINT8};
-
-  mmdeploy_pose_detection_t *res{};
-  status = mmdeploy_pose_detector_apply(pose_detector, &mat, 1, &res);
-  if (status != MMDEPLOY_SUCCESS) {
-    fprintf(stderr, "failed to apply pose estimator, code: %d\n", (int)status);
-    return 1;
-  }
-
-  for (int i = 0; i < res->length; i++) {
-    cv::circle(img, {(int)res->point[i].x, (int)res->point[i].y}, 1, {0, 255, 0}, 2);
-  }
-  cv::imwrite("output_pose.png", img);
-
-  mmdeploy_pose_detector_release_result(res, 1);
-  mmdeploy_pose_detector_destroy(pose_detector);
-
-  return 0;
+int main(int argc, char* argv[])
+{
+    if (argc != 4)
+    {
+        fprintf(stderr, "usage:\n  pose_detection device_name model_path image_path\n");
+        return 1;
+    }
+    auto    device_name = argv[1];
+    auto    model_path  = argv[2];
+    auto    image_path  = argv[3];
+    cv::Mat img         = cv::imread(image_path);
+    if (!img.data)
+    {
+        fprintf(stderr, "failed to load image: %s\n", image_path);
+        return 1;
+    }
+
+    mmdeploy_pose_detector_t pose_detector{};
+    int                      status{};
+    status = mmdeploy_pose_detector_create_by_path(model_path, device_name, 0, &pose_detector);
+    if (status != MMDEPLOY_SUCCESS)
+    {
+        fprintf(stderr, "failed to create pose_estimator, code: %d\n", (int)status);
+        return 1;
+    }
+
+    mmdeploy_mat_t mat{
+        img.data,
+        img.rows,
+        img.cols,
+        3,
+        MMDEPLOY_PIXEL_FORMAT_BGR,
+        MMDEPLOY_DATA_TYPE_UINT8};
+
+    mmdeploy_pose_detection_t* res{};
+    status = mmdeploy_pose_detector_apply(pose_detector, &mat, 1, &res);
+    if (status != MMDEPLOY_SUCCESS)
+    {
+        fprintf(stderr, "failed to apply pose estimator, code: %d\n", (int)status);
+        return 1;
+    }
+
+    for (int i = 0; i < res->length; i++)
+    {
+        cv::circle(img, {(int)res->point[i].x, (int)res->point[i].y}, 1, {0, 255, 0}, 2);
+    }
+    cv::imwrite("output_pose.png", img);
+
+    mmdeploy_pose_detector_release_result(res, 1);
+    mmdeploy_pose_detector_destroy(pose_detector);
+
+    return 0;
 }
diff --git a/demo/csrc/c/rotated_object_detection.cpp b/demo/csrc/c/rotated_object_detection.cpp
index 937c65fdb4..94e47fbdc9 100644
--- a/demo/csrc/c/rotated_object_detection.cpp
+++ b/demo/csrc/c/rotated_object_detection.cpp
@@ -6,65 +6,76 @@
 
 #include "mmdeploy/rotated_detector.h"
 
-int main(int argc, char *argv[]) {
-  if (argc != 4) {
-    fprintf(stderr, "usage:\n  oriented_object_detection device_name model_path image_path\n");
-    return 1;
-  }
-  auto device_name = argv[1];
-  auto model_path = argv[2];
-  auto image_path = argv[3];
-  cv::Mat img = cv::imread(image_path);
-  if (!img.data) {
-    fprintf(stderr, "failed to load image: %s\n", image_path);
-    return 1;
-  }
+int main(int argc, char* argv[])
+{
+    if (argc != 4)
+    {
+        fprintf(stderr, "usage:\n  oriented_object_detection device_name model_path image_path\n");
+        return 1;
+    }
+    auto    device_name = argv[1];
+    auto    model_path  = argv[2];
+    auto    image_path  = argv[3];
+    cv::Mat img         = cv::imread(image_path);
+    if (!img.data)
+    {
+        fprintf(stderr, "failed to load image: %s\n", image_path);
+        return 1;
+    }
 
-  mmdeploy_rotated_detector_t detector{};
-  int status{};
-  status = mmdeploy_rotated_detector_create_by_path(model_path, device_name, 0, &detector);
-  if (status != MMDEPLOY_SUCCESS) {
-    fprintf(stderr, "failed to create rotated detector, code: %d\n", (int)status);
-    return 1;
-  }
+    mmdeploy_rotated_detector_t detector{};
+    int                         status{};
+    status = mmdeploy_rotated_detector_create_by_path(model_path, device_name, 0, &detector);
+    if (status != MMDEPLOY_SUCCESS)
+    {
+        fprintf(stderr, "failed to create rotated detector, code: %d\n", (int)status);
+        return 1;
+    }
 
-  mmdeploy_mat_t mat{
-      img.data, img.rows, img.cols, 3, MMDEPLOY_PIXEL_FORMAT_BGR, MMDEPLOY_DATA_TYPE_UINT8};
+    mmdeploy_mat_t mat{
+        img.data,
+        img.rows,
+        img.cols,
+        3,
+        MMDEPLOY_PIXEL_FORMAT_BGR,
+        MMDEPLOY_DATA_TYPE_UINT8};
 
-  mmdeploy_rotated_detection_t *rbboxes{};
-  int *res_count{};
-  status = mmdeploy_rotated_detector_apply(detector, &mat, 1, &rbboxes, &res_count);
-  if (status != MMDEPLOY_SUCCESS) {
-    fprintf(stderr, "failed to apply rotated detector, code: %d\n", (int)status);
-    return 1;
-  }
+    mmdeploy_rotated_detection_t* rbboxes{};
+    int*                          res_count{};
+    status = mmdeploy_rotated_detector_apply(detector, &mat, 1, &rbboxes, &res_count);
+    if (status != MMDEPLOY_SUCCESS)
+    {
+        fprintf(stderr, "failed to apply rotated detector, code: %d\n", (int)status);
+        return 1;
+    }
 
-  for (int i = 0; i < *res_count; ++i) {
-    // skip low score
-    if (rbboxes[i].score < 0.1) {
-      continue;
+    for (int i = 0; i < *res_count; ++i)
+    {
+        // skip low score
+        if (rbboxes[i].score < 0.1)
+        {
+            continue;
+        }
+        const auto& rbbox = rbboxes[i].rbbox;
+        float       xc    = rbbox[0];
+        float       yc    = rbbox[1];
+        float       w     = rbbox[2];
+        float       h     = rbbox[3];
+        float       ag    = rbbox[4];
+        float       wx    = w / 2 * std::cos(ag);
+        float       wy    = w / 2 * std::sin(ag);
+        float       hx    = -h / 2 * std::sin(ag);
+        float       hy    = h / 2 * std::cos(ag);
+        cv::Point   p1    = {int(xc - wx - hx), int(yc - wy - hy)};
+        cv::Point   p2    = {int(xc + wx - hx), int(yc + wy - hy)};
+        cv::Point   p3    = {int(xc + wx + hx), int(yc + wy + hy)};
+        cv::Point   p4    = {int(xc - wx + hx), int(yc - wy + hy)};
+        cv::drawContours(img, std::vector<std::vector<cv::Point>>{{p1, p2, p3, p4}}, -1, {0, 255, 0}, 2);
     }
-    const auto &rbbox = rbboxes[i].rbbox;
-    float xc = rbbox[0];
-    float yc = rbbox[1];
-    float w = rbbox[2];
-    float h = rbbox[3];
-    float ag = rbbox[4];
-    float wx = w / 2 * std::cos(ag);
-    float wy = w / 2 * std::sin(ag);
-    float hx = -h / 2 * std::sin(ag);
-    float hy = h / 2 * std::cos(ag);
-    cv::Point p1 = {int(xc - wx - hx), int(yc - wy - hy)};
-    cv::Point p2 = {int(xc + wx - hx), int(yc + wy - hy)};
-    cv::Point p3 = {int(xc + wx + hx), int(yc + wy + hy)};
-    cv::Point p4 = {int(xc - wx + hx), int(yc - wy + hy)};
-    cv::drawContours(img, std::vector<std::vector<cv::Point>>{{p1, p2, p3, p4}}, -1, {0, 255, 0},
-                     2);
-  }
-  cv::imwrite("output_rotated_detection.png", img);
+    cv::imwrite("output_rotated_detection.png", img);
 
-  mmdeploy_rotated_detector_release_result(rbboxes, res_count);
-  mmdeploy_rotated_detector_destroy(detector);
+    mmdeploy_rotated_detector_release_result(rbboxes, res_count);
+    mmdeploy_rotated_detector_destroy(detector);
 
-  return 0;
+    return 0;
 }
diff --git a/demo/csrc/c/video_recognition.cpp b/demo/csrc/c/video_recognition.cpp
index 2ae332453e..5bb44b556d 100644
--- a/demo/csrc/c/video_recognition.cpp
+++ b/demo/csrc/c/video_recognition.cpp
@@ -8,104 +8,119 @@
 
 #include "mmdeploy/video_recognizer.h"
 
-void SampleFrames(const char* video_path, std::map<int, cv::Mat>& buffer,
-                  std::vector<mmdeploy_mat_t>& clips, int clip_len, int frame_interval = 1,
-                  int num_clips = 1) {
-  cv::VideoCapture cap = cv::VideoCapture(video_path);
-  if (!cap.isOpened()) {
-    fprintf(stderr, "failed to load video: %s\n", video_path);
-    exit(1);
-  }
-
-  int num_frames = cap.get(cv::CAP_PROP_FRAME_COUNT);
-  printf("num_frames %d\n", num_frames);
-
-  int ori_clip_len = clip_len * frame_interval;
-  float avg_interval = (num_frames - ori_clip_len + 1.f) / num_clips;
-  std::vector<int> frame_inds;
-  for (int i = 0; i < num_clips; i++) {
-    int clip_offset = i * avg_interval + avg_interval / 2.0;
-    for (int j = 0; j < clip_len; j++) {
-      int ind = (j * frame_interval + clip_offset) % num_frames;
-      if (num_frames <= ori_clip_len - 1) {
-        ind = j % num_frames;
-      }
-      frame_inds.push_back(ind);
+void SampleFrames(const char* video_path, std::map<int, cv::Mat>& buffer, std::vector<mmdeploy_mat_t>& clips, int clip_len, int frame_interval = 1, int num_clips = 1)
+{
+    cv::VideoCapture cap = cv::VideoCapture(video_path);
+    if (!cap.isOpened())
+    {
+        fprintf(stderr, "failed to load video: %s\n", video_path);
+        exit(1);
     }
-  }
-
-  std::vector<int> unique_inds(frame_inds.begin(), frame_inds.end());
-  std::sort(unique_inds.begin(), unique_inds.end());
-  auto last = std::unique(unique_inds.begin(), unique_inds.end());
-  unique_inds.erase(last, unique_inds.end());
-
-  int ind = 0;
-  for (int i = 0; i < unique_inds.size(); i++) {
-    int tid = unique_inds[i];
-    cv::Mat frame;
-    while (ind < tid) {
-      cap.read(frame);
-      ind++;
+
+    int num_frames = cap.get(cv::CAP_PROP_FRAME_COUNT);
+    printf("num_frames %d\n", num_frames);
+
+    int              ori_clip_len = clip_len * frame_interval;
+    float            avg_interval = (num_frames - ori_clip_len + 1.f) / num_clips;
+    std::vector<int> frame_inds;
+    for (int i = 0; i < num_clips; i++)
+    {
+        int clip_offset = i * avg_interval + avg_interval / 2.0;
+        for (int j = 0; j < clip_len; j++)
+        {
+            int ind = (j * frame_interval + clip_offset) % num_frames;
+            if (num_frames <= ori_clip_len - 1)
+            {
+                ind = j % num_frames;
+            }
+            frame_inds.push_back(ind);
+        }
+    }
+
+    std::vector<int> unique_inds(frame_inds.begin(), frame_inds.end());
+    std::sort(unique_inds.begin(), unique_inds.end());
+    auto last = std::unique(unique_inds.begin(), unique_inds.end());
+    unique_inds.erase(last, unique_inds.end());
+
+    int ind = 0;
+    for (int i = 0; i < unique_inds.size(); i++)
+    {
+        int     tid = unique_inds[i];
+        cv::Mat frame;
+        while (ind < tid)
+        {
+            cap.read(frame);
+            ind++;
+        }
+        cap.read(frame);
+        buffer[tid] = frame;
+        ind++;
+    }
+
+    clips.resize(frame_inds.size());
+    for (int i = 0; i < frame_inds.size(); i++)
+    {
+        auto&          img = buffer[frame_inds[i]];
+        mmdeploy_mat_t mat{
+            img.data,
+            img.rows,
+            img.cols,
+            3,
+            MMDEPLOY_PIXEL_FORMAT_BGR,
+            MMDEPLOY_DATA_TYPE_UINT8};
+        clips[i] = mat;
     }
-    cap.read(frame);
-    buffer[tid] = frame;
-    ind++;
-  }
-
-  clips.resize(frame_inds.size());
-  for (int i = 0; i < frame_inds.size(); i++) {
-    auto& img = buffer[frame_inds[i]];
-    mmdeploy_mat_t mat{
-        img.data, img.rows, img.cols, 3, MMDEPLOY_PIXEL_FORMAT_BGR, MMDEPLOY_DATA_TYPE_UINT8};
-    clips[i] = mat;
-  }
 }
 
-int main(int argc, char* argv[]) {
-  if (argc != 7) {
-    fprintf(stderr,
-            "usage:\n  video_recognition device_name dump_model_directory video_path clip_len "
-            "frame_interval num_clips \n");
-    return 1;
-  }
-  auto device_name = argv[1];
-  auto model_path = argv[2];
-  auto video_path = argv[3];
-
-  int clip_len = std::stoi(argv[4]);
-  int frame_interval = std::stoi(argv[5]);
-  int num_clips = std::stoi(argv[6]);
-
-  std::map<int, cv::Mat> buffer;
-  std::vector<mmdeploy_mat_t> clips;
-  std::vector<mmdeploy_video_sample_info_t> clip_info;
-  SampleFrames(video_path, buffer, clips, clip_len, frame_interval, num_clips);
-  clip_info.push_back({clip_len, num_clips});
-
-  mmdeploy_video_recognizer_t recognizer{};
-  int status{};
-  status = mmdeploy_video_recognizer_create_by_path(model_path, device_name, 0, &recognizer);
-  if (status != MMDEPLOY_SUCCESS) {
-    fprintf(stderr, "failed to create recognizer, code: %d\n", (int)status);
-    return 1;
-  }
-
-  mmdeploy_video_recognition_t* res{};
-  int* res_count{};
-  status = mmdeploy_video_recognizer_apply(recognizer, clips.data(), clip_info.data(), 1, &res,
-                                           &res_count);
-  if (status != MMDEPLOY_SUCCESS) {
-    fprintf(stderr, "failed to apply classifier, code: %d\n", (int)status);
-    return 1;
-  }
-
-  for (int i = 0; i < res_count[0]; ++i) {
-    fprintf(stderr, "label: %d, score: %.4f\n", res[i].label_id, res[i].score);
-  }
-
-  mmdeploy_video_recognizer_release_result(res, res_count, 1);
-
-  mmdeploy_video_recognizer_destroy(recognizer);
-
-  return 0;
+int main(int argc, char* argv[])
+{
+    if (argc != 7)
+    {
+        fprintf(stderr,
+                "usage:\n  video_recognition device_name dump_model_directory video_path clip_len "
+                "frame_interval num_clips \n");
+        return 1;
+    }
+    auto                                      device_name = argv[1];
+    auto                                      model_path  = argv[2];
+    auto                                      video_path  = argv[3];
+
+    int                                       clip_len       = std::stoi(argv[4]);
+    int                                       frame_interval = std::stoi(argv[5]);
+    int                                       num_clips      = std::stoi(argv[6]);
+
+    std::map<int, cv::Mat>                    buffer;
+    std::vector<mmdeploy_mat_t>               clips;
+    std::vector<mmdeploy_video_sample_info_t> clip_info;
+    SampleFrames(video_path, buffer, clips, clip_len, frame_interval, num_clips);
+    clip_info.push_back({clip_len, num_clips});
+
+    mmdeploy_video_recognizer_t recognizer{};
+    int                         status{};
+    status = mmdeploy_video_recognizer_create_by_path(model_path, device_name, 0, &recognizer);
+    if (status != MMDEPLOY_SUCCESS)
+    {
+        fprintf(stderr, "failed to create recognizer, code: %d\n", (int)status);
+        return 1;
+    }
+
+    mmdeploy_video_recognition_t* res{};
+    int*                          res_count{};
+    status = mmdeploy_video_recognizer_apply(recognizer, clips.data(), clip_info.data(), 1, &res, &res_count);
+    if (status != MMDEPLOY_SUCCESS)
+    {
+        fprintf(stderr, "failed to apply classifier, code: %d\n", (int)status);
+        return 1;
+    }
+
+    for (int i = 0; i < res_count[0]; ++i)
+    {
+        fprintf(stderr, "label: %d, score: %.4f\n", res[i].label_id, res[i].score);
+    }
+
+    mmdeploy_video_recognizer_release_result(res, res_count, 1);
+
+    mmdeploy_video_recognizer_destroy(recognizer);
+
+    return 0;
 }
diff --git a/demo/csrc/cpp/classifier.cxx b/demo/csrc/cpp/classifier.cxx
index 3f55d9524d..ba065e55dc 100644
--- a/demo/csrc/cpp/classifier.cxx
+++ b/demo/csrc/cpp/classifier.cxx
@@ -10,44 +10,50 @@ DEFINE_ARG_string(image, "Input image path");
 DEFINE_string(device, "cpu", R"(Device name, e.g. "cpu", "cuda")");
 DEFINE_string(output, "classifier_output.jpg", "Output image path");
 
-int main(int argc, char* argv[]) {
-  if (!utils::ParseArguments(argc, argv)) {
-    return -1;
-  }
-
-  cv::Mat img = cv::imread(ARGS_image);
-  if (img.empty()) {
-    fprintf(stderr, "failed to load image: %s\n", ARGS_image.c_str());
-    return -1;
-  }
-
-  mmdeploy::Profiler profiler("/tmp/profile.bin");
-  mmdeploy::Context context;
-  context.Add(mmdeploy::Device(FLAGS_device));
-  context.Add(profiler);
-
-  // construct a classifier instance
-  mmdeploy::Classifier classifier(mmdeploy::Model{ARGS_model}, context);
-  // warmup
-  for (int i = 0; i < 20; ++i) {
-    classifier.Apply(img);
-  }
-
-  // apply the classifier; the result is an array-like class holding references to
-  // `mmdeploy_classification_t`, will be released automatically on destruction
-  mmdeploy::Classifier::Result result = classifier.Apply(img);
-
-  // visualize results
-  utils::Visualize v;
-  auto sess = v.get_session(img);
-  int count = 0;
-  for (const mmdeploy_classification_t& cls : result) {
-    sess.add_label(cls.label_id, cls.score, count++);
-  }
-
-  if (!FLAGS_output.empty()) {
-    cv::imwrite(FLAGS_output, sess.get());
-  }
-
-  return 0;
+int main(int argc, char* argv[])
+{
+    if (!utils::ParseArguments(argc, argv))
+    {
+        return -1;
+    }
+
+    cv::Mat img = cv::imread(ARGS_image);
+    if (img.empty())
+    {
+        fprintf(stderr, "failed to load image: %s\n", ARGS_image.c_str());
+        return -1;
+    }
+
+    mmdeploy::Profiler profiler("/tmp/profile.bin");
+    mmdeploy::Context  context;
+    context.Add(mmdeploy::Device(FLAGS_device));
+    context.Add(profiler);
+
+    // construct a classifier instance
+    mmdeploy::Classifier classifier(mmdeploy::Model{ARGS_model}, context);
+    // warmup
+    for (int i = 0; i < 20; ++i)
+    {
+        classifier.Apply(img);
+    }
+
+    // apply the classifier; the result is an array-like class holding references to
+    // `mmdeploy_classification_t`, will be released automatically on destruction
+    mmdeploy::Classifier::Result result = classifier.Apply(img);
+
+    // visualize results
+    utils::Visualize             v;
+    auto                         sess  = v.get_session(img);
+    int                          count = 0;
+    for (const mmdeploy_classification_t& cls : result)
+    {
+        sess.add_label(cls.label_id, cls.score, count++);
+    }
+
+    if (!FLAGS_output.empty())
+    {
+        cv::imwrite(FLAGS_output, sess.get());
+    }
+
+    return 0;
 }
diff --git a/demo/csrc/cpp/det_pose.cxx b/demo/csrc/cpp/det_pose.cxx
index b18fbe003b..50ced81bac 100644
--- a/demo/csrc/cpp/det_pose.cxx
+++ b/demo/csrc/cpp/det_pose.cxx
@@ -22,54 +22,61 @@ DEFINE_double(det_min_bbox_size, -1, "Detection minimum bbox size");
 
 DEFINE_double(pose_thr, 0, "Pose key-point threshold");
 
-int main(int argc, char* argv[]) {
-  if (!utils::ParseArguments(argc, argv)) {
-    return -1;
-  }
-
-  cv::Mat img = cv::imread(ARGS_image);
-  if (img.empty()) {
-    fprintf(stderr, "failed to load image: %s\n", ARGS_image.c_str());
-    return -1;
-  }
-
-  mmdeploy::Device device{FLAGS_device};
-  // create object detector
-  mmdeploy::Detector detector(mmdeploy::Model(ARGS_det_model), device);
-  // create pose detector
-  mmdeploy::PoseDetector pose(mmdeploy::Model(ARGS_pose_model), device);
-
-  // apply the detector, the result is an array-like class holding references to
-  // `mmdeploy_detection_t`, will be released automatically on destruction
-  mmdeploy::Detector::Result dets = detector.Apply(img);
-
-  // filter detections and extract bboxes for pose model
-  std::vector<mmdeploy_rect_t> bboxes;
-  for (const mmdeploy_detection_t& det : dets) {
-    if (det.label_id == FLAGS_det_label && det.score > FLAGS_det_thr) {
-      bboxes.push_back(det.bbox);
+int main(int argc, char* argv[])
+{
+    if (!utils::ParseArguments(argc, argv))
+    {
+        return -1;
     }
-  }
 
-  // apply pose detector, if no bboxes are provided, full image will be used; the result is an
-  // array-like class holding references to `mmdeploy_pose_detection_t`, will be released
-  // automatically on destruction
-  mmdeploy::PoseDetector::Result poses = pose.Apply(img, bboxes);
+    cv::Mat img = cv::imread(ARGS_image);
+    if (img.empty())
+    {
+        fprintf(stderr, "failed to load image: %s\n", ARGS_image.c_str());
+        return -1;
+    }
 
-  assert(bboxes.size() == poses.size());
+    mmdeploy::Device             device{FLAGS_device};
+    // create object detector
+    mmdeploy::Detector           detector(mmdeploy::Model(ARGS_det_model), device);
+    // create pose detector
+    mmdeploy::PoseDetector       pose(mmdeploy::Model(ARGS_pose_model), device);
+
+    // apply the detector, the result is an array-like class holding references to
+    // `mmdeploy_detection_t`, will be released automatically on destruction
+    mmdeploy::Detector::Result   dets = detector.Apply(img);
+
+    // filter detections and extract bboxes for pose model
+    std::vector<mmdeploy_rect_t> bboxes;
+    for (const mmdeploy_detection_t& det : dets)
+    {
+        if (det.label_id == FLAGS_det_label && det.score > FLAGS_det_thr)
+        {
+            bboxes.push_back(det.bbox);
+        }
+    }
 
-  // visualize results
-  utils::Visualize v;
-  v.set_skeleton(utils::Skeleton::get(FLAGS_skeleton));
-  auto sess = v.get_session(img);
-  for (size_t i = 0; i < bboxes.size(); ++i) {
-    sess.add_bbox(bboxes[i], -1, -1);
-    sess.add_pose(poses[i].point, poses[i].score, poses[i].length, FLAGS_pose_thr);
-  }
+    // apply pose detector, if no bboxes are provided, full image will be used; the result is an
+    // array-like class holding references to `mmdeploy_pose_detection_t`, will be released
+    // automatically on destruction
+    mmdeploy::PoseDetector::Result poses = pose.Apply(img, bboxes);
+
+    assert(bboxes.size() == poses.size());
+
+    // visualize results
+    utils::Visualize v;
+    v.set_skeleton(utils::Skeleton::get(FLAGS_skeleton));
+    auto sess = v.get_session(img);
+    for (size_t i = 0; i < bboxes.size(); ++i)
+    {
+        sess.add_bbox(bboxes[i], -1, -1);
+        sess.add_pose(poses[i].point, poses[i].score, poses[i].length, FLAGS_pose_thr);
+    }
 
-  if (!FLAGS_output.empty()) {
-    cv::imwrite(FLAGS_output, sess.get());
-  }
+    if (!FLAGS_output.empty())
+    {
+        cv::imwrite(FLAGS_output, sess.get());
+    }
 
-  return 0;
+    return 0;
 }
diff --git a/demo/csrc/cpp/detector.cxx b/demo/csrc/cpp/detector.cxx
index b91ea1b280..e8d1765123 100644
--- a/demo/csrc/cpp/detector.cxx
+++ b/demo/csrc/cpp/detector.cxx
@@ -11,46 +11,53 @@ DEFINE_string(output, "detector_output.jpg", "Output image path");
 
 DEFINE_double(det_thr, .5, "Detection score threshold");
 
-int main(int argc, char* argv[]) {
-  if (!utils::ParseArguments(argc, argv)) {
-    return -1;
-  }
-
-  cv::Mat img = cv::imread(ARGS_image);
-  if (img.empty()) {
-    fprintf(stderr, "failed to load image: %s\n", ARGS_image.c_str());
-    return -1;
-  }
-  mmdeploy::Profiler profiler("/tmp/profile.bin");
-  mmdeploy::Context context;
-  context.Add(mmdeploy::Device(FLAGS_device));
-  context.Add(profiler);
-
-  // construct a detector instance
-  mmdeploy::Detector detector(mmdeploy::Model{ARGS_model}, context);
-
-  // warmup
-  for (int i = 0; i < 20; ++i) {
-    detector.Apply(img);
-  }
-
-  // apply the detector, the result is an array-like class holding references to
-  // `mmdeploy_detection_t`, will be released automatically on destruction
-  mmdeploy::Detector::Result dets = detector.Apply(img);
-
-  // visualize
-  utils::Visualize v;
-  auto sess = v.get_session(img);
-  int count = 0;
-  for (const mmdeploy_detection_t& det : dets) {
-    if (det.score > FLAGS_det_thr) {  // filter bboxes
-      sess.add_det(det.bbox, det.label_id, det.score, det.mask, count++);
+int main(int argc, char* argv[])
+{
+    if (!utils::ParseArguments(argc, argv))
+    {
+        return -1;
     }
-  }
 
-  if (!FLAGS_output.empty()) {
-    cv::imwrite(FLAGS_output, sess.get());
-  }
+    cv::Mat img = cv::imread(ARGS_image);
+    if (img.empty())
+    {
+        fprintf(stderr, "failed to load image: %s\n", ARGS_image.c_str());
+        return -1;
+    }
+    mmdeploy::Profiler profiler("/tmp/profile.bin");
+    mmdeploy::Context  context;
+    context.Add(mmdeploy::Device(FLAGS_device));
+    context.Add(profiler);
+
+    // construct a detector instance
+    mmdeploy::Detector detector(mmdeploy::Model{ARGS_model}, context);
+
+    // warmup
+    for (int i = 0; i < 20; ++i)
+    {
+        detector.Apply(img);
+    }
+
+    // apply the detector, the result is an array-like class holding references to
+    // `mmdeploy_detection_t`, will be released automatically on destruction
+    mmdeploy::Detector::Result dets = detector.Apply(img);
+
+    // visualize
+    utils::Visualize           v;
+    auto                       sess  = v.get_session(img);
+    int                        count = 0;
+    for (const mmdeploy_detection_t& det : dets)
+    {
+        if (det.score > FLAGS_det_thr)
+        {  // filter bboxes
+            sess.add_det(det.bbox, det.label_id, det.score, det.mask, count++);
+        }
+    }
+
+    if (!FLAGS_output.empty())
+    {
+        cv::imwrite(FLAGS_output, sess.get());
+    }
 
-  return 0;
+    return 0;
 }
diff --git a/demo/csrc/cpp/pose_detector.cxx b/demo/csrc/cpp/pose_detector.cxx
index 025afdd794..b48c4b7cc3 100644
--- a/demo/csrc/cpp/pose_detector.cxx
+++ b/demo/csrc/cpp/pose_detector.cxx
@@ -5,40 +5,45 @@
 #include <opencv2/imgproc.hpp>
 #include <string>
 
-int main(int argc, char *argv[]) {
-  if (argc != 4) {
-    fprintf(stderr, "usage:\n  pose_detection device_name model_path image_path\n");
-    return 1;
-  }
-  auto device_name = argv[1];
-  auto model_path = argv[2];
-  auto image_path = argv[3];
-  cv::Mat img = cv::imread(image_path);
-  if (!img.data) {
-    fprintf(stderr, "failed to load image: %s\n", image_path);
-    return 1;
-  }
-
-  using namespace mmdeploy;
-
-  mmdeploy::Profiler profiler("/tmp/profile.bin");
-  mmdeploy::Context context;
-  context.Add(mmdeploy::Device(device_name));
-  context.Add(profiler);
-
-  PoseDetector detector{Model(model_path), context};
-
-  // warmup
-  for (int i = 0; i < 20; ++i) {
-    detector.Apply(img);
-  }
-
-  auto res = detector.Apply(img);
-
-  for (int i = 0; i < res[0].length; i++) {
-    cv::circle(img, {(int)res[0].point[i].x, (int)res[0].point[i].y}, 1, {0, 255, 0}, 2);
-  }
-  cv::imwrite("output_pose.png", img);
-
-  return 0;
+int main(int argc, char* argv[])
+{
+    if (argc != 4)
+    {
+        fprintf(stderr, "usage:\n  pose_detection device_name model_path image_path\n");
+        return 1;
+    }
+    auto    device_name = argv[1];
+    auto    model_path  = argv[2];
+    auto    image_path  = argv[3];
+    cv::Mat img         = cv::imread(image_path);
+    if (!img.data)
+    {
+        fprintf(stderr, "failed to load image: %s\n", image_path);
+        return 1;
+    }
+
+    using namespace mmdeploy;
+
+    mmdeploy::Profiler profiler("/tmp/profile.bin");
+    mmdeploy::Context  context;
+    context.Add(mmdeploy::Device(device_name));
+    context.Add(profiler);
+
+    PoseDetector detector{Model(model_path), context};
+
+    // warmup
+    for (int i = 0; i < 20; ++i)
+    {
+        detector.Apply(img);
+    }
+
+    auto res = detector.Apply(img);
+
+    for (int i = 0; i < res[0].length; i++)
+    {
+        cv::circle(img, {(int)res[0].point[i].x, (int)res[0].point[i].y}, 1, {0, 255, 0}, 2);
+    }
+    cv::imwrite("output_pose.png", img);
+
+    return 0;
 }
diff --git a/demo/csrc/cpp/pose_tracker.cxx b/demo/csrc/cpp/pose_tracker.cxx
index c4cf26b891..391343a971 100644
--- a/demo/csrc/cpp/pose_tracker.cxx
+++ b/demo/csrc/cpp/pose_tracker.cxx
@@ -17,54 +17,56 @@ DEFINE_int32(output_size, 0, "Long-edge of output frames");
 DEFINE_int32(flip, 0, "Set to 1 for flipping the input horizontally");
 DEFINE_int32(show, 1, "Delay passed to `cv::waitKey` when using `cv::imshow`; -1: disable");
 
-DEFINE_string(skeleton, "coco",
-              R"(Path to skeleton data or name of predefined skeletons: "coco", "coco-wholebody", "coco-wholebody-hand")");
-DEFINE_string(background, "default",
-              R"(Output background, "default": original image, "black": black background)");
+DEFINE_string(skeleton, "coco", R"(Path to skeleton data or name of predefined skeletons: "coco", "coco-wholebody", "coco-wholebody-hand")");
+DEFINE_string(background, "default", R"(Output background, "default": original image, "black": black background)");
 
 #include "pose_tracker_params.h"
 
-int main(int argc, char* argv[]) {
-  if (!utils::ParseArguments(argc, argv)) {
-    return -1;
-  }
-
-  // create pose tracker pipeline
-  mmdeploy::PoseTracker tracker(mmdeploy::Model(ARGS_det_model), mmdeploy::Model(ARGS_pose_model),
-                                mmdeploy::Device{FLAGS_device});
-
-  mmdeploy::PoseTracker::Params params;
-  // initialize tracker params with program arguments
-  InitTrackerParams(params);
-
-  // create a tracker state for each video
-  mmdeploy::PoseTracker::State state = tracker.CreateState(params);
-
-  utils::mediaio::Input input(ARGS_input, FLAGS_flip);
-  utils::mediaio::Output output(FLAGS_output, FLAGS_show);
-
-  utils::Visualize v(FLAGS_output_size);
-  v.set_background(FLAGS_background);
-  v.set_skeleton(utils::Skeleton::get(FLAGS_skeleton));
-
-  for (const cv::Mat& frame : input) {
-    // apply the pipeline with the tracker state and video frame; the result is an array-like class
-    // holding references to `mmdeploy_pose_tracker_target_t`, will be released automatically on
-    // destruction
-    mmdeploy::PoseTracker::Result result = tracker.Apply(state, frame);
-
-    // visualize results
-    auto sess = v.get_session(frame);
-    for (const mmdeploy_pose_tracker_target_t& target : result) {
-      sess.add_pose(target.keypoints, target.scores, target.keypoint_count, FLAGS_pose_kpt_thr);
+int main(int argc, char* argv[])
+{
+    if (!utils::ParseArguments(argc, argv))
+    {
+        return -1;
     }
 
-    // write to output stream
-    if (!output.write(sess.get())) {
-      // user requested exit by pressing ESC
-      break;
+    // create pose tracker pipeline
+    mmdeploy::PoseTracker         tracker(mmdeploy::Model(ARGS_det_model), mmdeploy::Model(ARGS_pose_model), mmdeploy::Device{FLAGS_device});
+
+    mmdeploy::PoseTracker::Params params;
+    // initialize tracker params with program arguments
+    InitTrackerParams(params);
+
+    // create a tracker state for each video
+    mmdeploy::PoseTracker::State state = tracker.CreateState(params);
+
+    utils::mediaio::Input        input(ARGS_input, FLAGS_flip);
+    utils::mediaio::Output       output(FLAGS_output, FLAGS_show);
+
+    utils::Visualize             v(FLAGS_output_size);
+    v.set_background(FLAGS_background);
+    v.set_skeleton(utils::Skeleton::get(FLAGS_skeleton));
+
+    for (const cv::Mat& frame : input)
+    {
+        // apply the pipeline with the tracker state and video frame; the result is an array-like class
+        // holding references to `mmdeploy_pose_tracker_target_t`, will be released automatically on
+        // destruction
+        mmdeploy::PoseTracker::Result result = tracker.Apply(state, frame);
+
+        // visualize results
+        auto                          sess = v.get_session(frame);
+        for (const mmdeploy_pose_tracker_target_t& target : result)
+        {
+            sess.add_pose(target.keypoints, target.scores, target.keypoint_count, FLAGS_pose_kpt_thr);
+        }
+
+        // write to output stream
+        if (!output.write(sess.get()))
+        {
+            // user requested exit by pressing ESC
+            break;
+        }
     }
-  }
 
-  return 0;
+    return 0;
 }
diff --git a/demo/csrc/cpp/pose_tracker_params.h b/demo/csrc/cpp/pose_tracker_params.h
index 2cda301869..8bfa59ac9b 100644
--- a/demo/csrc/cpp/pose_tracker_params.h
+++ b/demo/csrc/cpp/pose_tracker_params.h
@@ -4,36 +4,34 @@ DEFINE_int32(det_interval, 1, "Detection interval");
 DEFINE_int32(det_label, 0, "Detection label use for pose estimation");
 DEFINE_double(det_thr, 0.5, "Detection score threshold");
 DEFINE_double(det_min_bbox_size, -1, "Detection minimum bbox size");
-DEFINE_double(det_nms_thr, .7,
-              "NMS IOU threshold for merging detected bboxes and bboxes from tracked targets");
+DEFINE_double(det_nms_thr, .7, "NMS IOU threshold for merging detected bboxes and bboxes from tracked targets");
 
 DEFINE_int32(pose_max_num_bboxes, -1, "Max number of bboxes used for pose estimation per frame");
 DEFINE_double(pose_kpt_thr, .5, "Threshold for visible key-points");
-DEFINE_int32(pose_min_keypoints, -1,
-             "Min number of key-points for valid poses, -1 indicates ceil(n_kpts/2)");
+DEFINE_int32(pose_min_keypoints, -1, "Min number of key-points for valid poses, -1 indicates ceil(n_kpts/2)");
 DEFINE_double(pose_bbox_scale, 1.25, "Scale for expanding key-points to bbox");
 DEFINE_double(
-    pose_min_bbox_size, -1,
+    pose_min_bbox_size,
+    -1,
     "Min pose bbox size, tracks with bbox size smaller than the threshold will be dropped");
-DEFINE_double(pose_nms_thr, 0.5,
-              "NMS OKS/IOU threshold for suppressing overlapped poses, useful when multiple pose "
-              "estimations collapse to the same target");
+DEFINE_double(pose_nms_thr, 0.5, "NMS OKS/IOU threshold for suppressing overlapped poses, useful when multiple pose "
+                                 "estimations collapse to the same target");
 
 DEFINE_double(track_iou_thr, 0.4, "IOU threshold for associating missing tracks");
-DEFINE_int32(track_max_missing, 10,
-             "Max number of missing frames before a missing tracks is removed");
+DEFINE_int32(track_max_missing, 10, "Max number of missing frames before a missing tracks is removed");
 
-void InitTrackerParams(mmdeploy::PoseTracker::Params& params) {
-  params->det_interval = FLAGS_det_interval;
-  params->det_label = FLAGS_det_label;
-  params->det_thr = FLAGS_det_thr;
-  params->det_min_bbox_size = FLAGS_det_min_bbox_size;
-  params->pose_max_num_bboxes = FLAGS_pose_max_num_bboxes;
-  params->pose_kpt_thr = FLAGS_pose_kpt_thr;
-  params->pose_min_keypoints = FLAGS_pose_min_keypoints;
-  params->pose_bbox_scale = FLAGS_pose_bbox_scale;
-  params->pose_min_bbox_size = FLAGS_pose_min_bbox_size;
-  params->pose_nms_thr = FLAGS_pose_nms_thr;
-  params->track_iou_thr = FLAGS_track_iou_thr;
-  params->track_max_missing = FLAGS_track_max_missing;
+void InitTrackerParams(mmdeploy::PoseTracker::Params& params)
+{
+    params->det_interval        = FLAGS_det_interval;
+    params->det_label           = FLAGS_det_label;
+    params->det_thr             = FLAGS_det_thr;
+    params->det_min_bbox_size   = FLAGS_det_min_bbox_size;
+    params->pose_max_num_bboxes = FLAGS_pose_max_num_bboxes;
+    params->pose_kpt_thr        = FLAGS_pose_kpt_thr;
+    params->pose_min_keypoints  = FLAGS_pose_min_keypoints;
+    params->pose_bbox_scale     = FLAGS_pose_bbox_scale;
+    params->pose_min_bbox_size  = FLAGS_pose_min_bbox_size;
+    params->pose_nms_thr        = FLAGS_pose_nms_thr;
+    params->track_iou_thr       = FLAGS_track_iou_thr;
+    params->track_max_missing   = FLAGS_track_max_missing;
 }
diff --git a/demo/csrc/cpp/restorer.cxx b/demo/csrc/cpp/restorer.cxx
index 0b788a2514..a31b103635 100644
--- a/demo/csrc/cpp/restorer.cxx
+++ b/demo/csrc/cpp/restorer.cxx
@@ -11,40 +11,45 @@ DEFINE_ARG_string(image, "Input image path");
 DEFINE_string(device, "cpu", R"(Device name, e.g. "cpu", "cuda")");
 DEFINE_string(output, "restorer_output.jpg", "Output image path");
 
-int main(int argc, char* argv[]) {
-  if (!utils::ParseArguments(argc, argv)) {
-    return -1;
-  }
-
-  cv::Mat img = cv::imread(ARGS_image);
-  if (img.empty()) {
-    fprintf(stderr, "failed to load image: %s\n", ARGS_image.c_str());
-    return -1;
-  }
-
-  mmdeploy::Profiler profiler("/tmp/profile.bin");
-  mmdeploy::Context context;
-  context.Add(mmdeploy::Device(FLAGS_device));
-  context.Add(profiler);
-
-  // construct a restorer instance
-  mmdeploy::Restorer restorer{mmdeploy::Model{ARGS_model}, context};
-
-  // warmup
-  for (int i = 0; i < 20; ++i) {
-    restorer.Apply(img);
-  }
-
-  // apply restorer to the image
-  mmdeploy::Restorer::Result result = restorer.Apply(img);
-
-  // convert to BGR
-  cv::Mat upsampled(result->height, result->width, CV_8UC3, result->data);
-  cv::cvtColor(upsampled, upsampled, cv::COLOR_RGB2BGR);
-
-  if (!FLAGS_output.empty()) {
-    cv::imwrite(FLAGS_output, upsampled);
-  }
-
-  return 0;
+int main(int argc, char* argv[])
+{
+    if (!utils::ParseArguments(argc, argv))
+    {
+        return -1;
+    }
+
+    cv::Mat img = cv::imread(ARGS_image);
+    if (img.empty())
+    {
+        fprintf(stderr, "failed to load image: %s\n", ARGS_image.c_str());
+        return -1;
+    }
+
+    mmdeploy::Profiler profiler("/tmp/profile.bin");
+    mmdeploy::Context  context;
+    context.Add(mmdeploy::Device(FLAGS_device));
+    context.Add(profiler);
+
+    // construct a restorer instance
+    mmdeploy::Restorer restorer{mmdeploy::Model{ARGS_model}, context};
+
+    // warmup
+    for (int i = 0; i < 20; ++i)
+    {
+        restorer.Apply(img);
+    }
+
+    // apply restorer to the image
+    mmdeploy::Restorer::Result result = restorer.Apply(img);
+
+    // convert to BGR
+    cv::Mat                    upsampled(result->height, result->width, CV_8UC3, result->data);
+    cv::cvtColor(upsampled, upsampled, cv::COLOR_RGB2BGR);
+
+    if (!FLAGS_output.empty())
+    {
+        cv::imwrite(FLAGS_output, upsampled);
+    }
+
+    return 0;
 }
diff --git a/demo/csrc/cpp/rotated_detector.cxx b/demo/csrc/cpp/rotated_detector.cxx
index 6c5b9f1d37..01e30411ac 100644
--- a/demo/csrc/cpp/rotated_detector.cxx
+++ b/demo/csrc/cpp/rotated_detector.cxx
@@ -11,46 +11,53 @@ DEFINE_string(output, "rotated_detector_output.jpg", "Output image path");
 
 DEFINE_double(det_thr, 0.1, "Detection score threshold");
 
-int main(int argc, char* argv[]) {
-  if (!utils::ParseArguments(argc, argv)) {
-    return -1;
-  }
-
-  cv::Mat img = cv::imread(ARGS_image);
-  if (img.empty()) {
-    fprintf(stderr, "failed to load image: %s\n", ARGS_image.c_str());
-    return -1;
-  }
-
-  mmdeploy::Profiler profiler("/tmp/profile.bin");
-  mmdeploy::Context context;
-  context.Add(mmdeploy::Device(FLAGS_device));
-  context.Add(profiler);
-
-  // construct a detector instance
-  mmdeploy::RotatedDetector detector(mmdeploy::Model{ARGS_model}, context);
-
-  // warmup
-  for (int i = 0; i < 20; ++i) {
-    detector.Apply(img);
-  }
-
-  // apply the detector, the result is an array-like class holding references to
-  // `mmdeploy_rotated_detection_t`, will be released automatically on destruction
-  mmdeploy::RotatedDetector::Result dets = detector.Apply(img);
-
-  // visualize results
-  utils::Visualize v;
-  auto sess = v.get_session(img);
-  for (const mmdeploy_rotated_detection_t& det : dets) {
-    if (det.score > FLAGS_det_thr) {
-      sess.add_rotated_det(det.rbbox, det.label_id, det.score);
+int main(int argc, char* argv[])
+{
+    if (!utils::ParseArguments(argc, argv))
+    {
+        return -1;
     }
-  }
 
-  if (!FLAGS_output.empty()) {
-    cv::imwrite(FLAGS_output, sess.get());
-  }
+    cv::Mat img = cv::imread(ARGS_image);
+    if (img.empty())
+    {
+        fprintf(stderr, "failed to load image: %s\n", ARGS_image.c_str());
+        return -1;
+    }
+
+    mmdeploy::Profiler profiler("/tmp/profile.bin");
+    mmdeploy::Context  context;
+    context.Add(mmdeploy::Device(FLAGS_device));
+    context.Add(profiler);
+
+    // construct a detector instance
+    mmdeploy::RotatedDetector detector(mmdeploy::Model{ARGS_model}, context);
+
+    // warmup
+    for (int i = 0; i < 20; ++i)
+    {
+        detector.Apply(img);
+    }
+
+    // apply the detector, the result is an array-like class holding references to
+    // `mmdeploy_rotated_detection_t`, will be released automatically on destruction
+    mmdeploy::RotatedDetector::Result dets = detector.Apply(img);
+
+    // visualize results
+    utils::Visualize                  v;
+    auto                              sess = v.get_session(img);
+    for (const mmdeploy_rotated_detection_t& det : dets)
+    {
+        if (det.score > FLAGS_det_thr)
+        {
+            sess.add_rotated_det(det.rbbox, det.label_id, det.score);
+        }
+    }
+
+    if (!FLAGS_output.empty())
+    {
+        cv::imwrite(FLAGS_output, sess.get());
+    }
 
-  return 0;
+    return 0;
 }
diff --git a/demo/csrc/cpp/segmentor.cxx b/demo/csrc/cpp/segmentor.cxx
index c12f8f4153..8c4b5f4901 100644
--- a/demo/csrc/cpp/segmentor.cxx
+++ b/demo/csrc/cpp/segmentor.cxx
@@ -13,44 +13,48 @@ DEFINE_ARG_string(model, "Model path");
 DEFINE_ARG_string(image, "Input image path");
 DEFINE_string(device, "cpu", R"(Device name, e.g. "cpu", "cuda")");
 DEFINE_string(output, "segmentor_output.jpg", "Output image path");
-DEFINE_string(palette, "cityscapes",
-              R"(Path to palette data or name of predefined palettes: "cityscapes")");
-
-int main(int argc, char* argv[]) {
-  if (!utils::ParseArguments(argc, argv)) {
-    return -1;
-  }
-
-  cv::Mat img = cv::imread(ARGS_image);
-  if (img.empty()) {
-    fprintf(stderr, "failed to load image: %s\n", ARGS_image.c_str());
-    return -1;
-  }
-
-  mmdeploy::Profiler profiler("/tmp/profile.bin");
-  mmdeploy::Context context;
-  context.Add(mmdeploy::Device(FLAGS_device));
-  context.Add(profiler);
-  mmdeploy::Segmentor segmentor{mmdeploy::Model{ARGS_model}, context};
-
-  // warmup
-  for (int i = 0; i < 20; ++i) {
-    segmentor.Apply(img);
-  }
-
-  // apply the detector, the result is an array-like class holding a reference to
-  // `mmdeploy_segmentation_t`, will be released automatically on destruction
-  mmdeploy::Segmentor::Result seg = segmentor.Apply(img);
-
-  // visualize
-  utils::Visualize v;
-  v.set_palette(utils::Palette::get(FLAGS_palette));
-  auto sess = v.get_session(img);
-  sess.add_mask(seg->height, seg->width, seg->classes, seg->mask, seg->score);
-
-  if (!FLAGS_output.empty()) {
-    cv::imwrite(FLAGS_output, sess.get());
-  }
-
-  return 0;
+DEFINE_string(palette, "cityscapes", R"(Path to palette data or name of predefined palettes: "cityscapes")");
+
+int main(int argc, char* argv[])
+{
+    if (!utils::ParseArguments(argc, argv))
+    {
+        return -1;
+    }
+
+    cv::Mat img = cv::imread(ARGS_image);
+    if (img.empty())
+    {
+        fprintf(stderr, "failed to load image: %s\n", ARGS_image.c_str());
+        return -1;
+    }
+
+    mmdeploy::Profiler profiler("/tmp/profile.bin");
+    mmdeploy::Context  context;
+    context.Add(mmdeploy::Device(FLAGS_device));
+    context.Add(profiler);
+    mmdeploy::Segmentor segmentor{mmdeploy::Model{ARGS_model}, context};
+
+    // warmup
+    for (int i = 0; i < 20; ++i)
+    {
+        segmentor.Apply(img);
+    }
+
+    // apply the detector, the result is an array-like class holding a reference to
+    // `mmdeploy_segmentation_t`, will be released automatically on destruction
+    mmdeploy::Segmentor::Result seg = segmentor.Apply(img);
+
+    // visualize
+    utils::Visualize            v;
+    v.set_palette(utils::Palette::get(FLAGS_palette));
+    auto sess = v.get_session(img);
+    sess.add_mask(seg->height, seg->width, seg->classes, seg->mask, seg->score);
+
+    if (!FLAGS_output.empty())
+    {
+        cv::imwrite(FLAGS_output, sess.get());
+    }
+
+    return 0;
 }
diff --git a/demo/csrc/cpp/text_det_recog.cxx b/demo/csrc/cpp/text_det_recog.cxx
index 76f42d72c7..6a5e6c6bcb 100644
--- a/demo/csrc/cpp/text_det_recog.cxx
+++ b/demo/csrc/cpp/text_det_recog.cxx
@@ -43,40 +43,43 @@ const auto config_json = R"(
 
 using namespace mmdeploy;
 
-int main(int argc, char* argv[]) {
-  if (argc != 5) {
-    fprintf(stderr,
-            "usage:\n\ttext_det_recog device_name det_model_path reg_model_path image_path\n");
-    return -1;
-  }
+int main(int argc, char* argv[])
+{
+    if (argc != 5)
+    {
+        fprintf(stderr,
+                "usage:\n\ttext_det_recog device_name det_model_path reg_model_path image_path\n");
+        return -1;
+    }
 
-  auto device_name = argv[1];
-  auto det_model_path = argv[2];
-  auto reg_model_path = argv[3];
-  auto image_path = argv[4];
+    auto    device_name    = argv[1];
+    auto    det_model_path = argv[2];
+    auto    reg_model_path = argv[3];
+    auto    image_path     = argv[4];
 
-  cv::Mat mat = cv::imread(image_path);
-  if (!mat.data) {
-    fprintf(stderr, "failed to open image %s\n", image_path);
-    return -1;
-  }
+    cv::Mat mat = cv::imread(image_path);
+    if (!mat.data)
+    {
+        fprintf(stderr, "failed to open image %s\n", image_path);
+        return -1;
+    }
 
-  auto config = from_json<Value>(config_json);
+    auto    config = from_json<Value>(config_json);
 
-  Context context(Device(device_name, 0));
+    Context context(Device(device_name, 0));
 
-  auto thread_pool = Scheduler::ThreadPool(4);
-  auto infer_thread = Scheduler::Thread();
-  context.Add("thread_pool", thread_pool);
-  context.Add("infer_thread", infer_thread);
-  context.Add("text_detection", Model(det_model_path));
-  context.Add("text_recognition", Model(reg_model_path));
+    auto    thread_pool  = Scheduler::ThreadPool(4);
+    auto    infer_thread = Scheduler::Thread();
+    context.Add("thread_pool", thread_pool);
+    context.Add("infer_thread", infer_thread);
+    context.Add("text_detection", Model(det_model_path));
+    context.Add("text_recognition", Model(reg_model_path));
 
-  Pipeline pipeline(config, context);
+    Pipeline pipeline(config, context);
 
-  auto output = pipeline.Apply(mat);
+    auto     output = pipeline.Apply(mat);
 
-  // MMDEPLOY_INFO("output:\n{}", output);
+    // MMDEPLOY_INFO("output:\n{}", output);
 
-  return 0;
+    return 0;
 }
diff --git a/demo/csrc/cpp/text_ocr.cxx b/demo/csrc/cpp/text_ocr.cxx
index 6c8fdb055b..064b4429b8 100644
--- a/demo/csrc/cpp/text_ocr.cxx
+++ b/demo/csrc/cpp/text_ocr.cxx
@@ -16,42 +16,47 @@ DEFINE_string(output, "text_ocr_output.jpg", "Output image path");
 using mmdeploy::TextDetector;
 using mmdeploy::TextRecognizer;
 
-int main(int argc, char* argv[]) {
-  if (!utils::ParseArguments(argc, argv)) {
-    return -1;
-  }
-
-  cv::Mat img = cv::imread(ARGS_image);
-  if (img.empty()) {
-    fprintf(stderr, "failed to load image: %s\n", ARGS_image.c_str());
-    return -1;
-  }
-
-  mmdeploy::Device device(FLAGS_device);
-  TextDetector detector{mmdeploy::Model(ARGS_det_model), device};
-  TextRecognizer recognizer{mmdeploy::Model(ARGS_reg_model), device};
-
-  // apply the detector, the result is an array-like class holding references to
-  // `mmdeploy_text_detection_t`, will be released automatically on destruction
-  TextDetector::Result bboxes = detector.Apply(img);
-
-  // apply recognizer, if no bboxes are provided, full image will be used; the result is an
-  // array-like class holding references to `mmdeploy_text_recognition_t`, will be released
-  // automatically on destruction
-  TextRecognizer::Result texts = recognizer.Apply(img, {bboxes.begin(), bboxes.size()});
-
-  // visualize results
-  utils::Visualize v;
-  auto sess = v.get_session(img);
-  for (size_t i = 0; i < bboxes.size(); ++i) {
-    mmdeploy_text_detection_t& bbox = bboxes[i];
-    mmdeploy_text_recognition_t& text = texts[i];
-    sess.add_text_det(bbox.bbox, bbox.score, text.text, text.length, i);
-  }
-
-  if (!FLAGS_output.empty()) {
-    cv::imwrite(FLAGS_output, sess.get());
-  }
-
-  return 0;
+int main(int argc, char* argv[])
+{
+    if (!utils::ParseArguments(argc, argv))
+    {
+        return -1;
+    }
+
+    cv::Mat img = cv::imread(ARGS_image);
+    if (img.empty())
+    {
+        fprintf(stderr, "failed to load image: %s\n", ARGS_image.c_str());
+        return -1;
+    }
+
+    mmdeploy::Device       device(FLAGS_device);
+    TextDetector           detector{mmdeploy::Model(ARGS_det_model), device};
+    TextRecognizer         recognizer{mmdeploy::Model(ARGS_reg_model), device};
+
+    // apply the detector, the result is an array-like class holding references to
+    // `mmdeploy_text_detection_t`, will be released automatically on destruction
+    TextDetector::Result   bboxes = detector.Apply(img);
+
+    // apply recognizer, if no bboxes are provided, full image will be used; the result is an
+    // array-like class holding references to `mmdeploy_text_recognition_t`, will be released
+    // automatically on destruction
+    TextRecognizer::Result texts = recognizer.Apply(img, {bboxes.begin(), bboxes.size()});
+
+    // visualize results
+    utils::Visualize       v;
+    auto                   sess = v.get_session(img);
+    for (size_t i = 0; i < bboxes.size(); ++i)
+    {
+        mmdeploy_text_detection_t&   bbox = bboxes[i];
+        mmdeploy_text_recognition_t& text = texts[i];
+        sess.add_text_det(bbox.bbox, bbox.score, text.text, text.length, i);
+    }
+
+    if (!FLAGS_output.empty())
+    {
+        cv::imwrite(FLAGS_output, sess.get());
+    }
+
+    return 0;
 }
diff --git a/demo/csrc/cpp/utils/argparse.h b/demo/csrc/cpp/utils/argparse.h
index 5c94c8afad..08384ff16c 100644
--- a/demo/csrc/cpp/utils/argparse.h
+++ b/demo/csrc/cpp/utils/argparse.h
@@ -19,254 +19,332 @@
 #define DEFINE_ARG_double(name, msg) _MMDEPLOY_DEFINE_ARG(double, name, msg)
 #define DEFINE_ARG_string(name, msg) _MMDEPLOY_DEFINE_ARG(std::string, name, msg)
 
-namespace utils {
+namespace utils
+{
 
-class ArgParse {
- public:
-  template <typename T>
-  static T Register(const std::string& type, const std::string& name, T init,
-                    const std::string& msg, void* ptr) {
-    instance()._Register(type, name, msg, true, ptr);
-    return init;
-  }
-
-  template <typename T>
-  static T Register(const std::string& type, const std::string& name, const std::string& msg,
-                    void* ptr) {
-    instance()._Register(type, name, msg, false, ptr);
-    return {};
-  }
+    class ArgParse
+    {
+      public:
+        template<typename T>
+        static T Register(const std::string& type, const std::string& name, T init, const std::string& msg, void* ptr)
+        {
+            instance()._Register(type, name, msg, true, ptr);
+            return init;
+        }
 
-  static bool ParseArguments(int argc, char* argv[]) {
-    if (!instance()._Parse(argc, argv)) {
-      ShowUsageWithFlags(argv[0]);
-      return false;
-    }
-    return true;
-  }
+        template<typename T>
+        static T Register(const std::string& type, const std::string& name, const std::string& msg, void* ptr)
+        {
+            instance()._Register(type, name, msg, false, ptr);
+            return {};
+        }
 
-  static void ShowUsageWithFlags(const char* argv0) { instance()._ShowUsageWithFlags(argv0); }
+        static bool ParseArguments(int argc, char* argv[])
+        {
+            if (!instance()._Parse(argc, argv))
+            {
+                ShowUsageWithFlags(argv[0]);
+                return false;
+            }
+            return true;
+        }
 
- private:
-  static ArgParse& instance() {
-    static ArgParse inst;
-    return inst;
-  }
+        static void ShowUsageWithFlags(const char* argv0)
+        {
+            instance()._ShowUsageWithFlags(argv0);
+        }
 
-  struct Info {
-    std::string name;
-    std::string type;
-    std::string msg;
-    bool is_flag;
-    void* ptr;
-  };
+      private:
+        static ArgParse& instance()
+        {
+            static ArgParse inst;
+            return inst;
+        }
 
-  void _Register(std::string type, const std::string& name, const std::string& msg, bool is_flag,
-                 void* ptr) {
-    if (type == "std::string") {
-      type = "string";
-    } else if (type == "int32_t") {
-      type = "int32";
-    }
-    infos_.push_back({name, type, msg, is_flag, ptr});
-  }
+        struct Info
+        {
+            std::string name;
+            std::string type;
+            std::string msg;
+            bool        is_flag;
+            void*       ptr;
+        };
 
-  bool _Parse(int argc, char* argv[]) {
-    int arg_idx{-1};
-    std::vector<std::string> args(infos_.size());
-    std::vector<int> used(infos_.size());
-    for (int i = 1; i < argc; ++i) {
-      if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) {
-        return false;
-      }
-      if (argv[i][0] == '-' && argv[i][1] == '-') {
-        // parse flag key-value pair (--x=y or --x y)
-        int eq{-1};
-        for (int k = 2; argv[i][k]; ++k) {
-          if (argv[i][k] == '=') {
-            eq = k;
-            break;
-          }
+        void _Register(std::string type, const std::string& name, const std::string& msg, bool is_flag, void* ptr)
+        {
+            if (type == "std::string")
+            {
+                type = "string";
+            }
+            else if (type == "int32_t")
+            {
+                type = "int32";
+            }
+            infos_.push_back({name, type, msg, is_flag, ptr});
         }
-        std::string key;
-        std::string val;
-        if (eq >= 0) {
-          key = std::string(argv[i] + 2, argv[i] + eq);
-          val = std::string(argv[i] + eq + 1);
-        } else {
-          key = std::string(argv[i] + 2);
-          if (i < argc - 1) {
-            val = argv[++i];
-          }
-        }
-        bool found{};
-        for (int j = 0; j < infos_.size(); ++j) {
-          auto& flag = infos_[j];
-          if (key == flag.name) {
-            args[j] = val;
-            found = used[j] = 1;
-            break;
-          }
-        }
-        if (!found) {
-          std::cout << "error: unknown option: " << key << std::endl;
-          return false;
-        }
-      } else {
-        for (arg_idx++; arg_idx < infos_.size(); ++arg_idx) {
-          if (!infos_[arg_idx].is_flag) {
-            args[arg_idx] = argv[i];
-            used[arg_idx] = 1;
-            break;
-          }
-        }
-        if (arg_idx == infos_.size()) {
-          std::cout << "error: unknown argument: " << argv[i] << std::endl;
-          return false;
-        }
-      }
-    }
-    std::vector<std::string> missing;
-    for (arg_idx++; arg_idx < infos_.size(); ++arg_idx) {
-      if (!infos_[arg_idx].is_flag) {
-        missing.push_back(infos_[arg_idx].name);
-      }
-    }
-    if (!missing.empty()) {
-      std::cout << "error: the following arguments are required:";
-      for (int i = 0; i < missing.size(); ++i) {
-        std::cout << " " << missing[i];
-        if (i != missing.size() - 1) {
-          std::cout << ",";
+
+        bool _Parse(int argc, char* argv[])
+        {
+            int                      arg_idx{-1};
+            std::vector<std::string> args(infos_.size());
+            std::vector<int>         used(infos_.size());
+            for (int i = 1; i < argc; ++i)
+            {
+                if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0)
+                {
+                    return false;
+                }
+                if (argv[i][0] == '-' && argv[i][1] == '-')
+                {
+                    // parse flag key-value pair (--x=y or --x y)
+                    int eq{-1};
+                    for (int k = 2; argv[i][k]; ++k)
+                    {
+                        if (argv[i][k] == '=')
+                        {
+                            eq = k;
+                            break;
+                        }
+                    }
+                    std::string key;
+                    std::string val;
+                    if (eq >= 0)
+                    {
+                        key = std::string(argv[i] + 2, argv[i] + eq);
+                        val = std::string(argv[i] + eq + 1);
+                    }
+                    else
+                    {
+                        key = std::string(argv[i] + 2);
+                        if (i < argc - 1)
+                        {
+                            val = argv[++i];
+                        }
+                    }
+                    bool found{};
+                    for (int j = 0; j < infos_.size(); ++j)
+                    {
+                        auto& flag = infos_[j];
+                        if (key == flag.name)
+                        {
+                            args[j] = val;
+                            found = used[j] = 1;
+                            break;
+                        }
+                    }
+                    if (!found)
+                    {
+                        std::cout << "error: unknown option: " << key << std::endl;
+                        return false;
+                    }
+                }
+                else
+                {
+                    for (arg_idx++; arg_idx < infos_.size(); ++arg_idx)
+                    {
+                        if (!infos_[arg_idx].is_flag)
+                        {
+                            args[arg_idx] = argv[i];
+                            used[arg_idx] = 1;
+                            break;
+                        }
+                    }
+                    if (arg_idx == infos_.size())
+                    {
+                        std::cout << "error: unknown argument: " << argv[i] << std::endl;
+                        return false;
+                    }
+                }
+            }
+            std::vector<std::string> missing;
+            for (arg_idx++; arg_idx < infos_.size(); ++arg_idx)
+            {
+                if (!infos_[arg_idx].is_flag)
+                {
+                    missing.push_back(infos_[arg_idx].name);
+                }
+            }
+            if (!missing.empty())
+            {
+                std::cout << "error: the following arguments are required:";
+                for (int i = 0; i < missing.size(); ++i)
+                {
+                    std::cout << " " << missing[i];
+                    if (i != missing.size() - 1)
+                    {
+                        std::cout << ",";
+                    }
+                }
+                std::cout << "\n";
+                return false;
+            }
+
+            for (int i = 0; i < infos_.size(); ++i)
+            {
+                if (used[i])
+                {
+                    try
+                    {
+                        parse_str(infos_[i], args[i]);
+                    }
+                    catch (...)
+                    {
+                        std::cout << "error: failed to parse " << infos_[i].name << ": " << args[i] << std::endl;
+                        return false;
+                    }
+                }
+            }
+
+            return true;
         }
-      }
-      std::cout << "\n";
-      return false;
-    }
 
-    for (int i = 0; i < infos_.size(); ++i) {
-      if (used[i]) {
-        try {
-          parse_str(infos_[i], args[i]);
-        } catch (...) {
-          std::cout << "error: failed to parse " << infos_[i].name << ": " << args[i] << std::endl;
-          return false;
+        static void parse_str(Info& info, const std::string& str)
+        {
+            if (info.type == "int32")
+            {
+                *static_cast<int32_t*>(info.ptr) = std::stoi(str);
+            }
+            else if (info.type == "double")
+            {
+                *static_cast<double*>(info.ptr) = std::stod(str);
+            }
+            else if (info.type == "string")
+            {
+                *static_cast<std::string*>(info.ptr) = str;
+            }
+            else
+            {
+                // pass
+            }
         }
-      }
-    }
 
-    return true;
-  }
+        static std::string get_default_str(const Info& info)
+        {
+            if (info.type == "int32")
+            {
+                return std::to_string(*static_cast<int32_t*>(info.ptr));
+            }
+            else if (info.type == "double")
+            {
+                std::ostringstream os;
+                os << std::setprecision(3) << *static_cast<double*>(info.ptr);
+                return os.str();
+            }
+            else if (info.type == "string")
+            {
+                return "\"" + *(static_cast<std::string*>(info.ptr)) + "\"";
+            }
+            else
+            {
+                return "<unknown type>";
+            }
+        }
 
-  static void parse_str(Info& info, const std::string& str) {
-    if (info.type == "int32") {
-      *static_cast<int32_t*>(info.ptr) = std::stoi(str);
-    } else if (info.type == "double") {
-      *static_cast<double*>(info.ptr) = std::stod(str);
-    } else if (info.type == "string") {
-      *static_cast<std::string*>(info.ptr) = str;
-    } else {
-      // pass
-    }
-  }
+        void _ShowUsageWithFlags(const char* argv0) const
+        {
+            ShowUsage(argv0);
+            static constexpr const auto kLineLength = 80;
+            std::cout << std::endl;
+            int max_name_length = 0;
+            for (const auto& info : infos_)
+            {
+                max_name_length = std::max(max_name_length, (int)info.name.length());
+            }
+            max_name_length += 4;
+            auto name_col_size = max_name_length + 1;
+            auto msg_col_size  = kLineLength - name_col_size;
+            std::cout << "required arguments:\n";
+            ShowFlags(name_col_size, msg_col_size, false);
+            std::cout << std::endl;
+            std::cout << "optional arguments:\n";
+            ShowFlags(name_col_size, msg_col_size, true);
+        }
 
-  static std::string get_default_str(const Info& info) {
-    if (info.type == "int32") {
-      return std::to_string(*static_cast<int32_t*>(info.ptr));
-    } else if (info.type == "double") {
-      std::ostringstream os;
-      os << std::setprecision(3) << *static_cast<double*>(info.ptr);
-      return os.str();
-    } else if (info.type == "string") {
-      return "\"" + *(static_cast<std::string*>(info.ptr)) + "\"";
-    } else {
-      return "<unknown type>";
-    }
-  }
+        void ShowFlags(int name_col_size, int msg_col_size, bool is_flag) const
+        {
+            for (const auto& info : infos_)
+            {
+                if (info.is_flag != is_flag)
+                {
+                    continue;
+                }
+                std::string name = "  ";
+                if (info.is_flag)
+                {
+                    name.append("--");
+                }
+                name.append(info.name);
+                while (name.length() < name_col_size)
+                {
+                    name.append(" ");
+                }
+                std::cout << name;
+                std::string msg = info.msg;
+                while (msg.length() > msg_col_size)
+                {  // insert line-breaks when msg is too long
+                    auto pos = msg.rend() - std::find(std::make_reverse_iterator(msg.begin() + msg_col_size),
+                                                      msg.rend(),
+                                                      ' ');
+                    std::cout << msg.substr(0, pos - 1) << std::endl;
+                    std::cout << std::string(name_col_size, ' ');
+                    msg = msg.substr(pos);
+                }
+                std::cout << msg;
+                std::string type;
+                type.append("[").append(info.type);
+                if (info.is_flag)
+                {
+                    type.append(" = ").append(get_default_str(info));
+                }
+                type.append("]");
+                if (msg.length() + type.length() + 1 > msg_col_size)
+                {
+                    std::cout << std::endl
+                              << std::string(name_col_size, ' ') << type;
+                }
+                else
+                {
+                    std::cout << " " << type;
+                }
+                std::cout << std::endl;
+            }
+        }
 
-  void _ShowUsageWithFlags(const char* argv0) const {
-    ShowUsage(argv0);
-    static constexpr const auto kLineLength = 80;
-    std::cout << std::endl;
-    int max_name_length = 0;
-    for (const auto& info : infos_) {
-      max_name_length = std::max(max_name_length, (int)info.name.length());
-    }
-    max_name_length += 4;
-    auto name_col_size = max_name_length + 1;
-    auto msg_col_size = kLineLength - name_col_size;
-    std::cout << "required arguments:\n";
-    ShowFlags(name_col_size, msg_col_size, false);
-    std::cout << std::endl;
-    std::cout << "optional arguments:\n";
-    ShowFlags(name_col_size, msg_col_size, true);
-  }
+        void ShowUsage(const char* argv0) const
+        {
+            for (auto p = argv0; *p; ++p)
+            {
+                if (*p == '/' || *p == '\'')
+                {
+                    argv0 = p + 1;
+                }
+            }
+            std::cout << "Usage: " << argv0 << " [options]";
+            for (const auto& info : infos_)
+            {
+                if (!info.is_flag)
+                {
+                    std::cout << " " << info.name;
+                }
+            }
+            std::cout << std::endl;
+        }
 
-  void ShowFlags(int name_col_size, int msg_col_size, bool is_flag) const {
-    for (const auto& info : infos_) {
-      if (info.is_flag != is_flag) {
-        continue;
-      }
-      std::string name = "  ";
-      if (info.is_flag) {
-        name.append("--");
-      }
-      name.append(info.name);
-      while (name.length() < name_col_size) {
-        name.append(" ");
-      }
-      std::cout << name;
-      std::string msg = info.msg;
-      while (msg.length() > msg_col_size) {  // insert line-breaks when msg is too long
-        auto pos = msg.rend() - std::find(std::make_reverse_iterator(msg.begin() + msg_col_size),
-                                          msg.rend(), ' ');
-        std::cout << msg.substr(0, pos - 1) << std::endl;
-        std::cout << std::string(name_col_size, ' ');
-        msg = msg.substr(pos);
-      }
-      std::cout << msg;
-      std::string type;
-      type.append("[").append(info.type);
-      if (info.is_flag) {
-        type.append(" = ").append(get_default_str(info));
-      }
-      type.append("]");
-      if (msg.length() + type.length() + 1 > msg_col_size) {
-        std::cout << std::endl << std::string(name_col_size, ' ') << type;
-      } else {
-        std::cout << " " << type;
-      }
-      std::cout << std::endl;
-    }
-  }
+      private:
+        std::vector<Info> infos_;
+    };
 
-  void ShowUsage(const char* argv0) const {
-    for (auto p = argv0; *p; ++p) {
-      if (*p == '/' || *p == '\'') {
-        argv0 = p + 1;
-      }
+    inline bool ParseArguments(int argc, char* argv[])
+    {
+        return ArgParse::ParseArguments(argc, argv);
     }
-    std::cout << "Usage: " << argv0 << " [options]";
-    for (const auto& info : infos_) {
-      if (!info.is_flag) {
-        std::cout << " " << info.name;
-      }
-    }
-    std::cout << std::endl;
-  }
-
- private:
-  std::vector<Info> infos_;
-};
-
-inline bool ParseArguments(int argc, char* argv[]) { return ArgParse::ParseArguments(argc, argv); }
 
 }  // namespace utils
 
 #define _MMDEPLOY_DEFINE_FLAG(type, name, init, msg) \
-  type FLAGS_##name = ::utils::ArgParse::Register(#type, #name, type(init), msg, &FLAGS_##name)
+    type FLAGS_##name = ::utils::ArgParse::Register(#type, #name, type(init), msg, &FLAGS_##name)
 
 #define _MMDEPLOY_DEFINE_ARG(type, name, msg) \
-  type ARGS_##name = ::utils::ArgParse::Register<type>(#type, #name, msg, &ARGS_##name)
+    type ARGS_##name = ::utils::ArgParse::Register<type>(#type, #name, msg, &ARGS_##name)
 
 #endif  // MMDEPLOY_ARGPARSE_H
diff --git a/demo/csrc/cpp/utils/mediaio.h b/demo/csrc/cpp/utils/mediaio.h
index 65018602c2..871debaf09 100644
--- a/demo/csrc/cpp/utils/mediaio.h
+++ b/demo/csrc/cpp/utils/mediaio.h
@@ -10,384 +10,552 @@
 #include "opencv2/imgcodecs/imgcodecs.hpp"
 #include "opencv2/videoio/videoio.hpp"
 
-namespace utils {
-namespace mediaio {
-
-enum class MediaType { kUnknown, kImage, kVideo, kImageList, kWebcam, kFmtStr, kDisable };
-
-namespace detail {
-
-static std::string get_extension(const std::string& path) {
-  std::string ext;
-  for (auto i = (int)path.size() - 1; i >= 0; --i) {
-    if (path[i] == '.') {
-      ext.push_back(path[i]);
-      for (++i; i < path.size(); ++i) {
-        ext.push_back((char)std::tolower((unsigned char)path[i]));
-      }
-      return ext;
-    }
-  }
-  return {};
-}
-
-int ext2fourcc(const std::string& ext) {
-  auto get_fourcc = [](const char* s) { return cv::VideoWriter::fourcc(s[0], s[1], s[2], s[3]); };
-  static std::map<std::string, int> ext2fourcc{
-      {".mp4", get_fourcc("mp4v")},
-      {".avi", get_fourcc("DIVX")},
-      {".mkv", get_fourcc("X264")},
-      {".wmv", get_fourcc("WMV3")},
-  };
-  auto it = ext2fourcc.find(ext);
-  if (it != ext2fourcc.end()) {
-    return it->second;
-  }
-  return get_fourcc("DIVX");
-}
-
-static bool is_video(const std::string& ext) {
-  static const std::set<std::string> es{".mp4", ".avi", ".mkv", ".webm", ".mov", ".mpg", ".wmv"};
-  return es.count(ext);
-}
-
-static bool is_list(const std::string& ext) {
-  static const std::set<std::string> es{".txt"};
-  return es.count(ext);
-}
-
-static bool is_image(const std::string& ext) {
-  static const std::set<std::string> es{".jpg", ".jpeg", ".png", ".tif", ".tiff",
-                                        ".bmp", ".ppm",  ".pgm", ".webp"};
-  return es.count(ext);
-}
-
-static bool is_fmtstr(const std::string& str) {
-  for (const auto& c : str) {
-    if (c == '%') {
-      return true;
-    }
-  }
-  return false;
-}
-
-}  // namespace detail
-
-class Input;
-
-class InputIterator {
- public:
-  using iterator_category = std::input_iterator_tag;
-  using difference_type = std::ptrdiff_t;
-  using reference = cv::Mat&;
-  using value_type = reference;
-  using pointer = void;
-
- public:
-  InputIterator() = default;
-  explicit InputIterator(Input& input) : input_(&input) { next(); }
-  InputIterator& operator++() {
-    next();
-    return *this;
-  }
-  reference operator*() { return frame_; }
-  friend bool operator==(const InputIterator& a, const InputIterator& b) {
-    return &a == &b || a.is_end() == b.is_end();
-  }
-  friend bool operator!=(const InputIterator& a, const InputIterator& b) { return !(a == b); }
-
- private:
-  void next();
-  bool is_end() const noexcept { return frame_.data != nullptr; }
-
- private:
-  cv::Mat frame_;
-  Input* input_{};
-};
-
-class BatchInputIterator {
- public:
-  using iterator_category = std::input_iterator_tag;
-  using difference_type = std::ptrdiff_t;
-  using reference = std::vector<cv::Mat>&;
-  using value_type = reference;
-  using pointer = void;
-
- public:
-  BatchInputIterator() = default;
-  BatchInputIterator(InputIterator iter, InputIterator end, size_t batch_size)
-      : iter_(std::move(iter)), end_(std::move(end)), batch_size_(batch_size) {
-    next();
-  }
-
-  BatchInputIterator& operator++() {
-    next();
-    return *this;
-  }
-
-  reference operator*() { return data_; }
-
-  friend bool operator==(const BatchInputIterator& a, const BatchInputIterator& b) {
-    return &a == &b || a.is_end() == b.is_end();
-  }
-
-  friend bool operator!=(const BatchInputIterator& a, const BatchInputIterator& b) {
-    return !(a == b);
-  }
-
- private:
-  void next() {
-    data_.clear();
-    for (size_t i = 0; i < batch_size_ && iter_ != end_; ++i, ++iter_) {
-      data_.push_back(*iter_);
-    }
-  }
-
-  bool is_end() const { return data_.empty(); }
-
- private:
-  InputIterator iter_;
-  InputIterator end_;
-  size_t batch_size_{1};
-  std::vector<cv::Mat> data_;
-};
-
-class Input {
- public:
-  explicit Input(const std::string& path, bool flip = false, MediaType type = MediaType::kUnknown)
-      : path_(path), flip_(flip), type_(type) {
-    if (type_ == MediaType::kUnknown) {
-      auto ext = detail::get_extension(path);
-      if (detail::is_image(ext)) {
-        type_ = MediaType::kImage;
-      } else if (detail::is_video(ext)) {
-        type_ = MediaType::kVideo;
-      } else if (path.size() == 1 && std::isdigit((unsigned char)path[0])) {
-        type_ = MediaType::kWebcam;
-      } else if (detail::is_list(ext) || try_image_list(path)) {
-        type_ = MediaType::kImageList;
-      } else if (try_image(path)) {
-        type_ = MediaType::kImage;
-      } else if (try_video(path)) {
-        type_ = MediaType::kVideo;
-      } else {
-        std::cout << "unknown file type: " << path << "\n";
-      }
-    }
-    if (type_ != MediaType::kUnknown) {
-      if (type_ == MediaType::kVideo) {
-        cap_.open(path_);
-        if (!cap_.isOpened()) {
-          std::cerr << "failed to open video file: " << path_ << "\n";
+namespace utils
+{
+    namespace mediaio
+    {
+
+        enum class MediaType
+        {
+            kUnknown,
+            kImage,
+            kVideo,
+            kImageList,
+            kWebcam,
+            kFmtStr,
+            kDisable
+        };
+
+        namespace detail
+        {
+
+            static std::string get_extension(const std::string& path)
+            {
+                std::string ext;
+                for (auto i = (int)path.size() - 1; i >= 0; --i)
+                {
+                    if (path[i] == '.')
+                    {
+                        ext.push_back(path[i]);
+                        for (++i; i < path.size(); ++i)
+                        {
+                            ext.push_back((char)std::tolower((unsigned char)path[i]));
+                        }
+                        return ext;
+                    }
+                }
+                return {};
+            }
+
+            int ext2fourcc(const std::string& ext)
+            {
+                auto get_fourcc = [](const char* s)
+                { return cv::VideoWriter::fourcc(s[0], s[1], s[2], s[3]); };
+                static std::map<std::string, int> ext2fourcc{
+                    {".mp4", get_fourcc("mp4v")},
+                    {".avi", get_fourcc("DIVX")},
+                    {".mkv", get_fourcc("X264")},
+                    {".wmv", get_fourcc("WMV3")},
+                };
+                auto it = ext2fourcc.find(ext);
+                if (it != ext2fourcc.end())
+                {
+                    return it->second;
+                }
+                return get_fourcc("DIVX");
+            }
+
+            static bool is_video(const std::string& ext)
+            {
+                static const std::set<std::string> es{".mp4", ".avi", ".mkv", ".webm", ".mov", ".mpg", ".wmv"};
+                return es.count(ext);
+            }
+
+            static bool is_list(const std::string& ext)
+            {
+                static const std::set<std::string> es{".txt"};
+                return es.count(ext);
+            }
+
+            static bool is_image(const std::string& ext)
+            {
+                static const std::set<std::string> es{".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp", ".ppm", ".pgm", ".webp"};
+                return es.count(ext);
+            }
+
+            static bool is_fmtstr(const std::string& str)
+            {
+                for (const auto& c : str)
+                {
+                    if (c == '%')
+                    {
+                        return true;
+                    }
+                }
+                return false;
+            }
+
+        }  // namespace detail
+
+        class Input;
+
+        class InputIterator
+        {
+          public:
+            using iterator_category = std::input_iterator_tag;
+            using difference_type   = std::ptrdiff_t;
+            using reference         = cv::Mat&;
+            using value_type        = reference;
+            using pointer           = void;
+
+          public:
+            InputIterator() = default;
+            explicit InputIterator(Input& input)
+                : input_(&input)
+            {
+                next();
+            }
+            InputIterator& operator++()
+            {
+                next();
+                return *this;
+            }
+            reference operator*()
+            {
+                return frame_;
+            }
+            friend bool operator==(const InputIterator& a, const InputIterator& b)
+            {
+                return &a == &b || a.is_end() == b.is_end();
+            }
+            friend bool operator!=(const InputIterator& a, const InputIterator& b)
+            {
+                return !(a == b);
+            }
+
+          private:
+            void next();
+            bool is_end() const noexcept
+            {
+                return frame_.data != nullptr;
+            }
+
+          private:
+            cv::Mat frame_;
+            Input*  input_{};
+        };
+
+        class BatchInputIterator
+        {
+          public:
+            using iterator_category = std::input_iterator_tag;
+            using difference_type   = std::ptrdiff_t;
+            using reference         = std::vector<cv::Mat>&;
+            using value_type        = reference;
+            using pointer           = void;
+
+          public:
+            BatchInputIterator() = default;
+            BatchInputIterator(InputIterator iter, InputIterator end, size_t batch_size)
+                : iter_(std::move(iter))
+                , end_(std::move(end))
+                , batch_size_(batch_size)
+            {
+                next();
+            }
+
+            BatchInputIterator& operator++()
+            {
+                next();
+                return *this;
+            }
+
+            reference operator*()
+            {
+                return data_;
+            }
+
+            friend bool operator==(const BatchInputIterator& a, const BatchInputIterator& b)
+            {
+                return &a == &b || a.is_end() == b.is_end();
+            }
+
+            friend bool operator!=(const BatchInputIterator& a, const BatchInputIterator& b)
+            {
+                return !(a == b);
+            }
+
+          private:
+            void next()
+            {
+                data_.clear();
+                for (size_t i = 0; i < batch_size_ && iter_ != end_; ++i, ++iter_)
+                {
+                    data_.push_back(*iter_);
+                }
+            }
+
+            bool is_end() const
+            {
+                return data_.empty();
+            }
+
+          private:
+            InputIterator        iter_;
+            InputIterator        end_;
+            size_t               batch_size_{1};
+            std::vector<cv::Mat> data_;
+        };
+
+        class Input
+        {
+          public:
+            explicit Input(const std::string& path, bool flip = false, MediaType type = MediaType::kUnknown)
+                : path_(path)
+                , flip_(flip)
+                , type_(type)
+            {
+                if (type_ == MediaType::kUnknown)
+                {
+                    auto ext = detail::get_extension(path);
+                    if (detail::is_image(ext))
+                    {
+                        type_ = MediaType::kImage;
+                    }
+                    else if (detail::is_video(ext))
+                    {
+                        type_ = MediaType::kVideo;
+                    }
+                    else if (path.size() == 1 && std::isdigit((unsigned char)path[0]))
+                    {
+                        type_ = MediaType::kWebcam;
+                    }
+                    else if (detail::is_list(ext) || try_image_list(path))
+                    {
+                        type_ = MediaType::kImageList;
+                    }
+                    else if (try_image(path))
+                    {
+                        type_ = MediaType::kImage;
+                    }
+                    else if (try_video(path))
+                    {
+                        type_ = MediaType::kVideo;
+                    }
+                    else
+                    {
+                        std::cout << "unknown file type: " << path << "\n";
+                    }
+                }
+                if (type_ != MediaType::kUnknown)
+                {
+                    if (type_ == MediaType::kVideo)
+                    {
+                        cap_.open(path_);
+                        if (!cap_.isOpened())
+                        {
+                            std::cerr << "failed to open video file: " << path_ << "\n";
+                        }
+                    }
+                    else if (type_ == MediaType::kWebcam)
+                    {
+                        cap_.open(std::stoi(path_));
+                        if (!cap_.isOpened())
+                        {
+                            std::cerr << "failed to open camera index: " << path_ << "\n";
+                        }
+                        type_ = MediaType::kVideo;
+                    }
+                    else if (type_ == MediaType::kImage)
+                    {
+                        items_ = {path_};
+                        type_  = MediaType::kImageList;
+                    }
+                    else if (type_ == MediaType::kImageList)
+                    {
+                        if (items_.empty())
+                        {
+                            items_ = load_image_list(path);
+                        }
+                    }
+                }
+            }
+            InputIterator begin()
+            {
+                return InputIterator(*this);
+            }
+            InputIterator end()
+            {
+                return {};
+            }  // NOLINT
+
+            cv::Mat read()
+            {
+                cv::Mat img;
+                if (type_ == MediaType::kVideo)
+                {
+                    cap_ >> img;
+                }
+                else if (type_ == MediaType::kImageList)
+                {
+                    while (!img.data && index_ < items_.size())
+                    {
+                        auto path = items_[index_++];
+                        img       = cv::imread(path);
+                        if (!img.data)
+                        {
+                            std::cerr << "failed to load image: " << path << "\n";
+                        }
+                    }
+                }
+                if (flip_ && !img.empty())
+                {
+                    cv::flip(img, img, 1);
+                }
+                return img;
+            }
+
+            class Batch
+            {
+              public:
+                Batch(Input& input, size_t batch_size)
+                    : input_(&input)
+                    , batch_size_(batch_size)
+                {
+                }
+                BatchInputIterator begin()
+                {
+                    return {input_->begin(), input_->end(), batch_size_};
+                }
+                BatchInputIterator end()
+                {
+                    return {};
+                }  // NOLINT
+
+              private:
+                Input* input_{};
+                size_t batch_size_{1};
+            };
+
+            Batch batch(size_t batch_size)
+            {
+                return {*this, batch_size};
+            }
+
+          private:
+            static bool try_image(const std::string& path)
+            {
+                return cv::imread(path).data;
+            }
+
+            static bool try_video(const std::string& path)
+            {
+                return cv::VideoCapture(path).isOpened();
+            }
+
+            static std::vector<std::string> load_image_list(const std::string& path, size_t max_bytes = 0)
+            {
+                std::ifstream ifs(path);
+                ifs.seekg(0, std::ifstream::end);
+                auto size = ifs.tellg();
+                ifs.seekg(0, std::ifstream::beg);
+                if (max_bytes && size > max_bytes)
+                {
+                    return {};
+                }
+                auto strip = [](std::string& s)
+                {
+                    while (!s.empty() && std::isspace((unsigned char)s.back()))
+                    {
+                        s.pop_back();
+                    }
+                };
+                std::vector<std::string> ret;
+                std::string              line;
+                while (std::getline(ifs, line))
+                {
+                    strip(line);
+                    if (!line.empty())
+                    {
+                        ret.push_back(std::move(line));
+                    }
+                }
+                return ret;
+            }
+
+            bool try_image_list(const std::string& path)
+            {
+                auto   items = load_image_list(path, 1 << 20);
+                size_t count = 0;
+                for (const auto& item : items)
+                {
+                    if (detail::is_image(detail::get_extension(item)) && ++count > items.size() / 10)
+                    {
+                        items_ = std::move(items);
+                        return true;
+                    }
+                }
+                return false;
+            }
+
+          private:
+            std::string              path_;
+            bool                     flip_{};
+            MediaType                type_{MediaType::kUnknown};
+            std::vector<std::string> items_;
+            cv::VideoCapture         cap_;
+            size_t                   index_{};
+        };
+
+        inline void InputIterator::next()
+        {
+            assert(input_);
+            frame_ = input_->read();
         }
-      } else if (type_ == MediaType::kWebcam) {
-        cap_.open(std::stoi(path_));
-        if (!cap_.isOpened()) {
-          std::cerr << "failed to open camera index: " << path_ << "\n";
-        }
-        type_ = MediaType::kVideo;
-      } else if (type_ == MediaType::kImage) {
-        items_ = {path_};
-        type_ = MediaType::kImageList;
-      } else if (type_ == MediaType::kImageList) {
-        if (items_.empty()) {
-          items_ = load_image_list(path);
-        }
-      }
-    }
-  }
-  InputIterator begin() { return InputIterator(*this); }
-  InputIterator end() { return {}; }  // NOLINT
-
-  cv::Mat read() {
-    cv::Mat img;
-    if (type_ == MediaType::kVideo) {
-      cap_ >> img;
-    } else if (type_ == MediaType::kImageList) {
-      while (!img.data && index_ < items_.size()) {
-        auto path = items_[index_++];
-        img = cv::imread(path);
-        if (!img.data) {
-          std::cerr << "failed to load image: " << path << "\n";
-        }
-      }
-    }
-    if (flip_ && !img.empty()) {
-      cv::flip(img, img, 1);
-    }
-    return img;
-  }
-
-  class Batch {
-   public:
-    Batch(Input& input, size_t batch_size) : input_(&input), batch_size_(batch_size) {}
-    BatchInputIterator begin() { return {input_->begin(), input_->end(), batch_size_}; }
-    BatchInputIterator end() { return {}; }  // NOLINT
-
-   private:
-    Input* input_{};
-    size_t batch_size_{1};
-  };
-
-  Batch batch(size_t batch_size) { return {*this, batch_size}; }
-
- private:
-  static bool try_image(const std::string& path) { return cv::imread(path).data; }
-
-  static bool try_video(const std::string& path) { return cv::VideoCapture(path).isOpened(); }
-
-  static std::vector<std::string> load_image_list(const std::string& path, size_t max_bytes = 0) {
-    std::ifstream ifs(path);
-    ifs.seekg(0, std::ifstream::end);
-    auto size = ifs.tellg();
-    ifs.seekg(0, std::ifstream::beg);
-    if (max_bytes && size > max_bytes) {
-      return {};
-    }
-    auto strip = [](std::string& s) {
-      while (!s.empty() && std::isspace((unsigned char)s.back())) {
-        s.pop_back();
-      }
-    };
-    std::vector<std::string> ret;
-    std::string line;
-    while (std::getline(ifs, line)) {
-      strip(line);
-      if (!line.empty()) {
-        ret.push_back(std::move(line));
-      }
-    }
-    return ret;
-  }
-
-  bool try_image_list(const std::string& path) {
-    auto items = load_image_list(path, 1 << 20);
-    size_t count = 0;
-    for (const auto& item : items) {
-      if (detail::is_image(detail::get_extension(item)) && ++count > items.size() / 10) {
-        items_ = std::move(items);
-        return true;
-      }
-    }
-    return false;
-  }
-
- private:
-  std::string path_;
-  bool flip_{};
-  MediaType type_{MediaType::kUnknown};
-  std::vector<std::string> items_;
-  cv::VideoCapture cap_;
-  size_t index_{};
-};
-
-inline void InputIterator::next() {
-  assert(input_);
-  frame_ = input_->read();
-}
-
-class Output;
-
-class OutputIterator {
- public:
-  using iterator_category = std::output_iterator_tag;
-  using difference_type = std::ptrdiff_t;
-  using reference = void;
-  using value_type = void;
-  using pointer = void;
-
- public:
-  explicit OutputIterator(Output& output) : output_(&output) {}
-
-  OutputIterator& operator=(const cv::Mat& frame);
-
-  OutputIterator& operator*() { return *this; }
-  OutputIterator& operator++() { return *this; }
-  OutputIterator& operator++(int) { return *this; }  // NOLINT
-
- private:
-  Output* output_{};
-};
-
-class Output {
- public:
-  explicit Output(const std::string& path, int show, MediaType type = MediaType::kUnknown)
-      : path_(path), type_(type), show_(show) {
-    ext_ = detail::get_extension(path);
-    if (type_ == MediaType::kUnknown) {
-      if (path_.empty()) {
-        type_ = MediaType::kDisable;
-      } else if (detail::is_image(ext_)) {
-        if (detail::is_fmtstr(path)) {
-          type_ = MediaType::kFmtStr;
-        } else {
-          type_ = MediaType::kImage;
+
+        class Output;
+
+        class OutputIterator
+        {
+          public:
+            using iterator_category = std::output_iterator_tag;
+            using difference_type   = std::ptrdiff_t;
+            using reference         = void;
+            using value_type        = void;
+            using pointer           = void;
+
+          public:
+            explicit OutputIterator(Output& output)
+                : output_(&output)
+            {
+            }
+
+            OutputIterator& operator=(const cv::Mat& frame);
+
+            OutputIterator& operator*()
+            {
+                return *this;
+            }
+            OutputIterator& operator++()
+            {
+                return *this;
+            }
+            OutputIterator& operator++(int)
+            {
+                return *this;
+            }  // NOLINT
+
+          private:
+            Output* output_{};
+        };
+
+        class Output
+        {
+          public:
+            explicit Output(const std::string& path, int show, MediaType type = MediaType::kUnknown)
+                : path_(path)
+                , type_(type)
+                , show_(show)
+            {
+                ext_ = detail::get_extension(path);
+                if (type_ == MediaType::kUnknown)
+                {
+                    if (path_.empty())
+                    {
+                        type_ = MediaType::kDisable;
+                    }
+                    else if (detail::is_image(ext_))
+                    {
+                        if (detail::is_fmtstr(path))
+                        {
+                            type_ = MediaType::kFmtStr;
+                        }
+                        else
+                        {
+                            type_ = MediaType::kImage;
+                        }
+                    }
+                    else if (detail::is_video(ext_))
+                    {
+                        type_ = MediaType::kVideo;
+                    }
+                    else
+                    {
+                        std::cout << "unknown file type: " << path << "\n";
+                    }
+                }
+            }
+
+            bool write(const cv::Mat& frame)
+            {
+                bool exit = false;
+                switch (type_)
+                {
+                    case MediaType::kDisable:
+                        break;
+                    case MediaType::kImage:
+                        cv::imwrite(path_, frame);
+                        break;
+                    case MediaType::kFmtStr:
+                    {
+                        char buf[256];
+                        snprintf(buf, sizeof(buf), path_.c_str(), frame_id_);
+                        cv::imwrite(buf, frame);
+                        break;
+                    }
+                    case MediaType::kVideo:
+                        write_video(frame);
+                        break;
+                    default:
+                        std::cout << "unsupported output media type\n";
+                        assert(0);
+                }
+                if (show_ >= 0)
+                {
+                    cv::imshow("", frame);
+                    exit = cv::waitKey(show_) == 27;  // ESC
+                }
+                ++frame_id_;
+                return !exit;
+            }
+
+            OutputIterator inserter()
+            {
+                return OutputIterator{*this};
+            }
+
+          private:
+            void write_video(const cv::Mat& frame)
+            {
+                if (!video_.isOpened())
+                {
+                    open_video(frame.size());
+                }
+                video_ << frame;
+            }
+
+            void open_video(const cv::Size& size)
+            {
+                video_.open(path_, detail::ext2fourcc(ext_), 30, size);
+            }
+
+          private:
+            std::string     path_;
+            std::string     ext_;
+            MediaType       type_{MediaType::kUnknown};
+            int             show_{1};
+            size_t          frame_id_{0};
+            cv::VideoWriter video_;
+        };
+
+        OutputIterator& OutputIterator::operator=(const cv::Mat& frame)
+        {
+            assert(output_);
+            output_->write(frame);
+            return *this;
         }
-      } else if (detail::is_video(ext_)) {
-        type_ = MediaType::kVideo;
-      } else {
-        std::cout << "unknown file type: " << path << "\n";
-      }
-    }
-  }
-
-  bool write(const cv::Mat& frame) {
-    bool exit = false;
-    switch (type_) {
-      case MediaType::kDisable:
-        break;
-      case MediaType::kImage:
-        cv::imwrite(path_, frame);
-        break;
-      case MediaType::kFmtStr: {
-        char buf[256];
-        snprintf(buf, sizeof(buf), path_.c_str(), frame_id_);
-        cv::imwrite(buf, frame);
-        break;
-      }
-      case MediaType::kVideo:
-        write_video(frame);
-        break;
-      default:
-        std::cout << "unsupported output media type\n";
-        assert(0);
-    }
-    if (show_ >= 0) {
-      cv::imshow("", frame);
-      exit = cv::waitKey(show_) == 27;  // ESC
-    }
-    ++frame_id_;
-    return !exit;
-  }
-
-  OutputIterator inserter() { return OutputIterator{*this}; }
-
- private:
-  void write_video(const cv::Mat& frame) {
-    if (!video_.isOpened()) {
-      open_video(frame.size());
-    }
-    video_ << frame;
-  }
-
-  void open_video(const cv::Size& size) { video_.open(path_, detail::ext2fourcc(ext_), 30, size); }
-
- private:
-  std::string path_;
-  std::string ext_;
-  MediaType type_{MediaType::kUnknown};
-  int show_{1};
-  size_t frame_id_{0};
-  cv::VideoWriter video_;
-};
-
-OutputIterator& OutputIterator::operator=(const cv::Mat& frame) {
-  assert(output_);
-  output_->write(frame);
-  return *this;
-}
-
-}  // namespace mediaio
+
+    }  // namespace mediaio
 }  // namespace utils
 
 #endif  // MMDEPLOY_MEDIAIO_H
diff --git a/demo/csrc/cpp/utils/palette.h b/demo/csrc/cpp/utils/palette.h
index 715fdbcb3b..010d9aa02c 100644
--- a/demo/csrc/cpp/utils/palette.h
+++ b/demo/csrc/cpp/utils/palette.h
@@ -10,87 +10,178 @@
 #include <utility>
 #include <vector>
 
-namespace utils {
+namespace utils
+{
 
-struct Palette {
-  std::vector<cv::Vec3b> data;
-  static Palette get(const std::string& path);
-  static Palette get(int n);
-};
+    struct Palette
+    {
+        std::vector<cv::Vec3b> data;
+        static Palette         get(const std::string& path);
+        static Palette         get(int n);
+    };
 
-inline Palette Palette::get(const std::string& path) {
-  if (path == "coco") {
-    Palette p{{
-        {220, 20, 60},   {119, 11, 32},   {0, 0, 142},     {0, 0, 230},     {106, 0, 228},
-        {0, 60, 100},    {0, 80, 100},    {0, 0, 70},      {0, 0, 192},     {250, 170, 30},
-        {100, 170, 30},  {220, 220, 0},   {175, 116, 175}, {250, 0, 30},    {165, 42, 42},
-        {255, 77, 255},  {0, 226, 252},   {182, 182, 255}, {0, 82, 0},      {120, 166, 157},
-        {110, 76, 0},    {174, 57, 255},  {199, 100, 0},   {72, 0, 118},    {255, 179, 240},
-        {0, 125, 92},    {209, 0, 151},   {188, 208, 182}, {0, 220, 176},   {255, 99, 164},
-        {92, 0, 73},     {133, 129, 255}, {78, 180, 255},  {0, 228, 0},     {174, 255, 243},
-        {45, 89, 255},   {134, 134, 103}, {145, 148, 174}, {255, 208, 186}, {197, 226, 255},
-        {171, 134, 1},   {109, 63, 54},   {207, 138, 255}, {151, 0, 95},    {9, 80, 61},
-        {84, 105, 51},   {74, 65, 105},   {166, 196, 102}, {208, 195, 210}, {255, 109, 65},
-        {0, 143, 149},   {179, 0, 194},   {209, 99, 106},  {5, 121, 0},     {227, 255, 205},
-        {147, 186, 208}, {153, 69, 1},    {3, 95, 161},    {163, 255, 0},   {119, 0, 170},
-        {0, 182, 199},   {0, 165, 120},   {183, 130, 88},  {95, 32, 0},     {130, 114, 135},
-        {110, 129, 133}, {166, 74, 118},  {219, 142, 185}, {79, 210, 114},  {178, 90, 62},
-        {65, 70, 15},    {127, 167, 115}, {59, 105, 106},  {142, 108, 45},  {196, 172, 0},
-        {95, 54, 80},    {128, 76, 255},  {201, 57, 1},    {246, 0, 122},   {191, 162, 208},
-    }};
-    for (auto& x : p.data) {
-      std::swap(x[0], x[2]);
+    inline Palette Palette::get(const std::string& path)
+    {
+        if (path == "coco")
+        {
+            Palette p{{
+                {220, 20, 60},
+                {119, 11, 32},
+                {0, 0, 142},
+                {0, 0, 230},
+                {106, 0, 228},
+                {0, 60, 100},
+                {0, 80, 100},
+                {0, 0, 70},
+                {0, 0, 192},
+                {250, 170, 30},
+                {100, 170, 30},
+                {220, 220, 0},
+                {175, 116, 175},
+                {250, 0, 30},
+                {165, 42, 42},
+                {255, 77, 255},
+                {0, 226, 252},
+                {182, 182, 255},
+                {0, 82, 0},
+                {120, 166, 157},
+                {110, 76, 0},
+                {174, 57, 255},
+                {199, 100, 0},
+                {72, 0, 118},
+                {255, 179, 240},
+                {0, 125, 92},
+                {209, 0, 151},
+                {188, 208, 182},
+                {0, 220, 176},
+                {255, 99, 164},
+                {92, 0, 73},
+                {133, 129, 255},
+                {78, 180, 255},
+                {0, 228, 0},
+                {174, 255, 243},
+                {45, 89, 255},
+                {134, 134, 103},
+                {145, 148, 174},
+                {255, 208, 186},
+                {197, 226, 255},
+                {171, 134, 1},
+                {109, 63, 54},
+                {207, 138, 255},
+                {151, 0, 95},
+                {9, 80, 61},
+                {84, 105, 51},
+                {74, 65, 105},
+                {166, 196, 102},
+                {208, 195, 210},
+                {255, 109, 65},
+                {0, 143, 149},
+                {179, 0, 194},
+                {209, 99, 106},
+                {5, 121, 0},
+                {227, 255, 205},
+                {147, 186, 208},
+                {153, 69, 1},
+                {3, 95, 161},
+                {163, 255, 0},
+                {119, 0, 170},
+                {0, 182, 199},
+                {0, 165, 120},
+                {183, 130, 88},
+                {95, 32, 0},
+                {130, 114, 135},
+                {110, 129, 133},
+                {166, 74, 118},
+                {219, 142, 185},
+                {79, 210, 114},
+                {178, 90, 62},
+                {65, 70, 15},
+                {127, 167, 115},
+                {59, 105, 106},
+                {142, 108, 45},
+                {196, 172, 0},
+                {95, 54, 80},
+                {128, 76, 255},
+                {201, 57, 1},
+                {246, 0, 122},
+                {191, 162, 208},
+            }};
+            for (auto& x : p.data)
+            {
+                std::swap(x[0], x[2]);
+            }
+            return p;
+        }
+        else if (path == "cityscapes")
+        {
+            Palette p{{
+                {128, 64, 128},
+                {244, 35, 232},
+                {70, 70, 70},
+                {102, 102, 156},
+                {190, 153, 153},
+                {153, 153, 153},
+                {250, 170, 30},
+                {220, 220, 0},
+                {107, 142, 35},
+                {152, 251, 152},
+                {70, 130, 180},
+                {220, 20, 60},
+                {255, 0, 0},
+                {0, 0, 142},
+                {0, 0, 70},
+                {0, 60, 100},
+                {0, 80, 100},
+                {0, 0, 230},
+                {119, 11, 32},
+            }};
+            for (auto& x : p.data)
+            {
+                std::swap(x[0], x[2]);
+            }
+            return p;
+        }
+        std::ifstream ifs(path);
+        if (!ifs.is_open())
+        {
+            std::cout << "error: failed to open palette data file: " << path << "\n";
+            std::abort();
+        }
+        Palette p;
+        int     n = 0;
+        ifs >> n;
+        for (int i = 0; i < n; ++i)
+        {
+            cv::Vec3b x{};
+            ifs >> x[0] >> x[1] >> x[2];
+            p.data.push_back(x);
+        }
+        return p;
     }
-    return p;
-  } else if (path == "cityscapes") {
-    Palette p{{
-        {128, 64, 128},  {244, 35, 232}, {70, 70, 70},  {102, 102, 156}, {190, 153, 153},
-        {153, 153, 153}, {250, 170, 30}, {220, 220, 0}, {107, 142, 35},  {152, 251, 152},
-        {70, 130, 180},  {220, 20, 60},  {255, 0, 0},   {0, 0, 142},     {0, 0, 70},
-        {0, 60, 100},    {0, 80, 100},   {0, 0, 230},   {119, 11, 32},
-    }};
-    for (auto& x : p.data) {
-      std::swap(x[0], x[2]);
-    }
-    return p;
-  }
-  std::ifstream ifs(path);
-  if (!ifs.is_open()) {
-    std::cout << "error: failed to open palette data file: " << path << "\n";
-    std::abort();
-  }
-  Palette p;
-  int n = 0;
-  ifs >> n;
-  for (int i = 0; i < n; ++i) {
-    cv::Vec3b x{};
-    ifs >> x[0] >> x[1] >> x[2];
-    p.data.push_back(x);
-  }
-  return p;
-}
 
-inline Palette Palette::get(int n) {
-  std::vector<cv::Point3f> samples(n * 100);
-  std::vector<int> indices(samples.size());
-  std::iota(indices.begin(), indices.end(), 0);
-  std::mt19937 gen;  // NOLINT
-  std::uniform_int_distribution<ushort> uniform_dist(0, 255);
-  for (auto& x : samples) {
-    x = {(float)uniform_dist(gen), (float)uniform_dist(gen), (float)uniform_dist(gen)};
-  }
-  std::vector<cv::Point3f> centers(n);
-  cv::Mat c_mat(centers, false);
-  cv::Mat s_mat(samples, false);
-  c_mat = c_mat.reshape(1, {n, 3});  // CV_32FC3 -> CV_32FC1 for cv::kmeans output
-  cv::kmeans(s_mat, n, indices, cv::TermCriteria(cv::TermCriteria::Type::COUNT, 10, 0), 1,
-             cv::KMEANS_PP_CENTERS, c_mat);
-  Palette p;
-  for (const auto& c : centers) {
-    p.data.emplace_back((uchar)c.x, (uchar)c.y, (uchar)c.z);
-  }
-  return p;
-}
+    inline Palette Palette::get(int n)
+    {
+        std::vector<cv::Point3f> samples(n * 100);
+        std::vector<int>         indices(samples.size());
+        std::iota(indices.begin(), indices.end(), 0);
+        std::mt19937                          gen;  // NOLINT
+        std::uniform_int_distribution<ushort> uniform_dist(0, 255);
+        for (auto& x : samples)
+        {
+            x = {(float)uniform_dist(gen), (float)uniform_dist(gen), (float)uniform_dist(gen)};
+        }
+        std::vector<cv::Point3f> centers(n);
+        cv::Mat                  c_mat(centers, false);
+        cv::Mat                  s_mat(samples, false);
+        c_mat = c_mat.reshape(1, {n, 3});  // CV_32FC3 -> CV_32FC1 for cv::kmeans output
+        cv::kmeans(s_mat, n, indices, cv::TermCriteria(cv::TermCriteria::Type::COUNT, 10, 0), 1, cv::KMEANS_PP_CENTERS, c_mat);
+        Palette p;
+        for (const auto& c : centers)
+        {
+            p.data.emplace_back((uchar)c.x, (uchar)c.y, (uchar)c.z);
+        }
+        return p;
+    }
 
 }  // namespace utils
 
diff --git a/demo/csrc/cpp/utils/skeleton.h b/demo/csrc/cpp/utils/skeleton.h
index 8319c28862..6d56138ba9 100644
--- a/demo/csrc/cpp/utils/skeleton.h
+++ b/demo/csrc/cpp/utils/skeleton.h
@@ -9,138 +9,297 @@
 #include <utility>
 #include <vector>
 
-namespace utils {
+namespace utils
+{
 
-struct Skeleton {
-  std::vector<std::pair<int, int>> links;
-  std::vector<cv::Scalar> palette;
-  std::vector<int> link_colors;
-  std::vector<int> point_colors;
-  static Skeleton get(const std::string& path);
-};
+    struct Skeleton
+    {
+        std::vector<std::pair<int, int>> links;
+        std::vector<cv::Scalar>          palette;
+        std::vector<int>                 link_colors;
+        std::vector<int>                 point_colors;
+        static Skeleton                  get(const std::string& path);
+    };
 
-const Skeleton& gSkeletonCoco() {
-  static const Skeleton inst{
-      {
-          {15, 13}, {13, 11}, {16, 14}, {14, 12}, {11, 12}, {5, 11}, {6, 12},
-          {5, 6},   {5, 7},   {6, 8},   {7, 9},   {8, 10},  {1, 2},  {0, 1},
-          {0, 2},   {1, 3},   {2, 4},   {3, 5},   {4, 6},
-      },
-      {
-          {255, 128, 0},   {255, 153, 51},  {255, 178, 102}, {230, 230, 0},   {255, 153, 255},
-          {153, 204, 255}, {255, 102, 255}, {255, 51, 255},  {102, 178, 255}, {51, 153, 255},
-          {255, 153, 153}, {255, 102, 102}, {255, 51, 51},   {153, 255, 153}, {102, 255, 102},
-          {51, 255, 51},   {0, 255, 0},     {0, 0, 255},     {255, 0, 0},     {255, 255, 255},
-      },
-      {0, 0, 0, 0, 7, 7, 7, 9, 9, 9, 9, 9, 16, 16, 16, 16, 16, 16, 16},
-      {16, 16, 16, 16, 16, 9, 9, 9, 9, 9, 9, 0, 0, 0, 0, 0, 0},
-  };
-  return inst;
-}
+    const Skeleton& gSkeletonCoco()
+    {
+        static const Skeleton inst{
+            {
+                {15, 13},
+                {13, 11},
+                {16, 14},
+                {14, 12},
+                {11, 12},
+                {5, 11},
+                {6, 12},
+                {5, 6},
+                {5, 7},
+                {6, 8},
+                {7, 9},
+                {8, 10},
+                {1, 2},
+                {0, 1},
+                {0, 2},
+                {1, 3},
+                {2, 4},
+                {3, 5},
+                {4, 6},
+            },
+            {
+                {255, 128, 0},
+                {255, 153, 51},
+                {255, 178, 102},
+                {230, 230, 0},
+                {255, 153, 255},
+                {153, 204, 255},
+                {255, 102, 255},
+                {255, 51, 255},
+                {102, 178, 255},
+                {51, 153, 255},
+                {255, 153, 153},
+                {255, 102, 102},
+                {255, 51, 51},
+                {153, 255, 153},
+                {102, 255, 102},
+                {51, 255, 51},
+                {0, 255, 0},
+                {0, 0, 255},
+                {255, 0, 0},
+                {255, 255, 255},
+            },
+            {0, 0, 0, 0, 7, 7, 7, 9, 9, 9, 9, 9, 16, 16, 16, 16, 16, 16, 16},
+            {16, 16, 16, 16, 16, 9, 9, 9, 9, 9, 9, 0, 0, 0, 0, 0, 0},
+        };
+        return inst;
+    }
 
-const Skeleton& gSkeletonCocoWholeBody() {
-  static const Skeleton inst{
-      {
-          {15, 13},   {13, 11},   {16, 14},   {14, 12},   {11, 12},   {5, 11},    {6, 12},
-          {5, 6},     {5, 7},     {6, 8},     {7, 9},     {8, 10},    {1, 2},     {0, 1},
-          {0, 2},     {1, 3},     {2, 4},     {3, 5},     {4, 6},     {15, 17},   {15, 18},
-          {15, 19},   {16, 20},   {16, 21},   {16, 22},   {91, 92},   {92, 93},   {93, 94},
-          {94, 95},   {91, 96},   {96, 97},   {97, 98},   {98, 99},   {91, 100},  {100, 101},
-          {101, 102}, {102, 103}, {91, 104},  {104, 105}, {105, 106}, {106, 107}, {91, 108},
-          {108, 109}, {109, 110}, {110, 111}, {112, 113}, {113, 114}, {114, 115}, {115, 116},
-          {112, 117}, {117, 118}, {118, 119}, {119, 120}, {112, 121}, {121, 122}, {122, 123},
-          {123, 124}, {112, 125}, {125, 126}, {126, 127}, {127, 128}, {112, 129}, {129, 130},
-          {130, 131}, {131, 132},
-      },
-      {
-          {51, 153, 255},
-          {0, 255, 0},
-          {255, 128, 0},
-          {255, 255, 255},
-          {255, 153, 255},
-          {102, 178, 255},
-          {255, 51, 51},
-      },
-      {1, 1, 2, 2, 0, 0, 0, 0, 1, 2, 1, 2, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
-       2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 1, 1, 1,
-       1, 2, 2, 2, 2, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 1, 1, 1, 1},
-      {0, 0, 0, 0, 0, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3,
-       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6,
-       1, 1, 1, 1, 3, 2, 2, 2, 2, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 1, 1, 1, 1},
-  };
-  return inst;
-}
+    const Skeleton& gSkeletonCocoWholeBody()
+    {
+        static const Skeleton inst{
+            {
+                {15, 13},
+                {13, 11},
+                {16, 14},
+                {14, 12},
+                {11, 12},
+                {5, 11},
+                {6, 12},
+                {5, 6},
+                {5, 7},
+                {6, 8},
+                {7, 9},
+                {8, 10},
+                {1, 2},
+                {0, 1},
+                {0, 2},
+                {1, 3},
+                {2, 4},
+                {3, 5},
+                {4, 6},
+                {15, 17},
+                {15, 18},
+                {15, 19},
+                {16, 20},
+                {16, 21},
+                {16, 22},
+                {91, 92},
+                {92, 93},
+                {93, 94},
+                {94, 95},
+                {91, 96},
+                {96, 97},
+                {97, 98},
+                {98, 99},
+                {91, 100},
+                {100, 101},
+                {101, 102},
+                {102, 103},
+                {91, 104},
+                {104, 105},
+                {105, 106},
+                {106, 107},
+                {91, 108},
+                {108, 109},
+                {109, 110},
+                {110, 111},
+                {112, 113},
+                {113, 114},
+                {114, 115},
+                {115, 116},
+                {112, 117},
+                {117, 118},
+                {118, 119},
+                {119, 120},
+                {112, 121},
+                {121, 122},
+                {122, 123},
+                {123, 124},
+                {112, 125},
+                {125, 126},
+                {126, 127},
+                {127, 128},
+                {112, 129},
+                {129, 130},
+                {130, 131},
+                {131, 132},
+            },
+            {
+                {51, 153, 255},
+                {0, 255, 0},
+                {255, 128, 0},
+                {255, 255, 255},
+                {255, 153, 255},
+                {102, 178, 255},
+                {255, 51, 51},
+            },
+            {1, 1, 2, 2, 0, 0, 0, 0, 1, 2, 1, 2, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 1, 1, 1, 1, 2, 2, 2, 2, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 1, 1, 1, 1},
+            {0, 0, 0, 0, 0, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 1, 1, 1, 1, 3, 2, 2, 2, 2, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 1, 1, 1, 1},
+        };
+        return inst;
+    }
 
-const Skeleton& gSkeletonCocoWholeBodyHand() {
-  static const Skeleton inst{
-      {
-          {0, 1},  {1, 2},   {2, 3},   {3, 4},
-          {0, 5},  {5, 6},   {6, 7},   {7, 8},
-          {0, 9},  {9, 10},  {10, 11}, {11, 12},
-          {0, 13}, {13, 14}, {14, 15}, {15, 16},
-          {0, 17}, {17, 18}, {18, 19}, {19, 20},
-      },
-      {
-          {255, 255, 255}, {255, 128, 0}, {255, 153, 255},
-          {102, 178, 255}, {255, 51, 51}, {0, 255, 0},
-      },
-      {1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5,},
-      {0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5,},
-  };
-  return inst;
-}
+    const Skeleton& gSkeletonCocoWholeBodyHand()
+    {
+        static const Skeleton inst{
+            {
+                {0, 1},
+                {1, 2},
+                {2, 3},
+                {3, 4},
+                {0, 5},
+                {5, 6},
+                {6, 7},
+                {7, 8},
+                {0, 9},
+                {9, 10},
+                {10, 11},
+                {11, 12},
+                {0, 13},
+                {13, 14},
+                {14, 15},
+                {15, 16},
+                {0, 17},
+                {17, 18},
+                {18, 19},
+                {19, 20},
+            },
+            {
+                {255, 255, 255},
+                {255, 128, 0},
+                {255, 153, 255},
+                {102, 178, 255},
+                {255, 51, 51},
+                {0, 255, 0},
+            },
+            {
+                1,
+                1,
+                1,
+                1,
+                2,
+                2,
+                2,
+                2,
+                3,
+                3,
+                3,
+                3,
+                4,
+                4,
+                4,
+                4,
+                5,
+                5,
+                5,
+                5,
+            },
+            {
+                0,
+                1,
+                1,
+                1,
+                1,
+                2,
+                2,
+                2,
+                2,
+                3,
+                3,
+                3,
+                3,
+                4,
+                4,
+                4,
+                4,
+                5,
+                5,
+                5,
+                5,
+            },
+        };
+        return inst;
+    }
 
-// n_links
-// u0, v0, u1, v1, ..., un-1, vn-1
-// n_palette
-// b0, g0, r0, ..., bn-1, gn-1, rn-1
-// n_link_color
-// i0, i1, ..., in-1
-// n_point_color
-// j0, j1, ..., jn-1
-inline Skeleton Skeleton::get(const std::string& path) {
-  if (path == "coco") {
-    return gSkeletonCoco();
-  } else if (path == "coco-wholebody") {
-    return gSkeletonCocoWholeBody();
-  } else if (path == "coco-wholebody-hand") {
-    return gSkeletonCocoWholeBodyHand();
-  }
-  std::ifstream ifs(path);
-  if (!ifs.is_open()) {
-    std::cout << "error: failed to open skeleton data file: " << path << "\n";
-    std::abort();
-  }
-  Skeleton skel;
-  int n = 0;
-  ifs >> n;
-  for (int i = 0; i < n; ++i) {
-    int u{}, v{};
-    ifs >> u >> v;
-    skel.links.emplace_back(u, v);
-  }
-  ifs >> n;
-  for (int i = 0; i < n; ++i) {
-    int b{}, g{}, r{};
-    ifs >> b >> g >> r;
-    skel.palette.emplace_back(b, g, r);
-  }
-  ifs >> n;
-  for (int i = 0; i < n; ++i) {
-    int x{};
-    ifs >> x;
-    skel.link_colors.push_back(x);
-  }
-  ifs >> n;
-  for (int i = 0; i < n; ++i) {
-    int x{};
-    ifs >> x;
-    skel.point_colors.push_back(x);
-  }
-  return skel;
-}
+    // n_links
+    // u0, v0, u1, v1, ..., un-1, vn-1
+    // n_palette
+    // b0, g0, r0, ..., bn-1, gn-1, rn-1
+    // n_link_color
+    // i0, i1, ..., in-1
+    // n_point_color
+    // j0, j1, ..., jn-1
+    inline Skeleton Skeleton::get(const std::string& path)
+    {
+        if (path == "coco")
+        {
+            return gSkeletonCoco();
+        }
+        else if (path == "coco-wholebody")
+        {
+            return gSkeletonCocoWholeBody();
+        }
+        else if (path == "coco-wholebody-hand")
+        {
+            return gSkeletonCocoWholeBodyHand();
+        }
+        std::ifstream ifs(path);
+        if (!ifs.is_open())
+        {
+            std::cout << "error: failed to open skeleton data file: " << path << "\n";
+            std::abort();
+        }
+        Skeleton skel;
+        int      n = 0;
+        ifs >> n;
+        for (int i = 0; i < n; ++i)
+        {
+            int u{}, v{};
+            ifs >> u >> v;
+            skel.links.emplace_back(u, v);
+        }
+        ifs >> n;
+        for (int i = 0; i < n; ++i)
+        {
+            int b{}, g{}, r{};
+            ifs >> b >> g >> r;
+            skel.palette.emplace_back(b, g, r);
+        }
+        ifs >> n;
+        for (int i = 0; i < n; ++i)
+        {
+            int x{};
+            ifs >> x;
+            skel.link_colors.push_back(x);
+        }
+        ifs >> n;
+        for (int i = 0; i < n; ++i)
+        {
+            int x{};
+            ifs >> x;
+            skel.point_colors.push_back(x);
+        }
+        return skel;
+    }
 
 }  // namespace utils
 
diff --git a/demo/csrc/cpp/utils/visualize.h b/demo/csrc/cpp/utils/visualize.h
index 9c6b0b04be..867c8e8553 100644
--- a/demo/csrc/cpp/utils/visualize.h
+++ b/demo/csrc/cpp/utils/visualize.h
@@ -13,249 +13,315 @@
 #include "palette.h"
 #include "skeleton.h"
 
-namespace utils {
+namespace utils
+{
 
-class Visualize {
- public:
-  class Session {
-   public:
-    explicit Session(Visualize& v, const cv::Mat& frame) : v_(v) {
-      if (v_.size_) {
-        scale_ = (float)v_.size_ / (float)std::max(frame.cols, frame.rows);
-      }
-      cv::Mat img;
-      if (v.background_ == "black") {
-        img = cv::Mat::zeros(frame.size(), CV_8UC3);
-      } else {
-        img = frame;
-        if (img.channels() == 1) {
-          cv::cvtColor(img, img, cv::COLOR_GRAY2BGR);
-        }
-      }
-      if (scale_ != 1) {
-        cv::resize(img, img, {}, scale_, scale_);
-      } else if (img.data == frame.data) {
-        img = img.clone();
-      }
-      img_ = std::move(img);
-    }
+    class Visualize
+    {
+      public:
+        class Session
+        {
+          public:
+            explicit Session(Visualize& v, const cv::Mat& frame)
+                : v_(v)
+            {
+                if (v_.size_)
+                {
+                    scale_ = (float)v_.size_ / (float)std::max(frame.cols, frame.rows);
+                }
+                cv::Mat img;
+                if (v.background_ == "black")
+                {
+                    img = cv::Mat::zeros(frame.size(), CV_8UC3);
+                }
+                else
+                {
+                    img = frame;
+                    if (img.channels() == 1)
+                    {
+                        cv::cvtColor(img, img, cv::COLOR_GRAY2BGR);
+                    }
+                }
+                if (scale_ != 1)
+                {
+                    cv::resize(img, img, {}, scale_, scale_);
+                }
+                else if (img.data == frame.data)
+                {
+                    img = img.clone();
+                }
+                img_ = std::move(img);
+            }
 
-    void add_label(int label_id, float score, int index) {
-      printf("label: %d, label_id: %d, score: %.4f\n", index, label_id, score);
-      auto size = .5f * static_cast<float>(img_.rows + img_.cols);
-      offset_ += add_text(to_text(label_id, score), {1, (float)offset_}, size) + 2;
-    }
+            void add_label(int label_id, float score, int index)
+            {
+                printf("label: %d, label_id: %d, score: %.4f\n", index, label_id, score);
+                auto size = .5f * static_cast<float>(img_.rows + img_.cols);
+                offset_ += add_text(to_text(label_id, score), {1, (float)offset_}, size) + 2;
+            }
 
-    int add_text(const std::string& text, const cv::Point2f& origin, float size) {
-      static constexpr const int font_face = cv::FONT_HERSHEY_SIMPLEX;
-      static constexpr const int thickness = 1;
-      static constexpr const auto max_font_scale = .5f;
-      static constexpr const auto min_font_scale = .25f;
-      float font_scale{};
-      if (size < 20) {
-        font_scale = min_font_scale;
-      } else if (size > 200) {
-        font_scale = max_font_scale;
-      } else {
-        font_scale = min_font_scale + (size - 20) / (200 - 20) * (max_font_scale - min_font_scale);
-      }
-      int baseline{};
-      auto text_size = cv::getTextSize(text, font_face, font_scale, thickness, &baseline);
-      cv::Rect rect(origin + cv::Point2f(0, text_size.height + 2 * thickness),
-                    origin + cv::Point2f(text_size.width, 0));
-      rect &= cv::Rect({}, img_.size());
-      if (rect.area() > 0) {
-        img_(rect) *= .35f;
-        cv::putText(img_, text, origin + cv::Point2f(0, text_size.height), font_face, font_scale,
-                    cv::Scalar::all(255), thickness, cv::LINE_AA);
-      }
-      return text_size.height;
-    }
+            int add_text(const std::string& text, const cv::Point2f& origin, float size)
+            {
+                static constexpr const int  font_face      = cv::FONT_HERSHEY_SIMPLEX;
+                static constexpr const int  thickness      = 1;
+                static constexpr const auto max_font_scale = .5f;
+                static constexpr const auto min_font_scale = .25f;
+                float                       font_scale{};
+                if (size < 20)
+                {
+                    font_scale = min_font_scale;
+                }
+                else if (size > 200)
+                {
+                    font_scale = max_font_scale;
+                }
+                else
+                {
+                    font_scale = min_font_scale + (size - 20) / (200 - 20) * (max_font_scale - min_font_scale);
+                }
+                int      baseline{};
+                auto     text_size = cv::getTextSize(text, font_face, font_scale, thickness, &baseline);
+                cv::Rect rect(origin + cv::Point2f(0, text_size.height + 2 * thickness),
+                              origin + cv::Point2f(text_size.width, 0));
+                rect &= cv::Rect({}, img_.size());
+                if (rect.area() > 0)
+                {
+                    img_(rect) *= .35f;
+                    cv::putText(img_, text, origin + cv::Point2f(0, text_size.height), font_face, font_scale, cv::Scalar::all(255), thickness, cv::LINE_AA);
+                }
+                return text_size.height;
+            }
 
-    static std::string to_text(int label_id, float score) {
-      std::stringstream ss;
-      ss << label_id << ": " << std::fixed << std::setprecision(1) << score * 100;
-      return ss.str();
-    }
+            static std::string to_text(int label_id, float score)
+            {
+                std::stringstream ss;
+                ss << label_id << ": " << std::fixed << std::setprecision(1) << score * 100;
+                return ss.str();
+            }
 
-    template <typename Mask>
-    void add_det(const mmdeploy_rect_t& rect, int label_id, float score, const Mask* mask,
-                 int index) {
-      printf("bbox %d, left=%.2f, top=%.2f, right=%.2f, bottom=%.2f, label=%d, score=%.4f\n", index,
-             rect.left, rect.top, rect.right, rect.bottom, label_id, score);
-      if (mask) {
-        fprintf(stdout, "mask %d, height=%d, width=%d\n", index, mask->height, mask->width);
-        int x0 = 0, y0 = 0, img_h=img_.size().height, img_w =img_.size().width ;
-        if (img_h != (int)mask->height || img_w != (int)mask->width ) { // maskrcnn
-          x0 = (int)std::max(std::floor(rect.left) - 1, 0.f);
-          y0 = (int)std::max(std::floor(rect.top) - 1, 0.f);
-        }
-        add_instance_mask({x0, y0}, rand(), mask->data, mask->height, mask->width);
-      }
-      add_bbox(rect, label_id, score);
-    }
+            template<typename Mask>
+            void add_det(const mmdeploy_rect_t& rect, int label_id, float score, const Mask* mask, int index)
+            {
+                printf("bbox %d, left=%.2f, top=%.2f, right=%.2f, bottom=%.2f, label=%d, score=%.4f\n", index, rect.left, rect.top, rect.right, rect.bottom, label_id, score);
+                if (mask)
+                {
+                    fprintf(stdout, "mask %d, height=%d, width=%d\n", index, mask->height, mask->width);
+                    int x0 = 0, y0 = 0, img_h = img_.size().height, img_w = img_.size().width;
+                    if (img_h != (int)mask->height || img_w != (int)mask->width)
+                    {  // maskrcnn
+                        x0 = (int)std::max(std::floor(rect.left) - 1, 0.f);
+                        y0 = (int)std::max(std::floor(rect.top) - 1, 0.f);
+                    }
+                    add_instance_mask({x0, y0}, rand(), mask->data, mask->height, mask->width);
+                }
+                add_bbox(rect, label_id, score);
+            }
 
-    void add_instance_mask(const cv::Point& origin, int color_id, const char* mask_data, int mask_h,
-                           int mask_w, float alpha = .5f) {
-      auto color = v_.palette_.data[color_id % v_.palette_.data.size()];
-      auto x_end = std::min(origin.x + mask_w, img_.cols);
-      auto y_end = std::min(origin.y + mask_h, img_.rows);
-      auto img_data = img_.ptr<cv::Vec3b>();
-      for (int i = origin.y; i < y_end; ++i) {
-        for (int j = origin.x; j < x_end; ++j) {
-          if (mask_data[(i - origin.y) * mask_w + (j - origin.x)]) {
-            img_data[i * img_.cols + j] = img_data[i * img_.cols + j] * (1 - alpha) + color * alpha;
-          }
-        }
-      }
-    }
+            void add_instance_mask(const cv::Point& origin, int color_id, const char* mask_data, int mask_h, int mask_w, float alpha = .5f)
+            {
+                auto color    = v_.palette_.data[color_id % v_.palette_.data.size()];
+                auto x_end    = std::min(origin.x + mask_w, img_.cols);
+                auto y_end    = std::min(origin.y + mask_h, img_.rows);
+                auto img_data = img_.ptr<cv::Vec3b>();
+                for (int i = origin.y; i < y_end; ++i)
+                {
+                    for (int j = origin.x; j < x_end; ++j)
+                    {
+                        if (mask_data[(i - origin.y) * mask_w + (j - origin.x)])
+                        {
+                            img_data[i * img_.cols + j] = img_data[i * img_.cols + j] * (1 - alpha) + color * alpha;
+                        }
+                    }
+                }
+            }
 
-    void add_bbox(mmdeploy_rect_t rect, int label_id, float score) {
-      rect.left *= scale_;
-      rect.right *= scale_;
-      rect.top *= scale_;
-      rect.bottom *= scale_;
-      if (label_id >= 0 && score > 0) {
-        auto area = std::max(0.f, (rect.right - rect.left) * (rect.bottom - rect.top));
-        add_text(to_text(label_id, score), {rect.left, rect.top}, std::sqrt(area));
-      }
-      cv::rectangle(img_, cv::Point2f(rect.left, rect.top), cv::Point2f(rect.right, rect.bottom),
-                    cv::Scalar(0, 255, 0));
-    }
+            void add_bbox(mmdeploy_rect_t rect, int label_id, float score)
+            {
+                rect.left *= scale_;
+                rect.right *= scale_;
+                rect.top *= scale_;
+                rect.bottom *= scale_;
+                if (label_id >= 0 && score > 0)
+                {
+                    auto area = std::max(0.f, (rect.right - rect.left) * (rect.bottom - rect.top));
+                    add_text(to_text(label_id, score), {rect.left, rect.top}, std::sqrt(area));
+                }
+                cv::rectangle(img_, cv::Point2f(rect.left, rect.top), cv::Point2f(rect.right, rect.bottom), cv::Scalar(0, 255, 0));
+            }
 
-    void add_text_det(mmdeploy_point_t bbox[4], float score, const char* text, size_t text_size,
-                      int index) {
-      printf("bbox[%d]: (%.2f, %.2f), (%.2f, %.2f), (%.2f, %.2f), (%.2f, %.2f), %.2f\n", index,  //
-             bbox[0].x, bbox[0].y,                                                               //
-             bbox[1].x, bbox[1].y,                                                               //
-             bbox[2].x, bbox[2].y,                                                               //
-             bbox[3].x, bbox[3].y, score);
-      std::vector<cv::Point> poly_points;
-      cv::Point2f center{};
-      for (int i = 0; i < 4; ++i) {
-        poly_points.emplace_back(bbox[i].x * scale_, bbox[i].y * scale_);
-        center += cv::Point2f(poly_points.back());
-      }
-      cv::polylines(img_, poly_points, true, cv::Scalar{0, 255, 0}, 1, cv::LINE_AA);
-      if (text) {
-        auto area = cv::contourArea(poly_points);
-        fprintf(stdout, "text[%d]: %s\n", index, text);
-        add_text(std::string(text, text + text_size), center / 4, std::sqrt(area));
-      }
-    }
+            void add_text_det(mmdeploy_point_t bbox[4], float score, const char* text, size_t text_size, int index)
+            {
+                printf("bbox[%d]: (%.2f, %.2f), (%.2f, %.2f), (%.2f, %.2f), (%.2f, %.2f), %.2f\n", index,  //
+                       bbox[0].x,
+                       bbox[0].y,  //
+                       bbox[1].x,
+                       bbox[1].y,  //
+                       bbox[2].x,
+                       bbox[2].y,  //
+                       bbox[3].x,
+                       bbox[3].y,
+                       score);
+                std::vector<cv::Point> poly_points;
+                cv::Point2f            center{};
+                for (int i = 0; i < 4; ++i)
+                {
+                    poly_points.emplace_back(bbox[i].x * scale_, bbox[i].y * scale_);
+                    center += cv::Point2f(poly_points.back());
+                }
+                cv::polylines(img_, poly_points, true, cv::Scalar{0, 255, 0}, 1, cv::LINE_AA);
+                if (text)
+                {
+                    auto area = cv::contourArea(poly_points);
+                    fprintf(stdout, "text[%d]: %s\n", index, text);
+                    add_text(std::string(text, text + text_size), center / 4, std::sqrt(area));
+                }
+            }
 
-    void add_rotated_det(const float bbox[5], int label_id, float score) {
-      float xc = bbox[0] * scale_;
-      float yc = bbox[1] * scale_;
-      float w = bbox[2] * scale_;
-      float h = bbox[3] * scale_;
-      float ag = bbox[4];
-      float wx = w / 2 * std::cos(ag);
-      float wy = w / 2 * std::sin(ag);
-      float hx = -h / 2 * std::sin(ag);
-      float hy = h / 2 * std::cos(ag);
-      cv::Point2f p1{xc - wx - hx, yc - wy - hy};
-      cv::Point2f p2{xc + wx - hx, yc + wy - hy};
-      cv::Point2f p3{xc + wx + hx, yc + wy + hy};
-      cv::Point2f p4{xc - wx + hx, yc - wy + hy};
-      cv::Point2f c = .25f * (p1 + p2 + p3 + p4);
-      cv::drawContours(
-          img_,
-          std::vector<std::vector<cv::Point>>{{p1 * scale_, p2 * scale_, p3 * scale_, p4 * scale_}},
-          -1, {0, 255, 0}, 2, cv::LINE_AA);
-      add_text(to_text(label_id, score), c, std::sqrt(w * h));
-    }
+            void add_rotated_det(const float bbox[5], int label_id, float score)
+            {
+                float       xc = bbox[0] * scale_;
+                float       yc = bbox[1] * scale_;
+                float       w  = bbox[2] * scale_;
+                float       h  = bbox[3] * scale_;
+                float       ag = bbox[4];
+                float       wx = w / 2 * std::cos(ag);
+                float       wy = w / 2 * std::sin(ag);
+                float       hx = -h / 2 * std::sin(ag);
+                float       hy = h / 2 * std::cos(ag);
+                cv::Point2f p1{xc - wx - hx, yc - wy - hy};
+                cv::Point2f p2{xc + wx - hx, yc + wy - hy};
+                cv::Point2f p3{xc + wx + hx, yc + wy + hy};
+                cv::Point2f p4{xc - wx + hx, yc - wy + hy};
+                cv::Point2f c = .25f * (p1 + p2 + p3 + p4);
+                cv::drawContours(
+                    img_,
+                    std::vector<std::vector<cv::Point>>{{p1 * scale_, p2 * scale_, p3 * scale_, p4 * scale_}},
+                    -1,
+                    {0, 255, 0},
+                    2,
+                    cv::LINE_AA);
+                add_text(to_text(label_id, score), c, std::sqrt(w * h));
+            }
 
-    void add_mask(int height, int width, int n_classes, const int* mask, const float* score) {
-      cv::Mat color_mask = cv::Mat::zeros(height, width, CV_8UC3);
-      auto n_pix = color_mask.total();
+            void add_mask(int height, int width, int n_classes, const int* mask, const float* score)
+            {
+                cv::Mat       color_mask = cv::Mat::zeros(height, width, CV_8UC3);
+                auto          n_pix      = color_mask.total();
 
-      // compute top 1 idx if score (CHW) is available
-      cv::Mat_<int> top;
-      if (!mask && score) {
-        top = cv::Mat_<int>::zeros(height, width);
-        for (auto c = 1; c < n_classes; ++c) {
-          top.forEach([&](int& x, const int* idx) {
+                // compute top 1 idx if score (CHW) is available
+                cv::Mat_<int> top;
+                if (!mask && score)
+                {
+                    top = cv::Mat_<int>::zeros(height, width);
+                    for (auto c = 1; c < n_classes; ++c)
+                    {
+                        top.forEach([&](int& x, const int* idx)
+                                    {
             auto offset = idx[0] * width + idx[1];
             if (score[c * n_pix + offset] > score[x * n_pix + offset]) {
               x = c;
-            }
-          });
-        }
-        mask = top.ptr<int>();
-      }
+            } });
+                    }
+                    mask = top.ptr<int>();
+                }
 
-      if (mask) {
-        // palette look-up
-        color_mask.forEach<cv::Vec3b>([&](cv::Vec3b& x, const int* idx) {
+                if (mask)
+                {
+                    // palette look-up
+                    color_mask.forEach<cv::Vec3b>([&](cv::Vec3b& x, const int* idx)
+                                                  {
           auto& palette = v_.palette_.data;
-          x = palette[mask[idx[0] * width + idx[1]] % palette.size()];
-        });
+          x = palette[mask[idx[0] * width + idx[1]] % palette.size()]; });
 
-        if (color_mask.size() != img_.size()) {
-          cv::resize(color_mask, color_mask, img_.size());
-        }
+                    if (color_mask.size() != img_.size())
+                    {
+                        cv::resize(color_mask, color_mask, img_.size());
+                    }
 
-        // blend mask and background image
-        cv::addWeighted(img_, .5, color_mask, .5, 0., img_);
-      }
-    }
+                    // blend mask and background image
+                    cv::addWeighted(img_, .5, color_mask, .5, 0., img_);
+                }
+            }
 
-    void add_pose(const mmdeploy_point_t* pts, const float* scores, int32_t pts_size, double thr) {
-      auto& skel = v_.skeleton_;
-      if (skel.point_colors.size() != pts_size) {
-        std::cout << "error: mismatched number of keypoints: " << skel.point_colors.size() << " vs "
-                  << pts_size << ", skip pose visualization.\n";
-        return;
-      }
-      std::vector<int> used(pts_size);
-      std::vector<int> is_end_point(pts_size);
-      for (size_t i = 0; i < skel.links.size(); ++i) {
-        auto u = skel.links[i].first;
-        auto v = skel.links[i].second;
-        is_end_point[u] = is_end_point[v] = 1;
-        if (scores[u] > thr && scores[v] > thr) {
-          used[u] = used[v] = 1;
-          cv::Point2f p0(pts[u].x, pts[u].y);
-          cv::Point2f p1(pts[v].x, pts[v].y);
-          cv::line(img_, p0 * scale_, p1 * scale_, skel.palette[skel.link_colors[i]], 1,
-                   cv::LINE_AA);
-        }
-      }
-      for (size_t i = 0; i < pts_size; ++i) {
-        if (!is_end_point[i] && scores[i] > thr || used[i]) {
-          cv::Point2f p(pts[i].x, pts[i].y);
-          cv::circle(img_, p * scale_, 1, skel.palette[skel.point_colors[i]], 2, cv::LINE_AA);
-        }
-      }
-    }
+            void add_pose(const mmdeploy_point_t* pts, const float* scores, int32_t pts_size, double thr)
+            {
+                auto& skel = v_.skeleton_;
+                if (skel.point_colors.size() != pts_size)
+                {
+                    std::cout << "error: mismatched number of keypoints: " << skel.point_colors.size() << " vs "
+                              << pts_size << ", skip pose visualization.\n";
+                    return;
+                }
+                std::vector<int> used(pts_size);
+                std::vector<int> is_end_point(pts_size);
+                for (size_t i = 0; i < skel.links.size(); ++i)
+                {
+                    auto u          = skel.links[i].first;
+                    auto v          = skel.links[i].second;
+                    is_end_point[u] = is_end_point[v] = 1;
+                    if (scores[u] > thr && scores[v] > thr)
+                    {
+                        used[u] = used[v] = 1;
+                        cv::Point2f p0(pts[u].x, pts[u].y);
+                        cv::Point2f p1(pts[v].x, pts[v].y);
+                        cv::line(img_, p0 * scale_, p1 * scale_, skel.palette[skel.link_colors[i]], 1, cv::LINE_AA);
+                    }
+                }
+                for (size_t i = 0; i < pts_size; ++i)
+                {
+                    if (!is_end_point[i] && scores[i] > thr || used[i])
+                    {
+                        cv::Point2f p(pts[i].x, pts[i].y);
+                        cv::circle(img_, p * scale_, 1, skel.palette[skel.point_colors[i]], 2, cv::LINE_AA);
+                    }
+                }
+            }
 
-    cv::Mat get() { return img_; }
+            cv::Mat get()
+            {
+                return img_;
+            }
 
-   private:
-    Visualize& v_;
-    float scale_{1};
-    int offset_{1};
-    cv::Mat img_;
-  };
+          private:
+            Visualize& v_;
+            float      scale_{1};
+            int        offset_{1};
+            cv::Mat    img_;
+        };
 
-  explicit Visualize(int size = 0) : size_(size) { palette_ = Palette::get(32); }
+        explicit Visualize(int size = 0)
+            : size_(size)
+        {
+            palette_ = Palette::get(32);
+        }
 
-  Session get_session(const cv::Mat& frame) { return Session(*this, frame); }
+        Session get_session(const cv::Mat& frame)
+        {
+            return Session(*this, frame);
+        }
 
-  void set_skeleton(const Skeleton& skeleton) { skeleton_ = skeleton; }
+        void set_skeleton(const Skeleton& skeleton)
+        {
+            skeleton_ = skeleton;
+        }
 
-  void set_palette(const Palette& palette) { palette_ = palette; }
+        void set_palette(const Palette& palette)
+        {
+            palette_ = palette;
+        }
 
-  void set_background(const std::string& background) { background_ = background; }
+        void set_background(const std::string& background)
+        {
+            background_ = background;
+        }
 
- private:
-  friend Session;
-  Skeleton skeleton_;
-  Palette palette_;
-  std::string background_;
-  int size_{};
-};
+      private:
+        friend Session;
+        Skeleton    skeleton_;
+        Palette     palette_;
+        std::string background_;
+        int         size_{};
+    };
 
 }  // namespace utils
 
diff --git a/demo/csrc/cpp/video_cls.cxx b/demo/csrc/cpp/video_cls.cxx
index 3d87ee4f7b..d06892b814 100644
--- a/demo/csrc/cpp/video_cls.cxx
+++ b/demo/csrc/cpp/video_cls.cxx
@@ -6,55 +6,61 @@
 #include "opencv2/videoio.hpp"
 #include "utils/argparse.h"
 
-void SampleFrames(const char* video_path, std::map<int, cv::Mat>& buffer,
-                  std::vector<mmdeploy::Mat>& clips, int clip_len, int frame_interval = 1,
-                  int num_clips = 1) {
-  cv::VideoCapture cap = cv::VideoCapture(video_path);
-  if (!cap.isOpened()) {
-    fprintf(stderr, "failed to load video: %s\n", video_path);
-    exit(1);
-  }
+void SampleFrames(const char* video_path, std::map<int, cv::Mat>& buffer, std::vector<mmdeploy::Mat>& clips, int clip_len, int frame_interval = 1, int num_clips = 1)
+{
+    cv::VideoCapture cap = cv::VideoCapture(video_path);
+    if (!cap.isOpened())
+    {
+        fprintf(stderr, "failed to load video: %s\n", video_path);
+        exit(1);
+    }
 
-  int num_frames = cap.get(cv::CAP_PROP_FRAME_COUNT);
-  printf("num_frames %d\n", num_frames);
+    int num_frames = cap.get(cv::CAP_PROP_FRAME_COUNT);
+    printf("num_frames %d\n", num_frames);
 
-  int ori_clip_len = clip_len * frame_interval;
-  float avg_interval = (num_frames - ori_clip_len + 1.f) / num_clips;
-  std::vector<int> frame_inds;
-  for (int i = 0; i < num_clips; i++) {
-    int clip_offset = i * avg_interval + avg_interval / 2.0;
-    for (int j = 0; j < clip_len; j++) {
-      int ind = (j * frame_interval + clip_offset) % num_frames;
-      if (num_frames <= ori_clip_len - 1) {
-        ind = j % num_frames;
-      }
-      frame_inds.push_back(ind);
+    int              ori_clip_len = clip_len * frame_interval;
+    float            avg_interval = (num_frames - ori_clip_len + 1.f) / num_clips;
+    std::vector<int> frame_inds;
+    for (int i = 0; i < num_clips; i++)
+    {
+        int clip_offset = i * avg_interval + avg_interval / 2.0;
+        for (int j = 0; j < clip_len; j++)
+        {
+            int ind = (j * frame_interval + clip_offset) % num_frames;
+            if (num_frames <= ori_clip_len - 1)
+            {
+                ind = j % num_frames;
+            }
+            frame_inds.push_back(ind);
+        }
     }
-  }
 
-  std::vector<int> unique_inds(frame_inds.begin(), frame_inds.end());
-  std::sort(unique_inds.begin(), unique_inds.end());
-  auto last = std::unique(unique_inds.begin(), unique_inds.end());
-  unique_inds.erase(last, unique_inds.end());
+    std::vector<int> unique_inds(frame_inds.begin(), frame_inds.end());
+    std::sort(unique_inds.begin(), unique_inds.end());
+    auto last = std::unique(unique_inds.begin(), unique_inds.end());
+    unique_inds.erase(last, unique_inds.end());
 
-  int ind = 0;
-  for (int i = 0; i < unique_inds.size(); i++) {
-    int tid = unique_inds[i];
-    cv::Mat frame;
-    while (ind < tid) {
-      cap.read(frame);
-      ind++;
+    int ind = 0;
+    for (int i = 0; i < unique_inds.size(); i++)
+    {
+        int     tid = unique_inds[i];
+        cv::Mat frame;
+        while (ind < tid)
+        {
+            cap.read(frame);
+            ind++;
+        }
+        cap.read(frame);
+        buffer[tid] = frame;
+        ind++;
     }
-    cap.read(frame);
-    buffer[tid] = frame;
-    ind++;
-  }
 
-  clips.resize(frame_inds.size());
-  for (int i = 0; i < frame_inds.size(); i++) {
-    auto& img = buffer[frame_inds[i]];
-    clips[i] = img;
-  }
+    clips.resize(frame_inds.size());
+    for (int i = 0; i < frame_inds.size(); i++)
+    {
+        auto& img = buffer[frame_inds[i]];
+        clips[i]  = img;
+    }
 }
 
 DEFINE_ARG_string(model, "Model path");
@@ -64,25 +70,27 @@ DEFINE_ARG_int32(frame_interval, "Frame interval");
 DEFINE_ARG_int32(num_clips, "Number of clips");
 DEFINE_string(device, "cpu", R"(Device name, e.g. "cpu", "cuda")");
 
-int main(int argc, char* argv[]) {
-  if (!utils::ParseArguments(argc, argv)) {
-    return -1;
-  }
+int main(int argc, char* argv[])
+{
+    if (!utils::ParseArguments(argc, argv))
+    {
+        return -1;
+    }
 
-  std::map<int, cv::Mat> buffer;
-  std::vector<mmdeploy::Mat> clips;
-  mmdeploy::VideoSampleInfo clip_info = {ARGS_clip_len, ARGS_num_clips};
-  SampleFrames(ARGS_video.c_str(), buffer, clips, ARGS_clip_len, ARGS_frame_interval,
-               ARGS_num_clips);
+    std::map<int, cv::Mat>     buffer;
+    std::vector<mmdeploy::Mat> clips;
+    mmdeploy::VideoSampleInfo  clip_info = {ARGS_clip_len, ARGS_num_clips};
+    SampleFrames(ARGS_video.c_str(), buffer, clips, ARGS_clip_len, ARGS_frame_interval, ARGS_num_clips);
 
-  mmdeploy::Model model(ARGS_model);
-  mmdeploy::VideoRecognizer recognizer(model, mmdeploy::Device{FLAGS_device});
+    mmdeploy::Model           model(ARGS_model);
+    mmdeploy::VideoRecognizer recognizer(model, mmdeploy::Device{FLAGS_device});
 
-  auto res = recognizer.Apply(clips, clip_info);
+    auto                      res = recognizer.Apply(clips, clip_info);
 
-  for (const auto& cls : res) {
-    fprintf(stderr, "label: %d, score: %.4f\n", cls.label_id, cls.score);
-  }
+    for (const auto& cls : res)
+    {
+        fprintf(stderr, "label: %d, score: %.4f\n", cls.label_id, cls.score);
+    }
 
-  return 0;
+    return 0;
 }
diff --git a/mmdeploy/backend/onnxruntime/init_plugins.py b/mmdeploy/backend/onnxruntime/init_plugins.py
index fd0d850fe5..3897be22a9 100644
--- a/mmdeploy/backend/onnxruntime/init_plugins.py
+++ b/mmdeploy/backend/onnxruntime/init_plugins.py
@@ -13,6 +13,7 @@ def get_ops_path() -> str:
     candidates = [
         '../../lib/libmmdeploy_onnxruntime_ops.so',
         '../../lib/mmdeploy_onnxruntime_ops.dll',
+        '../../lib/libmmdeploy_onnxruntime_ops.dylib',
     ]
     return get_file_path(os.path.dirname(__file__), candidates)
 
@@ -26,5 +27,6 @@ def get_lib_path() -> str:
     candidates = [
         '../../lib/libonnxruntime.so*',
         '../../lib/onnxruntime.dll',
+        '../../lib/libmmdeploy_onnxruntime_ops.dylib',
     ]
     return get_file_path(os.path.dirname(__file__), candidates)
diff --git a/service/snpe/server/CMakeLists.txt b/service/snpe/server/CMakeLists.txt
index f14ddc97a8..ce1232b1db 100644
--- a/service/snpe/server/CMakeLists.txt
+++ b/service/snpe/server/CMakeLists.txt
@@ -1,21 +1,21 @@
 # Copyright 2018 gRPC authors.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License. You may obtain a copy of
+# the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations under
+# the License.
 #
-# cmake build file for C++ helloworld example.
-# Assumes protobuf and gRPC have been installed using cmake.
-# See cmake_externalproject/CMakeLists.txt for all-in-one cmake build
-# that automatically builds all the dependencies before building helloworld.
+# cmake build file for C++ helloworld example. Assumes protobuf and gRPC have
+# been installed using cmake. See cmake_externalproject/CMakeLists.txt for
+# all-in-one cmake build that automatically builds all the dependencies before
+# building helloworld.
 
 cmake_minimum_required(VERSION 3.5.1)
 project(SNPEServer C CXX)
@@ -32,50 +32,41 @@ set(hw_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/inference.grpc.pb.cc")
 set(hw_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/inference.grpc.pb.h")
 
 add_custom_command(
-      OUTPUT "${hw_proto_srcs}" "${hw_proto_hdrs}" "${hw_grpc_srcs}" "${hw_grpc_hdrs}"
-      COMMAND ${_PROTOBUF_PROTOC}
-      ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}"
-        --cpp_out "${CMAKE_CURRENT_BINARY_DIR}"
-        -I "${hw_proto_path}"
-        --plugin=protoc-gen-grpc="${_GRPC_CPP_PLUGIN_EXECUTABLE}"
-        "${hw_proto}"
-      DEPENDS "${hw_proto}")
+  OUTPUT "${hw_proto_srcs}" "${hw_proto_hdrs}" "${hw_grpc_srcs}"
+         "${hw_grpc_hdrs}"
+  COMMAND
+    ${_PROTOBUF_PROTOC} ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}" --cpp_out
+    "${CMAKE_CURRENT_BINARY_DIR}" -I "${hw_proto_path}"
+    --plugin=protoc-gen-grpc="${_GRPC_CPP_PLUGIN_EXECUTABLE}" "${hw_proto}"
+  DEPENDS "${hw_proto}")
 
 # Include generated *.pb.h files
 include_directories("${CMAKE_CURRENT_BINARY_DIR}")
 
 # hw_grpc_proto
-add_library(hw_grpc_proto
-  ${hw_grpc_srcs}
-  ${hw_grpc_hdrs}
-  ${hw_proto_srcs}
-  ${hw_proto_hdrs})
+add_library(hw_grpc_proto ${hw_grpc_srcs} ${hw_grpc_hdrs} ${hw_proto_srcs}
+                          ${hw_proto_hdrs})
 
-target_link_libraries(hw_grpc_proto
-  ${_REFLECTION}
-  ${_GRPC_GRPCPP}
-  ${_PROTOBUF_LIBPROTOBUF})
+target_link_libraries(hw_grpc_proto ${_REFLECTION} ${_GRPC_GRPCPP}
+                      ${_PROTOBUF_LIBPROTOBUF})
 
 add_library(snpe SHARED IMPORTED)
 
-if (NOT EXISTS $ENV{SNPE_ROOT}/lib/aarch64-android-clang6.0/)
-  message(FATAL_ERROR "SNPE_ROOT directory not exist: "$ENV{SNPE_ROOT}/lib/aarch64-android-clang6.0/)
+if(NOT EXISTS $ENV{SNPE_ROOT}/lib/aarch64-android-clang6.0/)
+  message(
+    FATAL_ERROR
+      "SNPE_ROOT directory not exist: "$ENV{SNPE_ROOT}/lib/aarch64-android-clang6.0/
+  )
 endif()
 
-set_target_properties(snpe PROPERTIES
-  IMPORTED_LOCATION "$ENV{SNPE_ROOT}/lib/aarch64-android-clang6.0/libSNPE.so"
-  INTERFACE_INCLUDE_DIRECTORIES "$ENV{SNPE_ROOT}/include/zdl"
-)
-target_link_directories(
+set_target_properties(
   snpe
-  INTERFACE
-)
+  PROPERTIES IMPORTED_LOCATION
+             "$ENV{SNPE_ROOT}/lib/aarch64-android-clang6.0/libSNPE.so"
+             INTERFACE_INCLUDE_DIRECTORIES "$ENV{SNPE_ROOT}/include/zdl")
+target_link_directories(snpe INTERFACE)
 
-add_executable(inference_server  inference_server.cc service_impl.cpp)
+add_executable(inference_server inference_server.cc service_impl.cpp)
 
-target_link_libraries(inference_server
-  hw_grpc_proto
-  ${_REFLECTION}
-  ${_GRPC_GRPCPP}
-  ${_PROTOBUF_LIBPROTOBUF}
-  snpe)
+target_link_libraries(inference_server hw_grpc_proto ${_REFLECTION}
+                      ${_GRPC_GRPCPP} ${_PROTOBUF_LIBPROTOBUF} snpe)
diff --git a/service/snpe/server/common.cmake b/service/snpe/server/common.cmake
index 20d2f0c01e..57da6345a7 100644
--- a/service/snpe/server/common.cmake
+++ b/service/snpe/server/common.cmake
@@ -1,25 +1,25 @@
 # Copyright 2018 gRPC authors.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License. You may obtain a copy of
+# the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations under
+# the License.
 #
-# cmake build file for C++ route_guide example.
-# Assumes protobuf and gRPC have been installed using cmake.
-# See cmake_externalproject/CMakeLists.txt for all-in-one cmake build
-# that automatically builds all the dependencies before building route_guide.
+# cmake build file for C++ route_guide example. Assumes protobuf and gRPC have
+# been installed using cmake. See cmake_externalproject/CMakeLists.txt for
+# all-in-one cmake build that automatically builds all the dependencies before
+# building route_guide.
 
 cmake_minimum_required(VERSION 3.5.1)
 
-set (CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD 17)
 
 if(MSVC)
   add_definitions(-D_WIN32_WINNT=0x600)
@@ -28,24 +28,22 @@ endif()
 find_package(Threads REQUIRED)
 
 if(GRPC_AS_SUBMODULE)
-  # One way to build a projects that uses gRPC is to just include the
-  # entire gRPC project tree via "add_subdirectory".
-  # This approach is very simple to use, but the are some potential
-  # disadvantages:
-  # * it includes gRPC's CMakeLists.txt directly into your build script
-  #   without and that can make gRPC's internal setting interfere with your
-  #   own build.
-  # * depending on what's installed on your system, the contents of submodules
-  #   in gRPC's third_party/* might need to be available (and there might be
-  #   additional prerequisites required to build them). Consider using
-  #   the gRPC_*_PROVIDER options to fine-tune the expected behavior.
+  # One way to build a projects that uses gRPC is to just include the entire
+  # gRPC project tree via "add_subdirectory". This approach is very simple to
+  # use, but the are some potential disadvantages: * it includes gRPC's
+  # CMakeLists.txt directly into your build script without and that can make
+  # gRPC's internal setting interfere with your own build. * depending on what's
+  # installed on your system, the contents of submodules in gRPC's third_party/*
+  # might need to be available (and there might be additional prerequisites
+  # required to build them). Consider using the gRPC_*_PROVIDER options to
+  # fine-tune the expected behavior.
   #
-  # A more robust approach to add dependency on gRPC is using
-  # cmake's ExternalProject_Add (see cmake_externalproject/CMakeLists.txt).
+  # A more robust approach to add dependency on gRPC is using cmake's
+  # ExternalProject_Add (see cmake_externalproject/CMakeLists.txt).
 
-  # Include the gRPC's cmake build (normally grpc source code would live
-  # in a git submodule called "third_party/grpc", but this example lives in
-  # the same repository as gRPC sources, so we just look a few directories up)
+  # Include the gRPC's cmake build (normally grpc source code would live in a
+  # git submodule called "third_party/grpc", but this example lives in the same
+  # repository as gRPC sources, so we just look a few directories up)
   add_subdirectory(../../.. ${CMAKE_CURRENT_BINARY_DIR}/grpc EXCLUDE_FROM_ALL)
   message(STATUS "Using gRPC via add_subdirectory.")
 
@@ -65,23 +63,22 @@ if(GRPC_AS_SUBMODULE)
     set(_GRPC_CPP_PLUGIN_EXECUTABLE $<TARGET_FILE:grpc_cpp_plugin>)
   endif()
 elseif(GRPC_FETCHCONTENT)
-  # Another way is to use CMake's FetchContent module to clone gRPC at
-  # configure time. This makes gRPC's source code available to your project,
-  # similar to a git submodule.
+  # Another way is to use CMake's FetchContent module to clone gRPC at configure
+  # time. This makes gRPC's source code available to your project, similar to a
+  # git submodule.
   message(STATUS "Using gRPC via add_subdirectory (FetchContent).")
   include(FetchContent)
   FetchContent_Declare(
     grpc
     GIT_REPOSITORY https://github.com/grpc/grpc.git
     # when using gRPC, you will actually set this to an existing tag, such as
-    # v1.25.0, v1.26.0 etc..
-    # For the purpose of testing, we override the tag used to the commit
-    # that's currently under test.
-    GIT_TAG        vGRPC_TAG_VERSION_OF_YOUR_CHOICE)
+    # v1.25.0, v1.26.0 etc.. For the purpose of testing, we override the tag
+    # used to the commit that's currently under test.
+    GIT_TAG vGRPC_TAG_VERSION_OF_YOUR_CHOICE)
   FetchContent_MakeAvailable(grpc)
 
-  # Since FetchContent uses add_subdirectory under the hood, we can use
-  # the grpc targets directly from this build.
+  # Since FetchContent uses add_subdirectory under the hood, we can use the grpc
+  # targets directly from this build.
   set(_PROTOBUF_LIBPROTOBUF libprotobuf)
   set(_REFLECTION grpc++_reflection)
   set(_PROTOBUF_PROTOC $<TARGET_FILE:protoc>)
@@ -95,8 +92,8 @@ else()
   # This branch assumes that gRPC and all its dependencies are already installed
   # on this system, so they can be located by find_package().
 
-  # Find Protobuf installation
-  # Looks for protobuf-config.cmake file installed by Protobuf's cmake installation.
+  # Find Protobuf installation Looks for protobuf-config.cmake file installed by
+  # Protobuf's cmake installation.
   set(protobuf_MODULE_COMPATIBLE TRUE)
   find_package(Protobuf CONFIG REQUIRED)
   message(STATUS "Using protobuf ${Protobuf_VERSION}")
@@ -109,8 +106,8 @@ else()
     set(_PROTOBUF_PROTOC $<TARGET_FILE:protobuf::protoc>)
   endif()
 
-  # Find gRPC installation
-  # Looks for gRPCConfig.cmake file installed by gRPC's cmake installation.
+  # Find gRPC installation Looks for gRPCConfig.cmake file installed by gRPC's
+  # cmake installation.
   find_package(gRPC CONFIG REQUIRED)
   message(STATUS "Using gRPC ${gRPC_VERSION}")
 
diff --git a/tests/test_csrc/CMakeLists.txt b/tests/test_csrc/CMakeLists.txt
index 960b188899..2f99a6d82d 100644
--- a/tests/test_csrc/CMakeLists.txt
+++ b/tests/test_csrc/CMakeLists.txt
@@ -12,69 +12,68 @@ aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/model MODEL_TC)
 aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/graph GRAPH_TC)
 
 set(DEVICE_TC)
-foreach (DEVICE IN LISTS MMDEPLOY_TARGET_DEVICES)
-    list(APPEND DEVICE_TC
-            ${CMAKE_CURRENT_SOURCE_DIR}/device/test_${DEVICE}_device.cpp)
-endforeach ()
+foreach(DEVICE IN LISTS MMDEPLOY_TARGET_DEVICES)
+  list(APPEND DEVICE_TC
+       ${CMAKE_CURRENT_SOURCE_DIR}/device/test_${DEVICE}_device.cpp)
+endforeach()
 
 set(CAPI_TC)
-if ("all" IN_LIST MMDEPLOY_CODEBASES)
-    set(TASK_LIST
-            "classifier;detector;segmentor;text_detector;text_recognizer;restorer;model"
-            )
-    set(CODEBASES "mmcls;mmdet;mmseg;mmedit;mmocr")
-else ()
-    set(TASK_LIST "model")
-    set(CODEBASES "${MMDEPLOY_CODEBASES}")
-    if ("mmcls" IN_LIST MMDEPLOY_CODEBASES)
-        list(APPEND TASK_LIST "classifier")
-    endif ()
-    if ("mmdet" IN_LIST MMDEPLOY_CODEBASES)
-        list(APPEND TASK_LIST "detector")
-    endif ()
-    if ("mmseg" IN_LIST MMDEPLOY_CODEBASES)
-        list(APPEND TASK_LIST "segmentor")
-    endif ()
-    if ("mmedit" IN_LIST MMDEPLOY_CODEBASES)
-        list(APPEND TASK_LIST "restorer")
-    endif ()
-    if ("mmocr" IN_LIST MMDEPLOY_CODEBASES)
-        list(APPEND TASK_LIST "text_detector")
-        list(APPEND TASK_LIST "text_recognizer")
-    endif ()
-endif ()
-foreach (TASK ${TASK_LIST})
-    list(APPEND CAPI_TC ${CMAKE_CURRENT_SOURCE_DIR}/capi/test_${TASK}.cpp)
-endforeach ()
+if("all" IN_LIST MMDEPLOY_CODEBASES)
+  set(TASK_LIST
+      "classifier;detector;segmentor;text_detector;text_recognizer;restorer;model"
+  )
+  set(CODEBASES "mmcls;mmdet;mmseg;mmedit;mmocr")
+else()
+  set(TASK_LIST "model")
+  set(CODEBASES "${MMDEPLOY_CODEBASES}")
+  if("mmcls" IN_LIST MMDEPLOY_CODEBASES)
+    list(APPEND TASK_LIST "classifier")
+  endif()
+  if("mmdet" IN_LIST MMDEPLOY_CODEBASES)
+    list(APPEND TASK_LIST "detector")
+  endif()
+  if("mmseg" IN_LIST MMDEPLOY_CODEBASES)
+    list(APPEND TASK_LIST "segmentor")
+  endif()
+  if("mmedit" IN_LIST MMDEPLOY_CODEBASES)
+    list(APPEND TASK_LIST "restorer")
+  endif()
+  if("mmocr" IN_LIST MMDEPLOY_CODEBASES)
+    list(APPEND TASK_LIST "text_detector")
+    list(APPEND TASK_LIST "text_recognizer")
+  endif()
+endif()
+foreach(TASK ${TASK_LIST})
+  list(APPEND CAPI_TC ${CMAKE_CURRENT_SOURCE_DIR}/capi/test_${TASK}.cpp)
+endforeach()
 
 # generate the header file
 configure_file(config/test_define.h.in
-        ${CMAKE_CURRENT_SOURCE_DIR}/test_define.h)
+               ${CMAKE_CURRENT_SOURCE_DIR}/test_define.h)
 
 set(TC_SRCS
-        ${TC_SRCS}
-        ${ARCHIVE_TC}
-        ${CORE_TC}
-        ${TRANSFORM_TC}
-        ${MODEL_TC}
-        ${NET_TC}
-        ${DEVICE_TC}
-        ${CAPI_TC}
-        ${GRAPH_TC})
+    ${TC_SRCS}
+    ${ARCHIVE_TC}
+    ${CORE_TC}
+    ${TRANSFORM_TC}
+    ${MODEL_TC}
+    ${NET_TC}
+    ${DEVICE_TC}
+    ${CAPI_TC}
+    ${GRAPH_TC})
 
 add_executable(mmdeploy_tests ${TC_SRCS})
 target_include_directories(mmdeploy_tests
-        PRIVATE ${CMAKE_SOURCE_DIR}/third_party/catch2)
+                           PRIVATE ${CMAKE_SOURCE_DIR}/third_party/catch2)
 target_include_directories(mmdeploy_tests PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
 
-if (NOT (MMDEPLOY_SHARED_LIBS OR MSVC))
-    target_compile_options(mmdeploy_tests PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-fvisibility=hidden>)
-endif ()
+if(NOT (MMDEPLOY_SHARED_LIBS OR MSVC))
+  target_compile_options(mmdeploy_tests
+                         PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-fvisibility=hidden>)
+endif()
 
 mmdeploy_load_static(mmdeploy_tests MMDeployStaticModules)
 mmdeploy_load_dynamic(mmdeploy_tests MMDeployDynamicModules)
-target_link_libraries(mmdeploy_tests PRIVATE
-    MMDeployLibs
-    mmdeploy_transform
-    mmdeploy_operation
-    mmdeploy_opencv_utils)
+target_link_libraries(
+  mmdeploy_tests PRIVATE MMDeployLibs mmdeploy_transform mmdeploy_operation
+                         mmdeploy_opencv_utils)
diff --git a/tests/test_csrc/archive/test_json_archive.cpp b/tests/test_csrc/archive/test_json_archive.cpp
index ce4aef9994..5a76194a36 100644
--- a/tests/test_csrc/archive/test_json_archive.cpp
+++ b/tests/test_csrc/archive/test_json_archive.cpp
@@ -12,53 +12,60 @@
 #include "catch.hpp"
 #include "mmdeploy/archive/json_archive.h"
 
-using ArrayLikeTypes = std::tuple<std::vector<int>, std::deque<int>, std::array<int, 15>,
-                                  std::list<int>, std::set<int>, std::unordered_set<int>,
-                                  std::multiset<int>, std::unordered_multiset<int> >;
+using ArrayLikeTypes = std::tuple<std::vector<int>, std::deque<int>, std::array<int, 15>, std::list<int>, std::set<int>, std::unordered_set<int>, std::multiset<int>, std::unordered_multiset<int>>;
 
-TEMPLATE_LIST_TEST_CASE("test array-like", "[archive]", ArrayLikeTypes) {
-  TestType v{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4};
-  nlohmann::json json;
-  mmdeploy::JsonOutputArchive oa(json);
-  oa(v);
-  mmdeploy::JsonInputArchive ia(json);
-  TestType u{};
-  ia(u);
-  std::cout << json << std::endl;
-  REQUIRE(u == v);
+TEMPLATE_LIST_TEST_CASE("test array-like", "[archive]", ArrayLikeTypes)
+{
+    TestType                    v{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4};
+    nlohmann::json              json;
+    mmdeploy::JsonOutputArchive oa(json);
+    oa(v);
+    mmdeploy::JsonInputArchive ia(json);
+    TestType                   u{};
+    ia(u);
+    std::cout << json << std::endl;
+    REQUIRE(u == v);
 }
 
 using MapLikeTypes = std::tuple<
     //        std::map<int, float>
-    std::map<int, float>, std::unordered_map<int, float>, std::multimap<int, float>,
-    std::unordered_multimap<int, float> >;
+    std::map<int, float>,
+    std::unordered_map<int, float>,
+    std::multimap<int, float>,
+    std::unordered_multimap<int, float>>;
 
-TEMPLATE_LIST_TEST_CASE("test map-like", "[archive]", MapLikeTypes) {
-  TestType v{{1, 123.456f}, {1, 222.222f}, {2, 111.222f}, {3, 223.332f}, {3, 1.22e10f}};
-  nlohmann::json json;
-  mmdeploy::JsonOutputArchive oa(json);
-  oa(v);
-  mmdeploy::JsonInputArchive ia(json);
-  TestType u;
-  ia(u);
-  std::cout << json << std::endl;
-  REQUIRE(u == v);
+TEMPLATE_LIST_TEST_CASE("test map-like", "[archive]", MapLikeTypes)
+{
+    TestType                    v{{1, 123.456f}, {1, 222.222f}, {2, 111.222f}, {3, 223.332f}, {3, 1.22e10f}};
+    nlohmann::json              json;
+    mmdeploy::JsonOutputArchive oa(json);
+    oa(v);
+    mmdeploy::JsonInputArchive ia(json);
+    TestType                   u;
+    ia(u);
+    std::cout << json << std::endl;
+    REQUIRE(u == v);
 }
 
-struct A {
-  std::vector<int> vec;
-  std::string str;
-  friend bool operator==(const A& a, const A& b) { return a.vec == b.vec && a.str == b.str; }
-  MMDEPLOY_ARCHIVE_MEMBERS(vec, str);
+struct A
+{
+    std::vector<int> vec;
+    std::string      str;
+    friend bool      operator==(const A& a, const A& b)
+    {
+        return a.vec == b.vec && a.str == b.str;
+    }
+    MMDEPLOY_ARCHIVE_MEMBERS(vec, str);
 };
 
-TEST_CASE("test struct", "[archive]") {
-  A a{{1, 2, 3, 4, 5}, "hello"};
-  nlohmann::json json;
-  mmdeploy::JsonOutputArchive oa(json);
-  oa(a);
-  mmdeploy::JsonInputArchive ia(json);
-  A b;
-  ia(b);
-  REQUIRE(a == b);
+TEST_CASE("test struct", "[archive]")
+{
+    A                           a{{1, 2, 3, 4, 5}, "hello"};
+    nlohmann::json              json;
+    mmdeploy::JsonOutputArchive oa(json);
+    oa(a);
+    mmdeploy::JsonInputArchive ia(json);
+    A                          b;
+    ia(b);
+    REQUIRE(a == b);
 }
diff --git a/tests/test_csrc/archive/test_value_archive.cpp b/tests/test_csrc/archive/test_value_archive.cpp
index ceb53bd764..70e0ac29fc 100644
--- a/tests/test_csrc/archive/test_value_archive.cpp
+++ b/tests/test_csrc/archive/test_value_archive.cpp
@@ -35,26 +35,28 @@ using ArrayLikeTypes =
 
 // clang-format on
 
-TEMPLATE_LIST_TEST_CASE("test array-like for value", "[value]", ArrayLikeTypes) {
-  TestType v{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4};
-  mmdeploy::Value value;
-  mmdeploy::ValueOutputArchive oa(value);
-  oa(v);
-  mmdeploy::ValueInputArchive ia(value);
-  TestType u{};
-  ia(u);
-  REQUIRE(u == v);
+TEMPLATE_LIST_TEST_CASE("test array-like for value", "[value]", ArrayLikeTypes)
+{
+    TestType                     v{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4};
+    mmdeploy::Value              value;
+    mmdeploy::ValueOutputArchive oa(value);
+    oa(v);
+    mmdeploy::ValueInputArchive ia(value);
+    TestType                    u{};
+    ia(u);
+    REQUIRE(u == v);
 }
 
-TEST_CASE("test native array for value archive", "[value1]") {
-  const int a[10] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-  int b[10] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-  mmdeploy::Value value;
-  mmdeploy::ValueOutputArchive oa(value);
-  oa(a);
-  mmdeploy::ValueInputArchive ia(value);
-  ia(b);
-  REQUIRE(std::vector<int>(a, a + 10) == std::vector<int>(b, b + 10));
+TEST_CASE("test native array for value archive", "[value1]")
+{
+    const int                    a[10] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+    int                          b[10] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    mmdeploy::Value              value;
+    mmdeploy::ValueOutputArchive oa(value);
+    oa(a);
+    mmdeploy::ValueInputArchive ia(value);
+    ia(b);
+    REQUIRE(std::vector<int>(a, a + 10) == std::vector<int>(b, b + 10));
 }
 
 // clang-format off
@@ -70,49 +72,60 @@ using MapLikeTypes =
 
 // clang-format on
 
-TEMPLATE_LIST_TEST_CASE("test map-like for value archive", "[value]", MapLikeTypes) {
-  TestType v{{1, 123.456f}, {1, 222.222f}, {2, 111.222f}, {3, 223.332f}, {3, 1.22e10f}};
-  mmdeploy::Value value;
-  mmdeploy::ValueOutputArchive oa(value);
-  oa(v);
-  mmdeploy::ValueInputArchive ia(value);
-  TestType u{};
-  ia(u);
-  REQUIRE(u == v);
+TEMPLATE_LIST_TEST_CASE("test map-like for value archive", "[value]", MapLikeTypes)
+{
+    TestType                     v{{1, 123.456f}, {1, 222.222f}, {2, 111.222f}, {3, 223.332f}, {3, 1.22e10f}};
+    mmdeploy::Value              value;
+    mmdeploy::ValueOutputArchive oa(value);
+    oa(v);
+    mmdeploy::ValueInputArchive ia(value);
+    TestType                    u{};
+    ia(u);
+    REQUIRE(u == v);
 }
 
-struct OuterObject {
-  int x;
-  float y;
-  struct InnerObject {
-    std::string f;
-    bool g;
-    friend bool operator==(const InnerObject& a, const InnerObject& b) {
-      return a.f == b.f && a.g == b.g;
+struct OuterObject
+{
+    int   x;
+    float y;
+    struct InnerObject
+    {
+        std::string f;
+        bool        g;
+        friend bool operator==(const InnerObject& a, const InnerObject& b)
+        {
+            return a.f == b.f && a.g == b.g;
+        }
+        MMDEPLOY_ARCHIVE_MEMBERS(f, g);
+    };
+    InnerObject inner;
+
+    struct Stl
+    {
+        std::vector<std::string>   s_vec;
+        std::map<std::string, int> si_map;
+        friend bool                operator==(const Stl& a, const Stl& b)
+        {
+            return a.s_vec == b.s_vec && a.si_map == b.si_map;
+        }
+        MMDEPLOY_ARCHIVE_MEMBERS(s_vec);
+    };
+    Stl         stl;
+
+    friend bool operator==(const OuterObject& a, const OuterObject& b)
+    {
+        return a.x == b.x && a.y == b.y && a.inner == b.inner;
     }
-    MMDEPLOY_ARCHIVE_MEMBERS(f, g);
-  };
-  InnerObject inner;
-
-  struct Stl {
-    std::vector<std::string> s_vec;
-    std::map<std::string, int> si_map;
-    friend bool operator==(const Stl& a, const Stl& b) {
-      return a.s_vec == b.s_vec && a.si_map == b.si_map;
+    friend bool operator!=(const OuterObject& a, const OuterObject& b)
+    {
+        return !(a == b);
     }
-    MMDEPLOY_ARCHIVE_MEMBERS(s_vec);
-  };
-  Stl stl;
-
-  friend bool operator==(const OuterObject& a, const OuterObject& b) {
-    return a.x == b.x && a.y == b.y && a.inner == b.inner;
-  }
-  friend bool operator!=(const OuterObject& a, const OuterObject& b) { return !(a == b); }
-  MMDEPLOY_ARCHIVE_MEMBERS(x, y, inner, stl);
+    MMDEPLOY_ARCHIVE_MEMBERS(x, y, inner, stl);
 };
 
-TEST_CASE("test schema", "[value]") {
-  // clang-format off
+TEST_CASE("test schema", "[value]")
+{
+    // clang-format off
   OuterObject obj {
       1,
       2,
@@ -122,35 +135,35 @@ TEST_CASE("test schema", "[value]") {
         {{"1", 1}, {"er", 2}, {"three", 3}}
       }
   };
-  // clang-format on
-  mmdeploy::Value value;
-  mmdeploy::ValueOutputArchive oa(value);
-  oa(obj);
-
-  std::string ff;
-  mmdeploy::Value v(ff);
-  REQUIRE(v.is_string());
-
-  REQUIRE(value.is_object());
-  auto& x = value["x"];
-  REQUIRE(x.is_number_integer());
-  REQUIRE(x.get<int>() == 1);
-  auto& y = value["y"];
-  REQUIRE(y.is_number_float());
-  REQUIRE(y.get<float>() == 2);
-  auto& inner = value["inner"];
-  REQUIRE(inner.is_object());
-  auto& f = inner["f"];
-  REQUIRE(f.type() == mmdeploy::ValueType::kString);
-  REQUIRE(f.is_string());
-  REQUIRE(f.get<std::string>() == "3");
-  auto& g = inner["g"];
-  REQUIRE(g.type() == mmdeploy::ValueType::kBool);
-  REQUIRE(g.get<bool>() == false);
-
-  mmdeploy::ValueInputArchive ia(value);
-  OuterObject u{};
-  REQUIRE(obj != u);
-  ia(u);
-  REQUIRE(obj == u);
+    // clang-format on
+    mmdeploy::Value              value;
+    mmdeploy::ValueOutputArchive oa(value);
+    oa(obj);
+
+    std::string     ff;
+    mmdeploy::Value v(ff);
+    REQUIRE(v.is_string());
+
+    REQUIRE(value.is_object());
+    auto& x = value["x"];
+    REQUIRE(x.is_number_integer());
+    REQUIRE(x.get<int>() == 1);
+    auto& y = value["y"];
+    REQUIRE(y.is_number_float());
+    REQUIRE(y.get<float>() == 2);
+    auto& inner = value["inner"];
+    REQUIRE(inner.is_object());
+    auto& f = inner["f"];
+    REQUIRE(f.type() == mmdeploy::ValueType::kString);
+    REQUIRE(f.is_string());
+    REQUIRE(f.get<std::string>() == "3");
+    auto& g = inner["g"];
+    REQUIRE(g.type() == mmdeploy::ValueType::kBool);
+    REQUIRE(g.get<bool>() == false);
+
+    mmdeploy::ValueInputArchive ia(value);
+    OuterObject                 u{};
+    REQUIRE(obj != u);
+    ia(u);
+    REQUIRE(obj == u);
 }
diff --git a/tests/test_csrc/capi/test_classifier.cpp b/tests/test_csrc/capi/test_classifier.cpp
index 602012849b..da69d1998e 100644
--- a/tests/test_csrc/capi/test_classifier.cpp
+++ b/tests/test_csrc/capi/test_classifier.cpp
@@ -11,55 +11,61 @@
 
 using namespace std;
 
-TEST_CASE("test classifier's c api", "[.classifier][resource]") {
-  auto test = [](const std::string& device_name, const std::string& model_path,
-                 const std::vector<std::string>& img_list) {
-    mmdeploy_classifier_t classifier{nullptr};
-    auto ret =
-        mmdeploy_classifier_create_by_path(model_path.c_str(), device_name.c_str(), 0, &classifier);
-    REQUIRE(ret == MMDEPLOY_SUCCESS);
+TEST_CASE("test classifier's c api", "[.classifier][resource]")
+{
+    auto test = [](const std::string& device_name, const std::string& model_path, const std::vector<std::string>& img_list)
+    {
+        mmdeploy_classifier_t classifier{nullptr};
+        auto                  ret =
+            mmdeploy_classifier_create_by_path(model_path.c_str(), device_name.c_str(), 0, &classifier);
+        REQUIRE(ret == MMDEPLOY_SUCCESS);
 
-    vector<cv::Mat> cv_mats;
-    vector<mmdeploy_mat_t> mats;
-    for (auto& img_path : img_list) {
-      cv::Mat mat = cv::imread(img_path);
-      REQUIRE(!mat.empty());
-      cv_mats.push_back(mat);
-      mats.push_back({mat.data, mat.rows, mat.cols, mat.channels(), MMDEPLOY_PIXEL_FORMAT_BGR,
-                      MMDEPLOY_DATA_TYPE_UINT8});
-    }
+        vector<cv::Mat>        cv_mats;
+        vector<mmdeploy_mat_t> mats;
+        for (auto& img_path : img_list)
+        {
+            cv::Mat mat = cv::imread(img_path);
+            REQUIRE(!mat.empty());
+            cv_mats.push_back(mat);
+            mats.push_back({mat.data, mat.rows, mat.cols, mat.channels(), MMDEPLOY_PIXEL_FORMAT_BGR, MMDEPLOY_DATA_TYPE_UINT8});
+        }
 
-    mmdeploy_classification_t* results{nullptr};
-    int* result_count{nullptr};
-    ret = mmdeploy_classifier_apply(classifier, mats.data(), (int)mats.size(), &results,
-                                    &result_count);
-    REQUIRE(ret == MMDEPLOY_SUCCESS);
-    auto result_ptr = results;
-    MMDEPLOY_INFO("model_path: {}", model_path);
-    for (auto i = 0; i < (int)mats.size(); ++i) {
-      MMDEPLOY_INFO("the {}-th classification result: ", i);
-      for (int j = 0; j < *result_count; ++j, ++result_ptr) {
-        MMDEPLOY_INFO("\t label: {}, score: {}", result_ptr->label_id, result_ptr->score);
-      }
-    }
+        mmdeploy_classification_t* results{nullptr};
+        int*                       result_count{nullptr};
+        ret = mmdeploy_classifier_apply(classifier, mats.data(), (int)mats.size(), &results, &result_count);
+        REQUIRE(ret == MMDEPLOY_SUCCESS);
+        auto result_ptr = results;
+        MMDEPLOY_INFO("model_path: {}", model_path);
+        for (auto i = 0; i < (int)mats.size(); ++i)
+        {
+            MMDEPLOY_INFO("the {}-th classification result: ", i);
+            for (int j = 0; j < *result_count; ++j, ++result_ptr)
+            {
+                MMDEPLOY_INFO("\t label: {}, score: {}", result_ptr->label_id, result_ptr->score);
+            }
+        }
 
-    mmdeploy_classifier_release_result(results, result_count, (int)mats.size());
-    mmdeploy_classifier_destroy(classifier);
-  };
+        mmdeploy_classifier_release_result(results, result_count, (int)mats.size());
+        mmdeploy_classifier_destroy(classifier);
+    };
 
-  auto gResources = MMDeployTestResources::Get();
-  auto img_lists = gResources.LocateImageResources(fs::path{"mmcls"} / "images");
-  REQUIRE(!img_lists.empty());
+    auto gResources = MMDeployTestResources::Get();
+    auto img_lists  = gResources.LocateImageResources(fs::path{"mmcls"} / "images");
+    REQUIRE(!img_lists.empty());
 
-  for (auto& backend : gResources.backends()) {
-    DYNAMIC_SECTION("loop backend: " << backend) {
-      auto model_list = gResources.LocateModelResources(fs::path{"mmcls/"} / backend);
-      REQUIRE(!model_list.empty());
-      for (auto& model_path : model_list) {
-        for (auto& device_name : gResources.device_names(backend)) {
-          test(device_name, model_path, img_lists);
+    for (auto& backend : gResources.backends())
+    {
+        DYNAMIC_SECTION("loop backend: " << backend)
+        {
+            auto model_list = gResources.LocateModelResources(fs::path{"mmcls/"} / backend);
+            REQUIRE(!model_list.empty());
+            for (auto& model_path : model_list)
+            {
+                for (auto& device_name : gResources.device_names(backend))
+                {
+                    test(device_name, model_path, img_lists);
+                }
+            }
         }
-      }
     }
-  }
 }
diff --git a/tests/test_csrc/capi/test_detector.cpp b/tests/test_csrc/capi/test_detector.cpp
index 0fdb1252d1..888076fc42 100644
--- a/tests/test_csrc/capi/test_detector.cpp
+++ b/tests/test_csrc/capi/test_detector.cpp
@@ -11,59 +11,66 @@
 #include "test_resource.h"
 using namespace std;
 
-TEST_CASE("test detector's c api", "[.detector][resource]") {
-  MMDEPLOY_INFO("test detector");
-  auto test = [](const string &device, const string &model_path, const vector<string> &img_list) {
-    mmdeploy_detector_t detector{nullptr};
-    auto ret = mmdeploy_detector_create_by_path(model_path.c_str(), device.c_str(), 0, &detector);
-    REQUIRE(ret == MMDEPLOY_SUCCESS);
+TEST_CASE("test detector's c api", "[.detector][resource]")
+{
+    MMDEPLOY_INFO("test detector");
+    auto test = [](const string& device, const string& model_path, const vector<string>& img_list)
+    {
+        mmdeploy_detector_t detector{nullptr};
+        auto                ret = mmdeploy_detector_create_by_path(model_path.c_str(), device.c_str(), 0, &detector);
+        REQUIRE(ret == MMDEPLOY_SUCCESS);
 
-    vector<cv::Mat> cv_mats;
-    vector<mmdeploy_mat_t> mats;
-    for (auto &img_path : img_list) {
-      cv::Mat mat = cv::imread(img_path);
-      REQUIRE(!mat.empty());
-      cv_mats.push_back(mat);
-      mats.push_back({mat.data, mat.rows, mat.cols, mat.channels(), MMDEPLOY_PIXEL_FORMAT_BGR,
-                      MMDEPLOY_DATA_TYPE_UINT8});
-    }
+        vector<cv::Mat>        cv_mats;
+        vector<mmdeploy_mat_t> mats;
+        for (auto& img_path : img_list)
+        {
+            cv::Mat mat = cv::imread(img_path);
+            REQUIRE(!mat.empty());
+            cv_mats.push_back(mat);
+            mats.push_back({mat.data, mat.rows, mat.cols, mat.channels(), MMDEPLOY_PIXEL_FORMAT_BGR, MMDEPLOY_DATA_TYPE_UINT8});
+        }
 
-    mmdeploy_detection_t *results{nullptr};
-    int *result_count{nullptr};
-    ret = mmdeploy_detector_apply(detector, mats.data(), (int)mats.size(), &results, &result_count);
-    REQUIRE(ret == MMDEPLOY_SUCCESS);
-    auto result_ptr = results;
-    for (auto i = 0; i < mats.size(); ++i) {
-      MMDEPLOY_INFO("the '{}-th' image has '{}' objects", i, result_count[i]);
-      for (auto j = 0; j < result_count[i]; ++j, ++result_ptr) {
-        auto &bbox = result_ptr->bbox;
-        MMDEPLOY_INFO(" >> bbox[{}, {}, {}, {}], label_id {}, score {}", bbox.left, bbox.top,
-                      bbox.right, bbox.bottom, result_ptr->label_id, result_ptr->score);
-      }
-    }
-    mmdeploy_detector_release_result(results, result_count, (int)mats.size());
-    mmdeploy_detector_destroy(detector);
-  };
-  MMDEPLOY_INFO("get test resources");
-  auto &gResources = MMDeployTestResources::Get();
-  MMDEPLOY_INFO("locate image resources");
-  auto img_lists = gResources.LocateImageResources(fs::path{"mmdet"} / "images");
-  MMDEPLOY_INFO("{}", img_lists.size());
-  REQUIRE(!img_lists.empty());
+        mmdeploy_detection_t* results{nullptr};
+        int*                  result_count{nullptr};
+        ret = mmdeploy_detector_apply(detector, mats.data(), (int)mats.size(), &results, &result_count);
+        REQUIRE(ret == MMDEPLOY_SUCCESS);
+        auto result_ptr = results;
+        for (auto i = 0; i < mats.size(); ++i)
+        {
+            MMDEPLOY_INFO("the '{}-th' image has '{}' objects", i, result_count[i]);
+            for (auto j = 0; j < result_count[i]; ++j, ++result_ptr)
+            {
+                auto& bbox = result_ptr->bbox;
+                MMDEPLOY_INFO(" >> bbox[{}, {}, {}, {}], label_id {}, score {}", bbox.left, bbox.top, bbox.right, bbox.bottom, result_ptr->label_id, result_ptr->score);
+            }
+        }
+        mmdeploy_detector_release_result(results, result_count, (int)mats.size());
+        mmdeploy_detector_destroy(detector);
+    };
+    MMDEPLOY_INFO("get test resources");
+    auto& gResources = MMDeployTestResources::Get();
+    MMDEPLOY_INFO("locate image resources");
+    auto img_lists = gResources.LocateImageResources(fs::path{"mmdet"} / "images");
+    MMDEPLOY_INFO("{}", img_lists.size());
+    REQUIRE(!img_lists.empty());
 
-  for (auto &backend : gResources.backends()) {
-    MMDEPLOY_INFO("backend: {}", backend);
-    DYNAMIC_SECTION("loop backend: " << backend) {
-      auto model_list = gResources.LocateModelResources(fs::path{"mmdet"} / backend);
-      REQUIRE(!model_list.empty());
-      for (auto &model_path : model_list) {
-        MMDEPLOY_INFO("model: {}", model_path);
-        for (auto &device_name : gResources.device_names(backend)) {
-          test(device_name, model_path, img_lists);
+    for (auto& backend : gResources.backends())
+    {
+        MMDEPLOY_INFO("backend: {}", backend);
+        DYNAMIC_SECTION("loop backend: " << backend)
+        {
+            auto model_list = gResources.LocateModelResources(fs::path{"mmdet"} / backend);
+            REQUIRE(!model_list.empty());
+            for (auto& model_path : model_list)
+            {
+                MMDEPLOY_INFO("model: {}", model_path);
+                for (auto& device_name : gResources.device_names(backend))
+                {
+                    test(device_name, model_path, img_lists);
+                }
+            }
         }
-      }
     }
-  }
 }
 
 #if 0
diff --git a/tests/test_csrc/capi/test_model.cpp b/tests/test_csrc/capi/test_model.cpp
index 6c2aaa1e18..70a53f90df 100644
--- a/tests/test_csrc/capi/test_model.cpp
+++ b/tests/test_csrc/capi/test_model.cpp
@@ -7,25 +7,29 @@
 #include "mmdeploy/apis/c/mmdeploy/model.h"
 #include "test_resource.h"
 
-TEST_CASE("test model c capi", "[.model][resource]") {
-  auto &gResource = MMDeployTestResources::Get();
-  std::string model_path;
-  for (auto const &codebase : gResource.codebases()) {
-    for (auto const &backend : gResource.backends()) {
-      if (auto _model_list = gResource.LocateModelResources(fs::path{codebase} / backend);
-          !_model_list.empty()) {
-        model_path = _model_list.front();
-        break;
-      }
+TEST_CASE("test model c capi", "[.model][resource]")
+{
+    auto&       gResource = MMDeployTestResources::Get();
+    std::string model_path;
+    for (auto const& codebase : gResource.codebases())
+    {
+        for (auto const& backend : gResource.backends())
+        {
+            if (auto _model_list = gResource.LocateModelResources(fs::path{codebase} / backend);
+                !_model_list.empty())
+            {
+                model_path = _model_list.front();
+                break;
+            }
+        }
     }
-  }
 
-  REQUIRE(!model_path.empty());
-  mmdeploy_model_t model{};
-  REQUIRE(mmdeploy_model_create_by_path(model_path.c_str(), &model) == MMDEPLOY_SUCCESS);
-  mmdeploy_model_destroy(model);
-  model = nullptr;
+    REQUIRE(!model_path.empty());
+    mmdeploy_model_t model{};
+    REQUIRE(mmdeploy_model_create_by_path(model_path.c_str(), &model) == MMDEPLOY_SUCCESS);
+    mmdeploy_model_destroy(model);
+    model = nullptr;
 
-  REQUIRE(mmdeploy_model_create(nullptr, 0, &model) == MMDEPLOY_E_FAIL);
-  mmdeploy_model_destroy(model);
+    REQUIRE(mmdeploy_model_create(nullptr, 0, &model) == MMDEPLOY_E_FAIL);
+    mmdeploy_model_destroy(model);
 }
diff --git a/tests/test_csrc/capi/test_restorer.cpp b/tests/test_csrc/capi/test_restorer.cpp
index bade09941b..ed11c5988e 100644
--- a/tests/test_csrc/capi/test_restorer.cpp
+++ b/tests/test_csrc/capi/test_restorer.cpp
@@ -10,49 +10,55 @@
 
 using namespace std;
 
-TEST_CASE("test restorer's c api", "[.restorer][resource]") {
-  auto test = [](const string &device, const string &backend, const string &model_path,
-                 const vector<string> &img_list) {
-    mmdeploy_restorer_t restorer{nullptr};
-    auto ret = mmdeploy_restorer_create_by_path(model_path.c_str(), device.c_str(), 0, &restorer);
-    REQUIRE(ret == MMDEPLOY_SUCCESS);
-
-    vector<cv::Mat> cv_mats;
-    vector<mmdeploy_mat_t> mats;
-    for (auto &img_path : img_list) {
-      cv::Mat mat = cv::imread(img_path);
-      REQUIRE(!mat.empty());
-      cv_mats.push_back(mat);
-      mats.push_back({mat.data, mat.rows, mat.cols, mat.channels(), MMDEPLOY_PIXEL_FORMAT_BGR,
-                      MMDEPLOY_DATA_TYPE_UINT8});
-    }
-    mmdeploy_mat_t *res{};
-    ret = mmdeploy_restorer_apply(restorer, mats.data(), (int)mats.size(), &res);
-    REQUIRE(ret == MMDEPLOY_SUCCESS);
-
-    for (auto i = 0; i < cv_mats.size(); ++i) {
-      cv::Mat out(res[i].height, res[i].width, CV_8UC3, res[i].data);
-      cv::cvtColor(out, out, cv::COLOR_RGB2BGR);
-      cv::imwrite("restorer_" + backend + "_" + to_string(i) + ".bmp", out);
-    }
+TEST_CASE("test restorer's c api", "[.restorer][resource]")
+{
+    auto test = [](const string& device, const string& backend, const string& model_path, const vector<string>& img_list)
+    {
+        mmdeploy_restorer_t restorer{nullptr};
+        auto                ret = mmdeploy_restorer_create_by_path(model_path.c_str(), device.c_str(), 0, &restorer);
+        REQUIRE(ret == MMDEPLOY_SUCCESS);
+
+        vector<cv::Mat>        cv_mats;
+        vector<mmdeploy_mat_t> mats;
+        for (auto& img_path : img_list)
+        {
+            cv::Mat mat = cv::imread(img_path);
+            REQUIRE(!mat.empty());
+            cv_mats.push_back(mat);
+            mats.push_back({mat.data, mat.rows, mat.cols, mat.channels(), MMDEPLOY_PIXEL_FORMAT_BGR, MMDEPLOY_DATA_TYPE_UINT8});
+        }
+        mmdeploy_mat_t* res{};
+        ret = mmdeploy_restorer_apply(restorer, mats.data(), (int)mats.size(), &res);
+        REQUIRE(ret == MMDEPLOY_SUCCESS);
+
+        for (auto i = 0; i < cv_mats.size(); ++i)
+        {
+            cv::Mat out(res[i].height, res[i].width, CV_8UC3, res[i].data);
+            cv::cvtColor(out, out, cv::COLOR_RGB2BGR);
+            cv::imwrite("restorer_" + backend + "_" + to_string(i) + ".bmp", out);
+        }
+
+        mmdeploy_restorer_release_result(res, (int)mats.size());
+        mmdeploy_restorer_destroy(restorer);
+    };
+
+    auto gResources = MMDeployTestResources::Get();
+    auto img_lists  = gResources.LocateImageResources(fs::path{"mmedit"} / "images");
+    REQUIRE(!img_lists.empty());
 
-    mmdeploy_restorer_release_result(res, (int)mats.size());
-    mmdeploy_restorer_destroy(restorer);
-  };
-
-  auto gResources = MMDeployTestResources::Get();
-  auto img_lists = gResources.LocateImageResources(fs::path{"mmedit"} / "images");
-  REQUIRE(!img_lists.empty());
-
-  for (auto &backend : gResources.backends()) {
-    DYNAMIC_SECTION("loop backend: " << backend) {
-      auto model_list = gResources.LocateModelResources(fs::path{"mmedit"} / backend);
-      REQUIRE(!model_list.empty());
-      for (auto &model_path : model_list) {
-        for (auto &device_name : gResources.device_names(backend)) {
-          test(device_name, backend, model_path, img_lists);
+    for (auto& backend : gResources.backends())
+    {
+        DYNAMIC_SECTION("loop backend: " << backend)
+        {
+            auto model_list = gResources.LocateModelResources(fs::path{"mmedit"} / backend);
+            REQUIRE(!model_list.empty());
+            for (auto& model_path : model_list)
+            {
+                for (auto& device_name : gResources.device_names(backend))
+                {
+                    test(device_name, backend, model_path, img_lists);
+                }
+            }
         }
-      }
     }
-  }
 }
diff --git a/tests/test_csrc/capi/test_segmentor.cpp b/tests/test_csrc/capi/test_segmentor.cpp
index ef9078aae1..0efbe9f96e 100644
--- a/tests/test_csrc/capi/test_segmentor.cpp
+++ b/tests/test_csrc/capi/test_segmentor.cpp
@@ -10,52 +10,58 @@
 
 using namespace std;
 
-TEST_CASE("test segmentor's c api", "[.segmentor][resource]") {
-  auto test = [](const string &device, const string &backend, const string &model_path,
-                 const vector<string> &img_list) {
-    mmdeploy_segmentor_t segmentor{nullptr};
-    auto ret = mmdeploy_segmentor_create_by_path(model_path.c_str(), device.c_str(), 0, &segmentor);
-    REQUIRE(ret == MMDEPLOY_SUCCESS);
-
-    vector<cv::Mat> cv_mats;
-    vector<mmdeploy_mat_t> mats;
-    for (auto &img_path : img_list) {
-      cv::Mat mat = cv::imread(img_path);
-      REQUIRE(!mat.empty());
-      cv_mats.push_back(mat);
-      mats.push_back({mat.data, mat.rows, mat.cols, mat.channels(), MMDEPLOY_PIXEL_FORMAT_BGR,
-                      MMDEPLOY_DATA_TYPE_UINT8});
-    }
+TEST_CASE("test segmentor's c api", "[.segmentor][resource]")
+{
+    auto test = [](const string& device, const string& backend, const string& model_path, const vector<string>& img_list)
+    {
+        mmdeploy_segmentor_t segmentor{nullptr};
+        auto                 ret = mmdeploy_segmentor_create_by_path(model_path.c_str(), device.c_str(), 0, &segmentor);
+        REQUIRE(ret == MMDEPLOY_SUCCESS);
+
+        vector<cv::Mat>        cv_mats;
+        vector<mmdeploy_mat_t> mats;
+        for (auto& img_path : img_list)
+        {
+            cv::Mat mat = cv::imread(img_path);
+            REQUIRE(!mat.empty());
+            cv_mats.push_back(mat);
+            mats.push_back({mat.data, mat.rows, mat.cols, mat.channels(), MMDEPLOY_PIXEL_FORMAT_BGR, MMDEPLOY_DATA_TYPE_UINT8});
+        }
 
-    mmdeploy_segmentation_t *results{nullptr};
-    int count = 0;
-    ret = mmdeploy_segmentor_apply(segmentor, mats.data(), (int)mats.size(), &results);
-    REQUIRE(ret == MMDEPLOY_SUCCESS);
-    REQUIRE(results != nullptr);
+        mmdeploy_segmentation_t* results{nullptr};
+        int                      count = 0;
+        ret                            = mmdeploy_segmentor_apply(segmentor, mats.data(), (int)mats.size(), &results);
+        REQUIRE(ret == MMDEPLOY_SUCCESS);
+        REQUIRE(results != nullptr);
 
-    auto result_ptr = results;
-    for (auto i = 0; i < mats.size(); ++i, ++result_ptr) {
-      cv::Mat mask(result_ptr->height, result_ptr->width, CV_32SC1, result_ptr->mask);
-      cv::imwrite("mask_" + backend + "_" + to_string(i) + ".png", mask * 10);
-    }
+        auto result_ptr = results;
+        for (auto i = 0; i < mats.size(); ++i, ++result_ptr)
+        {
+            cv::Mat mask(result_ptr->height, result_ptr->width, CV_32SC1, result_ptr->mask);
+            cv::imwrite("mask_" + backend + "_" + to_string(i) + ".png", mask * 10);
+        }
+
+        mmdeploy_segmentor_release_result(results, (int)mats.size());
+        mmdeploy_segmentor_destroy(segmentor);
+    };
+
+    auto gResources = MMDeployTestResources::Get();
+    auto img_lists  = gResources.LocateImageResources(fs::path{"mmseg"} / "images");
+    REQUIRE(!img_lists.empty());
 
-    mmdeploy_segmentor_release_result(results, (int)mats.size());
-    mmdeploy_segmentor_destroy(segmentor);
-  };
-
-  auto gResources = MMDeployTestResources::Get();
-  auto img_lists = gResources.LocateImageResources(fs::path{"mmseg"} / "images");
-  REQUIRE(!img_lists.empty());
-
-  for (auto &backend : gResources.backends()) {
-    DYNAMIC_SECTION("loop backend: " << backend) {
-      auto model_list = gResources.LocateModelResources(fs::path{"mmseg"} / backend);
-      REQUIRE(!model_list.empty());
-      for (auto &model_path : model_list) {
-        for (auto &device_name : gResources.device_names(backend)) {
-          test(device_name, backend, model_path, img_lists);
+    for (auto& backend : gResources.backends())
+    {
+        DYNAMIC_SECTION("loop backend: " << backend)
+        {
+            auto model_list = gResources.LocateModelResources(fs::path{"mmseg"} / backend);
+            REQUIRE(!model_list.empty());
+            for (auto& model_path : model_list)
+            {
+                for (auto& device_name : gResources.device_names(backend))
+                {
+                    test(device_name, backend, model_path, img_lists);
+                }
+            }
         }
-      }
     }
-  }
 }
diff --git a/tests/test_csrc/capi/test_text_detector.cpp b/tests/test_csrc/capi/test_text_detector.cpp
index 95e1ae4932..9bbf24b0ac 100644
--- a/tests/test_csrc/capi/test_text_detector.cpp
+++ b/tests/test_csrc/capi/test_text_detector.cpp
@@ -10,58 +10,66 @@
 
 using namespace std;
 
-TEST_CASE("test text detector's c api", "[.text-detector][resource]") {
-  auto test = [](const string& device, const string& model_path, const vector<string>& img_list) {
-    mmdeploy_text_detector_t detector{nullptr};
-    auto ret =
-        mmdeploy_text_detector_create_by_path(model_path.c_str(), device.c_str(), 0, &detector);
-    REQUIRE(ret == MMDEPLOY_SUCCESS);
+TEST_CASE("test text detector's c api", "[.text-detector][resource]")
+{
+    auto test = [](const string& device, const string& model_path, const vector<string>& img_list)
+    {
+        mmdeploy_text_detector_t detector{nullptr};
+        auto                     ret =
+            mmdeploy_text_detector_create_by_path(model_path.c_str(), device.c_str(), 0, &detector);
+        REQUIRE(ret == MMDEPLOY_SUCCESS);
 
-    vector<cv::Mat> cv_mats;
-    vector<mmdeploy_mat_t> mats;
-    for (auto& img_path : img_list) {
-      cv::Mat mat = cv::imread(img_path);
-      REQUIRE(!mat.empty());
-      cv_mats.push_back(mat);
-      mats.push_back({mat.data, mat.rows, mat.cols, mat.channels(), MMDEPLOY_PIXEL_FORMAT_BGR,
-                      MMDEPLOY_DATA_TYPE_UINT8});
-    }
+        vector<cv::Mat>        cv_mats;
+        vector<mmdeploy_mat_t> mats;
+        for (auto& img_path : img_list)
+        {
+            cv::Mat mat = cv::imread(img_path);
+            REQUIRE(!mat.empty());
+            cv_mats.push_back(mat);
+            mats.push_back({mat.data, mat.rows, mat.cols, mat.channels(), MMDEPLOY_PIXEL_FORMAT_BGR, MMDEPLOY_DATA_TYPE_UINT8});
+        }
 
-    mmdeploy_text_detection_t* results{nullptr};
-    int* result_count{nullptr};
-    ret = mmdeploy_text_detector_apply(detector, mats.data(), (int)mats.size(), &results,
-                                       &result_count);
-    REQUIRE(ret == MMDEPLOY_SUCCESS);
+        mmdeploy_text_detection_t* results{nullptr};
+        int*                       result_count{nullptr};
+        ret = mmdeploy_text_detector_apply(detector, mats.data(), (int)mats.size(), &results, &result_count);
+        REQUIRE(ret == MMDEPLOY_SUCCESS);
 
-    auto result_ptr = results;
-    for (auto i = 0; i < mats.size(); ++i) {
-      MMDEPLOY_INFO("the {}-th image has '{}' objects", i, result_count[i]);
-      for (auto j = 0; j < result_count[i]; ++j, ++result_ptr) {
-        auto& bbox = result_ptr->bbox;
-        MMDEPLOY_INFO(">> bbox[{}].score: {}, coordinate: ", i, result_ptr->score);
-        for (auto& _bbox : result_ptr->bbox) {
-          MMDEPLOY_INFO(">> >> ({}, {})", _bbox.x, _bbox.y);
+        auto result_ptr = results;
+        for (auto i = 0; i < mats.size(); ++i)
+        {
+            MMDEPLOY_INFO("the {}-th image has '{}' objects", i, result_count[i]);
+            for (auto j = 0; j < result_count[i]; ++j, ++result_ptr)
+            {
+                auto& bbox = result_ptr->bbox;
+                MMDEPLOY_INFO(">> bbox[{}].score: {}, coordinate: ", i, result_ptr->score);
+                for (auto& _bbox : result_ptr->bbox)
+                {
+                    MMDEPLOY_INFO(">> >> ({}, {})", _bbox.x, _bbox.y);
+                }
+            }
         }
-      }
-    }
 
-    mmdeploy_text_detector_release_result(results, result_count, (int)mats.size());
-    mmdeploy_text_detector_destroy(detector);
-  };
+        mmdeploy_text_detector_release_result(results, result_count, (int)mats.size());
+        mmdeploy_text_detector_destroy(detector);
+    };
 
-  auto& gResources = MMDeployTestResources::Get();
-  auto img_list = gResources.LocateImageResources(fs::path{"mmocr"} / "images");
-  REQUIRE(!img_list.empty());
+    auto& gResources = MMDeployTestResources::Get();
+    auto  img_list   = gResources.LocateImageResources(fs::path{"mmocr"} / "images");
+    REQUIRE(!img_list.empty());
 
-  for (auto& backend : gResources.backends()) {
-    DYNAMIC_SECTION("loop backend: " << backend) {
-      auto model_list = gResources.LocateModelResources(fs::path{"mmocr"} / "textdet" / "backend");
-      REQUIRE(!model_list.empty());
-      for (auto& model_path : model_list) {
-        for (auto& device_name : gResources.device_names(backend)) {
-          test(device_name, model_path, img_list);
+    for (auto& backend : gResources.backends())
+    {
+        DYNAMIC_SECTION("loop backend: " << backend)
+        {
+            auto model_list = gResources.LocateModelResources(fs::path{"mmocr"} / "textdet" / "backend");
+            REQUIRE(!model_list.empty());
+            for (auto& model_path : model_list)
+            {
+                for (auto& device_name : gResources.device_names(backend))
+                {
+                    test(device_name, model_path, img_list);
+                }
+            }
         }
-      }
     }
-  }
 }
diff --git a/tests/test_csrc/capi/test_text_recognizer.cpp b/tests/test_csrc/capi/test_text_recognizer.cpp
index d7326848be..4051f368fc 100644
--- a/tests/test_csrc/capi/test_text_recognizer.cpp
+++ b/tests/test_csrc/capi/test_text_recognizer.cpp
@@ -12,118 +12,126 @@
 
 using namespace std;
 
-TEST_CASE("test text recognizer's c api", "[.text-recognizer][resource]") {
-  auto test = [](const string& device, const string& model_path, const vector<string>& img_list) {
-    mmdeploy_text_recognizer_t recognizer{nullptr};
-    auto ret =
-        mmdeploy_text_recognizer_create_by_path(model_path.c_str(), device.c_str(), 0, &recognizer);
-    REQUIRE(ret == MMDEPLOY_SUCCESS);
-
-    vector<cv::Mat> cv_mats;
-    vector<mmdeploy_mat_t> mats;
-    for (auto& img_path : img_list) {
-      cv::Mat mat = cv::imread(img_path);
-      REQUIRE(!mat.empty());
-      cv_mats.push_back(mat);
-      mats.push_back({mat.data, mat.rows, mat.cols, mat.channels(), MMDEPLOY_PIXEL_FORMAT_BGR,
-                      MMDEPLOY_DATA_TYPE_UINT8});
-    }
+TEST_CASE("test text recognizer's c api", "[.text-recognizer][resource]")
+{
+    auto test = [](const string& device, const string& model_path, const vector<string>& img_list)
+    {
+        mmdeploy_text_recognizer_t recognizer{nullptr};
+        auto                       ret =
+            mmdeploy_text_recognizer_create_by_path(model_path.c_str(), device.c_str(), 0, &recognizer);
+        REQUIRE(ret == MMDEPLOY_SUCCESS);
+
+        vector<cv::Mat>        cv_mats;
+        vector<mmdeploy_mat_t> mats;
+        for (auto& img_path : img_list)
+        {
+            cv::Mat mat = cv::imread(img_path);
+            REQUIRE(!mat.empty());
+            cv_mats.push_back(mat);
+            mats.push_back({mat.data, mat.rows, mat.cols, mat.channels(), MMDEPLOY_PIXEL_FORMAT_BGR, MMDEPLOY_DATA_TYPE_UINT8});
+        }
 
-    mmdeploy_text_recognition_t* results{};
-    ret = mmdeploy_text_recognizer_apply_bbox(recognizer, mats.data(), (int)mats.size(), nullptr,
-                                              nullptr, &results);
-    REQUIRE(ret == MMDEPLOY_SUCCESS);
+        mmdeploy_text_recognition_t* results{};
+        ret = mmdeploy_text_recognizer_apply_bbox(recognizer, mats.data(), (int)mats.size(), nullptr, nullptr, &results);
+        REQUIRE(ret == MMDEPLOY_SUCCESS);
 
-    for (auto i = 0; i < mats.size(); ++i) {
-      std::vector<float> score(results[i].score, results[i].score + results[i].length);
-      MMDEPLOY_INFO("image {}, text = {}, score = {}", i, results[i].text, score);
-    }
+        for (auto i = 0; i < mats.size(); ++i)
+        {
+            std::vector<float> score(results[i].score, results[i].score + results[i].length);
+            MMDEPLOY_INFO("image {}, text = {}, score = {}", i, results[i].text, score);
+        }
 
-    mmdeploy_text_recognizer_release_result(results, (int)mats.size());
-    mmdeploy_text_recognizer_destroy(recognizer);
-  };
-
-  auto& gResources = MMDeployTestResources::Get();
-  auto img_list = gResources.LocateImageResources(fs::path{"mmocr"} / "images");
-  REQUIRE(!img_list.empty());
-
-  for (auto& backend : gResources.backends()) {
-    DYNAMIC_SECTION("loop backend: " << backend) {
-      auto model_list = gResources.LocateModelResources(fs::path{"mmocr"} / "textreg" / "backend");
-      REQUIRE(!model_list.empty());
-      for (auto& model_path : model_list) {
-        for (auto& device_name : gResources.device_names(backend)) {
-          test(device_name, model_path, img_list);
+        mmdeploy_text_recognizer_release_result(results, (int)mats.size());
+        mmdeploy_text_recognizer_destroy(recognizer);
+    };
+
+    auto& gResources = MMDeployTestResources::Get();
+    auto  img_list   = gResources.LocateImageResources(fs::path{"mmocr"} / "images");
+    REQUIRE(!img_list.empty());
+
+    for (auto& backend : gResources.backends())
+    {
+        DYNAMIC_SECTION("loop backend: " << backend)
+        {
+            auto model_list = gResources.LocateModelResources(fs::path{"mmocr"} / "textreg" / "backend");
+            REQUIRE(!model_list.empty());
+            for (auto& model_path : model_list)
+            {
+                for (auto& device_name : gResources.device_names(backend))
+                {
+                    test(device_name, model_path, img_list);
+                }
+            }
         }
-      }
     }
-  }
 }
 
-TEST_CASE("test text detector-recognizer combo", "[.text-detector-recognizer]") {
-  auto test = [](const std::string& device, const string& det_model_path,
-                 const string& reg_model_path, std::vector<string>& img_list) {
-    mmdeploy_text_detector_t detector{};
-    REQUIRE(mmdeploy_text_detector_create_by_path(det_model_path.c_str(), device.c_str(), 0,
-                                                  &detector) == MMDEPLOY_SUCCESS);
-    mmdeploy_text_recognizer_t recognizer{};
-    REQUIRE(mmdeploy_text_recognizer_create_by_path(reg_model_path.c_str(), device.c_str(), 0,
-                                                    &recognizer) == MMDEPLOY_SUCCESS);
-
-    vector<cv::Mat> cv_mats;
-    vector<mmdeploy_mat_t> mats;
-    for (const auto& img_path : img_list) {
-      cv::Mat mat = cv::imread(img_path);
-      REQUIRE(!mat.empty());
-      cv_mats.push_back(mat);
-      mats.push_back({mat.data, mat.rows, mat.cols, mat.channels(), MMDEPLOY_PIXEL_FORMAT_BGR,
-                      MMDEPLOY_DATA_TYPE_UINT8});
-    }
+TEST_CASE("test text detector-recognizer combo", "[.text-detector-recognizer]")
+{
+    auto test = [](const std::string& device, const string& det_model_path, const string& reg_model_path, std::vector<string>& img_list)
+    {
+        mmdeploy_text_detector_t detector{};
+        REQUIRE(mmdeploy_text_detector_create_by_path(det_model_path.c_str(), device.c_str(), 0, &detector) == MMDEPLOY_SUCCESS);
+        mmdeploy_text_recognizer_t recognizer{};
+        REQUIRE(mmdeploy_text_recognizer_create_by_path(reg_model_path.c_str(), device.c_str(), 0, &recognizer) == MMDEPLOY_SUCCESS);
+
+        vector<cv::Mat>        cv_mats;
+        vector<mmdeploy_mat_t> mats;
+        for (const auto& img_path : img_list)
+        {
+            cv::Mat mat = cv::imread(img_path);
+            REQUIRE(!mat.empty());
+            cv_mats.push_back(mat);
+            mats.push_back({mat.data, mat.rows, mat.cols, mat.channels(), MMDEPLOY_PIXEL_FORMAT_BGR, MMDEPLOY_DATA_TYPE_UINT8});
+        }
 
-    mmdeploy_text_detection_t* bboxes{};
-    int* bbox_count{};
-    REQUIRE(mmdeploy_text_detector_apply(detector, mats.data(), mats.size(), &bboxes,
-                                         &bbox_count) == MMDEPLOY_SUCCESS);
+        mmdeploy_text_detection_t* bboxes{};
+        int*                       bbox_count{};
+        REQUIRE(mmdeploy_text_detector_apply(detector, mats.data(), mats.size(), &bboxes, &bbox_count) == MMDEPLOY_SUCCESS);
 
-    mmdeploy_text_recognition_t* texts{};
+        mmdeploy_text_recognition_t* texts{};
 
-    REQUIRE(mmdeploy_text_recognizer_apply_bbox(recognizer, mats.data(), (int)mats.size(), bboxes,
-                                                bbox_count, &texts) == MMDEPLOY_SUCCESS);
+        REQUIRE(mmdeploy_text_recognizer_apply_bbox(recognizer, mats.data(), (int)mats.size(), bboxes, bbox_count, &texts) == MMDEPLOY_SUCCESS);
 
-    int offset = 0;
-    for (auto i = 0; i < mats.size(); ++i) {
-      for (int j = 0; j < bbox_count[i]; ++j) {
-        auto& text = texts[offset + j];
-        std::vector<float> score(text.score, text.score + text.length);
-        MMDEPLOY_INFO("image {}, text = {}, score = {}", i, text.text, score);
-      }
-      offset += bbox_count[i];
-    }
+        int offset = 0;
+        for (auto i = 0; i < mats.size(); ++i)
+        {
+            for (int j = 0; j < bbox_count[i]; ++j)
+            {
+                auto&              text = texts[offset + j];
+                std::vector<float> score(text.score, text.score + text.length);
+                MMDEPLOY_INFO("image {}, text = {}, score = {}", i, text.text, score);
+            }
+            offset += bbox_count[i];
+        }
 
-    mmdeploy_text_recognizer_release_result(texts, offset);
-    mmdeploy_text_detector_release_result(bboxes, bbox_count, offset);
-
-    mmdeploy_text_recognizer_destroy(recognizer);
-    mmdeploy_text_detector_destroy(detector);
-  };
-
-  auto& gResources = MMDeployTestResources::Get();
-  auto img_list = gResources.LocateImageResources(fs::path{"mmocr"} / "images");
-  REQUIRE(!img_list.empty());
-
-  for (auto& backend : gResources.backends()) {
-    DYNAMIC_SECTION("loop backend: " << backend) {
-      auto det_model_list =
-          gResources.LocateModelResources(fs::path{"mmocr"} / "textdet" / backend);
-      auto reg_model_list =
-          gResources.LocateModelResources(fs::path{"mmocr"} / "textreg" / backend);
-      REQUIRE(!det_model_list.empty());
-      REQUIRE(!reg_model_list.empty());
-      auto det_model_path = det_model_list.front();
-      auto reg_model_path = reg_model_list.front();
-      for (auto& device_name : gResources.device_names(backend)) {
-        test(device_name, det_model_path, reg_model_path, img_list);
-      }
+        mmdeploy_text_recognizer_release_result(texts, offset);
+        mmdeploy_text_detector_release_result(bboxes, bbox_count, offset);
+
+        mmdeploy_text_recognizer_destroy(recognizer);
+        mmdeploy_text_detector_destroy(detector);
+    };
+
+    auto& gResources = MMDeployTestResources::Get();
+    auto  img_list   = gResources.LocateImageResources(fs::path{"mmocr"} / "images");
+    REQUIRE(!img_list.empty());
+
+    for (auto& backend : gResources.backends())
+    {
+        DYNAMIC_SECTION("loop backend: " << backend)
+        {
+            auto det_model_list =
+                gResources.LocateModelResources(fs::path{"mmocr"} / "textdet" / backend);
+            auto reg_model_list =
+                gResources.LocateModelResources(fs::path{"mmocr"} / "textreg" / backend);
+            REQUIRE(!det_model_list.empty());
+            REQUIRE(!reg_model_list.empty());
+            auto det_model_path = det_model_list.front();
+            auto reg_model_path = reg_model_list.front();
+            for (auto& device_name : gResources.device_names(backend))
+            {
+                test(device_name, det_model_path, reg_model_path, img_list);
+            }
+        }
     }
-  }
 }
diff --git a/tests/test_csrc/core/test_execution.cpp b/tests/test_csrc/core/test_execution.cpp
index debbd30f01..ba99c16e4e 100644
--- a/tests/test_csrc/core/test_execution.cpp
+++ b/tests/test_csrc/core/test_execution.cpp
@@ -19,253 +19,320 @@
 
 using namespace mmdeploy;
 
-TEST_CASE("test basic execution", "[execution]") {
-  auto x = Then(Just(), [] {});
-  static_assert(!_has_completion_scheduler_v<decltype(x)>);
-  InlineScheduler sch;
-  auto a = Just(Value{{"a", 100}, {"b", 200}});
-  static_assert(!_has_completion_scheduler_v<decltype(a)>);
-  auto b = ScheduleFrom(sch, a);
-  static_assert(_has_completion_scheduler_v<decltype(b)>);
-  static_assert(std::is_same_v<decltype(GetCompletionScheduler(b)), InlineScheduler>);
-  auto c = Then(b, [](Value v) -> Value { return {{"c", v["a"].get<int>() + v["b"].get<int>()}}; });
-  auto d = SyncWait(c);
-  MMDEPLOY_INFO("{}", d);
+TEST_CASE("test basic execution", "[execution]")
+{
+    auto x = Then(Just(), [] {});
+    static_assert(!_has_completion_scheduler_v<decltype(x)>);
+    InlineScheduler sch;
+    auto            a = Just(Value{{"a", 100}, {"b", 200}});
+    static_assert(!_has_completion_scheduler_v<decltype(a)>);
+    auto b = ScheduleFrom(sch, a);
+    static_assert(_has_completion_scheduler_v<decltype(b)>);
+    static_assert(std::is_same_v<decltype(GetCompletionScheduler(b)), InlineScheduler>);
+    auto c = Then(b, [](Value v) -> Value
+                  { return {{"c", v["a"].get<int>() + v["b"].get<int>()}}; });
+    auto d = SyncWait(c);
+    MMDEPLOY_INFO("{}", d);
 }
 
-template <class Sender>
-auto GetKey(Sender&& sndr, const std::string& key) {
-  return Then((Sender &&) sndr, [key](const Value& v) { return v[key]; });
+template<class Sender>
+auto GetKey(Sender&& sndr, const std::string& key)
+{
+    return Then((Sender&&)sndr, [key](const Value& v)
+                { return v[key]; });
 }
 
-TEST_CASE("test split", "[execution]") {
-  auto a = Just(Value{{"x", 100}, {"y", 1000}});
-  auto s = Split(a);
-  auto x = GetKey(s, "x");
-  auto y = GetKey(s, "y");
-  auto x_v = SyncWait(x);
-  auto y_v = SyncWait(y);
-  MMDEPLOY_INFO("x = {}, y = {}", x_v, y_v);
+TEST_CASE("test split", "[execution]")
+{
+    auto a   = Just(Value{{"x", 100}, {"y", 1000}});
+    auto s   = Split(a);
+    auto x   = GetKey(s, "x");
+    auto y   = GetKey(s, "y");
+    auto x_v = SyncWait(x);
+    auto y_v = SyncWait(y);
+    MMDEPLOY_INFO("x = {}, y = {}", x_v, y_v);
 }
 
-TEST_CASE("test when_all", "[execution]") {
-  auto a = Just(100);
-  auto b = Just(200);
-  auto c = Just(300);
-  auto d = Just(400);
-  auto e = Just(500);
-  auto t = WhenAll(a, b, c, d, e);
-  auto v = SyncWait(t);
-  MMDEPLOY_INFO("v = {}", v);
+TEST_CASE("test when_all", "[execution]")
+{
+    auto a = Just(100);
+    auto b = Just(200);
+    auto c = Just(300);
+    auto d = Just(400);
+    auto e = Just(500);
+    auto t = WhenAll(a, b, c, d, e);
+    auto v = SyncWait(t);
+    MMDEPLOY_INFO("v = {}", v);
 }
 
-void Func() {
-  auto a = Just(100, 200);
-  auto b =
-      LetValue(a, [](int& x, int& y) { return Then(Just(x + y), [](int v) { return v * v; }); });
-  auto v = SyncWait(b);
-  static_assert(std::is_same_v<decltype(v), std::tuple<int>>);
-  MMDEPLOY_INFO("v = {}", v);
+void Func()
+{
+    auto a = Just(100, 200);
+    auto b =
+        LetValue(a, [](int& x, int& y)
+                 { return Then(Just(x + y), [](int v)
+                               { return v * v; }); });
+    auto v = SyncWait(b);
+    static_assert(std::is_same_v<decltype(v), std::tuple<int>>);
+    MMDEPLOY_INFO("v = {}", v);
 }
 
-TEST_CASE("test let_value", "[execution]") { Func(); }
-
-TEST_CASE("test fork-join", "[execution]") {
-  auto a = Just(Value{{"x", 100}, {"y", 1000}});
-  auto s = Split(a);
-  auto x = GetKey(s, "x");
-  auto y = GetKey(s, "y");
-  auto xy = WhenAll(x, y);
-  auto v = SyncWait(xy);
-  static_assert(std::is_same_v<decltype(v), std::tuple<Value, Value>>);
-  MMDEPLOY_INFO("v = {}", v);
+TEST_CASE("test let_value", "[execution]")
+{
+    Func();
 }
 
-TEST_CASE("test ensure_started", "[execution]") {
-  //  auto s = Schedule(gThreadPool().GetScheduler());
-  auto pool = __static_thread_pool::StaticThreadPool{};
-  auto s = Schedule(pool.GetScheduler());
-  auto a = Then(s, []() -> Value {
+TEST_CASE("test fork-join", "[execution]")
+{
+    auto a  = Just(Value{{"x", 100}, {"y", 1000}});
+    auto s  = Split(a);
+    auto x  = GetKey(s, "x");
+    auto y  = GetKey(s, "y");
+    auto xy = WhenAll(x, y);
+    auto v  = SyncWait(xy);
+    static_assert(std::is_same_v<decltype(v), std::tuple<Value, Value>>);
+    MMDEPLOY_INFO("v = {}", v);
+}
+
+TEST_CASE("test ensure_started", "[execution]")
+{
+    //  auto s = Schedule(gThreadPool().GetScheduler());
+    auto pool = __static_thread_pool::StaticThreadPool{};
+    auto s    = Schedule(pool.GetScheduler());
+    auto a    = Then(s, []() -> Value
+                  {
     MMDEPLOY_INFO("ensure_started sleep start...");
     std::this_thread::sleep_for(std::chrono::seconds(1));
     MMDEPLOY_INFO("ensure_started sleep end");
-    return 23333;
-  });
-  MMDEPLOY_INFO("ensure_started call");
-  auto c = EnsureStarted(a);
-  MMDEPLOY_INFO("ensure_started ret");
-  std::this_thread::sleep_for(std::chrono::milliseconds(500));
-  MMDEPLOY_INFO("ensure_started sync_wait");
-  auto v = SyncWait(c);
-  MMDEPLOY_INFO("ensure_started: {}", v);
+    return 23333; });
+    MMDEPLOY_INFO("ensure_started call");
+    auto c = EnsureStarted(a);
+    MMDEPLOY_INFO("ensure_started ret");
+    std::this_thread::sleep_for(std::chrono::milliseconds(500));
+    MMDEPLOY_INFO("ensure_started sync_wait");
+    auto v = SyncWait(c);
+    MMDEPLOY_INFO("ensure_started: {}", v);
 }
 
-TEST_CASE("test start_detached", "[execution]") {
-  MMDEPLOY_INFO("test start_detached");
-  __static_thread_pool::StaticThreadPool pool{4};
-  auto s = Schedule(pool.GetScheduler());
-  auto a = Then(s, [] {
+TEST_CASE("test start_detached", "[execution]")
+{
+    MMDEPLOY_INFO("test start_detached");
+    __static_thread_pool::StaticThreadPool pool{4};
+    auto                                   s = Schedule(pool.GetScheduler());
+    auto                                   a = Then(s, []
+                  {
     std::this_thread::sleep_for(std::chrono::seconds(1));
-    return Value(100);
-  });
-  auto b = Then(a, [](auto&&...) { MMDEPLOY_INFO("OK {}", 1); });
-  StartDetached(b);
-  MMDEPLOY_INFO("StartDetached ret");
+    return Value(100); });
+    auto                                   b = Then(a, [](auto&&...)
+                  { MMDEPLOY_INFO("OK {}", 1); });
+    StartDetached(b);
+    MMDEPLOY_INFO("StartDetached ret");
 }
 
-TEST_CASE("test on", "[execution]") {
-  auto pool = __static_thread_pool::StaticThreadPool{4};
-  auto a = Just(100, 200);
-  auto b = On(pool.GetScheduler(), a);
-  auto c = SyncWait(b);
-  static_assert(std::is_same_v<decltype(c), std::tuple<int, int>>);
-  MMDEPLOY_INFO("c = {}", c);
+TEST_CASE("test on", "[execution]")
+{
+    auto pool = __static_thread_pool::StaticThreadPool{4};
+    auto a    = Just(100, 200);
+    auto b    = On(pool.GetScheduler(), a);
+    auto c    = SyncWait(b);
+    static_assert(std::is_same_v<decltype(c), std::tuple<int, int>>);
+    MMDEPLOY_INFO("c = {}", c);
 }
 
-mmdeploy_value_t f(mmdeploy_value_t v, void*) {
-  auto& arr = ((Value*)v)->array();
-  return (mmdeploy_value_t)(new Value{arr[0].get<int>() + arr[1].get<int>()});
+mmdeploy_value_t f(mmdeploy_value_t v, void*)
+{
+    auto& arr = ((Value*)v)->array();
+    return (mmdeploy_value_t)(new Value{arr[0].get<int>() + arr[1].get<int>()});
 }
 
-void G() {
-  auto sched = TypeErasedScheduler<>(InlineScheduler{});
-  auto int2_sender = TypeErasedSender<int, int>(Just(100, 200));
-  auto float2_sender = Then(std::move(int2_sender),
-                            [](int x, int y) { return std::make_tuple((float)y, (float)x); });
-  auto b = Then(Expand(std::move(float2_sender)), [](float x, float y) {
+void G()
+{
+    auto sched         = TypeErasedScheduler<>(InlineScheduler{});
+    auto int2_sender   = TypeErasedSender<int, int>(Just(100, 200));
+    auto float2_sender = Then(std::move(int2_sender),
+                              [](int x, int y)
+                              { return std::make_tuple((float)y, (float)x); });
+    auto b             = Then(Expand(std::move(float2_sender)), [](float x, float y)
+                  {
     MMDEPLOY_INFO("{}, {}", x, y);
-    return static_cast<double>(x + y);
-  });
-  auto c = TypeErasedSender<double>(std::move(b));
-  auto val = SyncWait(std::move(c));
-  MMDEPLOY_INFO("val = {}", val);
+    return static_cast<double>(x + y); });
+    auto c             = TypeErasedSender<double>(std::move(b));
+    auto val           = SyncWait(std::move(c));
+    MMDEPLOY_INFO("val = {}", val);
 }
 
-TEST_CASE("test simple type erase", "[execution]") { G(); }
-
-void TestFunc(const char* sched_name) {
-  //  MMDEPLOY_INFO("testing with scheduler: {}", sched_name);
-  auto creator = gRegistry<TypeErasedScheduler<Value>>().Get(sched_name);
-  REQUIRE(creator);
-  auto sched = creator->Create({});
-  SECTION("Schedule") { (void)SyncWait(Schedule(sched)); }
-  SECTION("Just") {
-    auto [value] = SyncWait(Just(Value(100)) | TypeErase());
-    REQUIRE(value.get<int>() == 100);
-  }
-  SECTION("Transfer") {
-    auto sender = Just(Value(100)) | Transfer(sched) | TypeErase();
-    static_assert(std::is_same_v<decltype(sender), TypeErasedSender<Value>>);
-    auto [value] = SyncWait(std::move(sender));
-    REQUIRE(value.get<int>() == 100);
-  }
-  SECTION("Then") {
-    auto sender = Just(Value(100)) | Transfer(sched) |
-                  Then([](Value v) { return Value(v.get<int>() * v.get<int>()); });
-    auto value = std::get<Value>(SyncWait(std::move(sender)));
-    REQUIRE(value.get<int>() == 10000);
-  }
-  SECTION("On") {
-    auto sender = Just(Value(100)) |
-                  Then([](Value v) { return Value(v.get<int>() * v.get<int>()); }) | TypeErase();
-    auto [value] = SyncWait(On(sched, std::move(sender)));
-    REQUIRE(value.get<int>() == 10000);
-  }
-  SECTION("LetValue") {
-    auto sender = Just(Value(100)) | TypeErase() |
-                  LetValue([](Value& v) { return Just(Value(v.get<int>() * v.get<int>())); }) |
-                  TypeErase();
-    auto [value] = SyncWait(std::move(sender));
-    REQUIRE(value.get<int>() == 10000);
-  }
-  SECTION("Bulk") {
-    auto sender = Just(Value(Value::Array(100))) | Transfer(sched) |
-                  Bulk(100, [](size_t index, Value& v) { v[index] = (uint32_t)index; });
-    auto [value] = SyncWait(std::move(sender));
-    std::vector<int> a;
-    std::vector<int> b;
-    for (const auto& v : value) {
-      b.push_back(static_cast<int>(a.size()));
-      a.push_back(v.template get<int>());
+TEST_CASE("test simple type erase", "[execution]")
+{
+    G();
+}
+
+void TestFunc(const char* sched_name)
+{
+    //  MMDEPLOY_INFO("testing with scheduler: {}", sched_name);
+    auto creator = gRegistry<TypeErasedScheduler<Value>>().Get(sched_name);
+    REQUIRE(creator);
+    auto sched = creator->Create({});
+    SECTION("Schedule")
+    {
+        (void)SyncWait(Schedule(sched));
+    }
+    SECTION("Just")
+    {
+        auto [value] = SyncWait(Just(Value(100)) | TypeErase());
+        REQUIRE(value.get<int>() == 100);
+    }
+    SECTION("Transfer")
+    {
+        auto sender = Just(Value(100)) | Transfer(sched) | TypeErase();
+        static_assert(std::is_same_v<decltype(sender), TypeErasedSender<Value>>);
+        auto [value] = SyncWait(std::move(sender));
+        REQUIRE(value.get<int>() == 100);
+    }
+    SECTION("Then")
+    {
+        auto sender = Just(Value(100)) | Transfer(sched) |
+                      Then([](Value v)
+                           { return Value(v.get<int>() * v.get<int>()); });
+        auto value = std::get<Value>(SyncWait(std::move(sender)));
+        REQUIRE(value.get<int>() == 10000);
+    }
+    SECTION("On")
+    {
+        auto sender = Just(Value(100)) |
+                      Then([](Value v)
+                           { return Value(v.get<int>() * v.get<int>()); }) |
+                      TypeErase();
+        auto [value] = SyncWait(On(sched, std::move(sender)));
+        REQUIRE(value.get<int>() == 10000);
+    }
+    SECTION("LetValue")
+    {
+        auto sender = Just(Value(100)) | TypeErase() |
+                      LetValue([](Value& v)
+                               { return Just(Value(v.get<int>() * v.get<int>())); }) |
+                      TypeErase();
+        auto [value] = SyncWait(std::move(sender));
+        REQUIRE(value.get<int>() == 10000);
+    }
+    SECTION("Bulk")
+    {
+        auto sender = Just(Value(Value::Array(100))) | Transfer(sched) |
+                      Bulk(100, [](size_t index, Value& v)
+                           { v[index] = (uint32_t)index; });
+        auto [value] = SyncWait(std::move(sender));
+        std::vector<int> a;
+        std::vector<int> b;
+        for (const auto& v : value)
+        {
+            b.push_back(static_cast<int>(a.size()));
+            a.push_back(v.template get<int>());
+        }
+        REQUIRE(a == b);
+    }
+    SECTION("Split")
+    {
+        auto sender = Just(Value(100)) | Split();
+        auto [a]    = SyncWait(sender | Then([](Value v)
+                                          { return Value(v.get<int>() + 100); }));
+        auto [b]    = SyncWait(sender | Then([](Value v)
+                                          { return Value(v.get<int>() + 200); }));
+        REQUIRE(a.get<int>() == 200);
+        REQUIRE(b.get<int>() == 300);
+    }
+    SECTION("WhenAll")
+    {
+        auto sender   = Just(Value(100)) | Split();
+        auto a_sender = sender | Then([](Value v)
+                                      { return Value(v.get<int>() + 100); }) |
+                        TypeErase();
+        auto b_sender = sender | Then([](Value v)
+                                      { return Value(v.get<int>() + 200); }) |
+                        TypeErase();
+        auto [value] = SyncWait(WhenAll(std::vector{std::move(a_sender), std::move(b_sender)}));
+        REQUIRE(value[0].get<int>() == 200);
+        REQUIRE(value[1].get<int>() == 300);
+    }
+    SECTION("EnsureStarted")
+    {
+        auto sender = Just(Value(100)) |
+                      Then([](Value v)
+                           { return Value(v.get<int>() * v.get<int>()); }) |
+                      TypeErase();
+        sender       = EnsureStarted(std::move(sender));
+        auto [value] = SyncWait(std::move(sender));
+        REQUIRE(value.get<int>() == 10000);
+    }
+    SECTION("StartDetached")
+    {
+        auto sender = Just(Value(100)) |
+                      Then([](Value v)
+                           { MMDEPLOY_INFO("{}", v.get<int>() * v.get<int>()); }) |
+                      TypeErase();
+        StartDetached(std::move(sender));
+    }
+    SECTION("SyncWait")
+    {
+        (void)SyncWait(Schedule(sched));
     }
-    REQUIRE(a == b);
-  }
-  SECTION("Split") {
-    auto sender = Just(Value(100)) | Split();
-    auto [a] = SyncWait(sender | Then([](Value v) { return Value(v.get<int>() + 100); }));
-    auto [b] = SyncWait(sender | Then([](Value v) { return Value(v.get<int>() + 200); }));
-    REQUIRE(a.get<int>() == 200);
-    REQUIRE(b.get<int>() == 300);
-  }
-  SECTION("WhenAll") {
-    auto sender = Just(Value(100)) | Split();
-    auto a_sender = sender | Then([](Value v) { return Value(v.get<int>() + 100); }) | TypeErase();
-    auto b_sender = sender | Then([](Value v) { return Value(v.get<int>() + 200); }) | TypeErase();
-    auto [value] = SyncWait(WhenAll(std::vector{std::move(a_sender), std::move(b_sender)}));
-    REQUIRE(value[0].get<int>() == 200);
-    REQUIRE(value[1].get<int>() == 300);
-  }
-  SECTION("EnsureStarted") {
-    auto sender = Just(Value(100)) |
-                  Then([](Value v) { return Value(v.get<int>() * v.get<int>()); }) | TypeErase();
-    sender = EnsureStarted(std::move(sender));
-    auto [value] = SyncWait(std::move(sender));
-    REQUIRE(value.get<int>() == 10000);
-  }
-  SECTION("StartDetached") {
-    auto sender = Just(Value(100)) |
-                  Then([](Value v) { MMDEPLOY_INFO("{}", v.get<int>() * v.get<int>()); }) |
-                  TypeErase();
-    StartDetached(std::move(sender));
-  }
-  SECTION("SyncWait") { (void)SyncWait(Schedule(sched)); }
 }
 
-struct _inlined {
-  static constexpr const char* value = "Inline";
+struct _inlined
+{
+    static constexpr const char* value = "Inline";
 };
-struct _single_thread {
-  static constexpr const char* value = "SingleThread";
+struct _single_thread
+{
+    static constexpr const char* value = "SingleThread";
 };
-struct _thread_pool {
-  static constexpr const char* value = "ThreadPool";
+struct _thread_pool
+{
+    static constexpr const char* value = "ThreadPool";
 };
 
 using Schedulers = std::tuple<_inlined, _single_thread, _thread_pool>;
 
-TEMPLATE_LIST_TEST_CASE("test type erase", "[execution]", Schedulers) { TestFunc(TestType::value); }
-
-TEST_CASE("test executor C API", "[execution]") {
-  auto sched = mmdeploy_executor_inline();
-  REQUIRE(sched);
-  auto begin = mmdeploy_executor_just((mmdeploy_value_t) new Value{100, 200});
-  REQUIRE(begin);
-  auto a = mmdeploy_executor_transfer(begin, sched);
-  REQUIRE(a);
-  auto b = mmdeploy_executor_then(a, f, nullptr);
-  REQUIRE(b);
-  auto c = mmdeploy_executor_sync_wait(b);
-  REQUIRE(c);
-  MMDEPLOY_INFO("{}", *(Value*)c);
-  mmdeploy_value_destroy(c);
+TEMPLATE_LIST_TEST_CASE("test type erase", "[execution]", Schedulers)
+{
+    TestFunc(TestType::value);
 }
 
-auto Gen(int k) {
-  return [k](...) -> Value {
-    MMDEPLOY_INFO("{}: start sleeping", k);
-    std::this_thread::sleep_for(std::chrono::seconds(1));
-    MMDEPLOY_INFO("{}: done sleeping", k);
-    return k;
-  };
+TEST_CASE("test executor C API", "[execution]")
+{
+    auto sched = mmdeploy_executor_inline();
+    REQUIRE(sched);
+    auto begin = mmdeploy_executor_just((mmdeploy_value_t) new Value{100, 200});
+    REQUIRE(begin);
+    auto a = mmdeploy_executor_transfer(begin, sched);
+    REQUIRE(a);
+    auto b = mmdeploy_executor_then(a, f, nullptr);
+    REQUIRE(b);
+    auto c = mmdeploy_executor_sync_wait(b);
+    REQUIRE(c);
+    MMDEPLOY_INFO("{}", *(Value*)c);
+    mmdeploy_value_destroy(c);
+}
+
+auto Gen(int k)
+{
+    return [k](...) -> Value
+    {
+        MMDEPLOY_INFO("{}: start sleeping", k);
+        std::this_thread::sleep_for(std::chrono::seconds(1));
+        MMDEPLOY_INFO("{}: done sleeping", k);
+        return k;
+    };
 }
 
-void Fn() {
-  auto pool = __static_thread_pool::StaticThreadPool{4};
-  auto sched = pool.GetScheduler();
-  //  auto sched = InlineScheduler{};
-  auto begin = Schedule(sched);
-  auto a = Then(begin, []() -> Value { return 100; });
-  auto b = LetValue(a, [&](Value& v) {
+void Fn()
+{
+    auto pool  = __static_thread_pool::StaticThreadPool{4};
+    auto sched = pool.GetScheduler();
+    //  auto sched = InlineScheduler{};
+    auto begin = Schedule(sched);
+    auto a     = Then(begin, []() -> Value
+                  { return 100; });
+    auto b     = LetValue(a, [&](Value& v)
+                      {
     auto b1 = Then(Schedule(sched), Gen(1));
     auto b2 = Then(Schedule(sched), Gen(2));
     auto b3 = Then(Schedule(sched), Gen(3));
@@ -274,181 +341,210 @@ void Fn() {
     return LetValue(b, [&](auto&... vals) {
       MMDEPLOY_INFO("vals = {}", std::tuple{vals.template get<int>()...});
       return Just(Value((vals.template get<int>() + ...)));
-    });
-  });
-  auto v = SyncWait(b);
-  MMDEPLOY_INFO("threaded split: {}", v);
+    }); });
+    auto v     = SyncWait(b);
+    MMDEPLOY_INFO("threaded split: {}", v);
 }
 
-void Gn() {
-  auto v = SyncWait(LetValue(Just(Value(100)), [&](Value& v) {
-    return LetValue(Just(Value(200)), [&](Value& u) {
-      return LetValue(Just(Value(300)), [&](Value& w) {
-        return LetValue(Just(Value(400)), [&](Value& x) {
-          return Just(Value(u.get<int>() + v.get<int>() + w.get<int>() + x.get<int>()));
-        });
-      });
-    });
-  }));
-  MMDEPLOY_INFO("Gn: {}", v);
+void Gn()
+{
+    auto v = SyncWait(LetValue(Just(Value(100)), [&](Value& v)
+                               { return LetValue(Just(Value(200)), [&](Value& u)
+                                                 { return LetValue(Just(Value(300)), [&](Value& w)
+                                                                   { return LetValue(Just(Value(400)), [&](Value& x)
+                                                                                     { return Just(Value(u.get<int>() + v.get<int>() + w.get<int>() + x.get<int>())); }); }); }); }));
+    MMDEPLOY_INFO("Gn: {}", v);
 }
 
-TEST_CASE("test threaded split", "[execution]") { Fn(); }
+TEST_CASE("test threaded split", "[execution]")
+{
+    Fn();
+}
 
-TEST_CASE("test inference pipeline", "[execution][pipeline]") { Gn(); }
+TEST_CASE("test inference pipeline", "[execution][pipeline]")
+{
+    Gn();
+}
 
-TEST_CASE("test generic just", "[execution]") {
-  auto j = Just(1, 2, 3, 4.0);
-  auto s = LetValue(j, [](const auto&... vs) { return Just((vs + ...)); });
-  auto v = SyncWait(s);
-  MMDEPLOY_INFO("generic: {}", v);
+TEST_CASE("test generic just", "[execution]")
+{
+    auto j = Just(1, 2, 3, 4.0);
+    auto s = LetValue(j, [](const auto&... vs)
+                      { return Just((vs + ...)); });
+    auto v = SyncWait(s);
+    MMDEPLOY_INFO("generic: {}", v);
 }
 
-TEST_CASE("test generic split", "[execution]") {
-  auto j = Just(1, 2, 3);
-  auto s = Split(j);
-  auto a1 = Then(s, [](int x, auto...) { return x; });
-  auto a2 = Then(s, [](int, int y, auto...) { return y; });
-  auto a3 = Then(s, [](int, int, int z) { return z; });
-  auto a = WhenAll(a3, a2, a1);
-  auto [z, y, x] = SyncWait(a);
-  MMDEPLOY_INFO("generic split: {} {} {}", z, y, x);
+TEST_CASE("test generic split", "[execution]")
+{
+    auto j         = Just(1, 2, 3);
+    auto s         = Split(j);
+    auto a1        = Then(s, [](int x, auto...)
+                   { return x; });
+    auto a2        = Then(s, [](int, int y, auto...)
+                   { return y; });
+    auto a3        = Then(s, [](int, int, int z)
+                   { return z; });
+    auto a         = WhenAll(a3, a2, a1);
+    auto [z, y, x] = SyncWait(a);
+    MMDEPLOY_INFO("generic split: {} {} {}", z, y, x);
 }
 
-TEST_CASE("test bulk", "[execution]") {
-  //  __static_thread_pool::StaticThreadPool pool;
-  _single_thread_context::SingleThreadContext ctx;
-  auto scheduler = ctx.GetScheduler();
-  constexpr int N = 1024;
-  std::vector<float> a(N), b(N), c(N);
-  std::iota(begin(a), end(a), 0);
-  std::iota(rbegin(b), rend(b), 0);
-  auto init = Just(std::move(a), std::move(b), std::move(c)) | Transfer(scheduler);
-  auto fma = std::move(init) | Bulk(N, [](int index, const auto& a, const auto& b, auto& c) {
-               c[index] += a[index] * b[index];
-             });
-  MMDEPLOY_INFO(">>> test bulk");
-  auto [x, y, z] = SyncWait(fma);
-  MMDEPLOY_INFO("<<< test bulk");
-  MMDEPLOY_INFO("{}", z);
+TEST_CASE("test bulk", "[execution]")
+{
+    //  __static_thread_pool::StaticThreadPool pool;
+    _single_thread_context::SingleThreadContext ctx;
+    auto                                        scheduler = ctx.GetScheduler();
+    constexpr int                               N         = 1024;
+    std::vector<float>                          a(N), b(N), c(N);
+    std::iota(begin(a), end(a), 0);
+    std::iota(rbegin(b), rend(b), 0);
+    auto init = Just(std::move(a), std::move(b), std::move(c)) | Transfer(scheduler);
+    auto fma  = std::move(init) | Bulk(N, [](int index, const auto& a, const auto& b, auto& c)
+                                      { c[index] += a[index] * b[index]; });
+    MMDEPLOY_INFO(">>> test bulk");
+    auto [x, y, z] = SyncWait(fma);
+    MMDEPLOY_INFO("<<< test bulk");
+    MMDEPLOY_INFO("{}", z);
 }
 
-TEST_CASE("test schedule_after", "[execution]") {
-  TimedSingleThreadContext context;
-  auto sched = context.GetScheduler();
+TEST_CASE("test schedule_after", "[execution]")
+{
+    TimedSingleThreadContext              context;
+    auto                                  sched = context.GetScheduler();
 
-  auto s = ScheduleAfter(sched, std::chrono::seconds(1));
-  std::chrono::steady_clock::time_point start;
-  auto t = Then(s, [&start] {
+    auto                                  s = ScheduleAfter(sched, std::chrono::seconds(1));
+    std::chrono::steady_clock::time_point start;
+    auto                                  t = Then(s, [&start]
+                  {
     auto end = std::chrono::steady_clock::now();
     auto dt = std::chrono::duration<double>(end - start).count();
     MMDEPLOY_INFO("{} seconds passed", dt);
-    return 0;
-  });
-  start = std::chrono::steady_clock::now();
-  SyncWait(t);
+    return 0; });
+    start                                   = std::chrono::steady_clock::now();
+    SyncWait(t);
 }
 
-TEST_CASE("pipeable sender", "[execution]") {
-  InlineScheduler sched;
-  auto sender = Just(1) | Transfer(sched) | Then([](int x) { return x + 1; });
-  auto [two] = SyncWait(sender);
-  MMDEPLOY_INFO("pipeable sender: {}", two);
+TEST_CASE("pipeable sender", "[execution]")
+{
+    InlineScheduler sched;
+    auto            sender = Just(1) | Transfer(sched) | Then([](int x)
+                                                   { return x + 1; });
+    auto [two]             = SyncWait(sender);
+    MMDEPLOY_INFO("pipeable sender: {}", two);
 }
 
-struct IntManager {
-  using range_t = std::pair<size_t, size_t>;
-  static size_t get_size(int) { return 1; }
-  static void input(std::tuple<int>, range_t, std::tuple<int>& dst, range_t, size_t) {
-    ++std::get<0>(dst);
-  }
-  static void output(int&, range_t, int& dst, range_t, size_t) { ++dst; }
+struct IntManager
+{
+    using range_t = std::pair<size_t, size_t>;
+    static size_t get_size(int)
+    {
+        return 1;
+    }
+    static void input(std::tuple<int>, range_t, std::tuple<int>& dst, range_t, size_t)
+    {
+        ++std::get<0>(dst);
+    }
+    static void output(int&, range_t, int& dst, range_t, size_t)
+    {
+        ++dst;
+    }
 };
 
-TEST_CASE("test dynamic batch", "[execution]") {
-  TimedSingleThreadContext timer;
-  SingleThreadContext thread;
-  StaticThreadPool pool;
-
-  DynamicBatchScheduler<InlineScheduler, __static_thread_pool::Scheduler, IntManager> scheduler{
-      InlineScheduler{}, pool.GetScheduler(), &timer, 2, std::chrono::microseconds(10)};
-
-  constexpr const int N = 16;
-
-  dynamic_batch_t::context_t context;
-
-  std::vector<TypeErasedSender<int>> senders;
-  senders.reserve(N);
-  for (int i = 0; i < N; ++i) {
-    auto begin = TransferJust(scheduler, i);
-    // tag_invoke(DynamicBatch, scheduler, std::move(begin), context, [](int x) { return x; });
-    // MMDEPLOY_INFO("+++ create {}", i);
-    senders.emplace_back(EnsureStarted(DynamicBatch(std::move(begin), context, [](int x) {
+TEST_CASE("test dynamic batch", "[execution]")
+{
+    TimedSingleThreadContext                                                            timer;
+    SingleThreadContext                                                                 thread;
+    StaticThreadPool                                                                    pool;
+
+    DynamicBatchScheduler<InlineScheduler, __static_thread_pool::Scheduler, IntManager> scheduler{
+        InlineScheduler{},
+        pool.GetScheduler(),
+        &timer,
+        2,
+        std::chrono::microseconds(10)};
+
+    constexpr const int                N = 16;
+
+    dynamic_batch_t::context_t         context;
+
+    std::vector<TypeErasedSender<int>> senders;
+    senders.reserve(N);
+    for (int i = 0; i < N; ++i)
+    {
+        auto begin = TransferJust(scheduler, i);
+        // tag_invoke(DynamicBatch, scheduler, std::move(begin), context, [](int x) { return x; });
+        // MMDEPLOY_INFO("+++ create {}", i);
+        senders.emplace_back(EnsureStarted(DynamicBatch(std::move(begin), context, [](int x)
+                                                        {
       MMDEPLOY_INFO("start, batch_size: {}", x);
       std::this_thread::sleep_for(std::chrono::milliseconds(100));
       MMDEPLOY_INFO("end");
-      return x;
-    })));
-    // MMDEPLOY_INFO("--- create {}", i);
-    //    if (i >= 5) {
-    //      std::this_thread::sleep_for(std::chrono::microseconds(1000));
-    //    }
-  }
-
-  MMDEPLOY_INFO("waiting starts...");
-  for (auto& s : senders) {
-    auto [v] = SyncWait(std::move(s));
-    // MMDEPLOY_INFO("val: {}", v);
-  }
+      return x; })));
+        // MMDEPLOY_INFO("--- create {}", i);
+        //    if (i >= 5) {
+        //      std::this_thread::sleep_for(std::chrono::microseconds(1000));
+        //    }
+    }
+
+    MMDEPLOY_INFO("waiting starts...");
+    for (auto& s : senders)
+    {
+        auto [v] = SyncWait(std::move(s));
+        // MMDEPLOY_INFO("val: {}", v);
+    }
 }
 
-TEST_CASE("test dynamic batch for Value", "[execution]") {
-  //  TimedSingleThreadContext timer;
-  //  SingleThreadContext thread;
-  //  StaticThreadPool pool(2);
-  //
-  //  auto get_scheduler = [&](TimedSingleThreadContext* timer, auto scheduler, size_t
-  //  max_batch_size,
-  //                           auto timeout) {
-  //    return DynamicBatchScheduler<InlineScheduler, decltype(scheduler), ValueAssembler>{
-  //        inline_scheduler, std::move(scheduler), timer, max_batch_size, timeout};
-  //  };
-  //
-  //  auto scheduler = TypeErasedScheduler<Value>(
-  //      get_scheduler(nullptr, pool.GetScheduler(), 8, std::chrono::microseconds(10)));
-
-  auto exec_sched = mmdeploy_executor_system_pool();
-  auto dynamic_batch_sched = mmdeploy_executor_dynamic_batch(exec_sched, 32, -1);
-  auto& scheduler = *reinterpret_cast<TypeErasedScheduler<Value>*>(dynamic_batch_sched);
-  //  auto p = mmdeploy_executor_inline();
-  //  auto& scheduler = *reinterpret_cast<TypeErasedScheduler<Value>*>(p);
-
-  constexpr const int N = 256;
-
-  dynamic_batch_t::context_t context;
-
-  std::vector<TypeErasedSender<Value>> senders;
-  senders.reserve(N);
-  for (int i = 0; i < N; ++i) {
-    // FIXME:            GCC    MSVC
-    //  Value{Value{i}}  [[i]]   [i]
-    auto begin = TransferJust(scheduler, Value{Value::Array{i}});
-    senders.emplace_back(EnsureStarted(DynamicBatch(std::move(begin), context, [](Value x) {
+TEST_CASE("test dynamic batch for Value", "[execution]")
+{
+    //  TimedSingleThreadContext timer;
+    //  SingleThreadContext thread;
+    //  StaticThreadPool pool(2);
+    //
+    //  auto get_scheduler = [&](TimedSingleThreadContext* timer, auto scheduler, size_t
+    //  max_batch_size,
+    //                           auto timeout) {
+    //    return DynamicBatchScheduler<InlineScheduler, decltype(scheduler), ValueAssembler>{
+    //        inline_scheduler, std::move(scheduler), timer, max_batch_size, timeout};
+    //  };
+    //
+    //  auto scheduler = TypeErasedScheduler<Value>(
+    //      get_scheduler(nullptr, pool.GetScheduler(), 8, std::chrono::microseconds(10)));
+
+    auto                                 exec_sched          = mmdeploy_executor_system_pool();
+    auto                                 dynamic_batch_sched = mmdeploy_executor_dynamic_batch(exec_sched, 32, -1);
+    auto&                                scheduler           = *reinterpret_cast<TypeErasedScheduler<Value>*>(dynamic_batch_sched);
+    //  auto p = mmdeploy_executor_inline();
+    //  auto& scheduler = *reinterpret_cast<TypeErasedScheduler<Value>*>(p);
+
+    constexpr const int                  N = 256;
+
+    dynamic_batch_t::context_t           context;
+
+    std::vector<TypeErasedSender<Value>> senders;
+    senders.reserve(N);
+    for (int i = 0; i < N; ++i)
+    {
+        // FIXME:            GCC    MSVC
+        //  Value{Value{i}}  [[i]]   [i]
+        auto begin = TransferJust(scheduler, Value{Value::Array{i}});
+        senders.emplace_back(EnsureStarted(DynamicBatch(std::move(begin), context, [](Value x)
+                                                        {
       MMDEPLOY_INFO("batch_size: {}", x.front().size());
       std::this_thread::sleep_for(std::chrono::milliseconds(100));
       for (auto& v : x.front()) {
         v = v.get<int>() * v.get<int>();
       }
-      return x;
-    })));
-  }
-
-  MMDEPLOY_INFO("waiting starts...");
-  for (auto& s : senders) {
-    auto [v] = SyncWait(std::move(s));
-    // MMDEPLOY_INFO("val: {}", v[0][0]);
-  }
-
-  mmdeploy_scheduler_destroy(dynamic_batch_sched);
-  mmdeploy_scheduler_destroy(exec_sched);
+      return x; })));
+    }
+
+    MMDEPLOY_INFO("waiting starts...");
+    for (auto& s : senders)
+    {
+        auto [v] = SyncWait(std::move(s));
+        // MMDEPLOY_INFO("val: {}", v[0][0]);
+    }
+
+    mmdeploy_scheduler_destroy(dynamic_batch_sched);
+    mmdeploy_scheduler_destroy(exec_sched);
 }
diff --git a/tests/test_csrc/core/test_mat.cpp b/tests/test_csrc/core/test_mat.cpp
index 8e6ec37c0f..ba6f50640c 100644
--- a/tests/test_csrc/core/test_mat.cpp
+++ b/tests/test_csrc/core/test_mat.cpp
@@ -13,92 +13,101 @@ using namespace mmdeploy;
 using namespace framework;
 using namespace std;
 
-TEST_CASE("default mat constructor", "[mat]") {
-  auto gResource = MMDeployTestResources::Get();
-  const Device kHost{"cpu"};
-
-  SECTION("default constructor") {
-    Mat mat;
-    REQUIRE(mat.pixel_format() == PixelFormat::kGRAYSCALE);
-    REQUIRE(mat.type() == DataType::kINT8);
-    REQUIRE(mat.height() == 0);
-    REQUIRE(mat.width() == 0);
-    REQUIRE(mat.channel() == 0);
-    REQUIRE(mat.size() == 0);
-    REQUIRE(mat.byte_size() == 0);
-    REQUIRE(mat.data<void>() == nullptr);
-    REQUIRE(mat.device().platform_id() == -1);
-  }
-
-  SECTION("construct with device") {
-    std::array<PixelFormat, 7> pixel_formats{PixelFormat::kBGR,       PixelFormat::kRGB,
-                                             PixelFormat::kGRAYSCALE, PixelFormat::kNV12,
-                                             PixelFormat::kNV21,      PixelFormat::kBGRA};
-    std::array<DataType, 5> data_types{DataType::kFLOAT, DataType::kHALF, DataType::kINT8,
-                                       DataType::kINT32};
-
-    int success = 0;
-    for (auto format : pixel_formats) {
-      for (auto data_type : data_types) {
-        Mat mat{100, 200, format, data_type, kHost};
-        success += (mat.byte_size() > 0);
-      }
+TEST_CASE("default mat constructor", "[mat]")
+{
+    auto         gResource = MMDeployTestResources::Get();
+    const Device kHost{"cpu"};
+
+    SECTION("default constructor")
+    {
+        Mat mat;
+        REQUIRE(mat.pixel_format() == PixelFormat::kGRAYSCALE);
+        REQUIRE(mat.type() == DataType::kINT8);
+        REQUIRE(mat.height() == 0);
+        REQUIRE(mat.width() == 0);
+        REQUIRE(mat.channel() == 0);
+        REQUIRE(mat.size() == 0);
+        REQUIRE(mat.byte_size() == 0);
+        REQUIRE(mat.data<void>() == nullptr);
+        REQUIRE(mat.device().platform_id() == -1);
     }
-    REQUIRE(success == pixel_formats.size() * data_types.size());
 
-    for (auto &device_name : gResource.device_names()) {
-      Device device{device_name.c_str()};
-      REQUIRE_THROWS(Mat{100, 200, PixelFormat(0xff), DataType::kINT8, device});
-      REQUIRE_THROWS(Mat{100, 200, PixelFormat::kGRAYSCALE, DataType(0xff), device});
-    }
-  }
-
-  SECTION("construct with data") {
-    constexpr int kRows = 100;
-    constexpr int kCols = 200;
-    vector<uint8_t> data(kRows * kCols, 0);
-    SECTION("void* data") {
-      Mat mat{kRows, kCols, PixelFormat::kGRAYSCALE, DataType::kINT8, data.data(), kHost};
-      REQUIRE(mat.byte_size() > 0);
+    SECTION("construct with device")
+    {
+        std::array<PixelFormat, 7> pixel_formats{PixelFormat::kBGR, PixelFormat::kRGB, PixelFormat::kGRAYSCALE, PixelFormat::kNV12, PixelFormat::kNV21, PixelFormat::kBGRA};
+        std::array<DataType, 5>    data_types{DataType::kFLOAT, DataType::kHALF, DataType::kINT8, DataType::kINT32};
+
+        int                        success = 0;
+        for (auto format : pixel_formats)
+        {
+            for (auto data_type : data_types)
+            {
+                Mat mat{100, 200, format, data_type, kHost};
+                success += (mat.byte_size() > 0);
+            }
+        }
+        REQUIRE(success == pixel_formats.size() * data_types.size());
+
+        for (auto& device_name : gResource.device_names())
+        {
+            Device device{device_name.c_str()};
+            REQUIRE_THROWS(Mat{100, 200, PixelFormat(0xff), DataType::kINT8, device});
+            REQUIRE_THROWS(Mat{100, 200, PixelFormat::kGRAYSCALE, DataType(0xff), device});
+        }
     }
 
-    SECTION("shared_ptr") {
-      std::shared_ptr<void> data_ptr(data.data(), [&](void *p) {});
-      Mat mat{kRows, kCols, PixelFormat::kGRAYSCALE, DataType::kINT8, data_ptr, kHost};
-      REQUIRE(mat.byte_size() > 0);
+    SECTION("construct with data")
+    {
+        constexpr int   kRows = 100;
+        constexpr int   kCols = 200;
+        vector<uint8_t> data(kRows * kCols, 0);
+        SECTION("void* data")
+        {
+            Mat mat{kRows, kCols, PixelFormat::kGRAYSCALE, DataType::kINT8, data.data(), kHost};
+            REQUIRE(mat.byte_size() > 0);
+        }
+
+        SECTION("shared_ptr")
+        {
+            std::shared_ptr<void> data_ptr(data.data(), [&](void* p) {});
+            Mat                   mat{kRows, kCols, PixelFormat::kGRAYSCALE, DataType::kINT8, data_ptr, kHost};
+            REQUIRE(mat.byte_size() > 0);
+        }
     }
-  }
 }
 
-TEST_CASE("mat constructor in difference devices", "[mat]") {
-  auto gResource = MMDeployTestResources::Get();
-
-  constexpr int kRows = 10;
-  constexpr int kCols = 10;
-  constexpr int kSize = kRows * kCols;
-
-  vector<uint8_t> data(kSize);
-  std::iota(data.begin(), data.end(), 1);
-
-  for (auto &device_name : gResource.device_names()) {
-    Device device{device_name.c_str()};
-
-    // copy to device
-    Mat mat{kRows, kCols, PixelFormat::kGRAYSCALE, DataType::kINT8, device};
-    Stream stream = Stream::GetDefault(device);
-    REQUIRE(stream.Copy(data.data(), mat.buffer(), mat.buffer().GetSize()));
-    REQUIRE(stream.Wait());
-
-    // copy to host
-    vector<uint8_t> host_data(mat.size());
-    REQUIRE(stream.Copy(mat.buffer(), host_data.data(), mat.byte_size()));
-    REQUIRE(stream.Wait());
-
-    // compare data to check if they are the same
-    int count = 0;
-    for (size_t i = 0; i < host_data.size(); ++i) {
-      count += (host_data[i] == data[i]);
+TEST_CASE("mat constructor in difference devices", "[mat]")
+{
+    auto            gResource = MMDeployTestResources::Get();
+
+    constexpr int   kRows = 10;
+    constexpr int   kCols = 10;
+    constexpr int   kSize = kRows * kCols;
+
+    vector<uint8_t> data(kSize);
+    std::iota(data.begin(), data.end(), 1);
+
+    for (auto& device_name : gResource.device_names())
+    {
+        Device device{device_name.c_str()};
+
+        // copy to device
+        Mat    mat{kRows, kCols, PixelFormat::kGRAYSCALE, DataType::kINT8, device};
+        Stream stream = Stream::GetDefault(device);
+        REQUIRE(stream.Copy(data.data(), mat.buffer(), mat.buffer().GetSize()));
+        REQUIRE(stream.Wait());
+
+        // copy to host
+        vector<uint8_t> host_data(mat.size());
+        REQUIRE(stream.Copy(mat.buffer(), host_data.data(), mat.byte_size()));
+        REQUIRE(stream.Wait());
+
+        // compare data to check if they are the same
+        int count = 0;
+        for (size_t i = 0; i < host_data.size(); ++i)
+        {
+            count += (host_data[i] == data[i]);
+        }
+        REQUIRE(count == mat.size());
     }
-    REQUIRE(count == mat.size());
-  }
 }
diff --git a/tests/test_csrc/core/test_module_adapter.cpp b/tests/test_csrc/core/test_module_adapter.cpp
index 7529ea6ad6..03cc0ed0ff 100644
--- a/tests/test_csrc/core/test_module_adapter.cpp
+++ b/tests/test_csrc/core/test_module_adapter.cpp
@@ -7,27 +7,34 @@
 #include "mmdeploy/core/utils/formatter.h"
 #include "mmdeploy/experimental/module_adapter.h"
 
-namespace test_module_adapter {
+namespace test_module_adapter
+{
 
-using mmdeploy::CreateTask;
-using mmdeploy::MakeTask;
-using mmdeploy::Module;
-using mmdeploy::Result;
-using mmdeploy::Value;
+    using mmdeploy::CreateTask;
+    using mmdeploy::MakeTask;
+    using mmdeploy::Module;
+    using mmdeploy::Result;
+    using mmdeploy::Value;
 
-class MyModule {
- public:
-  std::tuple<int, int> operator()(const double& a, const double& b) noexcept {
-    return {a + b, a - b};
-  }
-};
+    class MyModule
+    {
+      public:
+        std::tuple<int, int> operator()(const double& a, const double& b) noexcept
+        {
+            return {a + b, a - b};
+        }
+    };
 
-Result<std::tuple<int, int> > my_func(int x, int y) { return {x + y, x - y}; }
+    Result<std::tuple<int, int>> my_func(int x, int y)
+    {
+        return {x + y, x - y};
+    }
 
-TEST_CASE("test module adapter", "[module_adapter]") {
-  Value x{100, 200};
-  Value y;
-  // clang-format off
+    TEST_CASE("test module adapter", "[module_adapter]")
+    {
+        Value x{100, 200};
+        Value y;
+        // clang-format off
   SECTION("create") {
     std::unique_ptr<Module> task;
     SECTION("function object") {
@@ -68,9 +75,9 @@ TEST_CASE("test module adapter", "[module_adapter]") {
       y = task.Process(x).value();
     }
   }
-  // clang-format on
-  REQUIRE(y[0].get<int>() == 300);
-  REQUIRE(y[1].get<int>() == -100);
-}
+        // clang-format on
+        REQUIRE(y[0].get<int>() == 300);
+        REQUIRE(y[1].get<int>() == -100);
+    }
 
 }  // namespace test_module_adapter
diff --git a/tests/test_csrc/core/test_registry.cpp b/tests/test_csrc/core/test_registry.cpp
index 64c0feffa8..848c1d8e38 100644
--- a/tests/test_csrc/core/test_registry.cpp
+++ b/tests/test_csrc/core/test_registry.cpp
@@ -8,87 +8,118 @@
 
 using namespace mmdeploy;
 
-using Decoder = Module;
+using Decoder        = Module;
 using DecoderCreator = Creator<Decoder>;
 
-class ImageDecoder final : public Decoder {
- public:
-  Result<Value> Process(const Value& input) override {
-    if (input.contains("image_path")) {
-      std::cout << "decode image whose path " << input["image_path"].get<std::string>()
-                << std::endl;
-    } else {
-      std::cerr << "input error" << std::endl;
-      return Status(eInvalidArgument);
+class ImageDecoder final : public Decoder
+{
+  public:
+    Result<Value> Process(const Value& input) override
+    {
+        if (input.contains("image_path"))
+        {
+            std::cout << "decode image whose path " << input["image_path"].get<std::string>()
+                      << std::endl;
+        }
+        else
+        {
+            std::cerr << "input error" << std::endl;
+            return Status(eInvalidArgument);
+        }
+        return Value();
     }
-    return Value();
-  }
 };
 
-class ImageDecoderCreator : public DecoderCreator {
- public:
-  std::string_view name() const noexcept override { return "image"; }
-  int version() const noexcept override { return 2004000; }
-  std::unique_ptr<Decoder> Create(const Value& value) override {
-    ImageDecoder decoder;
-    return std::make_unique<ImageDecoder>(std::move(decoder));
-  }
+class ImageDecoderCreator : public DecoderCreator
+{
+  public:
+    std::string_view name() const noexcept override
+    {
+        return "image";
+    }
+    int version() const noexcept override
+    {
+        return 2004000;
+    }
+    std::unique_ptr<Decoder> Create(const Value& value) override
+    {
+        ImageDecoder decoder;
+        return std::make_unique<ImageDecoder>(std::move(decoder));
+    }
 };
 
 MMDEPLOY_REGISTER_CREATOR(Decoder, ImageDecoderCreator);
 
-namespace no_mmdeploy {
-class ImageDecoder final : public Decoder {
- public:
-  ImageDecoder() = default;
-  Result<Value> Process(const Value& input) override {
-    if (input.contains("image_content")) {
-      std::cout << "decode image content" << std::endl;
-    } else {
-      std::cerr << "input error" << std::endl;
-      return Status(eInvalidArgument);
-    }
-    return Value();
-  }
-};
+namespace no_mmdeploy
+{
+    class ImageDecoder final : public Decoder
+    {
+      public:
+        ImageDecoder() = default;
+        Result<Value> Process(const Value& input) override
+        {
+            if (input.contains("image_content"))
+            {
+                std::cout << "decode image content" << std::endl;
+            }
+            else
+            {
+                std::cerr << "input error" << std::endl;
+                return Status(eInvalidArgument);
+            }
+            return Value();
+        }
+    };
 
-class ImageDecoderCreator : public DecoderCreator {
- public:
-  std::string_view name() const noexcept override { return "image"; }
-  int version() const noexcept override { return 1003006; };
-  std::unique_ptr<Decoder> Create(const Value& value) override {
-    ImageDecoder decoder;
-    return std::make_unique<ImageDecoder>(std::move(decoder));
-  }
-};
+    class ImageDecoderCreator : public DecoderCreator
+    {
+      public:
+        std::string_view name() const noexcept override
+        {
+            return "image";
+        }
+        int version() const noexcept override
+        {
+            return 1003006;
+        };
+        std::unique_ptr<Decoder> Create(const Value& value) override
+        {
+            ImageDecoder decoder;
+            return std::make_unique<ImageDecoder>(std::move(decoder));
+        }
+    };
 
-MMDEPLOY_REGISTER_CREATOR(Decoder, ImageDecoderCreator);
+    MMDEPLOY_REGISTER_CREATOR(Decoder, ImageDecoderCreator);
 
 }  // namespace no_mmdeploy
 
-TEST_CASE("define module in global namespace", "[registry]") {
-  auto& registry = gRegistry<Decoder>();
-  std::string module_type{"image"};
-  SECTION("get not existing decoder") {
-    auto creator = registry.Get("dummy");
-    CHECK(creator == nullptr);
-  }
-  SECTION("get creator without specifying version") {
-    auto creator = registry.Get(module_type);
-    CHECK(creator == nullptr);
-  }
-  SECTION("get creator by providing version") {
-    auto creator = registry.Get(module_type, 100);
-    CHECK(creator == nullptr);
+TEST_CASE("define module in global namespace", "[registry]")
+{
+    auto&       registry = gRegistry<Decoder>();
+    std::string module_type{"image"};
+    SECTION("get not existing decoder")
+    {
+        auto creator = registry.Get("dummy");
+        CHECK(creator == nullptr);
+    }
+    SECTION("get creator without specifying version")
+    {
+        auto creator = registry.Get(module_type);
+        CHECK(creator == nullptr);
+    }
+    SECTION("get creator by providing version")
+    {
+        auto creator = registry.Get(module_type, 100);
+        CHECK(creator == nullptr);
 
-    creator = registry.Get(module_type, 2004000);
-    CHECK(creator != nullptr);
-    auto decoder = creator->Create({});
-    CHECK(decoder->Process({{"image_path", "./test.jpg"}}));
+        creator = registry.Get(module_type, 2004000);
+        CHECK(creator != nullptr);
+        auto decoder = creator->Create({});
+        CHECK(decoder->Process({{"image_path", "./test.jpg"}}));
 
-    auto another_creator = registry.Get(module_type, 1003006);
-    CHECK(another_creator != nullptr);
-    auto another_decoder = another_creator->Create({});
-    CHECK(!another_decoder->Process({{"image_path", "./test.jpg"}}));
-  }
+        auto another_creator = registry.Get(module_type, 1003006);
+        CHECK(another_creator != nullptr);
+        auto another_decoder = another_creator->Create({});
+        CHECK(!another_decoder->Process({{"image_path", "./test.jpg"}}));
+    }
 }
diff --git a/tests/test_csrc/core/test_span.cpp b/tests/test_csrc/core/test_span.cpp
index 43e33bf658..ecd299dc16 100644
--- a/tests/test_csrc/core/test_span.cpp
+++ b/tests/test_csrc/core/test_span.cpp
@@ -8,75 +8,92 @@
 
 using mmdeploy::Span;
 
-TEST_CASE("test span ctors & deduction guides", "[span]") {
-  std::array a{1, 2, 3, 4, 5};
-  std::vector v{1, 2, 3, 4, 5};
-  int c[] = {1, 2, 3, 4, 5};
-  Span x = a;
-  Span y = x;
-
-  SECTION("ctor by it & size") { y = Span(v.begin(), v.size()); }
-
-  SECTION("ctor by first & last") { y = Span(v.begin(), v.end()); }
-
-  SECTION("ctor by vector") { y = Span(v); }
-
-  SECTION("ctor by array") { y = Span(a); }
-
-  SECTION("ctor by c-style array") { y = Span(c); }
-
-  REQUIRE(x == y);
+TEST_CASE("test span ctors & deduction guides", "[span]")
+{
+    std::array  a{1, 2, 3, 4, 5};
+    std::vector v{1, 2, 3, 4, 5};
+    int         c[] = {1, 2, 3, 4, 5};
+    Span        x   = a;
+    Span        y   = x;
+
+    SECTION("ctor by it & size")
+    {
+        y = Span(v.begin(), v.size());
+    }
+
+    SECTION("ctor by first & last")
+    {
+        y = Span(v.begin(), v.end());
+    }
+
+    SECTION("ctor by vector")
+    {
+        y = Span(v);
+    }
+
+    SECTION("ctor by array")
+    {
+        y = Span(a);
+    }
+
+    SECTION("ctor by c-style array")
+    {
+        y = Span(c);
+    }
+
+    REQUIRE(x == y);
 }
 
-TEST_CASE("test span apis", "[span]") {
-  int c[] = {1, 2, 3, 4, 5};
-  Span<int> s;
-  REQUIRE(s.empty());
-  REQUIRE(s.size() == 0);
-  s = c;
-
-  {
-    std::vector v{1, 2, 3, 4, 5};
-    std::vector<int> u(s.begin(), s.end());
-    REQUIRE(u == v);
-  }
-
-  {
-    std::vector v{5, 4, 3, 2, 1};
-    std::vector<int> u(s.rbegin(), s.rend());
-    REQUIRE(u == v);
-  }
-
-  REQUIRE(s.front() == 1);
-  REQUIRE(s.back() == 5);
-  REQUIRE(s.size() == 5);
-  REQUIRE(s.size_bytes() == 5 * sizeof(int));
-  for (int i = 0; i < 5; ++i) REQUIRE(s[i] == i + 1);
-  REQUIRE(s.data()[4] == 5);
-  REQUIRE(!s.empty());
-
-  int a[] = {1, 2, 3};
-  Span t = a;
-  REQUIRE(s != t);
-  REQUIRE(s.first(0).empty());
-  REQUIRE(s.first(3) == t);
-  REQUIRE(s.first(5) == s);
-
-  int b[] = {3, 4, 5};
-  t = b;
-  REQUIRE(s.last(0).empty());
-  REQUIRE(s.last(3) == t);
-  REQUIRE(s.last(5) == s);
-
-  int m[] = {2, 3, 4};
-  t = m;
-
-  REQUIRE(s.subspan(0, 0).empty());
-  REQUIRE(s.subspan(0, 5) == s);
-  REQUIRE(s.subspan(0) == s);
-  REQUIRE(s.subspan(1, 3) == t);
-  REQUIRE(s.subspan(1, 3) == s.first(4).last(3));
-
-  m[0] = 1;
-  REQUIRE(s.subspan(1, 3) != t);
+TEST_CASE("test span apis", "[span]")
+{
+    int       c[] = {1, 2, 3, 4, 5};
+    Span<int> s;
+    REQUIRE(s.empty());
+    REQUIRE(s.size() == 0);
+    s = c;
+
+    {
+        std::vector      v{1, 2, 3, 4, 5};
+        std::vector<int> u(s.begin(), s.end());
+        REQUIRE(u == v);
+    }
+
+    {
+        std::vector      v{5, 4, 3, 2, 1};
+        std::vector<int> u(s.rbegin(), s.rend());
+        REQUIRE(u == v);
+    }
+
+    REQUIRE(s.front() == 1);
+    REQUIRE(s.back() == 5);
+    REQUIRE(s.size() == 5);
+    REQUIRE(s.size_bytes() == 5 * sizeof(int));
+    for (int i = 0; i < 5; ++i) REQUIRE(s[i] == i + 1);
+    REQUIRE(s.data()[4] == 5);
+    REQUIRE(!s.empty());
+
+    int  a[] = {1, 2, 3};
+    Span t   = a;
+    REQUIRE(s != t);
+    REQUIRE(s.first(0).empty());
+    REQUIRE(s.first(3) == t);
+    REQUIRE(s.first(5) == s);
+
+    int b[] = {3, 4, 5};
+    t       = b;
+    REQUIRE(s.last(0).empty());
+    REQUIRE(s.last(3) == t);
+    REQUIRE(s.last(5) == s);
+
+    int m[] = {2, 3, 4};
+    t       = m;
+
+    REQUIRE(s.subspan(0, 0).empty());
+    REQUIRE(s.subspan(0, 5) == s);
+    REQUIRE(s.subspan(0) == s);
+    REQUIRE(s.subspan(1, 3) == t);
+    REQUIRE(s.subspan(1, 3) == s.first(4).last(3));
+
+    m[0] = 1;
+    REQUIRE(s.subspan(1, 3) != t);
 }
diff --git a/tests/test_csrc/core/test_status_code.cpp b/tests/test_csrc/core/test_status_code.cpp
index d1c14f20a8..ee87b96f14 100644
--- a/tests/test_csrc/core/test_status_code.cpp
+++ b/tests/test_csrc/core/test_status_code.cpp
@@ -6,33 +6,43 @@
 #include "mmdeploy/core/logger.h"
 #include "mmdeploy/core/status_code.h"
 
-namespace mmdeploy {
-
-Result<double> sqrt(int x) {
-  if (x >= 0) {
-    return std::sqrt(x);
-  } else {
-    return Status(eInvalidArgument);
-  }
-}
-
-Result<double> sqrt_of_negative() {
-  OUTCOME_TRY(auto x, sqrt(-1));
-  return x;
-}
-
-TEST_CASE("test status_code", "[status_code]") {
-  try {
-    sqrt_of_negative().value();
-  } catch (const Exception& e) {
-    REQUIRE(e.code() == eInvalidArgument);
-    MMDEPLOY_INFO("{}", e.what());
-  }
-
-  auto r = sqrt_of_negative();
-  REQUIRE(!r);
-  REQUIRE(r.error() == eInvalidArgument);
-  MMDEPLOY_INFO("{}", r.error().message().c_str());
-}
+namespace mmdeploy
+{
+
+    Result<double> sqrt(int x)
+    {
+        if (x >= 0)
+        {
+            return std::sqrt(x);
+        }
+        else
+        {
+            return Status(eInvalidArgument);
+        }
+    }
+
+    Result<double> sqrt_of_negative()
+    {
+        OUTCOME_TRY(auto x, sqrt(-1));
+        return x;
+    }
+
+    TEST_CASE("test status_code", "[status_code]")
+    {
+        try
+        {
+            sqrt_of_negative().value();
+        }
+        catch (const Exception& e)
+        {
+            REQUIRE(e.code() == eInvalidArgument);
+            MMDEPLOY_INFO("{}", e.what());
+        }
+
+        auto r = sqrt_of_negative();
+        REQUIRE(!r);
+        REQUIRE(r.error() == eInvalidArgument);
+        MMDEPLOY_INFO("{}", r.error().message().c_str());
+    }
 
 }  // namespace mmdeploy
diff --git a/tests/test_csrc/core/test_value.cpp b/tests/test_csrc/core/test_value.cpp
index dd2b451976..9be39c5f19 100644
--- a/tests/test_csrc/core/test_value.cpp
+++ b/tests/test_csrc/core/test_value.cpp
@@ -10,92 +10,107 @@
 
 using namespace mmdeploy;
 
-TEST_CASE("test value", "[value]") {
-  Value a;
-  REQUIRE(a.type() == ValueType::kNull);
-  Value value(1);
-  REQUIRE(value.type() == ValueType::kInt);
-  REQUIRE(value.get<int>() == 1);
-  REQUIRE(value.get<float>() == 1.f);
-  REQUIRE(value.get<double>() == 1.);
-  REQUIRE(value.get<bool>() == true);
-
-  value = true;
-  REQUIRE(value.type() == ValueType::kBool);
-  REQUIRE(value.get<int>() == 1);
-  REQUIRE(value.get<float>() == 1.f);
-  REQUIRE(value.get<double>() == 1.);
-  REQUIRE(value.get<bool>() == true);
-
-  value = ValueType::kObject;
-  REQUIRE(value.is_object());
-
-  using namespace std::string_literals;
-
-  value = "I'm a string";
-  REQUIRE(value.type() == ValueType::kString);
-  REQUIRE(value.get<std::string>() == "I'm a string");
-
-  value = "I'm a string"s;
-  REQUIRE(value.type() == ValueType::kString);
-  REQUIRE(value.get<const char*>() == "I'm a string"s);
-
-  Value copy = value;
-  Value integer(10);
-
-  Value array{0, 1, 2, 3, 4, 5};
-  REQUIRE(array.is_array());
-  for (const auto& x : array) {
-    std::cout << x.get<int>() << std::endl;
-  }
-
-  Value object{{"hello", 100}, {"world", 200}};
-  REQUIRE(object.is_object());
-  for (auto it = object.begin(); it != object.end(); ++it) {
-    std::cout << it.key() << " " << (*it).get<int>() << std::endl;
-  }
+TEST_CASE("test value", "[value]")
+{
+    Value a;
+    REQUIRE(a.type() == ValueType::kNull);
+    Value value(1);
+    REQUIRE(value.type() == ValueType::kInt);
+    REQUIRE(value.get<int>() == 1);
+    REQUIRE(value.get<float>() == 1.f);
+    REQUIRE(value.get<double>() == 1.);
+    REQUIRE(value.get<bool>() == true);
+
+    value = true;
+    REQUIRE(value.type() == ValueType::kBool);
+    REQUIRE(value.get<int>() == 1);
+    REQUIRE(value.get<float>() == 1.f);
+    REQUIRE(value.get<double>() == 1.);
+    REQUIRE(value.get<bool>() == true);
+
+    value = ValueType::kObject;
+    REQUIRE(value.is_object());
+
+    using namespace std::string_literals;
+
+    value = "I'm a string";
+    REQUIRE(value.type() == ValueType::kString);
+    REQUIRE(value.get<std::string>() == "I'm a string");
+
+    value = "I'm a string"s;
+    REQUIRE(value.type() == ValueType::kString);
+    REQUIRE(value.get<const char*>() == "I'm a string"s);
+
+    Value copy = value;
+    Value integer(10);
+
+    Value array{0, 1, 2, 3, 4, 5};
+    REQUIRE(array.is_array());
+    for (const auto& x : array)
+    {
+        std::cout << x.get<int>() << std::endl;
+    }
+
+    Value object{{"hello", 100}, {"world", 200}};
+    REQUIRE(object.is_object());
+    for (auto it = object.begin(); it != object.end(); ++it)
+    {
+        std::cout << it.key() << " " << (*it).get<int>() << std::endl;
+    }
 }
 
-TEST_CASE("test null interface for value", "[value]") {
-  Value v;
-  REQUIRE(v.is_null());
-  REQUIRE(v.size() == 0);
-  REQUIRE(v.empty());
+TEST_CASE("test null interface for value", "[value]")
+{
+    Value v;
+    REQUIRE(v.is_null());
+    REQUIRE(v.size() == 0);
+    REQUIRE(v.empty());
 }
 
-TEST_CASE("test array interface for value", "[value]") {
-  constexpr auto N = 10;
-  Value v;
-  SECTION("init by push_back") {
-    for (int i = 0; i < N; ++i) {
-      v.push_back(i);
+TEST_CASE("test array interface for value", "[value]")
+{
+    constexpr auto N = 10;
+    Value          v;
+    SECTION("init by push_back")
+    {
+        for (int i = 0; i < N; ++i)
+        {
+            v.push_back(i);
+        }
+    }
+    SECTION("init by initializer list")
+    {
+        v = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+    }
+    REQUIRE(v.is_array());
+    REQUIRE(v.size() == N);
+    for (int i = 0; i < N; ++i)
+    {
+        REQUIRE(v[i].get<int>() == i);
     }
-  }
-  SECTION("init by initializer list") { v = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; }
-  REQUIRE(v.is_array());
-  REQUIRE(v.size() == N);
-  for (int i = 0; i < N; ++i) {
-    REQUIRE(v[i].get<int>() == i);
-  }
 }
 
-TEST_CASE("test object interface for value", "[value]") {
-  constexpr auto N = 10;
-  Value v;
-  SECTION("init by operator[]") {
-    for (int i = 0; i < N; ++i) {
-      v[std::to_string(i)] = i;
+TEST_CASE("test object interface for value", "[value]")
+{
+    constexpr auto N = 10;
+    Value          v;
+    SECTION("init by operator[]")
+    {
+        for (int i = 0; i < N; ++i)
+        {
+            v[std::to_string(i)] = i;
+        }
+    }
+    SECTION("init by initializer list")
+    {
+        v = {{"0", 0}, {"1", 1}, {"2", 2}, {"3", 3}, {"4", 4}, {"5", 5}, {"6", 6}, {"7", 7}, {"8", 8}, {"9", 9}};
+    }
+    REQUIRE(v.is_object());
+    REQUIRE(v.size() == N);
+    for (int i = 0; i < N; ++i)
+    {
+        REQUIRE(v[std::to_string(i)].get<int>() == i);
     }
-  }
-  SECTION("init by initializer list") {
-    v = {{"0", 0}, {"1", 1}, {"2", 2}, {"3", 3}, {"4", 4},
-         {"5", 5}, {"6", 6}, {"7", 7}, {"8", 8}, {"9", 9}};
-  }
-  REQUIRE(v.is_object());
-  REQUIRE(v.size() == N);
-  for (int i = 0; i < N; ++i) {
-    REQUIRE(v[std::to_string(i)].get<int>() == i);
-  }
 }
 
 // clang-format off
@@ -119,233 +134,255 @@ using PrimaryTypes =
     >;
 // clang-format on
 
-TEMPLATE_LIST_TEST_CASE("test value set & get", "[value]", PrimaryTypes) {
-  using Type = typename TestType::type;
-  Type t{};
-  Value v = t;
-  REQUIRE(v.type() == TestType::value);
-  // copy ctor
-  Value u = v;
-  REQUIRE(u.type() == v.type());
-  // simple get
-  REQUIRE(u.get<Type>() == t);
-  // move ctor
-  Value w = std::move(v);
-  REQUIRE(v.type() == Value::kNull);
-  REQUIRE(w.type() == u.type());
-  REQUIRE(w.get<Type>() == u.get<Type>());
-  // from type enum
-  Value x = TestType::value;
-  REQUIRE(x.type() == TestType::value);
-  REQUIRE(x.get<Type>() == t);
+TEMPLATE_LIST_TEST_CASE("test value set & get", "[value]", PrimaryTypes)
+{
+    using Type = typename TestType::type;
+    Type  t{};
+    Value v = t;
+    REQUIRE(v.type() == TestType::value);
+    // copy ctor
+    Value u = v;
+    REQUIRE(u.type() == v.type());
+    // simple get
+    REQUIRE(u.get<Type>() == t);
+    // move ctor
+    Value w = std::move(v);
+    REQUIRE(v.type() == Value::kNull);
+    REQUIRE(w.type() == u.type());
+    REQUIRE(w.get<Type>() == u.get<Type>());
+    // from type enum
+    Value x = TestType::value;
+    REQUIRE(x.type() == TestType::value);
+    REQUIRE(x.get<Type>() == t);
 }
 
-TEST_CASE("test array interface of value", "[value]") {
-  Value a{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-  REQUIRE(a.is_array());
-  REQUIRE(a.size() == 10);
-  REQUIRE(!a.empty());
-  REQUIRE(a.front().get<int>() == 0);
-  REQUIRE(a.back().get<int>() == 9);
-  REQUIRE(std::as_const(a).front().get<int>() == 0);
-  REQUIRE(std::as_const(a).back().get<int>() == 9);
-  a.push_back(10);
-  REQUIRE(a.back().get<int>() == 10);
-  REQUIRE(a[10].get<int>() == 10);
-  REQUIRE(std::as_const(a)[10].get<int>() == 10);
-  a[10] = 100;
-  REQUIRE(a[10].get<int>() == 100);
-  Value b(11);
-  a.push_back(b);
-  REQUIRE(a.back().get<int>() == 11);
-
-  // init by push back
-  Value c;
-  c.push_back(0);
-  REQUIRE(c.is_array());
-  REQUIRE(c.size() == 1);
-  REQUIRE(c.front().get<int>() == 0);
-
-  // init by native type
-  Value::Array d{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-  auto size = d.size();
-  Value e = d;
-  REQUIRE(e.is_array());
-  REQUIRE(e.size() == d.size());
-  e = std::move(d);
-  REQUIRE(d.empty());
-  REQUIRE(e.size() == size);
-
-  // resize via ref to native type
-  Value f = Value::kArray;
-  REQUIRE(f.is_array());
-  f.get_ref<Value::Array&>().resize(1024);
-  REQUIRE(f.size() == 1024);
+TEST_CASE("test array interface of value", "[value]")
+{
+    Value a{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+    REQUIRE(a.is_array());
+    REQUIRE(a.size() == 10);
+    REQUIRE(!a.empty());
+    REQUIRE(a.front().get<int>() == 0);
+    REQUIRE(a.back().get<int>() == 9);
+    REQUIRE(std::as_const(a).front().get<int>() == 0);
+    REQUIRE(std::as_const(a).back().get<int>() == 9);
+    a.push_back(10);
+    REQUIRE(a.back().get<int>() == 10);
+    REQUIRE(a[10].get<int>() == 10);
+    REQUIRE(std::as_const(a)[10].get<int>() == 10);
+    a[10] = 100;
+    REQUIRE(a[10].get<int>() == 100);
+    Value b(11);
+    a.push_back(b);
+    REQUIRE(a.back().get<int>() == 11);
+
+    // init by push back
+    Value c;
+    c.push_back(0);
+    REQUIRE(c.is_array());
+    REQUIRE(c.size() == 1);
+    REQUIRE(c.front().get<int>() == 0);
+
+    // init by native type
+    Value::Array d{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+    auto         size = d.size();
+    Value        e    = d;
+    REQUIRE(e.is_array());
+    REQUIRE(e.size() == d.size());
+    e = std::move(d);
+    REQUIRE(d.empty());
+    REQUIRE(e.size() == size);
+
+    // resize via ref to native type
+    Value f = Value::kArray;
+    REQUIRE(f.is_array());
+    f.get_ref<Value::Array&>().resize(1024);
+    REQUIRE(f.size() == 1024);
 }
 
-TEST_CASE("test object interface of value", "[value]") {
-  Value a{{"0", 0}, {"1", 1}, {"2", 2}, {"3", 3}, {"4", 4}};
-  REQUIRE(a.is_object());
-  REQUIRE(a.size() == 5);
-  REQUIRE(!a.empty());
-  REQUIRE(a.contains("0"));
-  REQUIRE(a.value("4", 0) == 4);
-  REQUIRE(a.value("5", 0) == 0);
-  a.update({{"6", 6}, {"7", 7}});
-  REQUIRE(a["6"].get<int>() == 6);
-  REQUIRE(a["7"].get<int>() == 7);
-  REQUIRE(a.find("100") == a.end());
-
-  Value b;
-  REQUIRE(b.is_null());
-  b.update({{"hello", "world"}});
-  REQUIRE(b.is_object());
-  REQUIRE(b.value<std::string>("hello", "") == "world");
-
-  Value c;
-  c["hello"] = "world";
-  REQUIRE(c.is_object());
-  REQUIRE(c.value<std::string>("hello", "") == "world");
+TEST_CASE("test object interface of value", "[value]")
+{
+    Value a{{"0", 0}, {"1", 1}, {"2", 2}, {"3", 3}, {"4", 4}};
+    REQUIRE(a.is_object());
+    REQUIRE(a.size() == 5);
+    REQUIRE(!a.empty());
+    REQUIRE(a.contains("0"));
+    REQUIRE(a.value("4", 0) == 4);
+    REQUIRE(a.value("5", 0) == 0);
+    a.update({{"6", 6}, {"7", 7}});
+    REQUIRE(a["6"].get<int>() == 6);
+    REQUIRE(a["7"].get<int>() == 7);
+    REQUIRE(a.find("100") == a.end());
+
+    Value b;
+    REQUIRE(b.is_null());
+    b.update({{"hello", "world"}});
+    REQUIRE(b.is_object());
+    REQUIRE(b.value<std::string>("hello", "") == "world");
+
+    Value c;
+    c["hello"] = "world";
+    REQUIRE(c.is_object());
+    REQUIRE(c.value<std::string>("hello", "") == "world");
 }
 
 // TODO: Pointer
-TEST_CASE("test pointer of Value", "[value]") {
-  Value o{{"0", 0}, {"1", 1}, {"2", 2}, {"3", 3}, {"4", 4}};
-  Value a{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-  Value p{{"object", std::make_shared<Value>(std::move(o))},
-          {"array", std::make_shared<Value>(std::move(a))}};
-  REQUIRE(p.is_object());
-  REQUIRE(p["object"].is_pointer());
-  REQUIRE(p["object"].is_object());
-  REQUIRE(p["array"].is_array());
-  REQUIRE(p["array"].is_array());
-  MMDEPLOY_INFO("{}", p);
+TEST_CASE("test pointer of Value", "[value]")
+{
+    Value o{{"0", 0}, {"1", 1}, {"2", 2}, {"3", 3}, {"4", 4}};
+    Value a{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+    Value p{{"object", std::make_shared<Value>(std::move(o))},
+            {"array", std::make_shared<Value>(std::move(a))}};
+    REQUIRE(p.is_object());
+    REQUIRE(p["object"].is_pointer());
+    REQUIRE(p["object"].is_object());
+    REQUIRE(p["array"].is_array());
+    REQUIRE(p["array"].is_array());
+    MMDEPLOY_INFO("{}", p);
 }
 
-TEST_CASE("test null Value", "[value]") {
-  Value a;
-  REQUIRE(a.is_null());
-  REQUIRE(a.empty());
-  REQUIRE(a.size() == 0);
-  Value b = a;
-  REQUIRE(b.is_null());
-  Value c = std::move(b);
-  REQUIRE(b.is_null());
-  REQUIRE(c.is_null());
-  Value d = Value::kNull;
-  REQUIRE(d.is_null());
+TEST_CASE("test null Value", "[value]")
+{
+    Value a;
+    REQUIRE(a.is_null());
+    REQUIRE(a.empty());
+    REQUIRE(a.size() == 0);
+    Value b = a;
+    REQUIRE(b.is_null());
+    Value c = std::move(b);
+    REQUIRE(b.is_null());
+    REQUIRE(c.is_null());
+    Value d = Value::kNull;
+    REQUIRE(d.is_null());
 }
 
-TEST_CASE("test value iterator", "[value]") {
-  {
-    Value source{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-    int count{};
-    for (auto it = source.begin(); it != source.end(); ++it) {
-      count += it->get<int>() == count;
+TEST_CASE("test value iterator", "[value]")
+{
+    {
+        Value source{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+        int   count{};
+        for (auto it = source.begin(); it != source.end(); ++it)
+        {
+            count += it->get<int>() == count;
+        }
+        REQUIRE(count == source.size());
     }
-    REQUIRE(count == source.size());
-  }
-  {
-    const Value source{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-    int count{};
-    for (auto it = source.begin(); it != source.end(); ++it) {
-      count += it->get<int>() == count;
+    {
+        const Value source{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+        int         count{};
+        for (auto it = source.begin(); it != source.end(); ++it)
+        {
+            count += it->get<int>() == count;
+        }
+        REQUIRE(count == source.size());
     }
-    REQUIRE(count == source.size());
-  }
-  {
-    Value source{{"0", 0}, {"1", 1}, {"2", 2}, {"3", 3}, {"4", 4}};
-    int count{};
-    for (auto it = source.begin(); it != source.end(); ++it) {
-      if (it->get<int>() == count && it.key() == std::to_string(it->get<int>())) {
-        ++count;
-      }
+    {
+        Value source{{"0", 0}, {"1", 1}, {"2", 2}, {"3", 3}, {"4", 4}};
+        int   count{};
+        for (auto it = source.begin(); it != source.end(); ++it)
+        {
+            if (it->get<int>() == count && it.key() == std::to_string(it->get<int>()))
+            {
+                ++count;
+            }
+        }
+        REQUIRE(count == source.size());
     }
-    REQUIRE(count == source.size());
-  }
-  {
-    const Value source{{"0", 0}, {"1", 1}, {"2", 2}, {"3", 3}, {"4", 4}};
-    int count{};
-    for (auto it = source.begin(); it != source.end(); ++it) {
-      if (it->get<int>() == count && it.key() == std::to_string(it->get<int>())) {
-        ++count;
-      }
+    {
+        const Value source{{"0", 0}, {"1", 1}, {"2", 2}, {"3", 3}, {"4", 4}};
+        int         count{};
+        for (auto it = source.begin(); it != source.end(); ++it)
+        {
+            if (it->get<int>() == count && it.key() == std::to_string(it->get<int>()))
+            {
+                ++count;
+            }
+        }
+        REQUIRE(count == source.size());
     }
-    REQUIRE(count == source.size());
-  }
 }
 
-struct Meow {
-  int value;
+struct Meow
+{
+    int value;
 };
 
-struct Doge {
-  int value;
+struct Doge
+{
+    int value;
 };
 
-namespace mmdeploy {
+namespace mmdeploy
+{
 
-MMDEPLOY_REGISTER_TYPE_ID(Meow, 1234);
-MMDEPLOY_REGISTER_TYPE_ID(Doge, 3456);
+    MMDEPLOY_REGISTER_TYPE_ID(Meow, 1234);
+    MMDEPLOY_REGISTER_TYPE_ID(Doge, 3456);
 
 }  // namespace mmdeploy
 
 
-TEST_CASE("test dynamic interface for value", "[value]") {
-  Value meow(Meow{100});
-  REQUIRE(meow.is_any());
-  REQUIRE(meow.is_any<Meow>());
-  REQUIRE_FALSE(meow.is_any<int>());
-  REQUIRE_FALSE(meow.is_any<Doge>());
-  REQUIRE(meow.get<Meow>().value == 100);
-  REQUIRE(meow.get_ref<Meow&>().value == 100);
-  REQUIRE(meow.get_ptr<Meow*>() == &meow.get_ref<Meow&>());
-  REQUIRE(meow.get_ptr<const Meow*>() == meow.get_ptr<Meow*>());
-  REQUIRE(meow.get_ptr<EraseType<Doge>*>() == nullptr);
-
-  Doge v{100};
-  Value doge(cast_by_erasure(v));
-  auto u = doge.get<EraseType<Doge>>();
-  REQUIRE(u.value == v.value);
-  REQUIRE(doge.get_ptr<Meow*>() == nullptr);
-  REQUIRE(doge.get_ref<EraseType<Doge>&>().value == v.value);
-  REQUIRE(doge.get_ptr<EraseType<Doge>*>() == &doge.get_ref<EraseType<Doge>&>());
+TEST_CASE("test dynamic interface for value", "[value]")
+{
+    Value meow(Meow{100});
+    REQUIRE(meow.is_any());
+    REQUIRE(meow.is_any<Meow>());
+    REQUIRE_FALSE(meow.is_any<int>());
+    REQUIRE_FALSE(meow.is_any<Doge>());
+    REQUIRE(meow.get<Meow>().value == 100);
+    REQUIRE(meow.get_ref<Meow&>().value == 100);
+    REQUIRE(meow.get_ptr<Meow*>() == &meow.get_ref<Meow&>());
+    REQUIRE(meow.get_ptr<const Meow*>() == meow.get_ptr<Meow*>());
+    REQUIRE(meow.get_ptr<EraseType<Doge>*>() == nullptr);
+
+    Doge  v{100};
+    Value doge(cast_by_erasure(v));
+    auto  u = doge.get<EraseType<Doge>>();
+    REQUIRE(u.value == v.value);
+    REQUIRE(doge.get_ptr<Meow*>() == nullptr);
+    REQUIRE(doge.get_ref<EraseType<Doge>&>().value == v.value);
+    REQUIRE(doge.get_ptr<EraseType<Doge>*>() == &doge.get_ref<EraseType<Doge>&>());
 }
 
 // conclusion: when value contains more than 8 elements, the pointer type is faster than copying
 //  on a modern x86 CPU
-TEST_CASE("test speed of value", "[value]") {
-  //  constexpr auto N = 512;
-  constexpr auto N = 32;
-  constexpr auto M = N / 1;
-  constexpr auto K = 10;
-  // construct NxNxM cube
-  Value::Array a0(N);
-  for (int i = 0; i < N; ++i) {
-    Value::Array a1(N);
-    for (int j = 0; j < N; ++j) {
-      Value::Array a2(M);
-      for (int k = 0; k < M; ++k) {
-        a2[k] = k;
-      }
-      //      a1[j] = std::move(a2);
-      a1[j] = make_pointer(std::move(a2));
+TEST_CASE("test speed of value", "[value]")
+{
+    //  constexpr auto N = 512;
+    constexpr auto N = 32;
+    constexpr auto M = N / 1;
+    constexpr auto K = 10;
+    // construct NxNxM cube
+    Value::Array   a0(N);
+    for (int i = 0; i < N; ++i)
+    {
+        Value::Array a1(N);
+        for (int j = 0; j < N; ++j)
+        {
+            Value::Array a2(M);
+            for (int k = 0; k < M; ++k)
+            {
+                a2[k] = k;
+            }
+            //      a1[j] = std::move(a2);
+            a1[j] = make_pointer(std::move(a2));
+        }
+        a0[i] = std::move(a1);
+    }
+    Value v(std::move(a0));
+    auto  t0 = std::chrono::high_resolution_clock::now();
+    for (int i = 0; i < K; ++i)
+    {
+        Value t = graph::DistribAA(v).value();
     }
-    a0[i] = std::move(a1);
-  }
-  Value v(std::move(a0));
-  auto t0 = std::chrono::high_resolution_clock::now();
-  for (int i = 0; i < K; ++i) {
-    Value t = graph::DistribAA(v).value();
-  }
-  auto t1 = std::chrono::high_resolution_clock::now();
-  auto dt = std::chrono::duration<double, std::milli>(t1 - t0).count();
-  MMDEPLOY_INFO("time = {}ms", (float)dt);
+    auto t1 = std::chrono::high_resolution_clock::now();
+    auto dt = std::chrono::duration<double, std::milli>(t1 - t0).count();
+    MMDEPLOY_INFO("time = {}ms", (float)dt);
 }
 
-TEST_CASE("test ctor of value", "[value]") {
-  static_assert(!std::is_constructible<Value, void (*)(int)>::value, "");
-  static_assert(!std::is_constructible<Value, int*>::value, "");
+TEST_CASE("test ctor of value", "[value]")
+{
+    static_assert(!std::is_constructible<Value, void (*)(int)>::value, "");
+    static_assert(!std::is_constructible<Value, int*>::value, "");
 }
 
 //
diff --git a/tests/test_csrc/device/test_cpu_device.cpp b/tests/test_csrc/device/test_cpu_device.cpp
index 91882b3a70..c72104edd7 100644
--- a/tests/test_csrc/device/test_cpu_device.cpp
+++ b/tests/test_csrc/device/test_cpu_device.cpp
@@ -11,31 +11,33 @@ using namespace mmdeploy;
 using namespace framework;
 using namespace std::string_literals;
 
-TEST_CASE("test buffer", "[buffer]") {
-  using namespace mmdeploy;
-  Device device{"cpu"};
-  std::vector src{1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f};
-  std::vector dst(src.size(), 0.f);
-  auto size_in_bytes = src.size() * sizeof(float);
-  Buffer buf_x(device, size_in_bytes);
-  Buffer buf_y(device, size_in_bytes);
+TEST_CASE("test buffer", "[buffer]")
+{
+    using namespace mmdeploy;
+    Device      device{"cpu"};
+    std::vector src{1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f};
+    std::vector dst(src.size(), 0.f);
+    auto        size_in_bytes = src.size() * sizeof(float);
+    Buffer      buf_x(device, size_in_bytes);
+    Buffer      buf_y(device, size_in_bytes);
 
-  REQUIRE(buf_x);
-  REQUIRE(buf_y);
-  REQUIRE(buf_x.GetSize() == size_in_bytes);
-  REQUIRE(buf_y.GetSize() == size_in_bytes);
+    REQUIRE(buf_x);
+    REQUIRE(buf_y);
+    REQUIRE(buf_x.GetSize() == size_in_bytes);
+    REQUIRE(buf_y.GetSize() == size_in_bytes);
 
-  SECTION("copy w/ queue API") {
-    //    Stream stream(device);
-    auto stream = Stream::GetDefault(device);
-    Event event(device);
-    REQUIRE(stream);
-    REQUIRE(event);
-    REQUIRE(stream.Copy(src.data(), buf_x));
-    REQUIRE(stream.Copy(buf_x, buf_y));
-    REQUIRE(stream.Copy(buf_y, dst.data()));
-    REQUIRE(event.Record(stream));
-    REQUIRE(event.Wait());
-    REQUIRE(src == dst);
-  }
+    SECTION("copy w/ queue API")
+    {
+        //    Stream stream(device);
+        auto  stream = Stream::GetDefault(device);
+        Event event(device);
+        REQUIRE(stream);
+        REQUIRE(event);
+        REQUIRE(stream.Copy(src.data(), buf_x));
+        REQUIRE(stream.Copy(buf_x, buf_y));
+        REQUIRE(stream.Copy(buf_y, dst.data()));
+        REQUIRE(event.Record(stream));
+        REQUIRE(event.Wait());
+        REQUIRE(src == dst);
+    }
 }
diff --git a/tests/test_csrc/device/test_cuda_device.cpp b/tests/test_csrc/device/test_cuda_device.cpp
index 03dc99314b..d27632eb5c 100644
--- a/tests/test_csrc/device/test_cuda_device.cpp
+++ b/tests/test_csrc/device/test_cuda_device.cpp
@@ -11,33 +11,35 @@ using namespace mmdeploy;
 using namespace framework;
 using namespace std::string_literals;
 
-TEST_CASE("test cuda", "[cuda]") {
-  using namespace mmdeploy;
-  Device device{"cuda"};
-  REQUIRE(device.platform_id() > 0);
-  REQUIRE(device.device_id() == 0);
-  std::vector src{1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f};
-  std::vector dst(src.size(), 0.f);
-  auto size_in_bytes = src.size() * sizeof(float);
-  Buffer buf_x(device, size_in_bytes);
-  Buffer buf_y(device, size_in_bytes);
+TEST_CASE("test cuda", "[cuda]")
+{
+    using namespace mmdeploy;
+    Device device{"cuda"};
+    REQUIRE(device.platform_id() > 0);
+    REQUIRE(device.device_id() == 0);
+    std::vector src{1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f};
+    std::vector dst(src.size(), 0.f);
+    auto        size_in_bytes = src.size() * sizeof(float);
+    Buffer      buf_x(device, size_in_bytes);
+    Buffer      buf_y(device, size_in_bytes);
 
-  REQUIRE(buf_x);
-  REQUIRE(buf_y);
-  REQUIRE(buf_x.GetSize() == size_in_bytes);
-  REQUIRE(buf_y.GetSize() == size_in_bytes);
+    REQUIRE(buf_x);
+    REQUIRE(buf_y);
+    REQUIRE(buf_x.GetSize() == size_in_bytes);
+    REQUIRE(buf_y.GetSize() == size_in_bytes);
 
-  SECTION("copy w/ queue API") {
-    //    Stream stream(device);
-    auto stream = Stream::GetDefault(device);
-    Event event(device);
-    REQUIRE(stream);
-    REQUIRE(event);
-    REQUIRE(stream.Copy(src.data(), buf_x));
-    REQUIRE(stream.Copy(buf_x, buf_y));
-    REQUIRE(stream.Copy(buf_y, dst.data()));
-    REQUIRE(event.Record(stream));
-    REQUIRE(event.Wait());
-    REQUIRE(src == dst);
-  }
+    SECTION("copy w/ queue API")
+    {
+        //    Stream stream(device);
+        auto  stream = Stream::GetDefault(device);
+        Event event(device);
+        REQUIRE(stream);
+        REQUIRE(event);
+        REQUIRE(stream.Copy(src.data(), buf_x));
+        REQUIRE(stream.Copy(buf_x, buf_y));
+        REQUIRE(stream.Copy(buf_y, dst.data()));
+        REQUIRE(event.Record(stream));
+        REQUIRE(event.Wait());
+        REQUIRE(src == dst);
+    }
 }
diff --git a/tests/test_csrc/device/test_opencl_device.cpp b/tests/test_csrc/device/test_opencl_device.cpp
index 4d10301a77..85b2e42987 100644
--- a/tests/test_csrc/device/test_opencl_device.cpp
+++ b/tests/test_csrc/device/test_opencl_device.cpp
@@ -11,34 +11,36 @@ using namespace mmdeploy;
 using namespace runtime;
 using namespace std::string_literals;
 
-TEST_CASE("test opencl", "[opencl][!shouldfail]") {
-  using namespace mmdeploy;
-  Device device{"opencl"};
-  REQUIRE(device.platform_id() > 0);
-  REQUIRE(device.device_id() == 0);
+TEST_CASE("test opencl", "[opencl][!shouldfail]")
+{
+    using namespace mmdeploy;
+    Device device{"opencl"};
+    REQUIRE(device.platform_id() > 0);
+    REQUIRE(device.device_id() == 0);
 
-  std::vector src{1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f};
-  std::vector dst(src.size(), 0.f);
-  auto size_in_bytes = src.size() * sizeof(float);
-  Buffer buf_x(device, size_in_bytes);
-  Buffer buf_y(device, size_in_bytes);
+    std::vector src{1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f};
+    std::vector dst(src.size(), 0.f);
+    auto        size_in_bytes = src.size() * sizeof(float);
+    Buffer      buf_x(device, size_in_bytes);
+    Buffer      buf_y(device, size_in_bytes);
 
-  REQUIRE(buf_x);
-  REQUIRE(buf_y);
-  REQUIRE(buf_x.GetSize() == size_in_bytes);
-  REQUIRE(buf_y.GetSize() == size_in_bytes);
+    REQUIRE(buf_x);
+    REQUIRE(buf_y);
+    REQUIRE(buf_x.GetSize() == size_in_bytes);
+    REQUIRE(buf_y.GetSize() == size_in_bytes);
 
-  SECTION("copy w/ queue API") {
-    //    Stream stream(device);
-    auto stream = Stream::GetDefault(device);
-    Event event(device);
-    REQUIRE(stream);
-    REQUIRE(event);
-    REQUIRE(stream.Copy(src.data(), buf_x));
-    REQUIRE(stream.Copy(buf_x, buf_y));
-    REQUIRE(stream.Copy(buf_y, dst.data()));
-    REQUIRE(event.Record(stream));
-    REQUIRE(event.Wait());
-    REQUIRE(src == dst);
-  }
+    SECTION("copy w/ queue API")
+    {
+        //    Stream stream(device);
+        auto  stream = Stream::GetDefault(device);
+        Event event(device);
+        REQUIRE(stream);
+        REQUIRE(event);
+        REQUIRE(stream.Copy(src.data(), buf_x));
+        REQUIRE(stream.Copy(buf_x, buf_y));
+        REQUIRE(stream.Copy(buf_y, dst.data()));
+        REQUIRE(event.Record(stream));
+        REQUIRE(event.Wait());
+        REQUIRE(src == dst);
+    }
 }
diff --git a/tests/test_csrc/graph/test_cond.cpp b/tests/test_csrc/graph/test_cond.cpp
index 89db07d789..a421078b55 100644
--- a/tests/test_csrc/graph/test_cond.cpp
+++ b/tests/test_csrc/graph/test_cond.cpp
@@ -8,19 +8,26 @@
 
 using namespace mmdeploy;
 
-namespace {
+namespace
+{
 
-class PlusCreator : public Creator<Module> {
- public:
-  std::string_view name() const noexcept override { return "Plus"; }
-  std::unique_ptr<Module> Create(const Value&) override {
-    return CreateTask([](int a, int b) { return a + b; });
-  }
-};
+    class PlusCreator : public Creator<Module>
+    {
+      public:
+        std::string_view name() const noexcept override
+        {
+            return "Plus";
+        }
+        std::unique_ptr<Module> Create(const Value&) override
+        {
+            return CreateTask([](int a, int b)
+                              { return a + b; });
+        }
+    };
 
-MMDEPLOY_REGISTER_CREATOR(Module, PlusCreator);
+    MMDEPLOY_REGISTER_CREATOR(Module, PlusCreator);
 
-const auto json_config1 = R"(
+    const auto json_config1 = R"(
 {
   "type": "Cond",
   "input": ["pred", "a", "b"],
@@ -34,33 +41,34 @@ const auto json_config1 = R"(
 
 }  // namespace
 
-TEST_CASE("test Cond node", "[graph]") {
-  auto config = from_json<Value>(json_config1);
-  auto builder = graph::Builder::CreateFromConfig(config).value();
-  REQUIRE(builder);
-  auto node = builder->Build().value();
-  REQUIRE(node);
-  {
-    auto result = SyncWait(node->Process(Just(Value({{false}, {1}, {1}}))));
-    MMDEPLOY_INFO("{}", result);
-  }
-  {
-    auto result = SyncWait(node->Process(Just(Value({{true}, {1}, {1}}))));
-    MMDEPLOY_INFO("{}", result);
-  }
-  {
-    auto result = SyncWait(
-        node->Process(Just(Value({{false, false, false, false}, {1, 2, 3, 4}, {1, 3, 5, 7}}))));
-    MMDEPLOY_INFO("{}", result);
-  }
-  {
-    auto result = SyncWait(
-        node->Process(Just(Value({{true, true, true, true}, {1, 2, 3, 4}, {1, 3, 5, 7}}))));
-    MMDEPLOY_INFO("{}", result);
-  }
-  {
-    auto result = SyncWait(
-        node->Process(Just(Value({{true, false, false, true}, {1, 2, 3, 4}, {1, 3, 5, 7}}))));
-    MMDEPLOY_INFO("{}", result);
-  }
+TEST_CASE("test Cond node", "[graph]")
+{
+    auto config  = from_json<Value>(json_config1);
+    auto builder = graph::Builder::CreateFromConfig(config).value();
+    REQUIRE(builder);
+    auto node = builder->Build().value();
+    REQUIRE(node);
+    {
+        auto result = SyncWait(node->Process(Just(Value({{false}, {1}, {1}}))));
+        MMDEPLOY_INFO("{}", result);
+    }
+    {
+        auto result = SyncWait(node->Process(Just(Value({{true}, {1}, {1}}))));
+        MMDEPLOY_INFO("{}", result);
+    }
+    {
+        auto result = SyncWait(
+            node->Process(Just(Value({{false, false, false, false}, {1, 2, 3, 4}, {1, 3, 5, 7}}))));
+        MMDEPLOY_INFO("{}", result);
+    }
+    {
+        auto result = SyncWait(
+            node->Process(Just(Value({{true, true, true, true}, {1, 2, 3, 4}, {1, 3, 5, 7}}))));
+        MMDEPLOY_INFO("{}", result);
+    }
+    {
+        auto result = SyncWait(
+            node->Process(Just(Value({{true, false, false, true}, {1, 2, 3, 4}, {1, 3, 5, 7}}))));
+        MMDEPLOY_INFO("{}", result);
+    }
 }
diff --git a/tests/test_csrc/model/test_directory_model.cpp b/tests/test_csrc/model/test_directory_model.cpp
index e701d66a64..dd478af049 100644
--- a/tests/test_csrc/model/test_directory_model.cpp
+++ b/tests/test_csrc/model/test_directory_model.cpp
@@ -10,29 +10,32 @@
 using namespace mmdeploy;
 using namespace framework;
 
-TEST_CASE("test directory model", "[.model][resource]") {
-  std::unique_ptr<ModelImpl> model_impl;
-  for (auto& entry : gRegistry<ModelImpl>().Creators()) {
-    if (entry->name() == "DirectoryModel") {
-      model_impl = entry->Create();
-      break;
+TEST_CASE("test directory model", "[.model][resource]")
+{
+    std::unique_ptr<ModelImpl> model_impl;
+    for (auto& entry : gRegistry<ModelImpl>().Creators())
+    {
+        if (entry->name() == "DirectoryModel")
+        {
+            model_impl = entry->Create();
+            break;
+        }
     }
-  }
-  REQUIRE(model_impl);
+    REQUIRE(model_impl);
 
-  auto& gResource = MMDeployTestResources::Get();
-  auto directory_model_list = gResource.LocateModelResources("sdk_models");
-  REQUIRE(!directory_model_list.empty());
-  auto model_dir = "sdk_models/good_model";
-  REQUIRE(gResource.IsDir(model_dir));
-  auto model_path = gResource.resource_root_path() / model_dir;
-  REQUIRE(!model_impl->Init(model_path.string()).has_error());
-  REQUIRE(!model_impl->ReadFile("deploy.json").has_error());
-  REQUIRE(model_impl->ReadFile("not-existing-file").has_error());
+    auto& gResource            = MMDeployTestResources::Get();
+    auto  directory_model_list = gResource.LocateModelResources("sdk_models");
+    REQUIRE(!directory_model_list.empty());
+    auto model_dir = "sdk_models/good_model";
+    REQUIRE(gResource.IsDir(model_dir));
+    auto model_path = gResource.resource_root_path() / model_dir;
+    REQUIRE(!model_impl->Init(model_path.string()).has_error());
+    REQUIRE(!model_impl->ReadFile("deploy.json").has_error());
+    REQUIRE(model_impl->ReadFile("not-existing-file").has_error());
 
-  model_dir = "sdk_models/bad_model";
-  REQUIRE(gResource.IsDir(model_dir));
-  model_path = gResource.resource_root_path() / model_dir;
-  REQUIRE(!model_impl->Init(model_path.string()).has_error());
-  REQUIRE(model_impl->ReadMeta().has_error());
+    model_dir = "sdk_models/bad_model";
+    REQUIRE(gResource.IsDir(model_dir));
+    model_path = gResource.resource_root_path() / model_dir;
+    REQUIRE(!model_impl->Init(model_path.string()).has_error());
+    REQUIRE(model_impl->ReadMeta().has_error());
 }
diff --git a/tests/test_csrc/model/test_model.cpp b/tests/test_csrc/model/test_model.cpp
index 93dd797449..07ba93ebb0 100644
--- a/tests/test_csrc/model/test_model.cpp
+++ b/tests/test_csrc/model/test_model.cpp
@@ -11,39 +11,51 @@
 using namespace mmdeploy;
 using namespace framework;
 
-TEST_CASE("model constructor", "[model]") {
-  SECTION("default constructor") {
-    Model model;
-    REQUIRE(!model);
-  }
-  SECTION("explicit constructor with model path") {
-    REQUIRE_THROWS(Model{"path/to/not/existing/model"});
-  }
-  SECTION("explicit constructor with buffer") { REQUIRE_THROWS(Model{nullptr, 0}); }
+TEST_CASE("model constructor", "[model]")
+{
+    SECTION("default constructor")
+    {
+        Model model;
+        REQUIRE(!model);
+    }
+    SECTION("explicit constructor with model path")
+    {
+        REQUIRE_THROWS(Model{"path/to/not/existing/model"});
+    }
+    SECTION("explicit constructor with buffer")
+    {
+        REQUIRE_THROWS(Model{nullptr, 0});
+    }
 }
 
-TEST_CASE("model init", "[model]") {
-  auto& gResource = MMDeployTestResources::Get();
-  for (auto& codebase : gResource.codebases()) {
-    if (auto img_list = gResource.LocateImageResources(fs::path{codebase} / "images");
-        !img_list.empty()) {
-      Model model;
-      REQUIRE(model.Init(img_list.front()).has_error());
-      break;
+TEST_CASE("model init", "[model]")
+{
+    auto& gResource = MMDeployTestResources::Get();
+    for (auto& codebase : gResource.codebases())
+    {
+        if (auto img_list = gResource.LocateImageResources(fs::path{codebase} / "images");
+            !img_list.empty())
+        {
+            Model model;
+            REQUIRE(model.Init(img_list.front()).has_error());
+            break;
+        }
     }
-  }
-  for (auto& codebase : gResource.codebases()) {
-    for (auto& backend : gResource.backends()) {
-      if (auto model_list = gResource.LocateModelResources(fs::path{codebase} / backend);
-          !model_list.empty()) {
-        Model model;
-        REQUIRE(!model.Init(model_list.front()).has_error());
-        REQUIRE(!model.ReadFile("deploy.json").has_error());
-        auto const& meta = model.meta();
-        REQUIRE(!model.GetModelConfig(meta.models[0].name).has_error());
-        REQUIRE(model.GetModelConfig("not-existing-model").has_error());
-        break;
-      }
+    for (auto& codebase : gResource.codebases())
+    {
+        for (auto& backend : gResource.backends())
+        {
+            if (auto model_list = gResource.LocateModelResources(fs::path{codebase} / backend);
+                !model_list.empty())
+            {
+                Model model;
+                REQUIRE(!model.Init(model_list.front()).has_error());
+                REQUIRE(!model.ReadFile("deploy.json").has_error());
+                auto const& meta = model.meta();
+                REQUIRE(!model.GetModelConfig(meta.models[0].name).has_error());
+                REQUIRE(model.GetModelConfig("not-existing-model").has_error());
+                break;
+            }
+        }
     }
-  }
 }
diff --git a/tests/test_csrc/model/test_zip_model.cpp b/tests/test_csrc/model/test_zip_model.cpp
index 03d8fd4d2f..efd529d959 100644
--- a/tests/test_csrc/model/test_zip_model.cpp
+++ b/tests/test_csrc/model/test_zip_model.cpp
@@ -14,41 +14,47 @@ using namespace std;
 using namespace mmdeploy;
 
 #if MMDEPLOY_ZIP_MODEL
-TEST_CASE("test zip model", "[zip_model]") {
-  std::unique_ptr<ModelImpl> model_impl;
-  for (auto& entry : ModelRegistry::Get().ListEntries()) {
-    if (entry.name == "ZipModel") {
-      model_impl = entry.creator();
-      break;
+TEST_CASE("test zip model", "[zip_model]")
+{
+    std::unique_ptr<ModelImpl> model_impl;
+    for (auto& entry : ModelRegistry::Get().ListEntries())
+    {
+        if (entry.name == "ZipModel")
+        {
+            model_impl = entry.creator();
+            break;
+        }
     }
-  }
-  REQUIRE(model_impl);
+    REQUIRE(model_impl);
 
-  auto& gResource = MMDeployTestResources::Get();
-  SECTION("bad sdk model") {
-    auto zip_model_path = fs::path{"sdk_models"} / "not_zip_file";
-    REQUIRE(gResource.IsFile(zip_model_path));
-    auto model_path = gResource.resource_root_path() / zip_model_path;
-    REQUIRE(model_impl->Init(model_path.string()).has_error());
-  }
-  SECTION("bad zip buffer") {
-    std::vector<char> buffer(100);
-    REQUIRE(model_impl->Init(buffer.data(), buffer.size()).has_error());
-  }
+    auto& gResource = MMDeployTestResources::Get();
+    SECTION("bad sdk model")
+    {
+        auto zip_model_path = fs::path{"sdk_models"} / "not_zip_file";
+        REQUIRE(gResource.IsFile(zip_model_path));
+        auto model_path = gResource.resource_root_path() / zip_model_path;
+        REQUIRE(model_impl->Init(model_path.string()).has_error());
+    }
+    SECTION("bad zip buffer")
+    {
+        std::vector<char> buffer(100);
+        REQUIRE(model_impl->Init(buffer.data(), buffer.size()).has_error());
+    }
 
-  SECTION("good sdk model") {
-    auto zip_model_path = fs::path{"sdk_models"} / "good_model.zip";
-    REQUIRE(gResource.IsFile(zip_model_path));
-    auto model_path = gResource.resource_root_path() / zip_model_path;
-    REQUIRE(!model_impl->Init(model_path.string()).has_error());
-    REQUIRE(!model_impl->ReadFile("deploy.json").has_error());
-    REQUIRE(model_impl->ReadFile("not-exist-file").has_error());
-    REQUIRE(!model_impl->ReadMeta().has_error());
+    SECTION("good sdk model")
+    {
+        auto zip_model_path = fs::path{"sdk_models"} / "good_model.zip";
+        REQUIRE(gResource.IsFile(zip_model_path));
+        auto model_path = gResource.resource_root_path() / zip_model_path;
+        REQUIRE(!model_impl->Init(model_path.string()).has_error());
+        REQUIRE(!model_impl->ReadFile("deploy.json").has_error());
+        REQUIRE(model_impl->ReadFile("not-exist-file").has_error());
+        REQUIRE(!model_impl->ReadMeta().has_error());
 
-    ifstream ifs(model_path, std::ios::binary | std::ios::in);
-    REQUIRE(ifs.is_open());
-    string buffer((istreambuf_iterator<char>(ifs)), istreambuf_iterator<char>());
-    REQUIRE(!model_impl->Init(buffer.data(), buffer.size()).has_error());
-  }
+        ifstream ifs(model_path, std::ios::binary | std::ios::in);
+        REQUIRE(ifs.is_open());
+        string buffer((istreambuf_iterator<char>(ifs)), istreambuf_iterator<char>());
+        REQUIRE(!model_impl->Init(buffer.data(), buffer.size()).has_error());
+    }
 }
 #endif
diff --git a/tests/test_csrc/net/test_ncnn_net.cpp b/tests/test_csrc/net/test_ncnn_net.cpp
index 0546014592..43ed38905b 100644
--- a/tests/test_csrc/net/test_ncnn_net.cpp
+++ b/tests/test_csrc/net/test_ncnn_net.cpp
@@ -11,22 +11,23 @@
 using namespace mmdeploy;
 using namespace framework;
 
-TEST_CASE("test ncnn net", "[.ncnn_net][resource]") {
-  auto& gResource = MMDeployTestResources::Get();
-  auto model_list = gResource.LocateModelResources(fs::path{"mmcls"} / "ncnn");
-  REQUIRE(!model_list.empty());
+TEST_CASE("test ncnn net", "[.ncnn_net][resource]")
+{
+    auto& gResource  = MMDeployTestResources::Get();
+    auto  model_list = gResource.LocateModelResources(fs::path{"mmcls"} / "ncnn");
+    REQUIRE(!model_list.empty());
 
-  Model model(model_list.front());
-  REQUIRE(model);
+    Model model(model_list.front());
+    REQUIRE(model);
 
-  auto backend("ncnn");
-  auto creator = gRegistry<Net>().Get(backend);
-  REQUIRE(creator);
+    auto backend("ncnn");
+    auto creator = gRegistry<Net>().Get(backend);
+    REQUIRE(creator);
 
-  Device device{"cpu"};
-  auto stream = Stream::GetDefault(device);
-  Value net_config{{"context", {{"device", device}, {"model", model}, {"stream", stream}}},
-                   {"name", model.meta().models[0].name}};
-  auto net = creator->Create(net_config);
-  REQUIRE(net);
+    Device device{"cpu"};
+    auto   stream = Stream::GetDefault(device);
+    Value  net_config{{"context", {{"device", device}, {"model", model}, {"stream", stream}}},
+                      {"name", model.meta().models[0].name}};
+    auto   net = creator->Create(net_config);
+    REQUIRE(net);
 }
diff --git a/tests/test_csrc/net/test_openvino_net.cpp b/tests/test_csrc/net/test_openvino_net.cpp
index f708771adb..ed92f6bf82 100644
--- a/tests/test_csrc/net/test_openvino_net.cpp
+++ b/tests/test_csrc/net/test_openvino_net.cpp
@@ -11,22 +11,23 @@
 using namespace mmdeploy;
 using namespace framework;
 
-TEST_CASE("test openvino net", "[.openvino_net][resource]") {
-  auto& gResource = MMDeployTestResources::Get();
-  auto model_list = gResource.LocateModelResources(fs::path{"mmcls"} / "openvino");
-  REQUIRE(!model_list.empty());
+TEST_CASE("test openvino net", "[.openvino_net][resource]")
+{
+    auto& gResource  = MMDeployTestResources::Get();
+    auto  model_list = gResource.LocateModelResources(fs::path{"mmcls"} / "openvino");
+    REQUIRE(!model_list.empty());
 
-  Model model(model_list.front());
-  REQUIRE(model);
+    Model model(model_list.front());
+    REQUIRE(model);
 
-  auto backend("openvino");
-  auto creator = gRegistry<Net>().Get(backend);
-  REQUIRE(creator);
+    auto backend("openvino");
+    auto creator = gRegistry<Net>().Get(backend);
+    REQUIRE(creator);
 
-  Device device{"cpu"};
-  auto stream = Stream::GetDefault(device);
-  Value net_config{{"context", {{"device", device}, {"model", model}, {"stream", stream}}},
-                   {"name", model.meta().models[0].name}};
-  auto net = creator->Create(net_config);
-  REQUIRE(net);
+    Device device{"cpu"};
+    auto   stream = Stream::GetDefault(device);
+    Value  net_config{{"context", {{"device", device}, {"model", model}, {"stream", stream}}},
+                      {"name", model.meta().models[0].name}};
+    auto   net = creator->Create(net_config);
+    REQUIRE(net);
 }
diff --git a/tests/test_csrc/net/test_ort_net.cpp b/tests/test_csrc/net/test_ort_net.cpp
index 13f4d23064..d393707335 100644
--- a/tests/test_csrc/net/test_ort_net.cpp
+++ b/tests/test_csrc/net/test_ort_net.cpp
@@ -11,22 +11,23 @@
 using namespace mmdeploy;
 using namespace framework;
 
-TEST_CASE("test ort net", "[.ort_net][resource]") {
-  auto& gResource = MMDeployTestResources::Get();
-  auto model_list = gResource.LocateModelResources(fs::path{"mmcls"} / "ort");
-  REQUIRE(!model_list.empty());
+TEST_CASE("test ort net", "[.ort_net][resource]")
+{
+    auto& gResource  = MMDeployTestResources::Get();
+    auto  model_list = gResource.LocateModelResources(fs::path{"mmcls"} / "ort");
+    REQUIRE(!model_list.empty());
 
-  Model model(model_list.front());
-  REQUIRE(model);
+    Model model(model_list.front());
+    REQUIRE(model);
 
-  auto backend("onnxruntime");
-  auto creator = gRegistry<Net>().Get(backend);
-  REQUIRE(creator);
+    auto backend("onnxruntime");
+    auto creator = gRegistry<Net>().Get(backend);
+    REQUIRE(creator);
 
-  Device device{"cpu"};
-  auto stream = Stream::GetDefault(device);
-  Value net_config{{"context", {{"device", device}, {"model", model}, {"stream", stream}}},
-                   {"name", model.meta().models[0].name}};
-  auto net = creator->Create(net_config);
-  REQUIRE(net);
+    Device device{"cpu"};
+    auto   stream = Stream::GetDefault(device);
+    Value  net_config{{"context", {{"device", device}, {"model", model}, {"stream", stream}}},
+                      {"name", model.meta().models[0].name}};
+    auto   net = creator->Create(net_config);
+    REQUIRE(net);
 }
diff --git a/tests/test_csrc/net/test_ppl_net.cpp b/tests/test_csrc/net/test_ppl_net.cpp
index 55e70b3627..939bda452d 100644
--- a/tests/test_csrc/net/test_ppl_net.cpp
+++ b/tests/test_csrc/net/test_ppl_net.cpp
@@ -11,21 +11,22 @@
 using namespace mmdeploy;
 using namespace framework;
 
-TEST_CASE("test pplnn net", "[.ppl_net][resource]") {
-  auto& gResource = MMDeployTestResources::Get();
-  auto model_list = gResource.LocateModelResources(fs::path{"mmcls"} / "pplnn");
-  REQUIRE(!model_list.empty());
+TEST_CASE("test pplnn net", "[.ppl_net][resource]")
+{
+    auto& gResource  = MMDeployTestResources::Get();
+    auto  model_list = gResource.LocateModelResources(fs::path{"mmcls"} / "pplnn");
+    REQUIRE(!model_list.empty());
 
-  Model model(model_list.front());
-  REQUIRE(model);
+    Model model(model_list.front());
+    REQUIRE(model);
 
-  auto backend = "pplnn";
-  auto creator = gRegistry<Net>().Get(backend);
-  REQUIRE(creator);
+    auto backend = "pplnn";
+    auto creator = gRegistry<Net>().Get(backend);
+    REQUIRE(creator);
 
-  Device device{"cpu"};
-  auto stream = Stream::GetDefault(device);
-  // clang-format off
+    Device device{"cpu"};
+    auto   stream = Stream::GetDefault(device);
+    // clang-format off
   Value net_config{
       {"context", {
           {"device", device},
diff --git a/tests/test_csrc/net/test_trt_net.cpp b/tests/test_csrc/net/test_trt_net.cpp
index c0da700789..467489b3ce 100644
--- a/tests/test_csrc/net/test_trt_net.cpp
+++ b/tests/test_csrc/net/test_trt_net.cpp
@@ -11,22 +11,23 @@
 using namespace mmdeploy;
 using namespace framework;
 
-TEST_CASE("test trt net", "[.trt_net][resource]") {
-  auto& gResource = MMDeployTestResources::Get();
-  auto model_list = gResource.LocateModelResources(fs::path{"mmcls"} / "trt");
-  REQUIRE(!model_list.empty());
+TEST_CASE("test trt net", "[.trt_net][resource]")
+{
+    auto& gResource  = MMDeployTestResources::Get();
+    auto  model_list = gResource.LocateModelResources(fs::path{"mmcls"} / "trt");
+    REQUIRE(!model_list.empty());
 
-  Model model(model_list.front());
-  REQUIRE(model);
+    Model model(model_list.front());
+    REQUIRE(model);
 
-  auto backend("tensorrt");
-  auto creator = gRegistry<Net>().Get(backend);
-  REQUIRE(creator);
+    auto backend("tensorrt");
+    auto creator = gRegistry<Net>().Get(backend);
+    REQUIRE(creator);
 
-  Device device{"cuda"};
-  auto stream = Stream::GetDefault(device);
-  Value net_config{{"context", {{"device", device}, {"model", model}, {"stream", stream}}},
-                   {"name", model.meta().models[0].name}};
-  auto net = creator->Create(net_config);
-  REQUIRE(net);
+    Device device{"cuda"};
+    auto   stream = Stream::GetDefault(device);
+    Value  net_config{{"context", {{"device", device}, {"model", model}, {"stream", stream}}},
+                      {"name", model.meta().models[0].name}};
+    auto   net = creator->Create(net_config);
+    REQUIRE(net);
 }
diff --git a/tests/test_csrc/preprocess/test_collect.cpp b/tests/test_csrc/preprocess/test_collect.cpp
index 15a95fc525..941b7c6c1f 100644
--- a/tests/test_csrc/preprocess/test_collect.cpp
+++ b/tests/test_csrc/preprocess/test_collect.cpp
@@ -7,98 +7,108 @@
 using namespace mmdeploy;
 using namespace std;
 
-TEST_CASE("test collect constructor", "[collect]") {
-  Device device{"cpu"};
-  Stream stream{device};
-  Value cfg = {{"context", {{"device", device}, {"stream", stream}}}};
+TEST_CASE("test collect constructor", "[collect]")
+{
+    Device      device{"cpu"};
+    Stream      stream{device};
+    Value       cfg = {{"context", {{"device", device}, {"stream", stream}}}};
 
-  std::string transform_type{"Collect"};
-  auto creator = gRegistry<Transform>().Get(transform_type);
-  REQUIRE(creator != nullptr);
+    std::string transform_type{"Collect"};
+    auto        creator = gRegistry<Transform>().Get(transform_type);
+    REQUIRE(creator != nullptr);
 
-  REQUIRE_THROWS(creator->Create(cfg));
+    REQUIRE_THROWS(creator->Create(cfg));
 
-  SECTION("args with 'keys' which is not an array") {
-    auto _cfg = cfg;
-    _cfg["keys"] = "img";
-    REQUIRE_THROWS(creator->Create(_cfg));
-  }
+    SECTION("args with 'keys' which is not an array")
+    {
+        auto _cfg    = cfg;
+        _cfg["keys"] = "img";
+        REQUIRE_THROWS(creator->Create(_cfg));
+    }
 
-  SECTION("args with keys in array") {
-    auto _cfg = cfg;
-    _cfg["keys"] = {"img"};
-    auto module = creator->Create(_cfg);
-    REQUIRE(module != nullptr);
-  }
+    SECTION("args with keys in array")
+    {
+        auto _cfg    = cfg;
+        _cfg["keys"] = {"img"};
+        auto module  = creator->Create(_cfg);
+        REQUIRE(module != nullptr);
+    }
 
-  SECTION("args with meta_keys that is not an array") {
-    auto _cfg = cfg;
-    _cfg["keys"] = {"img"};
-    _cfg["meta_keys"] = "ori_img";
-    REQUIRE_THROWS(creator->Create(_cfg));
-  }
-  SECTION("args with meta_keys in array") {
-    auto _cfg = cfg;
-    _cfg["keys"] = {"img"};
-    _cfg["meta_keys"] = {"ori_img"};
-    auto module = creator->Create(_cfg);
-    REQUIRE(module != nullptr);
-  }
+    SECTION("args with meta_keys that is not an array")
+    {
+        auto _cfg         = cfg;
+        _cfg["keys"]      = {"img"};
+        _cfg["meta_keys"] = "ori_img";
+        REQUIRE_THROWS(creator->Create(_cfg));
+    }
+    SECTION("args with meta_keys in array")
+    {
+        auto _cfg         = cfg;
+        _cfg["keys"]      = {"img"};
+        _cfg["meta_keys"] = {"ori_img"};
+        auto module       = creator->Create(_cfg);
+        REQUIRE(module != nullptr);
+    }
 }
 
-TEST_CASE("test collect", "[collect]") {
-  std::string transform_type{"Collect"};
-  vector<std::string> keys{"img"};
-  vector<std::string> meta_keys{"filename", "ori_filename",   "ori_shape",   "img_shape",
-                                "flip",     "flip_direction", "img_norm_cfg"};
-  Value args;
-  Device device{"cpu"};
-  Stream stream{device};
-  args["context"]["device"] = device;
-  args["context"]["stream"] = stream;
-  for (auto& key : keys) {
-    args["keys"].push_back(key);
-  }
-  for (auto& meta_key : meta_keys) {
-    args["meta_keys"].push_back(meta_key);
-  }
+TEST_CASE("test collect", "[collect]")
+{
+    std::string         transform_type{"Collect"};
+    vector<std::string> keys{"img"};
+    vector<std::string> meta_keys{"filename", "ori_filename", "ori_shape", "img_shape", "flip", "flip_direction", "img_norm_cfg"};
+    Value               args;
+    Device              device{"cpu"};
+    Stream              stream{device};
+    args["context"]["device"] = device;
+    args["context"]["stream"] = stream;
+    for (auto& key : keys)
+    {
+        args["keys"].push_back(key);
+    }
+    for (auto& meta_key : meta_keys)
+    {
+        args["meta_keys"].push_back(meta_key);
+    }
 
-  auto creator = gRegistry<Transform>().Get(transform_type);
-  REQUIRE(creator != nullptr);
-  auto module = creator->Create(args);
-  REQUIRE(module != nullptr);
+    auto creator = gRegistry<Transform>().Get(transform_type);
+    REQUIRE(creator != nullptr);
+    auto module = creator->Create(args);
+    REQUIRE(module != nullptr);
 
-  Value input;
+    Value input;
 
-  SECTION("input is empty") {
-    auto ret = module->Apply(input);
-    REQUIRE(ret.has_error());
-    REQUIRE(ret.error() == eInvalidArgument);
-  }
+    SECTION("input is empty")
+    {
+        auto ret = module->Apply(input);
+        REQUIRE(ret.has_error());
+        REQUIRE(ret.error() == eInvalidArgument);
+    }
 
-  SECTION("input has 'ori_img' and 'attribute'") {
-    input["ori_img"] = Tensor{};
-    input["attribute"] = "this is a faked image";
-    auto ret = module->Apply(input);
-    REQUIRE(ret.has_error());
-    REQUIRE(ret.error() == eInvalidArgument);
-  }
+    SECTION("input has 'ori_img' and 'attribute'")
+    {
+        input["ori_img"]   = Tensor{};
+        input["attribute"] = "this is a faked image";
+        auto ret           = module->Apply(input);
+        REQUIRE(ret.has_error());
+        REQUIRE(ret.error() == eInvalidArgument);
+    }
 
-  SECTION("array input with correct keys and meta keys") {
-    Tensor tensor;
-    Value input{{"img", tensor},
-                {"filename", "test.jpg"},
-                {"ori_filename", "/the/path/of/test.jpg"},
-                {"ori_shape", {1000, 1000, 3}},
-                {"img_shape", {1, 3, 224, 224}},
-                {"flip", "false"},
-                {"flip_direction", "horizontal"},
-                {"img_norm_cfg",
-                 {{"mean", {123.675, 116.28, 103.53}},
-                  {"std", {58.395, 57.12, 57.375}},
-                  {"to_rgb", true}}}};
+    SECTION("array input with correct keys and meta keys")
+    {
+        Tensor tensor;
+        Value  input{{"img", tensor},
+                     {"filename", "test.jpg"},
+                     {"ori_filename", "/the/path/of/test.jpg"},
+                     {"ori_shape", {1000, 1000, 3}},
+                     {"img_shape", {1, 3, 224, 224}},
+                     {"flip", "false"},
+                     {"flip_direction", "horizontal"},
+                     {"img_norm_cfg",
+                      {{"mean", {123.675, 116.28, 103.53}},
+                       {"std", {58.395, 57.12, 57.375}},
+                       {"to_rgb", true}}}};
 
-    auto ret = module->Apply(input);
-    REQUIRE(ret.has_value());
-  }
+        auto   ret = module->Apply(input);
+        REQUIRE(ret.has_value());
+    }
 }
diff --git a/tests/test_csrc/preprocess/test_compose.cpp b/tests/test_csrc/preprocess/test_compose.cpp
index fba95bd566..b8d151cb7a 100644
--- a/tests/test_csrc/preprocess/test_compose.cpp
+++ b/tests/test_csrc/preprocess/test_compose.cpp
@@ -22,7 +22,7 @@ using namespace mmdeploy::test;
 using namespace std;
 using nlohmann::json;
 
-static constexpr const char *gPipelineConfig = R"(
+static constexpr const char* gPipelineConfig = R"(
 [{
 		"type": "LoadImageFromFile"
 	},
@@ -65,38 +65,42 @@ static constexpr const char *gPipelineConfig = R"(
 ]
 )";
 
-TEST_CASE("transform Compose exceptional case", "[compose]") {
-  Value compose_cfg;
-  SECTION("wrong transform type") {
-    compose_cfg = {{"type", "Compose"}, {"transforms", {{{"type", "collect"}}}}};
-  }
+TEST_CASE("transform Compose exceptional case", "[compose]")
+{
+    Value compose_cfg;
+    SECTION("wrong transform type")
+    {
+        compose_cfg = {{"type", "Compose"}, {"transforms", {{{"type", "collect"}}}}};
+    }
 
-  SECTION("wrong transform parameter") {
-    compose_cfg = {{"type", "Compose"}, {"transforms", {{{"type", "Collect"}}}}};
-  }
-  const Device kHost{"cpu"};
-  Stream stream{kHost};
-  REQUIRE(CreateTransform(compose_cfg, kHost, stream) == nullptr);
+    SECTION("wrong transform parameter")
+    {
+        compose_cfg = {{"type", "Compose"}, {"transforms", {{{"type", "Collect"}}}}};
+    }
+    const Device kHost{"cpu"};
+    Stream       stream{kHost};
+    REQUIRE(CreateTransform(compose_cfg, kHost, stream) == nullptr);
 }
 
-TEST_CASE("transform Compose", "[compose]") {
-  auto gResource = MMDeployTestResources::Get();
-  auto img_list = gResource.LocateImageResources("transform");
-  REQUIRE(!img_list.empty());
+TEST_CASE("transform Compose", "[compose]")
+{
+    auto gResource = MMDeployTestResources::Get();
+    auto img_list  = gResource.LocateImageResources("transform");
+    REQUIRE(!img_list.empty());
 
-  auto img_path = img_list.front();
-  cv::Mat bgr_mat = cv::imread(img_path, cv::IMREAD_COLOR);
-  auto src_mat = cpu::CVMat2Mat(bgr_mat, PixelFormat::kBGR);
-  Value input{{"ori_img", src_mat}};
+    auto         img_path = img_list.front();
+    cv::Mat      bgr_mat  = cv::imread(img_path, cv::IMREAD_COLOR);
+    auto         src_mat  = cpu::CVMat2Mat(bgr_mat, PixelFormat::kBGR);
+    Value        input{{"ori_img", src_mat}};
 
-  auto json = json::parse(gPipelineConfig);
-  auto cfg = ::mmdeploy::from_json<Value>(json);
-  Value compose_cfg{{"type", "Compose"}, {"transforms", cfg}};
+    auto         json = json::parse(gPipelineConfig);
+    auto         cfg  = ::mmdeploy::from_json<Value>(json);
+    Value        compose_cfg{{"type", "Compose"}, {"transforms", cfg}};
 
-  const Device kHost{"cpu"};
-  Stream stream{kHost};
-  auto transform = CreateTransform(compose_cfg, kHost, stream);
-  REQUIRE(transform != nullptr);
-  auto res = transform->Process({{"ori_img", src_mat}});
-  REQUIRE(!res.has_error());
+    const Device kHost{"cpu"};
+    Stream       stream{kHost};
+    auto         transform = CreateTransform(compose_cfg, kHost, stream);
+    REQUIRE(transform != nullptr);
+    auto res = transform->Process({{"ori_img", src_mat}});
+    REQUIRE(!res.has_error());
 }
diff --git a/tests/test_csrc/preprocess/test_crop.cpp b/tests/test_csrc/preprocess/test_crop.cpp
index 97fba9e153..c57e26bebc 100644
--- a/tests/test_csrc/preprocess/test_crop.cpp
+++ b/tests/test_csrc/preprocess/test_crop.cpp
@@ -15,97 +15,111 @@ using namespace framework;
 using namespace std;
 using namespace mmdeploy::test;
 
-tuple<int, int, int, int> CenterCropArea(const cv::Mat& mat, int crop_height, int crop_width) {
-  auto img_height = mat.rows;
-  auto img_width = mat.cols;
-  auto y1 = max(0, int(round((img_height - crop_height) / 2.)));
-  auto x1 = max(0, int(round((img_width - crop_width) / 2.)));
-  auto y2 = min(img_height, y1 + crop_height) - 1;
-  auto x2 = min(img_width, x1 + crop_width) - 1;
-  return {y1, x1, y2, x2};
+tuple<int, int, int, int> CenterCropArea(const cv::Mat& mat, int crop_height, int crop_width)
+{
+    auto img_height = mat.rows;
+    auto img_width  = mat.cols;
+    auto y1         = max(0, int(round((img_height - crop_height) / 2.)));
+    auto x1         = max(0, int(round((img_width - crop_width) / 2.)));
+    auto y2         = min(img_height, y1 + crop_height) - 1;
+    auto x2         = min(img_width, x1 + crop_width) - 1;
+    return {y1, x1, y2, x2};
 }
 
-void TestCenterCrop(const Value& cfg, const cv::Mat& mat, int crop_height, int crop_width) {
-  auto gResource = MMDeployTestResources::Get();
-  for (auto const& device_name : gResource.device_names()) {
-    Device device{device_name.c_str()};
-    Stream stream{device};
-    auto transform = CreateTransform(cfg, device, stream);
-    REQUIRE(transform != nullptr);
-
-    auto [top, left, bottom, right] = CenterCropArea(mat, crop_height, crop_width);
-    auto ref_mat = mmdeploy::cpu::Crop(mat, top, left, bottom, right);
-    auto res = transform->Process({{"img", cpu::CVMat2Tensor(mat)}});
-    REQUIRE(!res.has_error());
-    auto res_tensor = res.value()["img"].get<Tensor>();
-    REQUIRE(res_tensor.device() == device);
-    REQUIRE(Shape(res.value(), "img_shape") ==
-            vector<int64_t>{1, ref_mat.rows, ref_mat.cols, ref_mat.channels()});
-
-    const Device kHost{"cpu"};
-    auto host_tensor = MakeAvailableOnDevice(res_tensor, kHost, stream);
-    REQUIRE(stream.Wait());
-
-    auto res_mat = mmdeploy::cpu::Tensor2CVMat(host_tensor.value());
-    REQUIRE(mmdeploy::cpu::Compare(ref_mat, res_mat));
-  }
+void TestCenterCrop(const Value& cfg, const cv::Mat& mat, int crop_height, int crop_width)
+{
+    auto gResource = MMDeployTestResources::Get();
+    for (auto const& device_name : gResource.device_names())
+    {
+        Device device{device_name.c_str()};
+        Stream stream{device};
+        auto   transform = CreateTransform(cfg, device, stream);
+        REQUIRE(transform != nullptr);
+
+        auto [top, left, bottom, right] = CenterCropArea(mat, crop_height, crop_width);
+        auto ref_mat                    = mmdeploy::cpu::Crop(mat, top, left, bottom, right);
+        auto res                        = transform->Process({{"img", cpu::CVMat2Tensor(mat)}});
+        REQUIRE(!res.has_error());
+        auto res_tensor = res.value()["img"].get<Tensor>();
+        REQUIRE(res_tensor.device() == device);
+        REQUIRE(Shape(res.value(), "img_shape") ==
+                vector<int64_t>{1, ref_mat.rows, ref_mat.cols, ref_mat.channels()});
+
+        const Device kHost{"cpu"};
+        auto         host_tensor = MakeAvailableOnDevice(res_tensor, kHost, stream);
+        REQUIRE(stream.Wait());
+
+        auto res_mat = mmdeploy::cpu::Tensor2CVMat(host_tensor.value());
+        REQUIRE(mmdeploy::cpu::Compare(ref_mat, res_mat));
+    }
 }
 
-TEST_CASE("transform CenterCrop", "[crop]") {
-  auto gResource = MMDeployTestResources::Get();
-  auto img_list = gResource.LocateImageResources("transform");
-  REQUIRE(!img_list.empty());
-
-  auto img_path = img_list.front();
-  cv::Mat bgr_mat = cv::imread(img_path, cv::IMREAD_COLOR);
-  cv::Mat gray_mat = cv::imread(img_path, cv::IMREAD_GRAYSCALE);
-  cv::Mat bgr_float_mat;
-  cv::Mat gray_float_mat;
-  bgr_mat.convertTo(bgr_float_mat, CV_32FC3);
-  gray_mat.convertTo(gray_float_mat, CV_32FC1);
-
-  vector<cv::Mat> mats{bgr_mat, gray_mat, bgr_float_mat, gray_float_mat};
-
-  SECTION("crop_size: int; small size") {
-    constexpr int crop_size = 224;
-    Value cfg{{"type", "CenterCrop"}, {"crop_size", crop_size}};
-    for (auto& mat : mats) {
-      TestCenterCrop(cfg, mat, crop_size, crop_size);
+TEST_CASE("transform CenterCrop", "[crop]")
+{
+    auto gResource = MMDeployTestResources::Get();
+    auto img_list  = gResource.LocateImageResources("transform");
+    REQUIRE(!img_list.empty());
+
+    auto    img_path = img_list.front();
+    cv::Mat bgr_mat  = cv::imread(img_path, cv::IMREAD_COLOR);
+    cv::Mat gray_mat = cv::imread(img_path, cv::IMREAD_GRAYSCALE);
+    cv::Mat bgr_float_mat;
+    cv::Mat gray_float_mat;
+    bgr_mat.convertTo(bgr_float_mat, CV_32FC3);
+    gray_mat.convertTo(gray_float_mat, CV_32FC1);
+
+    vector<cv::Mat> mats{bgr_mat, gray_mat, bgr_float_mat, gray_float_mat};
+
+    SECTION("crop_size: int; small size")
+    {
+        constexpr int crop_size = 224;
+        Value         cfg{{"type", "CenterCrop"}, {"crop_size", crop_size}};
+        for (auto& mat : mats)
+        {
+            TestCenterCrop(cfg, mat, crop_size, crop_size);
+        }
     }
-  }
 
-  SECTION("crop_size: int; oversize") {
-    constexpr int crop_size = 800;
-    Value cfg{{"type", "CenterCrop"}, {"crop_size", crop_size}};
-    for (auto& mat : mats) {
-      TestCenterCrop(cfg, mat, crop_size, crop_size);
+    SECTION("crop_size: int; oversize")
+    {
+        constexpr int crop_size = 800;
+        Value         cfg{{"type", "CenterCrop"}, {"crop_size", crop_size}};
+        for (auto& mat : mats)
+        {
+            TestCenterCrop(cfg, mat, crop_size, crop_size);
+        }
     }
-  }
-
-  SECTION("crop_size: tuple") {
-    constexpr int crop_height = 224;
-    constexpr int crop_width = 224;
-    Value cfg{{"type", "CenterCrop"}, {"crop_size", {crop_height, crop_width}}};
-    for (auto& mat : mats) {
-      TestCenterCrop(cfg, mat, crop_height, crop_width);
+
+    SECTION("crop_size: tuple")
+    {
+        constexpr int crop_height = 224;
+        constexpr int crop_width  = 224;
+        Value         cfg{{"type", "CenterCrop"}, {"crop_size", {crop_height, crop_width}}};
+        for (auto& mat : mats)
+        {
+            TestCenterCrop(cfg, mat, crop_height, crop_width);
+        }
     }
-  }
-
-  SECTION("crop_size: tuple;oversize in height") {
-    constexpr int crop_height = 640;
-    constexpr int crop_width = 224;
-    Value cfg{{"type", "CenterCrop"}, {"crop_size", {crop_height, crop_width}}};
-    for (auto& mat : mats) {
-      TestCenterCrop(cfg, mat, crop_height, crop_width);
+
+    SECTION("crop_size: tuple;oversize in height")
+    {
+        constexpr int crop_height = 640;
+        constexpr int crop_width  = 224;
+        Value         cfg{{"type", "CenterCrop"}, {"crop_size", {crop_height, crop_width}}};
+        for (auto& mat : mats)
+        {
+            TestCenterCrop(cfg, mat, crop_height, crop_width);
+        }
     }
-  }
-
-  SECTION("crop_size: tuple;oversize in width") {
-    constexpr int crop_height = 224;
-    constexpr int crop_width = 800;
-    Value cfg{{"type", "CenterCrop"}, {"crop_size", {crop_height, crop_width}}};
-    for (auto& mat : mats) {
-      TestCenterCrop(cfg, mat, crop_height, crop_width);
+
+    SECTION("crop_size: tuple;oversize in width")
+    {
+        constexpr int crop_height = 224;
+        constexpr int crop_width  = 800;
+        Value         cfg{{"type", "CenterCrop"}, {"crop_size", {crop_height, crop_width}}};
+        for (auto& mat : mats)
+        {
+            TestCenterCrop(cfg, mat, crop_height, crop_width);
+        }
     }
-  }
 }
diff --git a/tests/test_csrc/preprocess/test_default_format_bundle.cpp b/tests/test_csrc/preprocess/test_default_format_bundle.cpp
index a011e898e8..4eb35f73f0 100644
--- a/tests/test_csrc/preprocess/test_default_format_bundle.cpp
+++ b/tests/test_csrc/preprocess/test_default_format_bundle.cpp
@@ -13,56 +13,62 @@ using namespace framework;
 using namespace mmdeploy::test;
 using namespace std;
 
-void TestDefaultFormatBundle(const Value& cfg, const cv::Mat& mat) {
-  auto gResource = MMDeployTestResources::Get();
-  for (auto const& device_name : gResource.device_names()) {
-    Device device{device_name.c_str()};
-    Stream stream{device};
-    auto transform = CreateTransform(cfg, device, stream);
-    REQUIRE(transform != nullptr);
+void TestDefaultFormatBundle(const Value& cfg, const cv::Mat& mat)
+{
+    auto gResource = MMDeployTestResources::Get();
+    for (auto const& device_name : gResource.device_names())
+    {
+        Device device{device_name.c_str()};
+        Stream stream{device};
+        auto   transform = CreateTransform(cfg, device, stream);
+        REQUIRE(transform != nullptr);
 
-    vector<cv::Mat> channel_mats(mat.channels());
-    for (auto i = 0; i < mat.channels(); ++i) {
-      cv::extractChannel(mat, channel_mats[i], i);
-    }
+        vector<cv::Mat> channel_mats(mat.channels());
+        for (auto i = 0; i < mat.channels(); ++i)
+        {
+            cv::extractChannel(mat, channel_mats[i], i);
+        }
 
-    auto res = transform->Process({{"img", cpu::CVMat2Tensor(mat)}});
-    REQUIRE(!res.has_error());
-    auto res_tensor = res.value()["img"].get<Tensor>();
-    REQUIRE(res_tensor.device() == device);
-    auto shape = res_tensor.desc().shape;
-    REQUIRE(shape == std::vector<int64_t>{1, mat.channels(), mat.rows, mat.cols});
+        auto res = transform->Process({{"img", cpu::CVMat2Tensor(mat)}});
+        REQUIRE(!res.has_error());
+        auto res_tensor = res.value()["img"].get<Tensor>();
+        REQUIRE(res_tensor.device() == device);
+        auto shape = res_tensor.desc().shape;
+        REQUIRE(shape == std::vector<int64_t>{1, mat.channels(), mat.rows, mat.cols});
 
-    const Device kHost{"cpu"};
-    auto host_tensor = MakeAvailableOnDevice(res_tensor, kHost, stream);
-    REQUIRE(stream.Wait());
+        const Device kHost{"cpu"};
+        auto         host_tensor = MakeAvailableOnDevice(res_tensor, kHost, stream);
+        REQUIRE(stream.Wait());
 
-    // mat's shape is {h, w, c}, while res_tensor's shape is {1, c, h, w}
-    // compare each channel between `res_tensor` and `mat`
-    // note `data_type` of `res_tensor` is `float`
-    auto step = shape[2] * shape[3] * sizeof(float);
-    auto data = host_tensor.value().data<uint8_t>();
-    for (auto i = 0; i < mat.channels(); ++i) {
-      cv::Mat _mat{mat.rows, mat.cols, CV_32FC1, data};
-      REQUIRE(::mmdeploy::cpu::Compare(channel_mats[i], _mat));
-      data += step;
+        // mat's shape is {h, w, c}, while res_tensor's shape is {1, c, h, w}
+        // compare each channel between `res_tensor` and `mat`
+        // note `data_type` of `res_tensor` is `float`
+        auto step = shape[2] * shape[3] * sizeof(float);
+        auto data = host_tensor.value().data<uint8_t>();
+        for (auto i = 0; i < mat.channels(); ++i)
+        {
+            cv::Mat _mat{mat.rows, mat.cols, CV_32FC1, data};
+            REQUIRE(::mmdeploy::cpu::Compare(channel_mats[i], _mat));
+            data += step;
+        }
     }
-  }
 }
 
-TEST_CASE("transform DefaultFormatBundle", "[bundle]") {
-  auto gResource = MMDeployTestResources::Get();
-  auto img_list = gResource.LocateImageResources("transform");
-  REQUIRE(!img_list.empty());
+TEST_CASE("transform DefaultFormatBundle", "[bundle]")
+{
+    auto gResource = MMDeployTestResources::Get();
+    auto img_list  = gResource.LocateImageResources("transform");
+    REQUIRE(!img_list.empty());
 
-  auto img_path = img_list.front();
-  cv::Mat bgr_mat = cv::imread(img_path, cv::IMREAD_COLOR);
-  cv::Mat gray_mat = cv::imread(img_path, cv::IMREAD_GRAYSCALE);
+    auto            img_path = img_list.front();
+    cv::Mat         bgr_mat  = cv::imread(img_path, cv::IMREAD_COLOR);
+    cv::Mat         gray_mat = cv::imread(img_path, cv::IMREAD_GRAYSCALE);
 
 
-  Value cfg{{"type", "DefaultFormatBundle"}, {"keys", {"img"}}};
-  vector<cv::Mat> mats{bgr_mat, gray_mat};
-  for (auto& mat : mats) {
-    TestDefaultFormatBundle(cfg, mat);
-  }
+    Value           cfg{{"type", "DefaultFormatBundle"}, {"keys", {"img"}}};
+    vector<cv::Mat> mats{bgr_mat, gray_mat};
+    for (auto& mat : mats)
+    {
+        TestDefaultFormatBundle(cfg, mat);
+    }
 }
diff --git a/tests/test_csrc/preprocess/test_image2tensor.cpp b/tests/test_csrc/preprocess/test_image2tensor.cpp
index 23fab43d06..73f2d75a1e 100644
--- a/tests/test_csrc/preprocess/test_image2tensor.cpp
+++ b/tests/test_csrc/preprocess/test_image2tensor.cpp
@@ -13,58 +13,64 @@ using namespace framework;
 using namespace mmdeploy::test;
 using namespace std;
 
-void TestImage2Tensor(const Value& cfg, const cv::Mat& mat) {
-  auto gResource = MMDeployTestResources::Get();
-  for (auto const& device_name : gResource.device_names()) {
-    Device device{device_name.c_str()};
-    Stream stream{device};
-    auto transform = CreateTransform(cfg, device, stream);
-    REQUIRE(transform != nullptr);
+void TestImage2Tensor(const Value& cfg, const cv::Mat& mat)
+{
+    auto gResource = MMDeployTestResources::Get();
+    for (auto const& device_name : gResource.device_names())
+    {
+        Device device{device_name.c_str()};
+        Stream stream{device};
+        auto   transform = CreateTransform(cfg, device, stream);
+        REQUIRE(transform != nullptr);
 
-    vector<cv::Mat> channel_mats(mat.channels());
-    for (auto i = 0; i < mat.channels(); ++i) {
-      cv::extractChannel(mat, channel_mats[i], i);
-    }
+        vector<cv::Mat> channel_mats(mat.channels());
+        for (auto i = 0; i < mat.channels(); ++i)
+        {
+            cv::extractChannel(mat, channel_mats[i], i);
+        }
 
-    auto res = transform->Process({{"img", cpu::CVMat2Tensor(mat)}});
-    REQUIRE(!res.has_error());
-    auto res_tensor = res.value()["img"].get<Tensor>();
-    REQUIRE(res_tensor.device() == device);
-    auto shape = res_tensor.desc().shape;
-    REQUIRE(shape == std::vector<int64_t>{1, mat.channels(), mat.rows, mat.cols});
+        auto res = transform->Process({{"img", cpu::CVMat2Tensor(mat)}});
+        REQUIRE(!res.has_error());
+        auto res_tensor = res.value()["img"].get<Tensor>();
+        REQUIRE(res_tensor.device() == device);
+        auto shape = res_tensor.desc().shape;
+        REQUIRE(shape == std::vector<int64_t>{1, mat.channels(), mat.rows, mat.cols});
 
-    const Device kHost{"cpu"};
-    auto host_tensor = MakeAvailableOnDevice(res_tensor, kHost, stream);
-    REQUIRE(stream.Wait());
+        const Device kHost{"cpu"};
+        auto         host_tensor = MakeAvailableOnDevice(res_tensor, kHost, stream);
+        REQUIRE(stream.Wait());
 
-    // mat's shape is {h, w, c}, while res_tensor's shape is {1, c, h, w}
-    // compare each channel between `res_tensor` and `mat`
-    auto step = shape[2] * shape[3] * mat.elemSize1();
-    auto data = host_tensor.value().data<uint8_t>();
-    for (auto i = 0; i < mat.channels(); ++i) {
-      cv::Mat _mat{mat.rows, mat.cols, CV_MAKETYPE(mat.depth(), 1), data};
-      REQUIRE(::mmdeploy::cpu::Compare(channel_mats[i], _mat));
-      data += step;
+        // mat's shape is {h, w, c}, while res_tensor's shape is {1, c, h, w}
+        // compare each channel between `res_tensor` and `mat`
+        auto step = shape[2] * shape[3] * mat.elemSize1();
+        auto data = host_tensor.value().data<uint8_t>();
+        for (auto i = 0; i < mat.channels(); ++i)
+        {
+            cv::Mat _mat{mat.rows, mat.cols, CV_MAKETYPE(mat.depth(), 1), data};
+            REQUIRE(::mmdeploy::cpu::Compare(channel_mats[i], _mat));
+            data += step;
+        }
     }
-  }
 }
 
-TEST_CASE("transform ImageToTensor", "[img2tensor]") {
-  auto gResource = MMDeployTestResources::Get();
-  auto img_list = gResource.LocateImageResources("transform");
-  REQUIRE(!img_list.empty());
+TEST_CASE("transform ImageToTensor", "[img2tensor]")
+{
+    auto gResource = MMDeployTestResources::Get();
+    auto img_list  = gResource.LocateImageResources("transform");
+    REQUIRE(!img_list.empty());
 
-  auto img_path = img_list.front();
-  cv::Mat bgr_mat = cv::imread(img_path, cv::IMREAD_COLOR);
-  cv::Mat gray_mat = cv::imread(img_path, cv::IMREAD_GRAYSCALE);
-  cv::Mat bgr_float_mat;
-  cv::Mat gray_float_mat;
-  bgr_mat.convertTo(bgr_float_mat, CV_32FC3);
-  gray_mat.convertTo(gray_float_mat, CV_32FC1);
+    auto    img_path = img_list.front();
+    cv::Mat bgr_mat  = cv::imread(img_path, cv::IMREAD_COLOR);
+    cv::Mat gray_mat = cv::imread(img_path, cv::IMREAD_GRAYSCALE);
+    cv::Mat bgr_float_mat;
+    cv::Mat gray_float_mat;
+    bgr_mat.convertTo(bgr_float_mat, CV_32FC3);
+    gray_mat.convertTo(gray_float_mat, CV_32FC1);
 
-  Value cfg{{"type", "ImageToTensor"}, {"keys", {"img"}}};
-  vector<cv::Mat> mats{bgr_mat, gray_mat, bgr_float_mat, gray_float_mat};
-  for (auto& mat : mats) {
-    TestImage2Tensor(cfg, mat);
-  }
+    Value           cfg{{"type", "ImageToTensor"}, {"keys", {"img"}}};
+    vector<cv::Mat> mats{bgr_mat, gray_mat, bgr_float_mat, gray_float_mat};
+    for (auto& mat : mats)
+    {
+        TestImage2Tensor(cfg, mat);
+    }
 }
diff --git a/tests/test_csrc/preprocess/test_load.cpp b/tests/test_csrc/preprocess/test_load.cpp
index 258d2c0ed3..a353602b74 100644
--- a/tests/test_csrc/preprocess/test_load.cpp
+++ b/tests/test_csrc/preprocess/test_load.cpp
@@ -16,69 +16,75 @@ using namespace framework;
 using namespace std;
 using namespace mmdeploy::test;
 
-void TestLoad(const Value& cfg, const cv::Mat& mat, PixelFormat src_format,
-              PixelFormat dst_format) {
-  auto gResource = MMDeployTestResources::Get();
-  for (auto const& device_name : gResource.device_names()) {
-    Device device{device_name.c_str()};
-    Stream stream{device};
-    auto transform = CreateTransform(cfg, device, stream);
-    REQUIRE(transform != nullptr);
+void TestLoad(const Value& cfg, const cv::Mat& mat, PixelFormat src_format, PixelFormat dst_format)
+{
+    auto gResource = MMDeployTestResources::Get();
+    for (auto const& device_name : gResource.device_names())
+    {
+        Device device{device_name.c_str()};
+        Stream stream{device};
+        auto   transform = CreateTransform(cfg, device, stream);
+        REQUIRE(transform != nullptr);
 
-    auto ref_mat = mmdeploy::cpu::CvtColor(mat, src_format, dst_format);
+        auto ref_mat = mmdeploy::cpu::CvtColor(mat, src_format, dst_format);
 
-    auto res = transform->Process({{"ori_img", cpu::CVMat2Mat(mat, PixelFormat(src_format))}});
-    REQUIRE(!res.has_error());
-    auto res_tensor = res.value()["img"].get<Tensor>();
-    REQUIRE(res_tensor.device() == device);
-    REQUIRE(Shape(res.value(), "img_shape") ==
-            vector<int64_t>{1, ref_mat.rows, ref_mat.cols, ref_mat.channels()});
-    REQUIRE(Shape(res.value(), "ori_shape") ==
-            vector<int64_t>{1, mat.rows, mat.cols, mat.channels()});
-    REQUIRE(res.value().contains("img_fields"));
-    REQUIRE(res.value()["img_fields"].is_array());
-    REQUIRE(res.value()["img_fields"].size() == 1);
-    REQUIRE(res.value()["img_fields"][0].get<string>() == "img");
+        auto res = transform->Process({{"ori_img", cpu::CVMat2Mat(mat, PixelFormat(src_format))}});
+        REQUIRE(!res.has_error());
+        auto res_tensor = res.value()["img"].get<Tensor>();
+        REQUIRE(res_tensor.device() == device);
+        REQUIRE(Shape(res.value(), "img_shape") ==
+                vector<int64_t>{1, ref_mat.rows, ref_mat.cols, ref_mat.channels()});
+        REQUIRE(Shape(res.value(), "ori_shape") ==
+                vector<int64_t>{1, mat.rows, mat.cols, mat.channels()});
+        REQUIRE(res.value().contains("img_fields"));
+        REQUIRE(res.value()["img_fields"].is_array());
+        REQUIRE(res.value()["img_fields"].size() == 1);
+        REQUIRE(res.value()["img_fields"][0].get<string>() == "img");
 
-    const Device kHost{"cpu"};
-    auto host_tensor = MakeAvailableOnDevice(res_tensor, kHost, stream);
-    REQUIRE(stream.Wait());
+        const Device kHost{"cpu"};
+        auto         host_tensor = MakeAvailableOnDevice(res_tensor, kHost, stream);
+        REQUIRE(stream.Wait());
 
-    auto res_mat = mmdeploy::cpu::Tensor2CVMat(host_tensor.value());
-    REQUIRE(mmdeploy::cpu::Compare(ref_mat, res_mat));
-  }
+        auto res_mat = mmdeploy::cpu::Tensor2CVMat(host_tensor.value());
+        REQUIRE(mmdeploy::cpu::Compare(ref_mat, res_mat));
+    }
 }
 
-TEST_CASE("prepare image, that is LoadImageFromFile transform", "[.load]") {
-  auto gResource = MMDeployTestResources::Get();
-  auto img_list = gResource.LocateImageResources("transform");
-  REQUIRE(!img_list.empty());
+TEST_CASE("prepare image, that is LoadImageFromFile transform", "[.load]")
+{
+    auto gResource = MMDeployTestResources::Get();
+    auto img_list  = gResource.LocateImageResources("transform");
+    REQUIRE(!img_list.empty());
 
-  auto img_path = img_list.front();
-  cv::Mat bgr_mat = cv::imread(img_path, cv::IMREAD_COLOR);
-  cv::Mat gray_mat = cv::imread(img_path, cv::IMREAD_GRAYSCALE);
-  cv::Mat rgb_mat;
-  cv::Mat bgra_mat;
-  // TODO: make up yuv nv12/nv21 mat
+    auto    img_path = img_list.front();
+    cv::Mat bgr_mat  = cv::imread(img_path, cv::IMREAD_COLOR);
+    cv::Mat gray_mat = cv::imread(img_path, cv::IMREAD_GRAYSCALE);
+    cv::Mat rgb_mat;
+    cv::Mat bgra_mat;
+    // TODO: make up yuv nv12/nv21 mat
 
-  cv::cvtColor(bgr_mat, rgb_mat, cv::COLOR_BGR2RGB);
-  cv::cvtColor(bgr_mat, bgra_mat, cv::COLOR_BGR2BGRA);
+    cv::cvtColor(bgr_mat, rgb_mat, cv::COLOR_BGR2RGB);
+    cv::cvtColor(bgr_mat, bgra_mat, cv::COLOR_BGR2BGRA);
 
-  vector<pair<cv::Mat, PixelFormat>> mats{{bgr_mat, PixelFormat::kBGR},
-                                          {rgb_mat, PixelFormat::kRGB},
-                                          {gray_mat, PixelFormat::kGRAYSCALE},
-                                          {bgra_mat, PixelFormat::kBGRA}};
-  // pair is <color_type, to_float32>
-  vector<pair<std::string, bool>> conditions{
-      {"color", true}, {"color", false}, {"grayscale", true}, {"grayscale", false}};
+    vector<pair<cv::Mat, PixelFormat>> mats{{bgr_mat, PixelFormat::kBGR},
+                                            {rgb_mat, PixelFormat::kRGB},
+                                            {gray_mat, PixelFormat::kGRAYSCALE},
+                                            {bgra_mat, PixelFormat::kBGRA}};
+    // pair is <color_type, to_float32>
+    vector<pair<std::string, bool>>    conditions{
+           {"color", true},
+           {"color", false},
+           {"grayscale", true},
+           {"grayscale", false}};
 
-  for (auto& condition : conditions) {
-    Value cfg{{"type", "LoadImageFromFile"},
-              {"to_float32", condition.second},
-              {"color_type", condition.first}};
-    for (auto& mat : mats) {
-      TestLoad(cfg, mat.first, mat.second,
-               condition.first == "color" ? PixelFormat::kBGR : PixelFormat::kGRAYSCALE);
+    for (auto& condition : conditions)
+    {
+        Value cfg{{"type", "LoadImageFromFile"},
+                  {"to_float32", condition.second},
+                  {"color_type", condition.first}};
+        for (auto& mat : mats)
+        {
+            TestLoad(cfg, mat.first, mat.second, condition.first == "color" ? PixelFormat::kBGR : PixelFormat::kGRAYSCALE);
+        }
     }
-  }
 }
diff --git a/tests/test_csrc/preprocess/test_normalize.cpp b/tests/test_csrc/preprocess/test_normalize.cpp
index e44e2bd739..0f70d5fe69 100644
--- a/tests/test_csrc/preprocess/test_normalize.cpp
+++ b/tests/test_csrc/preprocess/test_normalize.cpp
@@ -15,90 +15,101 @@ using namespace framework;
 using namespace mmdeploy::test;
 using namespace std;
 
-void TestNormalize(const Value &cfg, const cv::Mat &mat) {
-  auto gResource = MMDeployTestResources::Get();
-  for (auto const &device_name : gResource.device_names()) {
-    Device device{device_name.c_str()};
-    Stream stream{device};
-    auto transform = CreateTransform(cfg, device, stream);
-    REQUIRE(transform != nullptr);
+void TestNormalize(const Value& cfg, const cv::Mat& mat)
+{
+    auto gResource = MMDeployTestResources::Get();
+    for (auto const& device_name : gResource.device_names())
+    {
+        Device device{device_name.c_str()};
+        Stream stream{device};
+        auto   transform = CreateTransform(cfg, device, stream);
+        REQUIRE(transform != nullptr);
 
-    vector<float> mean;
-    vector<float> std;
-    for (auto &v : cfg["mean"]) {
-      mean.push_back(v.get<float>());
-    }
-    for (auto &v : cfg["std"]) {
-      std.push_back(v.get<float>());
-    }
-    bool to_rgb = cfg.value("to_rgb", false);
+        vector<float> mean;
+        vector<float> std;
+        for (auto& v : cfg["mean"])
+        {
+            mean.push_back(v.get<float>());
+        }
+        for (auto& v : cfg["std"])
+        {
+            std.push_back(v.get<float>());
+        }
+        bool to_rgb = cfg.value("to_rgb", false);
 
-    auto _mat = mat.clone();
-    auto ref_mat = mmdeploy::cpu::Normalize(_mat, mean, std, to_rgb);
+        auto _mat    = mat.clone();
+        auto ref_mat = mmdeploy::cpu::Normalize(_mat, mean, std, to_rgb);
 
-    auto res = transform->Process({{"img", cpu::CVMat2Tensor(mat)}});
-    REQUIRE(!res.has_error());
-    auto res_tensor = res.value()["img"].get<Tensor>();
-    REQUIRE(res_tensor.device() == device);
-    REQUIRE(res_tensor.desc().data_type == DataType::kFLOAT);
-    REQUIRE(ImageNormCfg(res.value(), "mean") == mean);
-    REQUIRE(ImageNormCfg(res.value(), "std") == std);
+        auto res = transform->Process({{"img", cpu::CVMat2Tensor(mat)}});
+        REQUIRE(!res.has_error());
+        auto res_tensor = res.value()["img"].get<Tensor>();
+        REQUIRE(res_tensor.device() == device);
+        REQUIRE(res_tensor.desc().data_type == DataType::kFLOAT);
+        REQUIRE(ImageNormCfg(res.value(), "mean") == mean);
+        REQUIRE(ImageNormCfg(res.value(), "std") == std);
 
-    Device kHost{"cpu"};
-    auto host_tensor = MakeAvailableOnDevice(res_tensor, kHost, stream);
-    REQUIRE(stream.Wait());
-    auto res_mat = mmdeploy::cpu::Tensor2CVMat(host_tensor.value());
-    REQUIRE(mmdeploy::cpu::Compare(ref_mat, res_mat));
-  }
+        Device kHost{"cpu"};
+        auto   host_tensor = MakeAvailableOnDevice(res_tensor, kHost, stream);
+        REQUIRE(stream.Wait());
+        auto res_mat = mmdeploy::cpu::Tensor2CVMat(host_tensor.value());
+        REQUIRE(mmdeploy::cpu::Compare(ref_mat, res_mat));
+    }
 }
 
-TEST_CASE("transform Normalize", "[normalize]") {
-  auto gResource = MMDeployTestResources::Get();
-  auto img_list = gResource.LocateImageResources("transform");
-  REQUIRE(!img_list.empty());
+TEST_CASE("transform Normalize", "[normalize]")
+{
+    auto gResource = MMDeployTestResources::Get();
+    auto img_list  = gResource.LocateImageResources("transform");
+    REQUIRE(!img_list.empty());
 
-  auto img_path = img_list.front();
-  cv::Mat bgr_mat = cv::imread(img_path);
-  cv::Mat gray_mat;
-  cv::Mat float_bgr_mat;
-  cv::Mat float_gray_mat;
+    auto    img_path = img_list.front();
+    cv::Mat bgr_mat  = cv::imread(img_path);
+    cv::Mat gray_mat;
+    cv::Mat float_bgr_mat;
+    cv::Mat float_gray_mat;
 
-  cv::cvtColor(bgr_mat, gray_mat, cv::COLOR_BGR2GRAY);
-  bgr_mat.convertTo(float_bgr_mat, CV_32FC3);
-  gray_mat.convertTo(float_gray_mat, CV_32FC1);
+    cv::cvtColor(bgr_mat, gray_mat, cv::COLOR_BGR2GRAY);
+    bgr_mat.convertTo(float_bgr_mat, CV_32FC3);
+    gray_mat.convertTo(float_gray_mat, CV_32FC1);
 
-  SECTION("cpu vs gpu: 3 channel mat") {
-    bool to_rgb = true;
-    Value cfg{{"type", "Normalize"},
-              {"mean", {123.675, 116.28, 103.53}},
-              {"std", {58.395, 57.12, 57.375}},
-              {"to_rgb", to_rgb}};
-    vector<cv::Mat> mats{bgr_mat, float_bgr_mat};
-    for (auto &mat : mats) {
-      TestNormalize(cfg, mat);
+    SECTION("cpu vs gpu: 3 channel mat")
+    {
+        bool            to_rgb = true;
+        Value           cfg{{"type", "Normalize"},
+                            {"mean", {123.675, 116.28, 103.53}},
+                            {"std", {58.395, 57.12, 57.375}},
+                            {"to_rgb", to_rgb}};
+        vector<cv::Mat> mats{bgr_mat, float_bgr_mat};
+        for (auto& mat : mats)
+        {
+            TestNormalize(cfg, mat);
+        }
     }
-  }
 
-  SECTION("cpu vs gpu: 3 channel mat, to_rgb false") {
-    bool to_rgb = false;
-    Value cfg{{"type", "Normalize"},
-              {"mean", {123.675, 116.28, 103.53}},
-              {"std", {58.395, 57.12, 57.375}},
-              {"to_rgb", to_rgb}};
+    SECTION("cpu vs gpu: 3 channel mat, to_rgb false")
+    {
+        bool            to_rgb = false;
+        Value           cfg{{"type", "Normalize"},
+                            {"mean", {123.675, 116.28, 103.53}},
+                            {"std", {58.395, 57.12, 57.375}},
+                            {"to_rgb", to_rgb}};
 
-    vector<cv::Mat> mats{bgr_mat, float_bgr_mat};
-    for (auto &mat : mats) {
-      TestNormalize(cfg, mat);
+        vector<cv::Mat> mats{bgr_mat, float_bgr_mat};
+        for (auto& mat : mats)
+        {
+            TestNormalize(cfg, mat);
+        }
     }
-  }
 
-  SECTION("cpu vs gpu: 1 channel mat") {
-    bool to_rgb = true;
-    Value cfg{{"type", "Normalize"}, {"mean", {123.675}}, {"std", {58.395}}, {"to_rgb", to_rgb}};
+    SECTION("cpu vs gpu: 1 channel mat")
+    {
+        bool            to_rgb = true;
+        Value           cfg{{"type", "Normalize"}, {"mean", {123.675}}, {"std", {58.395}}, {"to_rgb", to_rgb}};
 
-    vector<cv::Mat> mats{gray_mat, float_gray_mat};
-    for (auto &mat : mats) {
-      TestNormalize(cfg, mat);
+        vector<cv::Mat> mats{gray_mat, float_gray_mat};
+        for (auto& mat : mats)
+        {
+            TestNormalize(cfg, mat);
+        }
     }
-  }
 }
diff --git a/tests/test_csrc/preprocess/test_pad.cpp b/tests/test_csrc/preprocess/test_pad.cpp
index 766de9c560..613679884c 100644
--- a/tests/test_csrc/preprocess/test_pad.cpp
+++ b/tests/test_csrc/preprocess/test_pad.cpp
@@ -16,105 +16,125 @@ using namespace std;
 using namespace mmdeploy::test;
 
 // left, top, right, bottom
-tuple<int, int, int, int> GetPadSize(const cv::Mat& mat, int dst_height, int dst_width) {
-  return {0, 0, dst_width - mat.cols, dst_height - mat.rows};
+tuple<int, int, int, int> GetPadSize(const cv::Mat& mat, int dst_height, int dst_width)
+{
+    return {0, 0, dst_width - mat.cols, dst_height - mat.rows};
 }
 
-tuple<int, int, int, int> GetPadSize(const cv::Mat& mat, bool square = true) {
-  int size = std::max(mat.rows, mat.cols);
-  return GetPadSize(mat, size, size);
+tuple<int, int, int, int> GetPadSize(const cv::Mat& mat, bool square = true)
+{
+    int size = std::max(mat.rows, mat.cols);
+    return GetPadSize(mat, size, size);
 }
 
-tuple<int, int, int, int> GetPadSize(const cv::Mat& mat, int divisor) {
-  auto pad_h = int(ceil(mat.rows * 1.0 / divisor)) * divisor;
-  auto pad_w = int(ceil(mat.cols * 1.0 / divisor)) * divisor;
-  return GetPadSize(mat, pad_h, pad_w);
+tuple<int, int, int, int> GetPadSize(const cv::Mat& mat, int divisor)
+{
+    auto pad_h = int(ceil(mat.rows * 1.0 / divisor)) * divisor;
+    auto pad_w = int(ceil(mat.cols * 1.0 / divisor)) * divisor;
+    return GetPadSize(mat, pad_h, pad_w);
 }
 
-void TestPad(const Value& cfg, const cv::Mat& mat, int top, int left, int bottom, int right,
-             int border_type, float val) {
-  auto gResource = MMDeployTestResources::Get();
-  for (auto const& device_name : gResource.device_names()) {
-    Device device{device_name.c_str()};
-    Stream stream{device};
-    auto transform = CreateTransform(cfg, device, stream);
-    REQUIRE(transform != nullptr);
+void TestPad(const Value& cfg, const cv::Mat& mat, int top, int left, int bottom, int right, int border_type, float val)
+{
+    auto gResource = MMDeployTestResources::Get();
+    for (auto const& device_name : gResource.device_names())
+    {
+        Device device{device_name.c_str()};
+        Stream stream{device};
+        auto   transform = CreateTransform(cfg, device, stream);
+        REQUIRE(transform != nullptr);
 
-    auto ref_mat = mmdeploy::cpu::Pad(mat, top, left, bottom, right, border_type, val);
+        auto ref_mat = mmdeploy::cpu::Pad(mat, top, left, bottom, right, border_type, val);
 
-    auto res = transform->Process({{"img", cpu::CVMat2Tensor(mat)}});
-    REQUIRE(!res.has_error());
-    auto res_tensor = res.value()["img"].get<Tensor>();
-    REQUIRE(res_tensor.device() == device);
-    REQUIRE(Shape(res.value(), "pad_shape") ==
-            vector<int64_t>{1, ref_mat.rows, ref_mat.cols, ref_mat.channels()});
-    REQUIRE(Shape(res.value(), "pad_fixed_size") ==
-            std::vector<int64_t>{ref_mat.rows, ref_mat.cols});
+        auto res = transform->Process({{"img", cpu::CVMat2Tensor(mat)}});
+        REQUIRE(!res.has_error());
+        auto res_tensor = res.value()["img"].get<Tensor>();
+        REQUIRE(res_tensor.device() == device);
+        REQUIRE(Shape(res.value(), "pad_shape") ==
+                vector<int64_t>{1, ref_mat.rows, ref_mat.cols, ref_mat.channels()});
+        REQUIRE(Shape(res.value(), "pad_fixed_size") ==
+                std::vector<int64_t>{ref_mat.rows, ref_mat.cols});
 
-    const Device kHost{"cpu"};
-    auto host_tensor = MakeAvailableOnDevice(res_tensor, kHost, stream);
-    REQUIRE(stream.Wait());
+        const Device kHost{"cpu"};
+        auto         host_tensor = MakeAvailableOnDevice(res_tensor, kHost, stream);
+        REQUIRE(stream.Wait());
 
-    auto res_mat = mmdeploy::cpu::Tensor2CVMat(host_tensor.value());
-    REQUIRE(mmdeploy::cpu::Compare(ref_mat, res_mat));
-  }
+        auto res_mat = mmdeploy::cpu::Tensor2CVMat(host_tensor.value());
+        REQUIRE(mmdeploy::cpu::Compare(ref_mat, res_mat));
+    }
 }
 
-TEST_CASE("transform 'Pad'", "[pad]") {
-  auto gResource = MMDeployTestResources::Get();
-  auto img_list = gResource.LocateImageResources("transform");
-  REQUIRE(!img_list.empty());
+TEST_CASE("transform 'Pad'", "[pad]")
+{
+    auto gResource = MMDeployTestResources::Get();
+    auto img_list  = gResource.LocateImageResources("transform");
+    REQUIRE(!img_list.empty());
 
-  auto img_path = img_list.front();
-  cv::Mat bgr_mat = cv::imread(img_path, cv::IMREAD_COLOR);
-  cv::Mat gray_mat;
-  cv::Mat float_bgr_mat;
-  cv::Mat float_gray_mat;
-  cv::cvtColor(bgr_mat, gray_mat, cv::COLOR_BGR2GRAY);
-  bgr_mat.convertTo(float_bgr_mat, CV_32FC3);
-  gray_mat.convertTo(float_gray_mat, CV_32FC1);
+    auto    img_path = img_list.front();
+    cv::Mat bgr_mat  = cv::imread(img_path, cv::IMREAD_COLOR);
+    cv::Mat gray_mat;
+    cv::Mat float_bgr_mat;
+    cv::Mat float_gray_mat;
+    cv::cvtColor(bgr_mat, gray_mat, cv::COLOR_BGR2GRAY);
+    bgr_mat.convertTo(float_bgr_mat, CV_32FC3);
+    gray_mat.convertTo(float_gray_mat, CV_32FC1);
 
-  vector<cv::Mat> mats{bgr_mat, gray_mat, float_bgr_mat, float_gray_mat};
-  vector<string> modes{"constant", "edge", "reflect", "symmetric"};
-  map<string, int> border_map{{"constant", cv::BORDER_CONSTANT},
-                              {"edge", cv::BORDER_REPLICATE},
-                              {"reflect", cv::BORDER_REFLECT_101},
-                              {"symmetric", cv::BORDER_REFLECT}};
-  SECTION("pad to square") {
-    bool square{true};
-    float val = 255.0f;
-    for (auto& mat : mats) {
-      for (auto& mode : modes) {
-        Value cfg{
-            {"type", "Pad"}, {"pad_to_square", square}, {"padding_mode", mode}, {"pad_val", val}};
-        auto [pad_left, pad_top, pad_right, pad_bottom] = GetPadSize(mat, square);
-        TestPad(cfg, mat, pad_top, pad_left, pad_bottom, pad_right, border_map[mode], 255);
-      }
+    vector<cv::Mat>  mats{bgr_mat, gray_mat, float_bgr_mat, float_gray_mat};
+    vector<string>   modes{"constant", "edge", "reflect", "symmetric"};
+    map<string, int> border_map{{"constant", cv::BORDER_CONSTANT},
+                                {"edge", cv::BORDER_REPLICATE},
+                                {"reflect", cv::BORDER_REFLECT_101},
+                                {"symmetric", cv::BORDER_REFLECT}};
+    SECTION("pad to square")
+    {
+        bool  square{true};
+        float val = 255.0f;
+        for (auto& mat : mats)
+        {
+            for (auto& mode : modes)
+            {
+                Value cfg{
+                    {"type", "Pad"},
+                    {"pad_to_square", square},
+                    {"padding_mode", mode},
+                    {"pad_val", val}};
+                auto [pad_left, pad_top, pad_right, pad_bottom] = GetPadSize(mat, square);
+                TestPad(cfg, mat, pad_top, pad_left, pad_bottom, pad_right, border_map[mode], 255);
+            }
+        }
     }
-  }
 
-  SECTION("pad with size_divisor") {
-    constexpr int divisor = 32;
-    float val = 255.0f;
-    for (auto& mat : mats) {
-      for (auto& mode : modes) {
-        Value cfg{
-            {"type", "Pad"}, {"size_divisor", divisor}, {"padding_mode", mode}, {"pad_val", val}};
-        auto [pad_left, pad_top, pad_right, pad_bottom] = GetPadSize(mat, divisor);
-        TestPad(cfg, mat, pad_top, pad_left, pad_bottom, pad_right, border_map[mode], 255);
-      }
+    SECTION("pad with size_divisor")
+    {
+        constexpr int divisor = 32;
+        float         val     = 255.0f;
+        for (auto& mat : mats)
+        {
+            for (auto& mode : modes)
+            {
+                Value cfg{
+                    {"type", "Pad"},
+                    {"size_divisor", divisor},
+                    {"padding_mode", mode},
+                    {"pad_val", val}};
+                auto [pad_left, pad_top, pad_right, pad_bottom] = GetPadSize(mat, divisor);
+                TestPad(cfg, mat, pad_top, pad_left, pad_bottom, pad_right, border_map[mode], 255);
+            }
+        }
     }
-  }
 
-  SECTION("pad with size") {
-    constexpr int height = 600;
-    constexpr int width = 800;
-    for (auto& mat : mats) {
-      for (auto& mode : modes) {
-        Value cfg{{"type", "Pad"}, {"size", {width, height}}, {"padding_mode", mode}};
-        auto [pad_left, pad_top, pad_right, pad_bottom] = GetPadSize(mat, height, width);
-        TestPad(cfg, mat, pad_top, pad_left, pad_bottom, pad_right, border_map[mode], 0);
-      }
+    SECTION("pad with size")
+    {
+        constexpr int height = 600;
+        constexpr int width  = 800;
+        for (auto& mat : mats)
+        {
+            for (auto& mode : modes)
+            {
+                Value cfg{{"type", "Pad"}, {"size", {width, height}}, {"padding_mode", mode}};
+                auto [pad_left, pad_top, pad_right, pad_bottom] = GetPadSize(mat, height, width);
+                TestPad(cfg, mat, pad_top, pad_left, pad_bottom, pad_right, border_map[mode], 0);
+            }
+        }
     }
-  }
 }
diff --git a/tests/test_csrc/preprocess/test_permute.cpp b/tests/test_csrc/preprocess/test_permute.cpp
index a12a4aad6e..753318aaad 100644
--- a/tests/test_csrc/preprocess/test_permute.cpp
+++ b/tests/test_csrc/preprocess/test_permute.cpp
@@ -18,104 +18,111 @@ using namespace framework;
 using namespace std;
 using namespace mmdeploy::test;
 
-template <typename T>
-bool CheckEqual(const Tensor& res, const vector<T>& expected) {
-  auto r = res.data<T>();
-  auto e = expected.data();
-  for (int i = 0; i < expected.size(); i++) {
-    if (r[i] != e[i]) {
-      return false;
+template<typename T>
+bool CheckEqual(const Tensor& res, const vector<T>& expected)
+{
+    auto r = res.data<T>();
+    auto e = expected.data();
+    for (int i = 0; i < expected.size(); i++)
+    {
+        if (r[i] != e[i])
+        {
+            return false;
+        }
     }
-  }
-  return true;
+    return true;
 }
 
-template <typename T>
-void TestPermute(const Tensor& src, const vector<int>& axes, const vector<T>& expected) {
-  auto gResource = MMDeployTestResources::Get();
-  for (auto const& device_name : gResource.device_names()) {
-    Device device{device_name.c_str()};
-    Stream stream{device};
-    ::mmdeploy::operation::Context ctx(device, stream);
-    auto permute = ::mmdeploy::operation::Managed<::mmdeploy::operation::Permute>::Create();
-    Tensor dst;
-    auto ret = permute.Apply(src, dst, axes);
-    REQUIRE(!ret.has_error());
-    const Device kHost{"cpu"};
-    auto host_tensor = MakeAvailableOnDevice(dst, kHost, stream);
-    REQUIRE(CheckEqual(host_tensor.value(), expected));
-  }
+template<typename T>
+void TestPermute(const Tensor& src, const vector<int>& axes, const vector<T>& expected)
+{
+    auto gResource = MMDeployTestResources::Get();
+    for (auto const& device_name : gResource.device_names())
+    {
+        Device                         device{device_name.c_str()};
+        Stream                         stream{device};
+        ::mmdeploy::operation::Context ctx(device, stream);
+        auto                           permute = ::mmdeploy::operation::Managed<::mmdeploy::operation::Permute>::Create();
+        Tensor                         dst;
+        auto                           ret = permute.Apply(src, dst, axes);
+        REQUIRE(!ret.has_error());
+        const Device kHost{"cpu"};
+        auto         host_tensor = MakeAvailableOnDevice(dst, kHost, stream);
+        REQUIRE(CheckEqual(host_tensor.value(), expected));
+    }
 }
 
-void TestPermuteWrongArgs(const Tensor& src) {
-  int sz = src.shape().size();
-  vector<int> oaxes(sz);
-  std::iota(oaxes.begin(), oaxes.end(), 0);
+void TestPermuteWrongArgs(const Tensor& src)
+{
+    int         sz = src.shape().size();
+    vector<int> oaxes(sz);
+    std::iota(oaxes.begin(), oaxes.end(), 0);
 
-  auto gResource = MMDeployTestResources::Get();
-  for (auto const& device_name : gResource.device_names()) {
-    Device device{device_name.c_str()};
-    Stream stream{device};
-    ::mmdeploy::operation::Context ctx(device, stream);
-    auto permute = ::mmdeploy::operation::Managed<::mmdeploy::operation::Permute>::Create();
-    Tensor dst;
-    {
-      auto axes = oaxes;
-      axes[0]--;
-      auto ret = permute.Apply(src, dst, axes);
-      REQUIRE(ret.has_error());
-    }
-    {
-      auto axes = oaxes;
-      axes.back()++;
-      auto ret = permute.Apply(src, dst, axes);
-      REQUIRE(ret.has_error());
-    }
+    auto gResource = MMDeployTestResources::Get();
+    for (auto const& device_name : gResource.device_names())
     {
-      auto axes = oaxes;
-      axes[0] = axes[1];
-      auto ret = permute.Apply(src, dst, axes);
-      REQUIRE(ret.has_error());
+        Device                         device{device_name.c_str()};
+        Stream                         stream{device};
+        ::mmdeploy::operation::Context ctx(device, stream);
+        auto                           permute = ::mmdeploy::operation::Managed<::mmdeploy::operation::Permute>::Create();
+        Tensor                         dst;
+        {
+            auto axes = oaxes;
+            axes[0]--;
+            auto ret = permute.Apply(src, dst, axes);
+            REQUIRE(ret.has_error());
+        }
+        {
+            auto axes = oaxes;
+            axes.back()++;
+            auto ret = permute.Apply(src, dst, axes);
+            REQUIRE(ret.has_error());
+        }
+        {
+            auto axes = oaxes;
+            axes[0]   = axes[1];
+            auto ret  = permute.Apply(src, dst, axes);
+            REQUIRE(ret.has_error());
+        }
     }
-  }
 }
 
-TEST_CASE("operation Permute", "[permute]") {
-  const Device kHost{"cpu"};
-  const int kSize = 2 * 3 * 2 * 4;
-  vector<uint8_t> data(kSize);
-  std::iota(data.begin(), data.end(), 0);  // [0, 48)
-  TensorDesc desc = {kHost, DataType::kINT8, {kSize}};
-  Tensor tensor(desc);
-  memcpy(tensor.data(), data.data(), data.size() * sizeof(uint8_t));
+TEST_CASE("operation Permute", "[permute]")
+{
+    const Device    kHost{"cpu"};
+    const int       kSize = 2 * 3 * 2 * 4;
+    vector<uint8_t> data(kSize);
+    std::iota(data.begin(), data.end(), 0);  // [0, 48)
+    TensorDesc desc = {kHost, DataType::kINT8, {kSize}};
+    Tensor     tensor(desc);
+    memcpy(tensor.data(), data.data(), data.size() * sizeof(uint8_t));
 
-  SECTION("permute: wrong axes") {
-    Tensor src = tensor;
-    src.Reshape({6, 8});
-    TestPermuteWrongArgs(src);
-  }
+    SECTION("permute: wrong axes")
+    {
+        Tensor src = tensor;
+        src.Reshape({6, 8});
+        TestPermuteWrongArgs(src);
+    }
 
-  SECTION("permute: dims 4") {
-    Tensor src = tensor;
-    src.Reshape({2, 3, 2, 4});
-    vector<int> axes = {1, 0, 3, 2};
-    vector<uint8_t> expected = {0,  4,  1,  5,  2,  6,  3,  7,  24, 28, 25, 29, 26, 30, 27, 31,
-                                8,  12, 9,  13, 10, 14, 11, 15, 32, 36, 33, 37, 34, 38, 35, 39,
-                                16, 20, 17, 21, 18, 22, 19, 23, 40, 44, 41, 45, 42, 46, 43, 47};
-    Tensor dst(src.desc());
-    memcpy(dst.data(), expected.data(), data.size() * sizeof(uint8_t));
-    TestPermute(src, axes, expected);
-  }
+    SECTION("permute: dims 4")
+    {
+        Tensor src = tensor;
+        src.Reshape({2, 3, 2, 4});
+        vector<int>     axes     = {1, 0, 3, 2};
+        vector<uint8_t> expected = {0, 4, 1, 5, 2, 6, 3, 7, 24, 28, 25, 29, 26, 30, 27, 31, 8, 12, 9, 13, 10, 14, 11, 15, 32, 36, 33, 37, 34, 38, 35, 39, 16, 20, 17, 21, 18, 22, 19, 23, 40, 44, 41, 45, 42, 46, 43, 47};
+        Tensor          dst(src.desc());
+        memcpy(dst.data(), expected.data(), data.size() * sizeof(uint8_t));
+        TestPermute(src, axes, expected);
+    }
 
-  SECTION("permute: dims 5") {
-    Tensor src = tensor;
-    src.Reshape({2, 3, 1, 2, 4});
-    vector<int> axes = {2, 0, 1, 4, 3};
-    vector<uint8_t> expected = {0,  4,  1,  5,  2,  6,  3,  7,  8,  12, 9,  13, 10, 14, 11, 15,
-                                16, 20, 17, 21, 18, 22, 19, 23, 24, 28, 25, 29, 26, 30, 27, 31,
-                                32, 36, 33, 37, 34, 38, 35, 39, 40, 44, 41, 45, 42, 46, 43, 47};
-    Tensor dst(src.desc());
-    memcpy(dst.data(), expected.data(), data.size() * sizeof(uint8_t));
-    TestPermute(src, axes, expected);
-  }
+    SECTION("permute: dims 5")
+    {
+        Tensor src = tensor;
+        src.Reshape({2, 3, 1, 2, 4});
+        vector<int>     axes     = {2, 0, 1, 4, 3};
+        vector<uint8_t> expected = {0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15, 16, 20, 17, 21, 18, 22, 19, 23, 24, 28, 25, 29, 26, 30, 27, 31, 32, 36, 33, 37, 34, 38, 35, 39, 40, 44, 41, 45, 42, 46, 43, 47};
+        Tensor          dst(src.desc());
+        memcpy(dst.data(), expected.data(), data.size() * sizeof(uint8_t));
+        TestPermute(src, axes, expected);
+    }
 }
diff --git a/tests/test_csrc/preprocess/test_resize.cpp b/tests/test_csrc/preprocess/test_resize.cpp
index b1c1ab0910..063f2f95f7 100644
--- a/tests/test_csrc/preprocess/test_resize.cpp
+++ b/tests/test_csrc/preprocess/test_resize.cpp
@@ -16,290 +16,339 @@ using namespace std;
 using namespace mmdeploy::test;
 
 // return {target_height, target_width}
-tuple<int, int> GetTargetSize(const cv::Mat& src, int size0, int size1) {
-  assert(size0 > 0);
-  if (size1 > 0) {
-    return {size0, size1};
-  } else {
-    if (src.rows < src.cols) {
-      return {size0, size0 * src.cols / src.rows};
-    } else {
-      return {size0 * src.rows / src.cols, size0};
+tuple<int, int> GetTargetSize(const cv::Mat& src, int size0, int size1)
+{
+    assert(size0 > 0);
+    if (size1 > 0)
+    {
+        return {size0, size1};
+    }
+    else
+    {
+        if (src.rows < src.cols)
+        {
+            return {size0, size0 * src.cols / src.rows};
+        }
+        else
+        {
+            return {size0 * src.rows / src.cols, size0};
+        }
     }
-  }
 }
 
 // return {target_height, target_width}
-tuple<int, int> GetTargetSize(const cv::Mat& src, int scale0, int scale1, bool keep_ratio) {
-  auto w = src.cols;
-  auto h = src.rows;
-  auto max_long_edge = max(scale0, scale1);
-  auto max_short_edge = min(scale0, scale1);
-  if (keep_ratio) {
-    auto scale_factor =
-        std::min(max_long_edge * 1.0 / std::max(h, w), max_short_edge * 1.0 / std::min(h, w));
-    return {int(h * scale_factor + 0.5f), int(w * scale_factor + 0.5f)};
-  } else {
-    return {scale0, scale1};
-  }
+tuple<int, int> GetTargetSize(const cv::Mat& src, int scale0, int scale1, bool keep_ratio)
+{
+    auto w              = src.cols;
+    auto h              = src.rows;
+    auto max_long_edge  = max(scale0, scale1);
+    auto max_short_edge = min(scale0, scale1);
+    if (keep_ratio)
+    {
+        auto scale_factor =
+            std::min(max_long_edge * 1.0 / std::max(h, w), max_short_edge * 1.0 / std::min(h, w));
+        return {int(h * scale_factor + 0.5f), int(w * scale_factor + 0.5f)};
+    }
+    else
+    {
+        return {scale0, scale1};
+    }
 }
 
-void TestResize(const Value& cfg, const std::string& device_name, const cv::Mat& mat,
-                int dst_height, int dst_width) {
-  if (MMDeployTestResources::Get().HasDevice(device_name)) {
-    Device device{device_name.c_str()};
-    Stream stream{device};
+void TestResize(const Value& cfg, const std::string& device_name, const cv::Mat& mat, int dst_height, int dst_width)
+{
+    if (MMDeployTestResources::Get().HasDevice(device_name))
+    {
+        Device device{device_name.c_str()};
+        Stream stream{device};
 
-    auto transform = CreateTransform(cfg, device, stream);
-    REQUIRE(transform != nullptr);
+        auto   transform = CreateTransform(cfg, device, stream);
+        REQUIRE(transform != nullptr);
 
-    auto interpolation = cfg["interpolation"].get<string>();
-    auto ref_mat = mmdeploy::cpu::Resize(mat, dst_height, dst_width, interpolation);
+        auto interpolation = cfg["interpolation"].get<string>();
+        auto ref_mat       = mmdeploy::cpu::Resize(mat, dst_height, dst_width, interpolation);
 
-    auto res = transform->Process({{"img", cpu::CVMat2Tensor(mat)}});
-    REQUIRE(!res.has_error());
-    auto res_tensor = res.value()["img"].get<Tensor>();
-    REQUIRE(res_tensor.device().device_id() == device.device_id());
-    REQUIRE(res_tensor.device().platform_id() == device.platform_id());
-    REQUIRE(res_tensor.device() == device);
-    REQUIRE(Shape(res.value(), "img_shape") ==
-            vector<int64_t>{1, ref_mat.rows, ref_mat.cols, ref_mat.channels()});
-    REQUIRE(Shape(res.value(), "img_shape") == res_tensor.desc().shape);
+        auto res = transform->Process({{"img", cpu::CVMat2Tensor(mat)}});
+        REQUIRE(!res.has_error());
+        auto res_tensor = res.value()["img"].get<Tensor>();
+        REQUIRE(res_tensor.device().device_id() == device.device_id());
+        REQUIRE(res_tensor.device().platform_id() == device.platform_id());
+        REQUIRE(res_tensor.device() == device);
+        REQUIRE(Shape(res.value(), "img_shape") ==
+                vector<int64_t>{1, ref_mat.rows, ref_mat.cols, ref_mat.channels()});
+        REQUIRE(Shape(res.value(), "img_shape") == res_tensor.desc().shape);
 
-    const Device kHost{"cpu"};
-    auto host_tensor = MakeAvailableOnDevice(res_tensor, kHost, stream);
-    REQUIRE(stream.Wait());
+        const Device kHost{"cpu"};
+        auto         host_tensor = MakeAvailableOnDevice(res_tensor, kHost, stream);
+        REQUIRE(stream.Wait());
 
-    auto res_mat = mmdeploy::cpu::Tensor2CVMat(host_tensor.value());
-    REQUIRE(mmdeploy::cpu::Compare(ref_mat, res_mat));
-    // cv::imwrite("ref.bmp", ref_mat);
-    // cv::imwrite("res.bmp", res_mat);
-  }
+        auto res_mat = mmdeploy::cpu::Tensor2CVMat(host_tensor.value());
+        REQUIRE(mmdeploy::cpu::Compare(ref_mat, res_mat));
+        // cv::imwrite("ref.bmp", ref_mat);
+        // cv::imwrite("res.bmp", res_mat);
+    }
 }
 
-void TestResizeWithScale(const Value& cfg, const std::string& device_name, const cv::Mat& mat,
-                         int scale0, int scale1, bool keep_ratio) {
-  if (MMDeployTestResources::Get().HasDevice(device_name)) {
-    Device device{device_name.c_str()};
-    Stream stream{device};
-    auto transform = CreateTransform(cfg, device, stream);
-    REQUIRE(transform != nullptr);
+void TestResizeWithScale(const Value& cfg, const std::string& device_name, const cv::Mat& mat, int scale0, int scale1, bool keep_ratio)
+{
+    if (MMDeployTestResources::Get().HasDevice(device_name))
+    {
+        Device device{device_name.c_str()};
+        Stream stream{device};
+        auto   transform = CreateTransform(cfg, device, stream);
+        REQUIRE(transform != nullptr);
 
-    auto [dst_height, dst_width] = GetTargetSize(mat, scale0, scale1, keep_ratio);
-    auto interpolation = cfg["interpolation"].get<string>();
-    auto ref_mat = mmdeploy::cpu::Resize(mat, dst_height, dst_width, interpolation);
+        auto [dst_height, dst_width] = GetTargetSize(mat, scale0, scale1, keep_ratio);
+        auto  interpolation          = cfg["interpolation"].get<string>();
+        auto  ref_mat                = mmdeploy::cpu::Resize(mat, dst_height, dst_width, interpolation);
 
-    Value input{{"img", cpu::CVMat2Tensor(mat)}, {"scale", {scale0, scale1}}};
-    auto res = transform->Process(input);
-    REQUIRE(!res.has_error());
-    auto res_tensor = res.value()["img"].get<Tensor>();
-    REQUIRE(res_tensor.device() == device);
-    REQUIRE(Shape(res.value(), "img_shape") ==
-            vector<int64_t>{1, ref_mat.rows, ref_mat.cols, ref_mat.channels()});
-    REQUIRE(Shape(res.value(), "img_shape") == res_tensor.desc().shape);
+        Value input{{"img", cpu::CVMat2Tensor(mat)}, {"scale", {scale0, scale1}}};
+        auto  res = transform->Process(input);
+        REQUIRE(!res.has_error());
+        auto res_tensor = res.value()["img"].get<Tensor>();
+        REQUIRE(res_tensor.device() == device);
+        REQUIRE(Shape(res.value(), "img_shape") ==
+                vector<int64_t>{1, ref_mat.rows, ref_mat.cols, ref_mat.channels()});
+        REQUIRE(Shape(res.value(), "img_shape") == res_tensor.desc().shape);
 
-    const Device kHost{"cpu"};
-    auto host_tensor = MakeAvailableOnDevice(res_tensor, kHost, stream);
-    REQUIRE(stream.Wait());
+        const Device kHost{"cpu"};
+        auto         host_tensor = MakeAvailableOnDevice(res_tensor, kHost, stream);
+        REQUIRE(stream.Wait());
 
-    auto res_mat = mmdeploy::cpu::Tensor2CVMat(host_tensor.value());
-    REQUIRE(mmdeploy::cpu::Compare(ref_mat, res_mat));
-    //  cv::imwrite("ref.bmp", ref_mat);
-    //  cv::imwrite("res.bmp", res_mat);
-  }
+        auto res_mat = mmdeploy::cpu::Tensor2CVMat(host_tensor.value());
+        REQUIRE(mmdeploy::cpu::Compare(ref_mat, res_mat));
+        //  cv::imwrite("ref.bmp", ref_mat);
+        //  cv::imwrite("res.bmp", res_mat);
+    }
 }
 
-void TestResizeWithScaleFactor(const Value& cfg, const std::string& device_name, const cv::Mat& mat,
-                               float scale_factor) {
-  if (MMDeployTestResources::Get().HasDevice(device_name)) {
-    Device device{device_name.c_str()};
-    Stream stream{device};
-    auto transform = CreateTransform(cfg, device, stream);
-    REQUIRE(transform != nullptr);
+void TestResizeWithScaleFactor(const Value& cfg, const std::string& device_name, const cv::Mat& mat, float scale_factor)
+{
+    if (MMDeployTestResources::Get().HasDevice(device_name))
+    {
+        Device device{device_name.c_str()};
+        Stream stream{device};
+        auto   transform = CreateTransform(cfg, device, stream);
+        REQUIRE(transform != nullptr);
 
-    // keep round policy with resize.cpp
-    const int dst_height = static_cast<int>(mat.rows * scale_factor + 0.5);
-    const int dst_width = static_cast<int>(mat.cols * scale_factor + 0.5);
-    auto interpolation = cfg["interpolation"].get<string>();
-    auto ref_mat = mmdeploy::cpu::Resize(mat, dst_height, dst_width, interpolation);
+        // keep round policy with resize.cpp
+        const int dst_height    = static_cast<int>(mat.rows * scale_factor + 0.5);
+        const int dst_width     = static_cast<int>(mat.cols * scale_factor + 0.5);
+        auto      interpolation = cfg["interpolation"].get<string>();
+        auto      ref_mat       = mmdeploy::cpu::Resize(mat, dst_height, dst_width, interpolation);
 
-    Value input{{"img", cpu::CVMat2Tensor(mat)}, {"scale_factor", scale_factor}};
-    auto res = transform->Process(input);
-    REQUIRE(!res.has_error());
-    auto res_tensor = res.value()["img"].get<Tensor>();
-    REQUIRE(res_tensor.device() == device);
-    REQUIRE(Shape(res.value(), "img_shape") ==
-            vector<int64_t>{1, ref_mat.rows, ref_mat.cols, ref_mat.channels()});
-    REQUIRE(Shape(res.value(), "img_shape") == res_tensor.desc().shape);
+        Value     input{{"img", cpu::CVMat2Tensor(mat)}, {"scale_factor", scale_factor}};
+        auto      res = transform->Process(input);
+        REQUIRE(!res.has_error());
+        auto res_tensor = res.value()["img"].get<Tensor>();
+        REQUIRE(res_tensor.device() == device);
+        REQUIRE(Shape(res.value(), "img_shape") ==
+                vector<int64_t>{1, ref_mat.rows, ref_mat.cols, ref_mat.channels()});
+        REQUIRE(Shape(res.value(), "img_shape") == res_tensor.desc().shape);
 
-    const Device kHost{"cpu"};
-    auto host_tensor = MakeAvailableOnDevice(res_tensor, kHost, stream);
-    auto res_mat = mmdeploy::cpu::Tensor2CVMat(host_tensor.value());
-    REQUIRE(mmdeploy::cpu::Compare(ref_mat, res_mat));
-    //  cv::imwrite("ref.bmp", ref_mat);
-    //  cv::imwrite("res.bmp", res_mat);
-  }
+        const Device kHost{"cpu"};
+        auto         host_tensor = MakeAvailableOnDevice(res_tensor, kHost, stream);
+        auto         res_mat     = mmdeploy::cpu::Tensor2CVMat(host_tensor.value());
+        REQUIRE(mmdeploy::cpu::Compare(ref_mat, res_mat));
+        //  cv::imwrite("ref.bmp", ref_mat);
+        //  cv::imwrite("res.bmp", res_mat);
+    }
 }
 
-TEST_CASE("resize transform: size", "[resize]") {
-  auto gResource = MMDeployTestResources::Get();
-  auto img_list = gResource.LocateImageResources("transform");
-  REQUIRE(!img_list.empty());
+TEST_CASE("resize transform: size", "[resize]")
+{
+    auto gResource = MMDeployTestResources::Get();
+    auto img_list  = gResource.LocateImageResources("transform");
+    REQUIRE(!img_list.empty());
 
-  auto img_path = img_list.front();
-  cv::Mat bgr_mat = cv::imread(img_path, cv::IMREAD_COLOR);
-  cv::Mat gray_mat = cv::imread(img_path, cv::IMREAD_GRAYSCALE);
-  cv::Mat bgr_float_mat;
-  cv::Mat gray_float_mat;
-  bgr_mat.convertTo(bgr_float_mat, CV_32FC3);
-  gray_mat.convertTo(gray_float_mat, CV_32FC1);
+    auto    img_path = img_list.front();
+    cv::Mat bgr_mat  = cv::imread(img_path, cv::IMREAD_COLOR);
+    cv::Mat gray_mat = cv::imread(img_path, cv::IMREAD_GRAYSCALE);
+    cv::Mat bgr_float_mat;
+    cv::Mat gray_float_mat;
+    bgr_mat.convertTo(bgr_float_mat, CV_32FC3);
+    gray_mat.convertTo(gray_float_mat, CV_32FC1);
 
-  vector<cv::Mat> mats{bgr_mat, gray_mat, bgr_float_mat, gray_float_mat};
-  vector<string> interpolations{"bilinear", "nearest", "area", "bicubic", "lanczos"};
-  set<string> cuda_interpolations{"bilinear", "nearest", "area"};
-  constexpr const char* kHost = "cpu";
-  SECTION("tuple size with -1") {
-    for (auto& mat : mats) {
-      auto size = std::max(mat.rows, mat.cols) + 10;
-      for (auto& interp : interpolations) {
-        Value cfg{{"type", "Resize"},
-                  {"size", {size, -1}},
-                  {"keep_ratio", false},
-                  {"interpolation", interp}};
-        auto [dst_height, dst_width] = GetTargetSize(mat, size, -1);
-        TestResize(cfg, kHost, mat, dst_height, dst_width);
-        if (cuda_interpolations.find(interp) != cuda_interpolations.end()) {
-          TestResize(cfg, "cuda", mat, dst_height, dst_width);
+    vector<cv::Mat>       mats{bgr_mat, gray_mat, bgr_float_mat, gray_float_mat};
+    vector<string>        interpolations{"bilinear", "nearest", "area", "bicubic", "lanczos"};
+    set<string>           cuda_interpolations{"bilinear", "nearest", "area"};
+    constexpr const char* kHost = "cpu";
+    SECTION("tuple size with -1")
+    {
+        for (auto& mat : mats)
+        {
+            auto size = std::max(mat.rows, mat.cols) + 10;
+            for (auto& interp : interpolations)
+            {
+                Value cfg{{"type", "Resize"},
+                          {"size", {size, -1}},
+                          {"keep_ratio", false},
+                          {"interpolation", interp}};
+                auto [dst_height, dst_width] = GetTargetSize(mat, size, -1);
+                TestResize(cfg, kHost, mat, dst_height, dst_width);
+                if (cuda_interpolations.find(interp) != cuda_interpolations.end())
+                {
+                    TestResize(cfg, "cuda", mat, dst_height, dst_width);
+                }
+            }
         }
-      }
     }
-  }
 
-  SECTION("no need to resize") {
-    for (auto& mat : mats) {
-      auto size = std::min(mat.rows, mat.cols);
-      for (auto& interp : interpolations) {
-        Value cfg{{"type", "Resize"},
-                  {"size", {size, -1}},
-                  {"keep_ratio", false},
-                  {"interpolation", interp}};
-        auto [dst_height, dst_width] = GetTargetSize(mat, size, -1);
-        TestResize(cfg, kHost, mat, dst_height, dst_width);
-      }
+    SECTION("no need to resize")
+    {
+        for (auto& mat : mats)
+        {
+            auto size = std::min(mat.rows, mat.cols);
+            for (auto& interp : interpolations)
+            {
+                Value cfg{{"type", "Resize"},
+                          {"size", {size, -1}},
+                          {"keep_ratio", false},
+                          {"interpolation", interp}};
+                auto [dst_height, dst_width] = GetTargetSize(mat, size, -1);
+                TestResize(cfg, kHost, mat, dst_height, dst_width);
+            }
+        }
     }
-  }
 
-  SECTION("fixed integer size") {
-    for (auto& mat : mats) {
-      constexpr int size = 224;
-      for (auto& interp : interpolations) {
-        Value cfg{
-            {"type", "Resize"}, {"size", size}, {"keep_ratio", false}, {"interpolation", interp}};
-        TestResize(cfg, kHost, mat, size, size);
-        if (cuda_interpolations.find(interp) != cuda_interpolations.end()) {
-          TestResize(cfg, "cuda", mat, size, size);
+    SECTION("fixed integer size")
+    {
+        for (auto& mat : mats)
+        {
+            constexpr int size = 224;
+            for (auto& interp : interpolations)
+            {
+                Value cfg{
+                    {"type", "Resize"},
+                    {"size", size},
+                    {"keep_ratio", false},
+                    {"interpolation", interp}};
+                TestResize(cfg, kHost, mat, size, size);
+                if (cuda_interpolations.find(interp) != cuda_interpolations.end())
+                {
+                    TestResize(cfg, "cuda", mat, size, size);
+                }
+            }
         }
-      }
     }
-  }
 
-  SECTION("fixed size: [1333, 800]. keep_ratio: true") {
-    constexpr int max_long_edge = 1333;
-    constexpr int max_short_edge = 800;
-    bool keep_ratio = true;
-    for (auto& mat : mats) {
-      for (auto& interp : interpolations) {
-        Value cfg{{"type", "Resize"},
-                  {"size", {max_long_edge, max_short_edge}},
-                  {"keep_ratio", keep_ratio},
-                  {"interpolation", interp}};
-        auto [dst_height, dst_width] =
-            GetTargetSize(mat, max_long_edge, max_short_edge, keep_ratio);
-        TestResize(cfg, kHost, mat, dst_height, dst_width);
-        if (cuda_interpolations.find(interp) != cuda_interpolations.end()) {
-          TestResize(cfg, "cuda", mat, dst_height, dst_width);
+    SECTION("fixed size: [1333, 800]. keep_ratio: true")
+    {
+        constexpr int max_long_edge  = 1333;
+        constexpr int max_short_edge = 800;
+        bool          keep_ratio     = true;
+        for (auto& mat : mats)
+        {
+            for (auto& interp : interpolations)
+            {
+                Value cfg{{"type", "Resize"},
+                          {"size", {max_long_edge, max_short_edge}},
+                          {"keep_ratio", keep_ratio},
+                          {"interpolation", interp}};
+                auto [dst_height, dst_width] =
+                    GetTargetSize(mat, max_long_edge, max_short_edge, keep_ratio);
+                TestResize(cfg, kHost, mat, dst_height, dst_width);
+                if (cuda_interpolations.find(interp) != cuda_interpolations.end())
+                {
+                    TestResize(cfg, "cuda", mat, dst_height, dst_width);
+                }
+            }
         }
-      }
     }
-  }
 
-  SECTION("fixed size: [1333, 800]. keep_ratio: false") {
-    constexpr int dst_height = 800;
-    constexpr int dst_width = 1333;
-    bool keep_ratio = false;
-    for (auto& mat : mats) {
-      for (auto& interp : interpolations) {
-        Value cfg{{"type", "Resize"},
-                  {"size", {dst_width, dst_height}},
-                  {"keep_ratio", keep_ratio},
-                  {"interpolation", interp}};
-        TestResize(cfg, kHost, mat, dst_height, dst_width);
-        if (cuda_interpolations.find(interp) != cuda_interpolations.end()) {
-          TestResize(cfg, "cuda", mat, dst_height, dst_width);
+    SECTION("fixed size: [1333, 800]. keep_ratio: false")
+    {
+        constexpr int dst_height = 800;
+        constexpr int dst_width  = 1333;
+        bool          keep_ratio = false;
+        for (auto& mat : mats)
+        {
+            for (auto& interp : interpolations)
+            {
+                Value cfg{{"type", "Resize"},
+                          {"size", {dst_width, dst_height}},
+                          {"keep_ratio", keep_ratio},
+                          {"interpolation", interp}};
+                TestResize(cfg, kHost, mat, dst_height, dst_width);
+                if (cuda_interpolations.find(interp) != cuda_interpolations.end())
+                {
+                    TestResize(cfg, "cuda", mat, dst_height, dst_width);
+                }
+            }
         }
-      }
     }
-  }
 
-  SECTION("fixed size: [800, 1333]. keep_ratio: true") {
-    constexpr int dst_height = 800;
-    constexpr int dst_width = 1333;
-    bool keep_ratio = true;
-    for (auto& mat : mats) {
-      for (auto& interp : interpolations) {
-        Value cfg{{"type", "Resize"},
-                  {"size", {dst_height, dst_width}},
-                  {"keep_ratio", keep_ratio},
-                  {"interpolation", interp}};
-        TestResizeWithScale(cfg, kHost, mat, dst_height, dst_width, keep_ratio);
-      }
+    SECTION("fixed size: [800, 1333]. keep_ratio: true")
+    {
+        constexpr int dst_height = 800;
+        constexpr int dst_width  = 1333;
+        bool          keep_ratio = true;
+        for (auto& mat : mats)
+        {
+            for (auto& interp : interpolations)
+            {
+                Value cfg{{"type", "Resize"},
+                          {"size", {dst_height, dst_width}},
+                          {"keep_ratio", keep_ratio},
+                          {"interpolation", interp}};
+                TestResizeWithScale(cfg, kHost, mat, dst_height, dst_width, keep_ratio);
+            }
+        }
     }
-  }
 
-  SECTION("img_scale: [800, 1333]. keep_ratio: false") {
-    constexpr int dst_height = 800;
-    constexpr int dst_width = 1333;
-    bool keep_ratio = false;
-    for (auto& mat : mats) {
-      for (auto& interp : interpolations) {
-        Value cfg{{"type", "Resize"},
-                  {"size", {dst_height, dst_width}},
-                  {"keep_ratio", keep_ratio},
-                  {"interpolation", interp}};
-        TestResizeWithScale(cfg, kHost, mat, dst_height, dst_width, keep_ratio);
-      }
+    SECTION("img_scale: [800, 1333]. keep_ratio: false")
+    {
+        constexpr int dst_height = 800;
+        constexpr int dst_width  = 1333;
+        bool          keep_ratio = false;
+        for (auto& mat : mats)
+        {
+            for (auto& interp : interpolations)
+            {
+                Value cfg{{"type", "Resize"},
+                          {"size", {dst_height, dst_width}},
+                          {"keep_ratio", keep_ratio},
+                          {"interpolation", interp}};
+                TestResizeWithScale(cfg, kHost, mat, dst_height, dst_width, keep_ratio);
+            }
+        }
     }
-  }
 
-  SECTION("scale_factor: 0.5") {
-    float scale_factor = 0.5;
-    bool keep_ratio = true;
-    for (auto& mat : mats) {
-      for (auto& interp : interpolations) {
-        Value cfg{{"type", "Resize"},
-                  {"size", {600, 800}},
-                  {"keep_ratio", keep_ratio},
-                  {"interpolation", interp}};
-        TestResizeWithScaleFactor(cfg, kHost, mat, scale_factor);
-      }
+    SECTION("scale_factor: 0.5")
+    {
+        float scale_factor = 0.5;
+        bool  keep_ratio   = true;
+        for (auto& mat : mats)
+        {
+            for (auto& interp : interpolations)
+            {
+                Value cfg{{"type", "Resize"},
+                          {"size", {600, 800}},
+                          {"keep_ratio", keep_ratio},
+                          {"interpolation", interp}};
+                TestResizeWithScaleFactor(cfg, kHost, mat, scale_factor);
+            }
+        }
     }
-  }
 
-  SECTION("resize 4 channel image") {
-    cv::Mat mat = cv::imread(img_path, cv::IMREAD_COLOR);
-    cv::Mat bgra_mat;
-    cv::cvtColor(bgr_mat, bgra_mat, cv::COLOR_BGR2BGRA);
-    assert(bgra_mat.channels() == 4);
-    constexpr int size = 256;
-    auto [dst_height, dst_width] = GetTargetSize(bgra_mat, size, -1);
-    for (auto& device_name : gResource.device_names()) {
-      for (auto& interp : cuda_interpolations) {
-        Value cfg{{"type", "Resize"},
-                  {"size", {size, -1}},
-                  {"keep_ratio", false},
-                  {"interpolation", interp}};
-        TestResize(cfg, device_name, bgra_mat, dst_height, dst_width);
-      }
+    SECTION("resize 4 channel image")
+    {
+        cv::Mat mat = cv::imread(img_path, cv::IMREAD_COLOR);
+        cv::Mat bgra_mat;
+        cv::cvtColor(bgr_mat, bgra_mat, cv::COLOR_BGR2BGRA);
+        assert(bgra_mat.channels() == 4);
+        constexpr int size           = 256;
+        auto [dst_height, dst_width] = GetTargetSize(bgra_mat, size, -1);
+        for (auto& device_name : gResource.device_names())
+        {
+            for (auto& interp : cuda_interpolations)
+            {
+                Value cfg{{"type", "Resize"},
+                          {"size", {size, -1}},
+                          {"keep_ratio", false},
+                          {"interpolation", interp}};
+                TestResize(cfg, device_name, bgra_mat, dst_height, dst_width);
+            }
+        }
     }
-  }
 }
diff --git a/tests/test_csrc/preprocess/test_utils.cpp b/tests/test_csrc/preprocess/test_utils.cpp
index fd601c33ae..9865ef22a6 100644
--- a/tests/test_csrc/preprocess/test_utils.cpp
+++ b/tests/test_csrc/preprocess/test_utils.cpp
@@ -3,59 +3,74 @@
 #include "test_utils.h"
 using namespace std;
 
-namespace mmdeploy::test {
-unique_ptr<Transform> CreateTransform(const Value& cfg, Device device, Stream stream) {
-  auto op_type = cfg.value<string>("type", "");
-  auto op_version = cfg.value<int>("version", -1);
-
-  try {
-    auto creator = gRegistry<transform::Transform>().Get(op_type, op_version);
-    if (creator == nullptr) {
-      return nullptr;
+namespace mmdeploy::test
+{
+    unique_ptr<Transform> CreateTransform(const Value& cfg, Device device, Stream stream)
+    {
+        auto op_type    = cfg.value<string>("type", "");
+        auto op_version = cfg.value<int>("version", -1);
+
+        try
+        {
+            auto creator = gRegistry<transform::Transform>().Get(op_type, op_version);
+            if (creator == nullptr)
+            {
+                return nullptr;
+            }
+            auto _cfg                 = cfg;
+            _cfg["context"]["device"] = device;
+            _cfg["context"]["stream"] = stream;
+
+            operation::Context context(device, stream);
+            return std::make_unique<Transform>(creator->Create(_cfg));
+        }
+        catch (std::exception& e)
+        {
+            cout << "exception: " << e.what() << endl;
+            return nullptr;
+        }
+        catch (...)
+        {
+            cout << "unexpected exception" << endl;
+            return nullptr;
+        }
+    }
+
+    vector<int64_t> Shape(const Value& value, const string& shape_key)
+    {
+        vector<int64_t> shape;
+        for (auto& v : value[shape_key])
+        {
+            shape.push_back(v.get<int>());
+        }
+        return shape;
+    }
+
+    vector<float> ImageNormCfg(const Value& value, const std::string& key)
+    {
+        vector<float> res;
+        for (auto& v : value["img_norm_cfg"][key])
+        {
+            res.push_back(v.get<float>());
+        }
+        return res;
+    }
+
+    Transform::Transform(std::unique_ptr<transform::Transform> transform)
+        : device_(operation::gContext().device())
+        , stream_(operation::gContext().stream())
+        , transform_(std::move(transform))
+    {
+    }
+
+    Result<Value> Transform::Process(const Value& input)
+    {
+        auto output = input;
+        {
+            operation::Context context(device_, stream_);
+            OUTCOME_TRY(transform_->Apply(output));
+        }
+        return output;
     }
-    auto _cfg = cfg;
-    _cfg["context"]["device"] = device;
-    _cfg["context"]["stream"] = stream;
-
-    operation::Context context(device, stream);
-    return std::make_unique<Transform>(creator->Create(_cfg));
-  } catch (std::exception& e) {
-    cout << "exception: " << e.what() << endl;
-    return nullptr;
-  } catch (...) {
-    cout << "unexpected exception" << endl;
-    return nullptr;
-  }
-}
-
-vector<int64_t> Shape(const Value& value, const string& shape_key) {
-  vector<int64_t> shape;
-  for (auto& v : value[shape_key]) {
-    shape.push_back(v.get<int>());
-  }
-  return shape;
-}
-
-vector<float> ImageNormCfg(const Value& value, const std::string& key) {
-  vector<float> res;
-  for (auto& v : value["img_norm_cfg"][key]) {
-    res.push_back(v.get<float>());
-  }
-  return res;
-}
-
-Transform::Transform(std::unique_ptr<transform::Transform> transform)
-    : device_(operation::gContext().device()),
-      stream_(operation::gContext().stream()),
-      transform_(std::move(transform)) {}
-
-Result<Value> Transform::Process(const Value& input) {
-  auto output = input;
-  {
-    operation::Context context(device_, stream_);
-    OUTCOME_TRY(transform_->Apply(output));
-  }
-  return output;
-}
 
 }  // namespace mmdeploy::test
diff --git a/tests/test_csrc/preprocess/test_utils.h b/tests/test_csrc/preprocess/test_utils.h
index 750bf3d096..b94c220bed 100644
--- a/tests/test_csrc/preprocess/test_utils.h
+++ b/tests/test_csrc/preprocess/test_utils.h
@@ -7,24 +7,26 @@
 #include "mmdeploy/core/value.h"
 #include "mmdeploy/preprocess/transform/transform.h"
 
-namespace mmdeploy::test {
+namespace mmdeploy::test
+{
 
-class Transform {
- public:
-  explicit Transform(std::unique_ptr<transform::Transform> transform);
-  Result<Value> Process(const Value& input);
+    class Transform
+    {
+      public:
+        explicit Transform(std::unique_ptr<transform::Transform> transform);
+        Result<Value> Process(const Value& input);
 
- private:
-  Device device_;
-  Stream stream_;
-  std::unique_ptr<transform::Transform> transform_;
-};
+      private:
+        Device                                device_;
+        Stream                                stream_;
+        std::unique_ptr<transform::Transform> transform_;
+    };
 
-std::unique_ptr<Transform> CreateTransform(const Value& cfg, Device device, Stream stream);
+    std::unique_ptr<Transform> CreateTransform(const Value& cfg, Device device, Stream stream);
 
-std::vector<int64_t> Shape(const Value& value, const std::string& shape_key);
+    std::vector<int64_t>       Shape(const Value& value, const std::string& shape_key);
 
-std::vector<float> ImageNormCfg(const Value& value, const std::string& key);
+    std::vector<float>         ImageNormCfg(const Value& value, const std::string& key);
 
 }  // namespace mmdeploy::test
 
diff --git a/tests/test_csrc/test_resource.h b/tests/test_csrc/test_resource.h
index 8d88cce3e7..24c5ffa23a 100644
--- a/tests/test_csrc/test_resource.h
+++ b/tests/test_csrc/test_resource.h
@@ -14,135 +14,174 @@
 
 using namespace std;
 
-class MMDeployTestResources {
- public:
-  static MMDeployTestResources &Get() {
-    static MMDeployTestResources resource;
-    return resource;
-  }
-
-  const std::vector<std::string> &device_names() const { return devices_; }
-  const std::vector<std::string> &device_names(const std::string &backend) const {
-    return backend_devices_.at(backend);
-  }
-  const std::vector<std::string> &backends() const { return backends_; }
-  const std::vector<std::string> &codebases() const { return codebases_; }
-  const fs::path &resource_root_path() const { return resource_root_path_; }
-
-  bool HasDevice(const std::string &name) const {
-    return std::any_of(devices_.begin(), devices_.end(),
-                       [&](const std::string &device_name) { return device_name == name; });
-  }
-
-  bool IsDir(const fs::path &dir_name) const {
-    auto path = resource_root_path_ / dir_name;
-    return fs::is_directory(path);
-  }
-
-  bool IsFile(const fs::path &file_name) const {
-    auto path = resource_root_path_ / file_name;
-    return fs::is_regular_file(path);
-  }
-
- public:
-  std::vector<std::string> LocateModelResources(const fs::path &sdk_model_zoo_dir) {
-    std::vector<std::string> sdk_model_list;
-    if (resource_root_path_.empty()) {
-      return sdk_model_list;
+class MMDeployTestResources
+{
+  public:
+    static MMDeployTestResources& Get()
+    {
+        static MMDeployTestResources resource;
+        return resource;
     }
 
-    auto path = resource_root_path_ / sdk_model_zoo_dir;
-    if (!fs::is_directory(path)) {
-      return sdk_model_list;
+    const std::vector<std::string>& device_names() const
+    {
+        return devices_;
     }
-    for (auto const &dir_entry : fs::directory_iterator{path}) {
-      fs::directory_entry entry{dir_entry.path()};
-      if (auto const &_path = dir_entry.path(); fs::is_directory(_path)) {
-        sdk_model_list.push_back(dir_entry.path().string());
-      }
+    const std::vector<std::string>& device_names(const std::string& backend) const
+    {
+        return backend_devices_.at(backend);
+    }
+    const std::vector<std::string>& backends() const
+    {
+        return backends_;
+    }
+    const std::vector<std::string>& codebases() const
+    {
+        return codebases_;
+    }
+    const fs::path& resource_root_path() const
+    {
+        return resource_root_path_;
     }
-    return sdk_model_list;
-  }
 
-  std::vector<std::string> LocateImageResources(const fs::path &img_dir) {
-    std::vector<std::string> img_list;
+    bool HasDevice(const std::string& name) const
+    {
+        return std::any_of(devices_.begin(), devices_.end(), [&](const std::string& device_name)
+                           { return device_name == name; });
+    }
 
-    if (resource_root_path_.empty()) {
-      return img_list;
+    bool IsDir(const fs::path& dir_name) const
+    {
+        auto path = resource_root_path_ / dir_name;
+        return fs::is_directory(path);
     }
 
-    auto path = resource_root_path_ / img_dir;
-    if (!fs::is_directory(path)) {
-      return img_list;
+    bool IsFile(const fs::path& file_name) const
+    {
+        auto path = resource_root_path_ / file_name;
+        return fs::is_regular_file(path);
     }
 
-    set<string> extensions{".png", ".jpg", ".jpeg", ".bmp"};
-    for (auto const &dir_entry : fs::directory_iterator{path}) {
-      if (!fs::is_regular_file(dir_entry.path())) {
-        std::cout << dir_entry.path().string() << std::endl;
-        continue;
-      }
-      auto const &_path = dir_entry.path();
-      auto ext = _path.extension().string();
-      std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
-      if (extensions.find(ext) != extensions.end()) {
-        img_list.push_back(_path.string());
-      }
+  public:
+    std::vector<std::string> LocateModelResources(const fs::path& sdk_model_zoo_dir)
+    {
+        std::vector<std::string> sdk_model_list;
+        if (resource_root_path_.empty())
+        {
+            return sdk_model_list;
+        }
+
+        auto path = resource_root_path_ / sdk_model_zoo_dir;
+        if (!fs::is_directory(path))
+        {
+            return sdk_model_list;
+        }
+        for (auto const& dir_entry : fs::directory_iterator{path})
+        {
+            fs::directory_entry entry{dir_entry.path()};
+            if (auto const& _path = dir_entry.path(); fs::is_directory(_path))
+            {
+                sdk_model_list.push_back(dir_entry.path().string());
+            }
+        }
+        return sdk_model_list;
     }
-    return img_list;
-  }
-
- private:
-  MMDeployTestResources() {
-    devices_ = Split(kDevices);
-    backends_ = Split(kBackends);
-    codebases_ = Split(kCodebases);
-    backend_devices_["pplnn"] = {"cpu", "cuda"};
-    backend_devices_["trt"] = {"cuda"};
-    backend_devices_["ort"] = {"cpu"};
-    backend_devices_["ncnn"] = {"cpu"};
-    backend_devices_["openvino"] = {"cpu"};
-    resource_root_path_ = LocateResourceRootPath(fs::current_path(), 8);
-  }
-
-  static std::vector<std::string> Split(const std::string &text, char delimiter = ';') {
-    std::vector<std::string> result;
-    std::istringstream ss(text);
-    for (std::string word; std::getline(ss, word, delimiter);) {
-      result.emplace_back(word);
+
+    std::vector<std::string> LocateImageResources(const fs::path& img_dir)
+    {
+        std::vector<std::string> img_list;
+
+        if (resource_root_path_.empty())
+        {
+            return img_list;
+        }
+
+        auto path = resource_root_path_ / img_dir;
+        if (!fs::is_directory(path))
+        {
+            return img_list;
+        }
+
+        set<string> extensions{".png", ".jpg", ".jpeg", ".bmp"};
+        for (auto const& dir_entry : fs::directory_iterator{path})
+        {
+            if (!fs::is_regular_file(dir_entry.path()))
+            {
+                std::cout << dir_entry.path().string() << std::endl;
+                continue;
+            }
+            auto const& _path = dir_entry.path();
+            auto        ext   = _path.extension().string();
+            std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
+            if (extensions.find(ext) != extensions.end())
+            {
+                img_list.push_back(_path.string());
+            }
+        }
+        return img_list;
     }
-    return result;
-  }
 
-  fs::path LocateResourceRootPath(const fs::path &cur_path, int max_depth) {
-    if (max_depth < 0) {
-      return "";
+  private:
+    MMDeployTestResources()
+    {
+        devices_                     = Split(kDevices);
+        backends_                    = Split(kBackends);
+        codebases_                   = Split(kCodebases);
+        backend_devices_["pplnn"]    = {"cpu", "cuda"};
+        backend_devices_["trt"]      = {"cuda"};
+        backend_devices_["ort"]      = {"cpu"};
+        backend_devices_["ncnn"]     = {"cpu"};
+        backend_devices_["openvino"] = {"cpu"};
+        resource_root_path_          = LocateResourceRootPath(fs::current_path(), 8);
     }
-    for (auto const &dir_entry : fs::directory_iterator{cur_path}) {
-      fs::directory_entry entry{dir_entry.path()};
-      auto const &_path = dir_entry.path();
-      // filename must be checked before fs::is_directory, the latter will throw
-      // when _path points to a system file on Windows
-      if (_path.filename() == "mmdeploy_test_resources" && fs::is_directory(_path)) {
-        return _path;
-      }
+
+    static std::vector<std::string> Split(const std::string& text, char delimiter = ';')
+    {
+        std::vector<std::string> result;
+        std::istringstream       ss(text);
+        for (std::string word; std::getline(ss, word, delimiter);)
+        {
+            result.emplace_back(word);
+        }
+        return result;
     }
-    // Didn't find 'mmdeploy_test_resources' in current directory.
-    // Move to its parent directory and keep looking for it
-    if (cur_path.has_parent_path()) {
-      return LocateResourceRootPath(cur_path.parent_path(), max_depth - 1);
-    } else {
-      return "";
+
+    fs::path LocateResourceRootPath(const fs::path& cur_path, int max_depth)
+    {
+        if (max_depth < 0)
+        {
+            return "";
+        }
+        for (auto const& dir_entry : fs::directory_iterator{cur_path})
+        {
+            fs::directory_entry entry{dir_entry.path()};
+            auto const&         _path = dir_entry.path();
+            // filename must be checked before fs::is_directory, the latter will throw
+            // when _path points to a system file on Windows
+            if (_path.filename() == "mmdeploy_test_resources" && fs::is_directory(_path))
+            {
+                return _path;
+            }
+        }
+        // Didn't find 'mmdeploy_test_resources' in current directory.
+        // Move to its parent directory and keep looking for it
+        if (cur_path.has_parent_path())
+        {
+            return LocateResourceRootPath(cur_path.parent_path(), max_depth - 1);
+        }
+        else
+        {
+            return "";
+        }
     }
-  }
-
- private:
-  std::vector<std::string> devices_;
-  std::vector<std::string> backends_;
-  std::vector<std::string> codebases_;
-  std::map<std::string, std::vector<std::string>> backend_devices_;
-  fs::path resource_root_path_;
-  //  std::string resource_root_path_;
+
+  private:
+    std::vector<std::string>                        devices_;
+    std::vector<std::string>                        backends_;
+    std::vector<std::string>                        codebases_;
+    std::map<std::string, std::vector<std::string>> backend_devices_;
+    fs::path                                        resource_root_path_;
+    //  std::string resource_root_path_;
 };
 
 #endif  // MMDEPLOY_TEST_RESOURCE_H
diff --git a/third_party/clipper/clipper.cpp b/third_party/clipper/clipper.cpp
index 626fe05eb0..5ea25f50e5 100644
--- a/third_party/clipper/clipper.cpp
+++ b/third_party/clipper/clipper.cpp
@@ -1,42 +1,42 @@
 /*******************************************************************************
-*                                                                              *
-* Author    :  Angus Johnson                                                   *
-* Version   :  6.4.2                                                           *
-* Date      :  27 February 2017                                                *
-* Website   :  http://www.angusj.com                                           *
-* Copyright :  Angus Johnson 2010-2017                                         *
-*                                                                              *
-* License:                                                                     *
-* Use, modification & distribution is subject to Boost Software License Ver 1. *
-* http://www.boost.org/LICENSE_1_0.txt                                         *
-*                                                                              *
-* Attributions:                                                                *
-* The code in this library is an extension of Bala Vatti's clipping algorithm: *
-* "A generic solution to polygon clipping"                                     *
-* Communications of the ACM, Vol 35, Issue 7 (July 1992) pp 56-63.             *
-* http://portal.acm.org/citation.cfm?id=129906                                 *
-*                                                                              *
-* Computer graphics and geometric modeling: implementation and algorithms      *
-* By Max K. Agoston                                                            *
-* Springer; 1 edition (January 4, 2005)                                        *
-* http://books.google.com/books?q=vatti+clipping+agoston                       *
-*                                                                              *
-* See also:                                                                    *
-* "Polygon Offsetting by Computing Winding Numbers"                            *
-* Paper no. DETC2005-85513 pp. 565-575                                         *
-* ASME 2005 International Design Engineering Technical Conferences             *
-* and Computers and Information in Engineering Conference (IDETC/CIE2005)      *
-* September 24-28, 2005 , Long Beach, California, USA                          *
-* http://www.me.berkeley.edu/~mcmains/pubs/DAC05OffsetPolygon.pdf              *
-*                                                                              *
-*******************************************************************************/
+ *                                                                              *
+ * Author    :  Angus Johnson                                                   *
+ * Version   :  6.4.2                                                           *
+ * Date      :  27 February 2017                                                *
+ * Website   :  http://www.angusj.com                                           *
+ * Copyright :  Angus Johnson 2010-2017                                         *
+ *                                                                              *
+ * License:                                                                     *
+ * Use, modification & distribution is subject to Boost Software License Ver 1. *
+ * http://www.boost.org/LICENSE_1_0.txt                                         *
+ *                                                                              *
+ * Attributions:                                                                *
+ * The code in this library is an extension of Bala Vatti's clipping algorithm: *
+ * "A generic solution to polygon clipping"                                     *
+ * Communications of the ACM, Vol 35, Issue 7 (July 1992) pp 56-63.             *
+ * http://portal.acm.org/citation.cfm?id=129906                                 *
+ *                                                                              *
+ * Computer graphics and geometric modeling: implementation and algorithms      *
+ * By Max K. Agoston                                                            *
+ * Springer; 1 edition (January 4, 2005)                                        *
+ * http://books.google.com/books?q=vatti+clipping+agoston                       *
+ *                                                                              *
+ * See also:                                                                    *
+ * "Polygon Offsetting by Computing Winding Numbers"                            *
+ * Paper no. DETC2005-85513 pp. 565-575                                         *
+ * ASME 2005 International Design Engineering Technical Conferences             *
+ * and Computers and Information in Engineering Conference (IDETC/CIE2005)      *
+ * September 24-28, 2005 , Long Beach, California, USA                          *
+ * http://www.me.berkeley.edu/~mcmains/pubs/DAC05OffsetPolygon.pdf              *
+ *                                                                              *
+ *******************************************************************************/
 
 /*******************************************************************************
-*                                                                              *
-* This is a translation of the Delphi Clipper library and the naming style     *
-* used has retained a Delphi flavour.                                          *
-*                                                                              *
-*******************************************************************************/
+ *                                                                              *
+ * This is a translation of the Delphi Clipper library and the naming style     *
+ * used has retained a Delphi flavour.                                          *
+ *                                                                              *
+ *******************************************************************************/
 
 #include "clipper.hpp"
 #include <cmath>
@@ -48,4582 +48,4771 @@
 #include <ostream>
 #include <functional>
 
-namespace ClipperLib {
+namespace ClipperLib
+{
 
-static double const pi = 3.141592653589793238;
-static double const two_pi = pi *2;
-static double const def_arc_tolerance = 0.25;
+    static double const pi                = 3.141592653589793238;
+    static double const two_pi            = pi * 2;
+    static double const def_arc_tolerance = 0.25;
 
-enum Direction { dRightToLeft, dLeftToRight };
+    enum Direction
+    {
+        dRightToLeft,
+        dLeftToRight
+    };
 
-static int const Unassigned = -1;  //edge not currently 'owning' a solution
-static int const Skip = -2;        //edge that would otherwise close a path
+    static int const Unassigned = -1;  // edge not currently 'owning' a solution
+    static int const Skip       = -2;  // edge that would otherwise close a path
 
 #define HORIZONTAL (-1.0E+40)
 #define TOLERANCE (1.0e-20)
 #define NEAR_ZERO(val) (((val) > -TOLERANCE) && ((val) < TOLERANCE))
 
-struct TEdge {
-  IntPoint Bot;
-  IntPoint Curr; //current (updated for every new scanbeam)
-  IntPoint Top;
-  double Dx;
-  PolyType PolyTyp;
-  EdgeSide Side; //side only refers to current side of solution poly
-  int WindDelta; //1 or -1 depending on winding direction
-  int WindCnt;
-  int WindCnt2; //winding count of the opposite polytype
-  int OutIdx;
-  TEdge *Next;
-  TEdge *Prev;
-  TEdge *NextInLML;
-  TEdge *NextInAEL;
-  TEdge *PrevInAEL;
-  TEdge *NextInSEL;
-  TEdge *PrevInSEL;
-};
-
-struct IntersectNode {
-  TEdge          *Edge1;
-  TEdge          *Edge2;
-  IntPoint        Pt;
-};
-
-struct LocalMinimum {
-  cInt          Y;
-  TEdge        *LeftBound;
-  TEdge        *RightBound;
-};
-
-struct OutPt;
-
-//OutRec: contains a path in the clipping solution. Edges in the AEL will
-//carry a pointer to an OutRec when they are part of the clipping solution.
-struct OutRec {
-  int       Idx;
-  bool      IsHole;
-  bool      IsOpen;
-  OutRec   *FirstLeft;  //see comments in clipper.pas
-  PolyNode *PolyNd;
-  OutPt    *Pts;
-  OutPt    *BottomPt;
-};
-
-struct OutPt {
-  int       Idx;
-  IntPoint  Pt;
-  OutPt    *Next;
-  OutPt    *Prev;
-};
-
-struct Join {
-  OutPt    *OutPt1;
-  OutPt    *OutPt2;
-  IntPoint  OffPt;
-};
-
-struct LocMinSorter
-{
-  inline bool operator()(const LocalMinimum& locMin1, const LocalMinimum& locMin2)
-  {
-    return locMin2.Y < locMin1.Y;
-  }
-};
-
-//------------------------------------------------------------------------------
-//------------------------------------------------------------------------------
-
-inline cInt Round(double val)
-{
-  if ((val < 0)) return static_cast<cInt>(val - 0.5);
-  else return static_cast<cInt>(val + 0.5);
-}
-//------------------------------------------------------------------------------
-
-inline cInt Abs(cInt val)
-{
-  return val < 0 ? -val : val;
-}
-
-//------------------------------------------------------------------------------
-// PolyTree methods ...
-//------------------------------------------------------------------------------
-
-void PolyTree::Clear()
-{
-    for (PolyNodes::size_type i = 0; i < AllNodes.size(); ++i)
-      delete AllNodes[i];
-    AllNodes.resize(0);
-    Childs.resize(0);
-}
-//------------------------------------------------------------------------------
-
-PolyNode* PolyTree::GetFirst() const
-{
-  if (!Childs.empty())
-      return Childs[0];
-  else
-      return 0;
-}
-//------------------------------------------------------------------------------
-
-int PolyTree::Total() const
-{
-  int result = (int)AllNodes.size();
-  //with negative offsets, ignore the hidden outer polygon ...
-  if (result > 0 && Childs[0] != AllNodes[0]) result--;
-  return result;
-}
-
-//------------------------------------------------------------------------------
-// PolyNode methods ...
-//------------------------------------------------------------------------------
-
-PolyNode::PolyNode(): Parent(0), Index(0), m_IsOpen(false)
-{
-}
-//------------------------------------------------------------------------------
-
-int PolyNode::ChildCount() const
-{
-  return (int)Childs.size();
-}
-//------------------------------------------------------------------------------
+    struct TEdge
+    {
+        IntPoint Bot;
+        IntPoint Curr;  // current (updated for every new scanbeam)
+        IntPoint Top;
+        double   Dx;
+        PolyType PolyTyp;
+        EdgeSide Side;       // side only refers to current side of solution poly
+        int      WindDelta;  // 1 or -1 depending on winding direction
+        int      WindCnt;
+        int      WindCnt2;  // winding count of the opposite polytype
+        int      OutIdx;
+        TEdge*   Next;
+        TEdge*   Prev;
+        TEdge*   NextInLML;
+        TEdge*   NextInAEL;
+        TEdge*   PrevInAEL;
+        TEdge*   NextInSEL;
+        TEdge*   PrevInSEL;
+    };
 
-void PolyNode::AddChild(PolyNode& child)
-{
-  unsigned cnt = (unsigned)Childs.size();
-  Childs.push_back(&child);
-  child.Parent = this;
-  child.Index = cnt;
-}
-//------------------------------------------------------------------------------
+    struct IntersectNode
+    {
+        TEdge*   Edge1;
+        TEdge*   Edge2;
+        IntPoint Pt;
+    };
 
-PolyNode* PolyNode::GetNext() const
-{
-  if (!Childs.empty())
-      return Childs[0];
-  else
-      return GetNextSiblingUp();
-}
-//------------------------------------------------------------------------------
+    struct LocalMinimum
+    {
+        cInt   Y;
+        TEdge* LeftBound;
+        TEdge* RightBound;
+    };
 
-PolyNode* PolyNode::GetNextSiblingUp() const
-{
-  if (!Parent) //protects against PolyTree.GetNextSiblingUp()
-      return 0;
-  else if (Index == Parent->Childs.size() - 1)
-      return Parent->GetNextSiblingUp();
-  else
-      return Parent->Childs[Index + 1];
-}
-//------------------------------------------------------------------------------
+    struct OutPt;
 
-bool PolyNode::IsHole() const
-{
-  bool result = true;
-  PolyNode* node = Parent;
-  while (node)
-  {
-      result = !result;
-      node = node->Parent;
-  }
-  return result;
-}
-//------------------------------------------------------------------------------
+    // OutRec: contains a path in the clipping solution. Edges in the AEL will
+    // carry a pointer to an OutRec when they are part of the clipping solution.
+    struct OutRec
+    {
+        int       Idx;
+        bool      IsHole;
+        bool      IsOpen;
+        OutRec*   FirstLeft;  // see comments in clipper.pas
+        PolyNode* PolyNd;
+        OutPt*    Pts;
+        OutPt*    BottomPt;
+    };
 
-bool PolyNode::IsOpen() const
-{
-  return m_IsOpen;
-}
-//------------------------------------------------------------------------------
+    struct OutPt
+    {
+        int      Idx;
+        IntPoint Pt;
+        OutPt*   Next;
+        OutPt*   Prev;
+    };
 
-#ifndef use_int32
+    struct Join
+    {
+        OutPt*   OutPt1;
+        OutPt*   OutPt2;
+        IntPoint OffPt;
+    };
 
-//------------------------------------------------------------------------------
-// Int128 class (enables safe math on signed 64bit integers)
-// eg Int128 val1((long64)9223372036854775807); //ie 2^63 -1
-//    Int128 val2((long64)9223372036854775807);
-//    Int128 val3 = val1 * val2;
-//    val3.AsString => "85070591730234615847396907784232501249" (8.5e+37)
-//------------------------------------------------------------------------------
+    struct LocMinSorter
+    {
+        inline bool operator()(const LocalMinimum& locMin1, const LocalMinimum& locMin2)
+        {
+            return locMin2.Y < locMin1.Y;
+        }
+    };
 
-class Int128
-{
-  public:
-    ulong64 lo;
-    long64 hi;
+    //------------------------------------------------------------------------------
+    //------------------------------------------------------------------------------
 
-    Int128(long64 _lo = 0)
+    inline cInt Round(double val)
     {
-      lo = (ulong64)_lo;
-      if (_lo < 0)  hi = -1; else hi = 0;
+        if ((val < 0))
+            return static_cast<cInt>(val - 0.5);
+        else
+            return static_cast<cInt>(val + 0.5);
     }
+    //------------------------------------------------------------------------------
 
-
-    Int128(const Int128 &val): lo(val.lo), hi(val.hi){}
-
-    Int128(const long64& _hi, const ulong64& _lo): lo(_lo), hi(_hi){}
-
-    Int128& operator = (const long64 &val)
+    inline cInt Abs(cInt val)
     {
-      lo = (ulong64)val;
-      if (val < 0) hi = -1; else hi = 0;
-      return *this;
+        return val < 0 ? -val : val;
     }
 
-    bool operator == (const Int128 &val) const
-      {return (hi == val.hi && lo == val.lo);}
+    //------------------------------------------------------------------------------
+    // PolyTree methods ...
+    //------------------------------------------------------------------------------
 
-    bool operator != (const Int128 &val) const
-      { return !(*this == val);}
-
-    bool operator > (const Int128 &val) const
+    void PolyTree::Clear()
     {
-      if (hi != val.hi)
-        return hi > val.hi;
-      else
-        return lo > val.lo;
+        for (PolyNodes::size_type i = 0; i < AllNodes.size(); ++i)
+            delete AllNodes[i];
+        AllNodes.resize(0);
+        Childs.resize(0);
     }
+    //------------------------------------------------------------------------------
 
-    bool operator < (const Int128 &val) const
+    PolyNode* PolyTree::GetFirst() const
     {
-      if (hi != val.hi)
-        return hi < val.hi;
-      else
-        return lo < val.lo;
+        if (!Childs.empty())
+            return Childs[0];
+        else
+            return 0;
     }
+    //------------------------------------------------------------------------------
 
-    bool operator >= (const Int128 &val) const
-      { return !(*this < val);}
+    int PolyTree::Total() const
+    {
+        int result = (int)AllNodes.size();
+        // with negative offsets, ignore the hidden outer polygon ...
+        if (result > 0 && Childs[0] != AllNodes[0]) result--;
+        return result;
+    }
 
-    bool operator <= (const Int128 &val) const
-      { return !(*this > val);}
+    //------------------------------------------------------------------------------
+    // PolyNode methods ...
+    //------------------------------------------------------------------------------
 
-    Int128& operator += (const Int128 &rhs)
+    PolyNode::PolyNode()
+        : Parent(0)
+        , Index(0)
+        , m_IsOpen(false)
     {
-      hi += rhs.hi;
-      lo += rhs.lo;
-      if (lo < rhs.lo) hi++;
-      return *this;
     }
+    //------------------------------------------------------------------------------
 
-    Int128 operator + (const Int128 &rhs) const
+    int PolyNode::ChildCount() const
     {
-      Int128 result(*this);
-      result+= rhs;
-      return result;
+        return (int)Childs.size();
     }
+    //------------------------------------------------------------------------------
 
-    Int128& operator -= (const Int128 &rhs)
+    void PolyNode::AddChild(PolyNode& child)
     {
-      *this += -rhs;
-      return *this;
+        unsigned cnt = (unsigned)Childs.size();
+        Childs.push_back(&child);
+        child.Parent = this;
+        child.Index  = cnt;
     }
+    //------------------------------------------------------------------------------
 
-    Int128 operator - (const Int128 &rhs) const
+    PolyNode* PolyNode::GetNext() const
     {
-      Int128 result(*this);
-      result -= rhs;
-      return result;
+        if (!Childs.empty())
+            return Childs[0];
+        else
+            return GetNextSiblingUp();
     }
+    //------------------------------------------------------------------------------
 
-    Int128 operator-() const //unary negation
+    PolyNode* PolyNode::GetNextSiblingUp() const
     {
-      if (lo == 0)
-        return Int128(-hi, 0);
-      else
-        return Int128(~hi, ~lo + 1);
+        if (!Parent)  // protects against PolyTree.GetNextSiblingUp()
+            return 0;
+        else if (Index == Parent->Childs.size() - 1)
+            return Parent->GetNextSiblingUp();
+        else
+            return Parent->Childs[Index + 1];
     }
+    //------------------------------------------------------------------------------
 
-    operator double() const
+    bool PolyNode::IsHole() const
     {
-      const double shift64 = 18446744073709551616.0; //2^64
-      if (hi < 0)
-      {
-        if (lo == 0) return (double)hi * shift64;
-        else return -(double)(~lo + ~hi * shift64);
-      }
-      else
-        return (double)(lo + hi * shift64);
+        bool      result = true;
+        PolyNode* node   = Parent;
+        while (node)
+        {
+            result = !result;
+            node   = node->Parent;
+        }
+        return result;
     }
+    //------------------------------------------------------------------------------
 
-};
-//------------------------------------------------------------------------------
+    bool PolyNode::IsOpen() const
+    {
+        return m_IsOpen;
+    }
+    //------------------------------------------------------------------------------
 
-Int128 Int128Mul (long64 lhs, long64 rhs)
-{
-  bool negate = (lhs < 0) != (rhs < 0);
-
-  if (lhs < 0) lhs = -lhs;
-  ulong64 int1Hi = ulong64(lhs) >> 32;
-  ulong64 int1Lo = ulong64(lhs & 0xFFFFFFFF);
-
-  if (rhs < 0) rhs = -rhs;
-  ulong64 int2Hi = ulong64(rhs) >> 32;
-  ulong64 int2Lo = ulong64(rhs & 0xFFFFFFFF);
-
-  //nb: see comments in clipper.pas
-  ulong64 a = int1Hi * int2Hi;
-  ulong64 b = int1Lo * int2Lo;
-  ulong64 c = int1Hi * int2Lo + int1Lo * int2Hi;
-
-  Int128 tmp;
-  tmp.hi = long64(a + (c >> 32));
-  tmp.lo = long64(c << 32);
-  tmp.lo += long64(b);
-  if (tmp.lo < b) tmp.hi++;
-  if (negate) tmp = -tmp;
-  return tmp;
-};
-#endif
+#ifndef use_int32
 
-//------------------------------------------------------------------------------
-// Miscellaneous global functions
-//------------------------------------------------------------------------------
+    //------------------------------------------------------------------------------
+    // Int128 class (enables safe math on signed 64bit integers)
+    // eg Int128 val1((long64)9223372036854775807); //ie 2^63 -1
+    //    Int128 val2((long64)9223372036854775807);
+    //    Int128 val3 = val1 * val2;
+    //    val3.AsString => "85070591730234615847396907784232501249" (8.5e+37)
+    //------------------------------------------------------------------------------
 
-bool Orientation(const Path &poly)
-{
-    return Area(poly) >= 0;
-}
-//------------------------------------------------------------------------------
+    class Int128
+    {
+      public:
+        ulong64 lo;
+        long64  hi;
 
-double Area(const Path &poly)
-{
-  int size = (int)poly.size();
-  if (size < 3) return 0;
-
-  double a = 0;
-  for (int i = 0, j = size -1; i < size; ++i)
-  {
-    a += ((double)poly[j].X + poly[i].X) * ((double)poly[j].Y - poly[i].Y);
-    j = i;
-  }
-  return -a * 0.5;
-}
-//------------------------------------------------------------------------------
+        Int128(long64 _lo = 0)
+        {
+            lo = (ulong64)_lo;
+            if (_lo < 0)
+                hi = -1;
+            else
+                hi = 0;
+        }
 
-double Area(const OutPt *op)
-{
-  const OutPt *startOp = op;
-  if (!op) return 0;
-  double a = 0;
-  do {
-    a +=  (double)(op->Prev->Pt.X + op->Pt.X) * (double)(op->Prev->Pt.Y - op->Pt.Y);
-    op = op->Next;
-  } while (op != startOp);
-  return a * 0.5;
-}
-//------------------------------------------------------------------------------
 
-double Area(const OutRec &outRec)
-{
-  return Area(outRec.Pts);
-}
-//------------------------------------------------------------------------------
+        Int128(const Int128& val)
+            : lo(val.lo)
+            , hi(val.hi)
+        {
+        }
 
-bool PointIsVertex(const IntPoint &Pt, OutPt *pp)
-{
-  OutPt *pp2 = pp;
-  do
-  {
-    if (pp2->Pt == Pt) return true;
-    pp2 = pp2->Next;
-  }
-  while (pp2 != pp);
-  return false;
-}
-//------------------------------------------------------------------------------
+        Int128(const long64& _hi, const ulong64& _lo)
+            : lo(_lo)
+            , hi(_hi)
+        {
+        }
 
-//See "The Point in Polygon Problem for Arbitrary Polygons" by Hormann & Agathos
-//http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.88.5498&rep=rep1&type=pdf
-int PointInPolygon(const IntPoint &pt, const Path &path)
-{
-  //returns 0 if false, +1 if true, -1 if pt ON polygon boundary
-  int result = 0;
-  size_t cnt = path.size();
-  if (cnt < 3) return 0;
-  IntPoint ip = path[0];
-  for(size_t i = 1; i <= cnt; ++i)
-  {
-    IntPoint ipNext = (i == cnt ? path[0] : path[i]);
-    if (ipNext.Y == pt.Y)
-    {
-        if ((ipNext.X == pt.X) || (ip.Y == pt.Y &&
-          ((ipNext.X > pt.X) == (ip.X < pt.X)))) return -1;
-    }
-    if ((ip.Y < pt.Y) != (ipNext.Y < pt.Y))
-    {
-      if (ip.X >= pt.X)
-      {
-        if (ipNext.X > pt.X) result = 1 - result;
-        else
+        Int128& operator=(const long64& val)
         {
-          double d = (double)(ip.X - pt.X) * (ipNext.Y - pt.Y) -
-            (double)(ipNext.X - pt.X) * (ip.Y - pt.Y);
-          if (!d) return -1;
-          if ((d > 0) == (ipNext.Y > ip.Y)) result = 1 - result;
+            lo = (ulong64)val;
+            if (val < 0)
+                hi = -1;
+            else
+                hi = 0;
+            return *this;
         }
-      } else
-      {
-        if (ipNext.X > pt.X)
+
+        bool operator==(const Int128& val) const
         {
-          double d = (double)(ip.X - pt.X) * (ipNext.Y - pt.Y) -
-            (double)(ipNext.X - pt.X) * (ip.Y - pt.Y);
-          if (!d) return -1;
-          if ((d > 0) == (ipNext.Y > ip.Y)) result = 1 - result;
+            return (hi == val.hi && lo == val.lo);
         }
-      }
-    }
-    ip = ipNext;
-  }
-  return result;
-}
-//------------------------------------------------------------------------------
 
-int PointInPolygon (const IntPoint &pt, OutPt *op)
-{
-  //returns 0 if false, +1 if true, -1 if pt ON polygon boundary
-  int result = 0;
-  OutPt* startOp = op;
-  for(;;)
-  {
-    if (op->Next->Pt.Y == pt.Y)
-    {
-        if ((op->Next->Pt.X == pt.X) || (op->Pt.Y == pt.Y &&
-          ((op->Next->Pt.X > pt.X) == (op->Pt.X < pt.X)))) return -1;
-    }
-    if ((op->Pt.Y < pt.Y) != (op->Next->Pt.Y < pt.Y))
-    {
-      if (op->Pt.X >= pt.X)
-      {
-        if (op->Next->Pt.X > pt.X) result = 1 - result;
-        else
+        bool operator!=(const Int128& val) const
         {
-          double d = (double)(op->Pt.X - pt.X) * (op->Next->Pt.Y - pt.Y) -
-            (double)(op->Next->Pt.X - pt.X) * (op->Pt.Y - pt.Y);
-          if (!d) return -1;
-          if ((d > 0) == (op->Next->Pt.Y > op->Pt.Y)) result = 1 - result;
+            return !(*this == val);
         }
-      } else
-      {
-        if (op->Next->Pt.X > pt.X)
+
+        bool operator>(const Int128& val) const
         {
-          double d = (double)(op->Pt.X - pt.X) * (op->Next->Pt.Y - pt.Y) -
-            (double)(op->Next->Pt.X - pt.X) * (op->Pt.Y - pt.Y);
-          if (!d) return -1;
-          if ((d > 0) == (op->Next->Pt.Y > op->Pt.Y)) result = 1 - result;
+            if (hi != val.hi)
+                return hi > val.hi;
+            else
+                return lo > val.lo;
         }
-      }
-    }
-    op = op->Next;
-    if (startOp == op) break;
-  }
-  return result;
-}
-//------------------------------------------------------------------------------
 
-bool Poly2ContainsPoly1(OutPt *OutPt1, OutPt *OutPt2)
-{
-  OutPt* op = OutPt1;
-  do
-  {
-    //nb: PointInPolygon returns 0 if false, +1 if true, -1 if pt on polygon
-    int res = PointInPolygon(op->Pt, OutPt2);
-    if (res >= 0) return res > 0;
-    op = op->Next;
-  }
-  while (op != OutPt1);
-  return true;
-}
-//----------------------------------------------------------------------
-
-bool SlopesEqual(const TEdge &e1, const TEdge &e2, bool UseFullInt64Range)
-{
-#ifndef use_int32
-  if (UseFullInt64Range)
-    return Int128Mul(e1.Top.Y - e1.Bot.Y, e2.Top.X - e2.Bot.X) ==
-    Int128Mul(e1.Top.X - e1.Bot.X, e2.Top.Y - e2.Bot.Y);
-  else
-#endif
-    return (e1.Top.Y - e1.Bot.Y) * (e2.Top.X - e2.Bot.X) ==
-    (e1.Top.X - e1.Bot.X) * (e2.Top.Y - e2.Bot.Y);
-}
-//------------------------------------------------------------------------------
+        bool operator<(const Int128& val) const
+        {
+            if (hi != val.hi)
+                return hi < val.hi;
+            else
+                return lo < val.lo;
+        }
 
-bool SlopesEqual(const IntPoint pt1, const IntPoint pt2,
-  const IntPoint pt3, bool UseFullInt64Range)
-{
-#ifndef use_int32
-  if (UseFullInt64Range)
-    return Int128Mul(pt1.Y-pt2.Y, pt2.X-pt3.X) == Int128Mul(pt1.X-pt2.X, pt2.Y-pt3.Y);
-  else
-#endif
-    return (pt1.Y-pt2.Y)*(pt2.X-pt3.X) == (pt1.X-pt2.X)*(pt2.Y-pt3.Y);
-}
-//------------------------------------------------------------------------------
+        bool operator>=(const Int128& val) const
+        {
+            return !(*this < val);
+        }
 
-bool SlopesEqual(const IntPoint pt1, const IntPoint pt2,
-  const IntPoint pt3, const IntPoint pt4, bool UseFullInt64Range)
-{
-#ifndef use_int32
-  if (UseFullInt64Range)
-    return Int128Mul(pt1.Y-pt2.Y, pt3.X-pt4.X) == Int128Mul(pt1.X-pt2.X, pt3.Y-pt4.Y);
-  else
-#endif
-    return (pt1.Y-pt2.Y)*(pt3.X-pt4.X) == (pt1.X-pt2.X)*(pt3.Y-pt4.Y);
-}
-//------------------------------------------------------------------------------
+        bool operator<=(const Int128& val) const
+        {
+            return !(*this > val);
+        }
 
-inline bool IsHorizontal(TEdge &e)
-{
-  return e.Dx == HORIZONTAL;
-}
-//------------------------------------------------------------------------------
+        Int128& operator+=(const Int128& rhs)
+        {
+            hi += rhs.hi;
+            lo += rhs.lo;
+            if (lo < rhs.lo) hi++;
+            return *this;
+        }
 
-inline double GetDx(const IntPoint pt1, const IntPoint pt2)
-{
-  return (pt1.Y == pt2.Y) ?
-    HORIZONTAL : (double)(pt2.X - pt1.X) / (pt2.Y - pt1.Y);
-}
-//---------------------------------------------------------------------------
+        Int128 operator+(const Int128& rhs) const
+        {
+            Int128 result(*this);
+            result += rhs;
+            return result;
+        }
 
-inline void SetDx(TEdge &e)
-{
-  cInt dy  = (e.Top.Y - e.Bot.Y);
-  if (dy == 0) e.Dx = HORIZONTAL;
-  else e.Dx = (double)(e.Top.X - e.Bot.X) / dy;
-}
-//---------------------------------------------------------------------------
+        Int128& operator-=(const Int128& rhs)
+        {
+            *this += -rhs;
+            return *this;
+        }
 
-inline void SwapSides(TEdge &Edge1, TEdge &Edge2)
-{
-  EdgeSide Side =  Edge1.Side;
-  Edge1.Side = Edge2.Side;
-  Edge2.Side = Side;
-}
-//------------------------------------------------------------------------------
+        Int128 operator-(const Int128& rhs) const
+        {
+            Int128 result(*this);
+            result -= rhs;
+            return result;
+        }
 
-inline void SwapPolyIndexes(TEdge &Edge1, TEdge &Edge2)
-{
-  int OutIdx =  Edge1.OutIdx;
-  Edge1.OutIdx = Edge2.OutIdx;
-  Edge2.OutIdx = OutIdx;
-}
-//------------------------------------------------------------------------------
+        Int128 operator-() const  // unary negation
+        {
+            if (lo == 0)
+                return Int128(-hi, 0);
+            else
+                return Int128(~hi, ~lo + 1);
+        }
 
-inline cInt TopX(TEdge &edge, const cInt currentY)
-{
-  return ( currentY == edge.Top.Y ) ?
-    edge.Top.X : edge.Bot.X + Round(edge.Dx *(currentY - edge.Bot.Y));
-}
-//------------------------------------------------------------------------------
+        operator double() const
+        {
+            const double shift64 = 18446744073709551616.0;  // 2^64
+            if (hi < 0)
+            {
+                if (lo == 0)
+                    return (double)hi * shift64;
+                else
+                    return -(double)(~lo + ~hi * shift64);
+            }
+            else
+                return (double)(lo + hi * shift64);
+        }
+    };
+    //------------------------------------------------------------------------------
 
-void IntersectPoint(TEdge &Edge1, TEdge &Edge2, IntPoint &ip)
-{
-#ifdef use_xyz
-  ip.Z = 0;
+    Int128 Int128Mul(long64 lhs, long64 rhs)
+    {
+        bool negate = (lhs < 0) != (rhs < 0);
+
+        if (lhs < 0) lhs = -lhs;
+        ulong64 int1Hi = ulong64(lhs) >> 32;
+        ulong64 int1Lo = ulong64(lhs & 0xFFFFFFFF);
+
+        if (rhs < 0) rhs = -rhs;
+        ulong64 int2Hi = ulong64(rhs) >> 32;
+        ulong64 int2Lo = ulong64(rhs & 0xFFFFFFFF);
+
+        // nb: see comments in clipper.pas
+        ulong64 a = int1Hi * int2Hi;
+        ulong64 b = int1Lo * int2Lo;
+        ulong64 c = int1Hi * int2Lo + int1Lo * int2Hi;
+
+        Int128  tmp;
+        tmp.hi = long64(a + (c >> 32));
+        tmp.lo = long64(c << 32);
+        tmp.lo += long64(b);
+        if (tmp.lo < b) tmp.hi++;
+        if (negate) tmp = -tmp;
+        return tmp;
+    };
 #endif
 
-  double b1, b2;
-  if (Edge1.Dx == Edge2.Dx)
-  {
-    ip.Y = Edge1.Curr.Y;
-    ip.X = TopX(Edge1, ip.Y);
-    return;
-  }
-  else if (Edge1.Dx == 0)
-  {
-    ip.X = Edge1.Bot.X;
-    if (IsHorizontal(Edge2))
-      ip.Y = Edge2.Bot.Y;
-    else
-    {
-      b2 = Edge2.Bot.Y - (Edge2.Bot.X / Edge2.Dx);
-      ip.Y = Round(ip.X / Edge2.Dx + b2);
-    }
-  }
-  else if (Edge2.Dx == 0)
-  {
-    ip.X = Edge2.Bot.X;
-    if (IsHorizontal(Edge1))
-      ip.Y = Edge1.Bot.Y;
-    else
-    {
-      b1 = Edge1.Bot.Y - (Edge1.Bot.X / Edge1.Dx);
-      ip.Y = Round(ip.X / Edge1.Dx + b1);
-    }
-  }
-  else
-  {
-    b1 = Edge1.Bot.X - Edge1.Bot.Y * Edge1.Dx;
-    b2 = Edge2.Bot.X - Edge2.Bot.Y * Edge2.Dx;
-    double q = (b2-b1) / (Edge1.Dx - Edge2.Dx);
-    ip.Y = Round(q);
-    if (std::fabs(Edge1.Dx) < std::fabs(Edge2.Dx))
-      ip.X = Round(Edge1.Dx * q + b1);
-    else
-      ip.X = Round(Edge2.Dx * q + b2);
-  }
-
-  if (ip.Y < Edge1.Top.Y || ip.Y < Edge2.Top.Y)
-  {
-    if (Edge1.Top.Y > Edge2.Top.Y)
-      ip.Y = Edge1.Top.Y;
-    else
-      ip.Y = Edge2.Top.Y;
-    if (std::fabs(Edge1.Dx) < std::fabs(Edge2.Dx))
-      ip.X = TopX(Edge1, ip.Y);
-    else
-      ip.X = TopX(Edge2, ip.Y);
-  }
-  //finally, don't allow 'ip' to be BELOW curr.Y (ie bottom of scanbeam) ...
-  if (ip.Y > Edge1.Curr.Y)
-  {
-    ip.Y = Edge1.Curr.Y;
-    //use the more vertical edge to derive X ...
-    if (std::fabs(Edge1.Dx) > std::fabs(Edge2.Dx))
-      ip.X = TopX(Edge2, ip.Y); else
-      ip.X = TopX(Edge1, ip.Y);
-  }
-}
-//------------------------------------------------------------------------------
+    //------------------------------------------------------------------------------
+    // Miscellaneous global functions
+    //------------------------------------------------------------------------------
 
-void ReversePolyPtLinks(OutPt *pp)
-{
-  if (!pp) return;
-  OutPt *pp1, *pp2;
-  pp1 = pp;
-  do {
-  pp2 = pp1->Next;
-  pp1->Next = pp1->Prev;
-  pp1->Prev = pp2;
-  pp1 = pp2;
-  } while( pp1 != pp );
-}
-//------------------------------------------------------------------------------
+    bool Orientation(const Path& poly)
+    {
+        return Area(poly) >= 0;
+    }
+    //------------------------------------------------------------------------------
 
-void DisposeOutPts(OutPt*& pp)
-{
-  if (pp == 0) return;
-    pp->Prev->Next = 0;
-  while( pp )
-  {
-    OutPt *tmpPp = pp;
-    pp = pp->Next;
-    delete tmpPp;
-  }
-}
-//------------------------------------------------------------------------------
+    double Area(const Path& poly)
+    {
+        int size = (int)poly.size();
+        if (size < 3) return 0;
 
-inline void InitEdge(TEdge* e, TEdge* eNext, TEdge* ePrev, const IntPoint& Pt)
-{
-  std::memset(e, 0, sizeof(TEdge));
-  e->Next = eNext;
-  e->Prev = ePrev;
-  e->Curr = Pt;
-  e->OutIdx = Unassigned;
-}
-//------------------------------------------------------------------------------
+        double a = 0;
+        for (int i = 0, j = size - 1; i < size; ++i)
+        {
+            a += ((double)poly[j].X + poly[i].X) * ((double)poly[j].Y - poly[i].Y);
+            j = i;
+        }
+        return -a * 0.5;
+    }
+    //------------------------------------------------------------------------------
 
-void InitEdge2(TEdge& e, PolyType Pt)
-{
-  if (e.Curr.Y >= e.Next->Curr.Y)
-  {
-    e.Bot = e.Curr;
-    e.Top = e.Next->Curr;
-  } else
-  {
-    e.Top = e.Curr;
-    e.Bot = e.Next->Curr;
-  }
-  SetDx(e);
-  e.PolyTyp = Pt;
-}
-//------------------------------------------------------------------------------
+    double Area(const OutPt* op)
+    {
+        const OutPt* startOp = op;
+        if (!op) return 0;
+        double a = 0;
+        do {
+            a += (double)(op->Prev->Pt.X + op->Pt.X) * (double)(op->Prev->Pt.Y - op->Pt.Y);
+            op = op->Next;
+        } while (op != startOp);
+        return a * 0.5;
+    }
+    //------------------------------------------------------------------------------
 
-TEdge* RemoveEdge(TEdge* e)
-{
-  //removes e from double_linked_list (but without removing from memory)
-  e->Prev->Next = e->Next;
-  e->Next->Prev = e->Prev;
-  TEdge* result = e->Next;
-  e->Prev = 0; //flag as removed (see ClipperBase.Clear)
-  return result;
-}
-//------------------------------------------------------------------------------
+    double Area(const OutRec& outRec)
+    {
+        return Area(outRec.Pts);
+    }
+    //------------------------------------------------------------------------------
 
-inline void ReverseHorizontal(TEdge &e)
-{
-  //swap horizontal edges' Top and Bottom x's so they follow the natural
-  //progression of the bounds - ie so their xbots will align with the
-  //adjoining lower edge. [Helpful in the ProcessHorizontal() method.]
-  std::swap(e.Top.X, e.Bot.X);
-#ifdef use_xyz
-  std::swap(e.Top.Z, e.Bot.Z);
-#endif
-}
-//------------------------------------------------------------------------------
+    bool PointIsVertex(const IntPoint& Pt, OutPt* pp)
+    {
+        OutPt* pp2 = pp;
+        do
+        {
+            if (pp2->Pt == Pt) return true;
+            pp2 = pp2->Next;
+        } while (pp2 != pp);
+        return false;
+    }
+    //------------------------------------------------------------------------------
 
-void SwapPoints(IntPoint &pt1, IntPoint &pt2)
-{
-  IntPoint tmp = pt1;
-  pt1 = pt2;
-  pt2 = tmp;
-}
-//------------------------------------------------------------------------------
+    // See "The Point in Polygon Problem for Arbitrary Polygons" by Hormann & Agathos
+    // http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.88.5498&rep=rep1&type=pdf
+    int PointInPolygon(const IntPoint& pt, const Path& path)
+    {
+        // returns 0 if false, +1 if true, -1 if pt ON polygon boundary
+        int    result = 0;
+        size_t cnt    = path.size();
+        if (cnt < 3) return 0;
+        IntPoint ip = path[0];
+        for (size_t i = 1; i <= cnt; ++i)
+        {
+            IntPoint ipNext = (i == cnt ? path[0] : path[i]);
+            if (ipNext.Y == pt.Y)
+            {
+                if ((ipNext.X == pt.X) || (ip.Y == pt.Y &&
+                                           ((ipNext.X > pt.X) == (ip.X < pt.X)))) return -1;
+            }
+            if ((ip.Y < pt.Y) != (ipNext.Y < pt.Y))
+            {
+                if (ip.X >= pt.X)
+                {
+                    if (ipNext.X > pt.X)
+                        result = 1 - result;
+                    else
+                    {
+                        double d = (double)(ip.X - pt.X) * (ipNext.Y - pt.Y) -
+                                   (double)(ipNext.X - pt.X) * (ip.Y - pt.Y);
+                        if (!d) return -1;
+                        if ((d > 0) == (ipNext.Y > ip.Y)) result = 1 - result;
+                    }
+                }
+                else
+                {
+                    if (ipNext.X > pt.X)
+                    {
+                        double d = (double)(ip.X - pt.X) * (ipNext.Y - pt.Y) -
+                                   (double)(ipNext.X - pt.X) * (ip.Y - pt.Y);
+                        if (!d) return -1;
+                        if ((d > 0) == (ipNext.Y > ip.Y)) result = 1 - result;
+                    }
+                }
+            }
+            ip = ipNext;
+        }
+        return result;
+    }
+    //------------------------------------------------------------------------------
 
-bool GetOverlapSegment(IntPoint pt1a, IntPoint pt1b, IntPoint pt2a,
-  IntPoint pt2b, IntPoint &pt1, IntPoint &pt2)
-{
-  //precondition: segments are Collinear.
-  if (Abs(pt1a.X - pt1b.X) > Abs(pt1a.Y - pt1b.Y))
-  {
-    if (pt1a.X > pt1b.X) SwapPoints(pt1a, pt1b);
-    if (pt2a.X > pt2b.X) SwapPoints(pt2a, pt2b);
-    if (pt1a.X > pt2a.X) pt1 = pt1a; else pt1 = pt2a;
-    if (pt1b.X < pt2b.X) pt2 = pt1b; else pt2 = pt2b;
-    return pt1.X < pt2.X;
-  } else
-  {
-    if (pt1a.Y < pt1b.Y) SwapPoints(pt1a, pt1b);
-    if (pt2a.Y < pt2b.Y) SwapPoints(pt2a, pt2b);
-    if (pt1a.Y < pt2a.Y) pt1 = pt1a; else pt1 = pt2a;
-    if (pt1b.Y > pt2b.Y) pt2 = pt1b; else pt2 = pt2b;
-    return pt1.Y > pt2.Y;
-  }
-}
-//------------------------------------------------------------------------------
+    int PointInPolygon(const IntPoint& pt, OutPt* op)
+    {
+        // returns 0 if false, +1 if true, -1 if pt ON polygon boundary
+        int    result  = 0;
+        OutPt* startOp = op;
+        for (;;)
+        {
+            if (op->Next->Pt.Y == pt.Y)
+            {
+                if ((op->Next->Pt.X == pt.X) || (op->Pt.Y == pt.Y &&
+                                                 ((op->Next->Pt.X > pt.X) == (op->Pt.X < pt.X)))) return -1;
+            }
+            if ((op->Pt.Y < pt.Y) != (op->Next->Pt.Y < pt.Y))
+            {
+                if (op->Pt.X >= pt.X)
+                {
+                    if (op->Next->Pt.X > pt.X)
+                        result = 1 - result;
+                    else
+                    {
+                        double d = (double)(op->Pt.X - pt.X) * (op->Next->Pt.Y - pt.Y) -
+                                   (double)(op->Next->Pt.X - pt.X) * (op->Pt.Y - pt.Y);
+                        if (!d) return -1;
+                        if ((d > 0) == (op->Next->Pt.Y > op->Pt.Y)) result = 1 - result;
+                    }
+                }
+                else
+                {
+                    if (op->Next->Pt.X > pt.X)
+                    {
+                        double d = (double)(op->Pt.X - pt.X) * (op->Next->Pt.Y - pt.Y) -
+                                   (double)(op->Next->Pt.X - pt.X) * (op->Pt.Y - pt.Y);
+                        if (!d) return -1;
+                        if ((d > 0) == (op->Next->Pt.Y > op->Pt.Y)) result = 1 - result;
+                    }
+                }
+            }
+            op = op->Next;
+            if (startOp == op) break;
+        }
+        return result;
+    }
+    //------------------------------------------------------------------------------
 
-bool FirstIsBottomPt(const OutPt* btmPt1, const OutPt* btmPt2)
-{
-  OutPt *p = btmPt1->Prev;
-  while ((p->Pt == btmPt1->Pt) && (p != btmPt1)) p = p->Prev;
-  double dx1p = std::fabs(GetDx(btmPt1->Pt, p->Pt));
-  p = btmPt1->Next;
-  while ((p->Pt == btmPt1->Pt) && (p != btmPt1)) p = p->Next;
-  double dx1n = std::fabs(GetDx(btmPt1->Pt, p->Pt));
-
-  p = btmPt2->Prev;
-  while ((p->Pt == btmPt2->Pt) && (p != btmPt2)) p = p->Prev;
-  double dx2p = std::fabs(GetDx(btmPt2->Pt, p->Pt));
-  p = btmPt2->Next;
-  while ((p->Pt == btmPt2->Pt) && (p != btmPt2)) p = p->Next;
-  double dx2n = std::fabs(GetDx(btmPt2->Pt, p->Pt));
-
-  if (std::max(dx1p, dx1n) == std::max(dx2p, dx2n) &&
-    std::min(dx1p, dx1n) == std::min(dx2p, dx2n))
-      return Area(btmPt1) > 0; //if otherwise identical use orientation
-  else
-    return (dx1p >= dx2p && dx1p >= dx2n) || (dx1n >= dx2p && dx1n >= dx2n);
-}
-//------------------------------------------------------------------------------
+    bool Poly2ContainsPoly1(OutPt* OutPt1, OutPt* OutPt2)
+    {
+        OutPt* op = OutPt1;
+        do
+        {
+            // nb: PointInPolygon returns 0 if false, +1 if true, -1 if pt on polygon
+            int res = PointInPolygon(op->Pt, OutPt2);
+            if (res >= 0) return res > 0;
+            op = op->Next;
+        } while (op != OutPt1);
+        return true;
+    }
+    //----------------------------------------------------------------------
 
-OutPt* GetBottomPt(OutPt *pp)
-{
-  OutPt* dups = 0;
-  OutPt* p = pp->Next;
-  while (p != pp)
-  {
-    if (p->Pt.Y > pp->Pt.Y)
-    {
-      pp = p;
-      dups = 0;
-    }
-    else if (p->Pt.Y == pp->Pt.Y && p->Pt.X <= pp->Pt.X)
-    {
-      if (p->Pt.X < pp->Pt.X)
-      {
-        dups = 0;
-        pp = p;
-      } else
-      {
-        if (p->Next != pp && p->Prev != pp) dups = p;
-      }
-    }
-    p = p->Next;
-  }
-  if (dups)
-  {
-    //there appears to be at least 2 vertices at BottomPt so ...
-    while (dups != p)
-    {
-      if (!FirstIsBottomPt(p, dups)) pp = dups;
-      dups = dups->Next;
-      while (dups->Pt != pp->Pt) dups = dups->Next;
-    }
-  }
-  return pp;
-}
-//------------------------------------------------------------------------------
+    bool SlopesEqual(const TEdge& e1, const TEdge& e2, bool UseFullInt64Range)
+    {
+#ifndef use_int32
+        if (UseFullInt64Range)
+            return Int128Mul(e1.Top.Y - e1.Bot.Y, e2.Top.X - e2.Bot.X) ==
+                   Int128Mul(e1.Top.X - e1.Bot.X, e2.Top.Y - e2.Bot.Y);
+        else
+#endif
+            return (e1.Top.Y - e1.Bot.Y) * (e2.Top.X - e2.Bot.X) ==
+                   (e1.Top.X - e1.Bot.X) * (e2.Top.Y - e2.Bot.Y);
+    }
+    //------------------------------------------------------------------------------
 
-bool Pt2IsBetweenPt1AndPt3(const IntPoint pt1,
-  const IntPoint pt2, const IntPoint pt3)
-{
-  if ((pt1 == pt3) || (pt1 == pt2) || (pt3 == pt2))
-    return false;
-  else if (pt1.X != pt3.X)
-    return (pt2.X > pt1.X) == (pt2.X < pt3.X);
-  else
-    return (pt2.Y > pt1.Y) == (pt2.Y < pt3.Y);
-}
-//------------------------------------------------------------------------------
+    bool SlopesEqual(const IntPoint pt1, const IntPoint pt2, const IntPoint pt3, bool UseFullInt64Range)
+    {
+#ifndef use_int32
+        if (UseFullInt64Range)
+            return Int128Mul(pt1.Y - pt2.Y, pt2.X - pt3.X) == Int128Mul(pt1.X - pt2.X, pt2.Y - pt3.Y);
+        else
+#endif
+            return (pt1.Y - pt2.Y) * (pt2.X - pt3.X) == (pt1.X - pt2.X) * (pt2.Y - pt3.Y);
+    }
+    //------------------------------------------------------------------------------
 
-bool HorzSegmentsOverlap(cInt seg1a, cInt seg1b, cInt seg2a, cInt seg2b)
-{
-  if (seg1a > seg1b) std::swap(seg1a, seg1b);
-  if (seg2a > seg2b) std::swap(seg2a, seg2b);
-  return (seg1a < seg2b) && (seg2a < seg1b);
-}
+    bool SlopesEqual(const IntPoint pt1, const IntPoint pt2, const IntPoint pt3, const IntPoint pt4, bool UseFullInt64Range)
+    {
+#ifndef use_int32
+        if (UseFullInt64Range)
+            return Int128Mul(pt1.Y - pt2.Y, pt3.X - pt4.X) == Int128Mul(pt1.X - pt2.X, pt3.Y - pt4.Y);
+        else
+#endif
+            return (pt1.Y - pt2.Y) * (pt3.X - pt4.X) == (pt1.X - pt2.X) * (pt3.Y - pt4.Y);
+    }
+    //------------------------------------------------------------------------------
 
-//------------------------------------------------------------------------------
-// ClipperBase class methods ...
-//------------------------------------------------------------------------------
+    inline bool IsHorizontal(TEdge& e)
+    {
+        return e.Dx == HORIZONTAL;
+    }
+    //------------------------------------------------------------------------------
 
-ClipperBase::ClipperBase() //constructor
-{
-  m_CurrentLM = m_MinimaList.begin(); //begin() == end() here
-  m_UseFullRange = false;
-}
-//------------------------------------------------------------------------------
+    inline double GetDx(const IntPoint pt1, const IntPoint pt2)
+    {
+        return (pt1.Y == pt2.Y) ?
+                   HORIZONTAL :
+                   (double)(pt2.X - pt1.X) / (pt2.Y - pt1.Y);
+    }
+    //---------------------------------------------------------------------------
 
-ClipperBase::~ClipperBase() //destructor
-{
-  Clear();
-}
-//------------------------------------------------------------------------------
+    inline void SetDx(TEdge& e)
+    {
+        cInt dy = (e.Top.Y - e.Bot.Y);
+        if (dy == 0)
+            e.Dx = HORIZONTAL;
+        else
+            e.Dx = (double)(e.Top.X - e.Bot.X) / dy;
+    }
+    //---------------------------------------------------------------------------
 
-void RangeTest(const IntPoint& Pt, bool& useFullRange)
-{
-  if (useFullRange)
-  {
-    if (Pt.X > hiRange || Pt.Y > hiRange || -Pt.X > hiRange || -Pt.Y > hiRange)
-      throw clipperException("Coordinate outside allowed range");
-  }
-  else if (Pt.X > loRange|| Pt.Y > loRange || -Pt.X > loRange || -Pt.Y > loRange)
-  {
-    useFullRange = true;
-    RangeTest(Pt, useFullRange);
-  }
-}
-//------------------------------------------------------------------------------
+    inline void SwapSides(TEdge& Edge1, TEdge& Edge2)
+    {
+        EdgeSide Side = Edge1.Side;
+        Edge1.Side    = Edge2.Side;
+        Edge2.Side    = Side;
+    }
+    //------------------------------------------------------------------------------
 
-TEdge* FindNextLocMin(TEdge* E)
-{
-  for (;;)
-  {
-    while (E->Bot != E->Prev->Bot || E->Curr == E->Top) E = E->Next;
-    if (!IsHorizontal(*E) && !IsHorizontal(*E->Prev)) break;
-    while (IsHorizontal(*E->Prev)) E = E->Prev;
-    TEdge* E2 = E;
-    while (IsHorizontal(*E)) E = E->Next;
-    if (E->Top.Y == E->Prev->Bot.Y) continue; //ie just an intermediate horz.
-    if (E2->Prev->Bot.X < E->Bot.X) E = E2;
-    break;
-  }
-  return E;
-}
-//------------------------------------------------------------------------------
+    inline void SwapPolyIndexes(TEdge& Edge1, TEdge& Edge2)
+    {
+        int OutIdx   = Edge1.OutIdx;
+        Edge1.OutIdx = Edge2.OutIdx;
+        Edge2.OutIdx = OutIdx;
+    }
+    //------------------------------------------------------------------------------
 
-TEdge* ClipperBase::ProcessBound(TEdge* E, bool NextIsForward)
-{
-  TEdge *Result = E;
-  TEdge *Horz = 0;
-
-  if (E->OutIdx == Skip)
-  {
-    //if edges still remain in the current bound beyond the skip edge then
-    //create another LocMin and call ProcessBound once more
-    if (NextIsForward)
-    {
-      while (E->Top.Y == E->Next->Bot.Y) E = E->Next;
-      //don't include top horizontals when parsing a bound a second time,
-      //they will be contained in the opposite bound ...
-      while (E != Result && IsHorizontal(*E)) E = E->Prev;
-    }
-    else
-    {
-      while (E->Top.Y == E->Prev->Bot.Y) E = E->Prev;
-      while (E != Result && IsHorizontal(*E)) E = E->Next;
-    }
-
-    if (E == Result)
-    {
-      if (NextIsForward) Result = E->Next;
-      else Result = E->Prev;
-    }
-    else
-    {
-      //there are more edges in the bound beyond result starting with E
-      if (NextIsForward)
-        E = Result->Next;
-      else
-        E = Result->Prev;
-      MinimaList::value_type locMin;
-      locMin.Y = E->Bot.Y;
-      locMin.LeftBound = 0;
-      locMin.RightBound = E;
-      E->WindDelta = 0;
-      Result = ProcessBound(E, NextIsForward);
-      m_MinimaList.push_back(locMin);
-    }
-    return Result;
-  }
-
-  TEdge *EStart;
-
-  if (IsHorizontal(*E))
-  {
-    //We need to be careful with open paths because this may not be a
-    //true local minima (ie E may be following a skip edge).
-    //Also, consecutive horz. edges may start heading left before going right.
-    if (NextIsForward)
-      EStart = E->Prev;
-    else
-      EStart = E->Next;
-    if (IsHorizontal(*EStart)) //ie an adjoining horizontal skip edge
-      {
-        if (EStart->Bot.X != E->Bot.X && EStart->Top.X != E->Bot.X)
-          ReverseHorizontal(*E);
-      }
-      else if (EStart->Bot.X != E->Bot.X)
-        ReverseHorizontal(*E);
-  }
-
-  EStart = E;
-  if (NextIsForward)
-  {
-    while (Result->Top.Y == Result->Next->Bot.Y && Result->Next->OutIdx != Skip)
-      Result = Result->Next;
-    if (IsHorizontal(*Result) && Result->Next->OutIdx != Skip)
-    {
-      //nb: at the top of a bound, horizontals are added to the bound
-      //only when the preceding edge attaches to the horizontal's left vertex
-      //unless a Skip edge is encountered when that becomes the top divide
-      Horz = Result;
-      while (IsHorizontal(*Horz->Prev)) Horz = Horz->Prev;
-      if (Horz->Prev->Top.X > Result->Next->Top.X) Result = Horz->Prev;
-    }
-    while (E != Result)
-    {
-      E->NextInLML = E->Next;
-      if (IsHorizontal(*E) && E != EStart &&
-        E->Bot.X != E->Prev->Top.X) ReverseHorizontal(*E);
-      E = E->Next;
-    }
-    if (IsHorizontal(*E) && E != EStart && E->Bot.X != E->Prev->Top.X)
-      ReverseHorizontal(*E);
-    Result = Result->Next; //move to the edge just beyond current bound
-  } else
-  {
-    while (Result->Top.Y == Result->Prev->Bot.Y && Result->Prev->OutIdx != Skip)
-      Result = Result->Prev;
-    if (IsHorizontal(*Result) && Result->Prev->OutIdx != Skip)
-    {
-      Horz = Result;
-      while (IsHorizontal(*Horz->Next)) Horz = Horz->Next;
-      if (Horz->Next->Top.X == Result->Prev->Top.X ||
-          Horz->Next->Top.X > Result->Prev->Top.X) Result = Horz->Next;
-    }
-
-    while (E != Result)
-    {
-      E->NextInLML = E->Prev;
-      if (IsHorizontal(*E) && E != EStart && E->Bot.X != E->Next->Top.X)
-        ReverseHorizontal(*E);
-      E = E->Prev;
-    }
-    if (IsHorizontal(*E) && E != EStart && E->Bot.X != E->Next->Top.X)
-      ReverseHorizontal(*E);
-    Result = Result->Prev; //move to the edge just beyond current bound
-  }
-
-  return Result;
-}
-//------------------------------------------------------------------------------
+    inline cInt TopX(TEdge& edge, const cInt currentY)
+    {
+        return (currentY == edge.Top.Y) ?
+                   edge.Top.X :
+                   edge.Bot.X + Round(edge.Dx * (currentY - edge.Bot.Y));
+    }
+    //------------------------------------------------------------------------------
 
-bool ClipperBase::AddPath(const Path &pg, PolyType PolyTyp, bool Closed)
-{
-#ifdef use_lines
-  if (!Closed && PolyTyp == ptClip)
-    throw clipperException("AddPath: Open paths must be subject.");
-#else
-  if (!Closed)
-    throw clipperException("AddPath: Open paths have been disabled.");
+    void IntersectPoint(TEdge& Edge1, TEdge& Edge2, IntPoint& ip)
+    {
+#ifdef use_xyz
+        ip.Z = 0;
 #endif
 
-  int highI = (int)pg.size() -1;
-  if (Closed) while (highI > 0 && (pg[highI] == pg[0])) --highI;
-  while (highI > 0 && (pg[highI] == pg[highI -1])) --highI;
-  if ((Closed && highI < 2) || (!Closed && highI < 1)) return false;
-
-  //create a new edge array ...
-  TEdge *edges = new TEdge [highI +1];
-
-  bool IsFlat = true;
-  //1. Basic (first) edge initialization ...
-  try
-  {
-    edges[1].Curr = pg[1];
-    RangeTest(pg[0], m_UseFullRange);
-    RangeTest(pg[highI], m_UseFullRange);
-    InitEdge(&edges[0], &edges[1], &edges[highI], pg[0]);
-    InitEdge(&edges[highI], &edges[0], &edges[highI-1], pg[highI]);
-    for (int i = highI - 1; i >= 1; --i)
-    {
-      RangeTest(pg[i], m_UseFullRange);
-      InitEdge(&edges[i], &edges[i+1], &edges[i-1], pg[i]);
-    }
-  }
-  catch(...)
-  {
-    delete [] edges;
-    throw; //range test fails
-  }
-  TEdge *eStart = &edges[0];
-
-  //2. Remove duplicate vertices, and (when closed) collinear edges ...
-  TEdge *E = eStart, *eLoopStop = eStart;
-  for (;;)
-  {
-    //nb: allows matching start and end points when not Closed ...
-    if (E->Curr == E->Next->Curr && (Closed || E->Next != eStart))
-    {
-      if (E == E->Next) break;
-      if (E == eStart) eStart = E->Next;
-      E = RemoveEdge(E);
-      eLoopStop = E;
-      continue;
-    }
-    if (E->Prev == E->Next)
-      break; //only two vertices
-    else if (Closed &&
-      SlopesEqual(E->Prev->Curr, E->Curr, E->Next->Curr, m_UseFullRange) &&
-      (!m_PreserveCollinear ||
-      !Pt2IsBetweenPt1AndPt3(E->Prev->Curr, E->Curr, E->Next->Curr)))
-    {
-      //Collinear edges are allowed for open paths but in closed paths
-      //the default is to merge adjacent collinear edges into a single edge.
-      //However, if the PreserveCollinear property is enabled, only overlapping
-      //collinear edges (ie spikes) will be removed from closed paths.
-      if (E == eStart) eStart = E->Next;
-      E = RemoveEdge(E);
-      E = E->Prev;
-      eLoopStop = E;
-      continue;
-    }
-    E = E->Next;
-    if ((E == eLoopStop) || (!Closed && E->Next == eStart)) break;
-  }
-
-  if ((!Closed && (E == E->Next)) || (Closed && (E->Prev == E->Next)))
-  {
-    delete [] edges;
-    return false;
-  }
-
-  if (!Closed)
-  {
-    m_HasOpenPaths = true;
-    eStart->Prev->OutIdx = Skip;
-  }
-
-  //3. Do second stage of edge initialization ...
-  E = eStart;
-  do
-  {
-    InitEdge2(*E, PolyTyp);
-    E = E->Next;
-    if (IsFlat && E->Curr.Y != eStart->Curr.Y) IsFlat = false;
-  }
-  while (E != eStart);
-
-  //4. Finally, add edge bounds to LocalMinima list ...
-
-  //Totally flat paths must be handled differently when adding them
-  //to LocalMinima list to avoid endless loops etc ...
-  if (IsFlat)
-  {
-    if (Closed)
-    {
-      delete [] edges;
-      return false;
-    }
-    E->Prev->OutIdx = Skip;
-    MinimaList::value_type locMin;
-    locMin.Y = E->Bot.Y;
-    locMin.LeftBound = 0;
-    locMin.RightBound = E;
-    locMin.RightBound->Side = esRight;
-    locMin.RightBound->WindDelta = 0;
-    for (;;)
-    {
-      if (E->Bot.X != E->Prev->Top.X) ReverseHorizontal(*E);
-      if (E->Next->OutIdx == Skip) break;
-      E->NextInLML = E->Next;
-      E = E->Next;
-    }
-    m_MinimaList.push_back(locMin);
-    m_edges.push_back(edges);
-	  return true;
-  }
-
-  m_edges.push_back(edges);
-  bool leftBoundIsForward;
-  TEdge* EMin = 0;
-
-  //workaround to avoid an endless loop in the while loop below when
-  //open paths have matching start and end points ...
-  if (E->Prev->Bot == E->Prev->Top) E = E->Next;
-
-  for (;;)
-  {
-    E = FindNextLocMin(E);
-    if (E == EMin) break;
-    else if (!EMin) EMin = E;
-
-    //E and E.Prev now share a local minima (left aligned if horizontal).
-    //Compare their slopes to find which starts which bound ...
-    MinimaList::value_type locMin;
-    locMin.Y = E->Bot.Y;
-    if (E->Dx < E->Prev->Dx)
-    {
-      locMin.LeftBound = E->Prev;
-      locMin.RightBound = E;
-      leftBoundIsForward = false; //Q.nextInLML = Q.prev
-    } else
-    {
-      locMin.LeftBound = E;
-      locMin.RightBound = E->Prev;
-      leftBoundIsForward = true; //Q.nextInLML = Q.next
-    }
-
-    if (!Closed) locMin.LeftBound->WindDelta = 0;
-    else if (locMin.LeftBound->Next == locMin.RightBound)
-      locMin.LeftBound->WindDelta = -1;
-    else locMin.LeftBound->WindDelta = 1;
-    locMin.RightBound->WindDelta = -locMin.LeftBound->WindDelta;
-
-    E = ProcessBound(locMin.LeftBound, leftBoundIsForward);
-    if (E->OutIdx == Skip) E = ProcessBound(E, leftBoundIsForward);
-
-    TEdge* E2 = ProcessBound(locMin.RightBound, !leftBoundIsForward);
-    if (E2->OutIdx == Skip) E2 = ProcessBound(E2, !leftBoundIsForward);
-
-    if (locMin.LeftBound->OutIdx == Skip)
-      locMin.LeftBound = 0;
-    else if (locMin.RightBound->OutIdx == Skip)
-      locMin.RightBound = 0;
-    m_MinimaList.push_back(locMin);
-    if (!leftBoundIsForward) E = E2;
-  }
-  return true;
-}
-//------------------------------------------------------------------------------
+        double b1, b2;
+        if (Edge1.Dx == Edge2.Dx)
+        {
+            ip.Y = Edge1.Curr.Y;
+            ip.X = TopX(Edge1, ip.Y);
+            return;
+        }
+        else if (Edge1.Dx == 0)
+        {
+            ip.X = Edge1.Bot.X;
+            if (IsHorizontal(Edge2))
+                ip.Y = Edge2.Bot.Y;
+            else
+            {
+                b2   = Edge2.Bot.Y - (Edge2.Bot.X / Edge2.Dx);
+                ip.Y = Round(ip.X / Edge2.Dx + b2);
+            }
+        }
+        else if (Edge2.Dx == 0)
+        {
+            ip.X = Edge2.Bot.X;
+            if (IsHorizontal(Edge1))
+                ip.Y = Edge1.Bot.Y;
+            else
+            {
+                b1   = Edge1.Bot.Y - (Edge1.Bot.X / Edge1.Dx);
+                ip.Y = Round(ip.X / Edge1.Dx + b1);
+            }
+        }
+        else
+        {
+            b1       = Edge1.Bot.X - Edge1.Bot.Y * Edge1.Dx;
+            b2       = Edge2.Bot.X - Edge2.Bot.Y * Edge2.Dx;
+            double q = (b2 - b1) / (Edge1.Dx - Edge2.Dx);
+            ip.Y     = Round(q);
+            if (std::fabs(Edge1.Dx) < std::fabs(Edge2.Dx))
+                ip.X = Round(Edge1.Dx * q + b1);
+            else
+                ip.X = Round(Edge2.Dx * q + b2);
+        }
 
-bool ClipperBase::AddPaths(const Paths &ppg, PolyType PolyTyp, bool Closed)
-{
-  bool result = false;
-  for (Paths::size_type i = 0; i < ppg.size(); ++i)
-    if (AddPath(ppg[i], PolyTyp, Closed)) result = true;
-  return result;
-}
-//------------------------------------------------------------------------------
+        if (ip.Y < Edge1.Top.Y || ip.Y < Edge2.Top.Y)
+        {
+            if (Edge1.Top.Y > Edge2.Top.Y)
+                ip.Y = Edge1.Top.Y;
+            else
+                ip.Y = Edge2.Top.Y;
+            if (std::fabs(Edge1.Dx) < std::fabs(Edge2.Dx))
+                ip.X = TopX(Edge1, ip.Y);
+            else
+                ip.X = TopX(Edge2, ip.Y);
+        }
+        // finally, don't allow 'ip' to be BELOW curr.Y (ie bottom of scanbeam) ...
+        if (ip.Y > Edge1.Curr.Y)
+        {
+            ip.Y = Edge1.Curr.Y;
+            // use the more vertical edge to derive X ...
+            if (std::fabs(Edge1.Dx) > std::fabs(Edge2.Dx))
+                ip.X = TopX(Edge2, ip.Y);
+            else
+                ip.X = TopX(Edge1, ip.Y);
+        }
+    }
+    //------------------------------------------------------------------------------
 
-void ClipperBase::Clear()
-{
-  DisposeLocalMinimaList();
-  for (EdgeList::size_type i = 0; i < m_edges.size(); ++i)
-  {
-    TEdge* edges = m_edges[i];
-    delete [] edges;
-  }
-  m_edges.clear();
-  m_UseFullRange = false;
-  m_HasOpenPaths = false;
-}
-//------------------------------------------------------------------------------
+    void ReversePolyPtLinks(OutPt* pp)
+    {
+        if (!pp) return;
+        OutPt *pp1, *pp2;
+        pp1 = pp;
+        do {
+            pp2       = pp1->Next;
+            pp1->Next = pp1->Prev;
+            pp1->Prev = pp2;
+            pp1       = pp2;
+        } while (pp1 != pp);
+    }
+    //------------------------------------------------------------------------------
 
-void ClipperBase::Reset()
-{
-  m_CurrentLM = m_MinimaList.begin();
-  if (m_CurrentLM == m_MinimaList.end()) return; //ie nothing to process
-  std::sort(m_MinimaList.begin(), m_MinimaList.end(), LocMinSorter());
-
-  m_Scanbeam = ScanbeamList(); //clears/resets priority_queue
-  //reset all edges ...
-  for (MinimaList::iterator lm = m_MinimaList.begin(); lm != m_MinimaList.end(); ++lm)
-  {
-    InsertScanbeam(lm->Y);
-    TEdge* e = lm->LeftBound;
-    if (e)
-    {
-      e->Curr = e->Bot;
-      e->Side = esLeft;
-      e->OutIdx = Unassigned;
-    }
-
-    e = lm->RightBound;
-    if (e)
-    {
-      e->Curr = e->Bot;
-      e->Side = esRight;
-      e->OutIdx = Unassigned;
-    }
-  }
-  m_ActiveEdges = 0;
-  m_CurrentLM = m_MinimaList.begin();
-}
-//------------------------------------------------------------------------------
+    void DisposeOutPts(OutPt*& pp)
+    {
+        if (pp == 0) return;
+        pp->Prev->Next = 0;
+        while (pp)
+        {
+            OutPt* tmpPp = pp;
+            pp           = pp->Next;
+            delete tmpPp;
+        }
+    }
+    //------------------------------------------------------------------------------
 
-void ClipperBase::DisposeLocalMinimaList()
-{
-  m_MinimaList.clear();
-  m_CurrentLM = m_MinimaList.begin();
-}
-//------------------------------------------------------------------------------
+    inline void InitEdge(TEdge* e, TEdge* eNext, TEdge* ePrev, const IntPoint& Pt)
+    {
+        std::memset(e, 0, sizeof(TEdge));
+        e->Next   = eNext;
+        e->Prev   = ePrev;
+        e->Curr   = Pt;
+        e->OutIdx = Unassigned;
+    }
+    //------------------------------------------------------------------------------
 
-bool ClipperBase::PopLocalMinima(cInt Y, const LocalMinimum *&locMin)
-{
-  if (m_CurrentLM == m_MinimaList.end() || (*m_CurrentLM).Y != Y) return false;
-  locMin = &(*m_CurrentLM);
-  ++m_CurrentLM;
-  return true;
-}
-//------------------------------------------------------------------------------
+    void InitEdge2(TEdge& e, PolyType Pt)
+    {
+        if (e.Curr.Y >= e.Next->Curr.Y)
+        {
+            e.Bot = e.Curr;
+            e.Top = e.Next->Curr;
+        }
+        else
+        {
+            e.Top = e.Curr;
+            e.Bot = e.Next->Curr;
+        }
+        SetDx(e);
+        e.PolyTyp = Pt;
+    }
+    //------------------------------------------------------------------------------
 
-IntRect ClipperBase::GetBounds()
-{
-  IntRect result;
-  MinimaList::iterator lm = m_MinimaList.begin();
-  if (lm == m_MinimaList.end())
-  {
-    result.left = result.top = result.right = result.bottom = 0;
-    return result;
-  }
-  result.left = lm->LeftBound->Bot.X;
-  result.top = lm->LeftBound->Bot.Y;
-  result.right = lm->LeftBound->Bot.X;
-  result.bottom = lm->LeftBound->Bot.Y;
-  while (lm != m_MinimaList.end())
-  {
-    //todo - needs fixing for open paths
-    result.bottom = std::max(result.bottom, lm->LeftBound->Bot.Y);
-    TEdge* e = lm->LeftBound;
-    for (;;) {
-      TEdge* bottomE = e;
-      while (e->NextInLML)
-      {
-        if (e->Bot.X < result.left) result.left = e->Bot.X;
-        if (e->Bot.X > result.right) result.right = e->Bot.X;
-        e = e->NextInLML;
-      }
-      result.left = std::min(result.left, e->Bot.X);
-      result.right = std::max(result.right, e->Bot.X);
-      result.left = std::min(result.left, e->Top.X);
-      result.right = std::max(result.right, e->Top.X);
-      result.top = std::min(result.top, e->Top.Y);
-      if (bottomE == lm->LeftBound) e = lm->RightBound;
-      else break;
-    }
-    ++lm;
-  }
-  return result;
-}
-//------------------------------------------------------------------------------
+    TEdge* RemoveEdge(TEdge* e)
+    {
+        // removes e from double_linked_list (but without removing from memory)
+        e->Prev->Next = e->Next;
+        e->Next->Prev = e->Prev;
+        TEdge* result = e->Next;
+        e->Prev       = 0;  // flag as removed (see ClipperBase.Clear)
+        return result;
+    }
+    //------------------------------------------------------------------------------
 
-void ClipperBase::InsertScanbeam(const cInt Y)
-{
-  m_Scanbeam.push(Y);
-}
-//------------------------------------------------------------------------------
+    inline void ReverseHorizontal(TEdge& e)
+    {
+        // swap horizontal edges' Top and Bottom x's so they follow the natural
+        // progression of the bounds - ie so their xbots will align with the
+        // adjoining lower edge. [Helpful in the ProcessHorizontal() method.]
+        std::swap(e.Top.X, e.Bot.X);
+#ifdef use_xyz
+        std::swap(e.Top.Z, e.Bot.Z);
+#endif
+    }
+    //------------------------------------------------------------------------------
 
-bool ClipperBase::PopScanbeam(cInt &Y)
-{
-  if (m_Scanbeam.empty()) return false;
-  Y = m_Scanbeam.top();
-  m_Scanbeam.pop();
-  while (!m_Scanbeam.empty() && Y == m_Scanbeam.top()) { m_Scanbeam.pop(); } // Pop duplicates.
-  return true;
-}
-//------------------------------------------------------------------------------
+    void SwapPoints(IntPoint& pt1, IntPoint& pt2)
+    {
+        IntPoint tmp = pt1;
+        pt1          = pt2;
+        pt2          = tmp;
+    }
+    //------------------------------------------------------------------------------
 
-void ClipperBase::DisposeAllOutRecs(){
-  for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); ++i)
-    DisposeOutRec(i);
-  m_PolyOuts.clear();
-}
-//------------------------------------------------------------------------------
+    bool GetOverlapSegment(IntPoint pt1a, IntPoint pt1b, IntPoint pt2a, IntPoint pt2b, IntPoint& pt1, IntPoint& pt2)
+    {
+        // precondition: segments are Collinear.
+        if (Abs(pt1a.X - pt1b.X) > Abs(pt1a.Y - pt1b.Y))
+        {
+            if (pt1a.X > pt1b.X) SwapPoints(pt1a, pt1b);
+            if (pt2a.X > pt2b.X) SwapPoints(pt2a, pt2b);
+            if (pt1a.X > pt2a.X)
+                pt1 = pt1a;
+            else
+                pt1 = pt2a;
+            if (pt1b.X < pt2b.X)
+                pt2 = pt1b;
+            else
+                pt2 = pt2b;
+            return pt1.X < pt2.X;
+        }
+        else
+        {
+            if (pt1a.Y < pt1b.Y) SwapPoints(pt1a, pt1b);
+            if (pt2a.Y < pt2b.Y) SwapPoints(pt2a, pt2b);
+            if (pt1a.Y < pt2a.Y)
+                pt1 = pt1a;
+            else
+                pt1 = pt2a;
+            if (pt1b.Y > pt2b.Y)
+                pt2 = pt1b;
+            else
+                pt2 = pt2b;
+            return pt1.Y > pt2.Y;
+        }
+    }
+    //------------------------------------------------------------------------------
 
-void ClipperBase::DisposeOutRec(PolyOutList::size_type index)
-{
-  OutRec *outRec = m_PolyOuts[index];
-  if (outRec->Pts) DisposeOutPts(outRec->Pts);
-  delete outRec;
-  m_PolyOuts[index] = 0;
-}
-//------------------------------------------------------------------------------
+    bool FirstIsBottomPt(const OutPt* btmPt1, const OutPt* btmPt2)
+    {
+        OutPt* p = btmPt1->Prev;
+        while ((p->Pt == btmPt1->Pt) && (p != btmPt1)) p = p->Prev;
+        double dx1p = std::fabs(GetDx(btmPt1->Pt, p->Pt));
+        p           = btmPt1->Next;
+        while ((p->Pt == btmPt1->Pt) && (p != btmPt1)) p = p->Next;
+        double dx1n = std::fabs(GetDx(btmPt1->Pt, p->Pt));
+
+        p = btmPt2->Prev;
+        while ((p->Pt == btmPt2->Pt) && (p != btmPt2)) p = p->Prev;
+        double dx2p = std::fabs(GetDx(btmPt2->Pt, p->Pt));
+        p           = btmPt2->Next;
+        while ((p->Pt == btmPt2->Pt) && (p != btmPt2)) p = p->Next;
+        double dx2n = std::fabs(GetDx(btmPt2->Pt, p->Pt));
+
+        if (std::max(dx1p, dx1n) == std::max(dx2p, dx2n) &&
+            std::min(dx1p, dx1n) == std::min(dx2p, dx2n))
+            return Area(btmPt1) > 0;  // if otherwise identical use orientation
+        else
+            return (dx1p >= dx2p && dx1p >= dx2n) || (dx1n >= dx2p && dx1n >= dx2n);
+    }
+    //------------------------------------------------------------------------------
 
-void ClipperBase::DeleteFromAEL(TEdge *e)
-{
-  TEdge* AelPrev = e->PrevInAEL;
-  TEdge* AelNext = e->NextInAEL;
-  if (!AelPrev &&  !AelNext && (e != m_ActiveEdges)) return; //already deleted
-  if (AelPrev) AelPrev->NextInAEL = AelNext;
-  else m_ActiveEdges = AelNext;
-  if (AelNext) AelNext->PrevInAEL = AelPrev;
-  e->NextInAEL = 0;
-  e->PrevInAEL = 0;
-}
-//------------------------------------------------------------------------------
+    OutPt* GetBottomPt(OutPt* pp)
+    {
+        OutPt* dups = 0;
+        OutPt* p    = pp->Next;
+        while (p != pp)
+        {
+            if (p->Pt.Y > pp->Pt.Y)
+            {
+                pp   = p;
+                dups = 0;
+            }
+            else if (p->Pt.Y == pp->Pt.Y && p->Pt.X <= pp->Pt.X)
+            {
+                if (p->Pt.X < pp->Pt.X)
+                {
+                    dups = 0;
+                    pp   = p;
+                }
+                else
+                {
+                    if (p->Next != pp && p->Prev != pp) dups = p;
+                }
+            }
+            p = p->Next;
+        }
+        if (dups)
+        {
+            // there appears to be at least 2 vertices at BottomPt so ...
+            while (dups != p)
+            {
+                if (!FirstIsBottomPt(p, dups)) pp = dups;
+                dups = dups->Next;
+                while (dups->Pt != pp->Pt) dups = dups->Next;
+            }
+        }
+        return pp;
+    }
+    //------------------------------------------------------------------------------
 
-OutRec* ClipperBase::CreateOutRec()
-{
-  OutRec* result = new OutRec;
-  result->IsHole = false;
-  result->IsOpen = false;
-  result->FirstLeft = 0;
-  result->Pts = 0;
-  result->BottomPt = 0;
-  result->PolyNd = 0;
-  m_PolyOuts.push_back(result);
-  result->Idx = (int)m_PolyOuts.size() - 1;
-  return result;
-}
-//------------------------------------------------------------------------------
+    bool Pt2IsBetweenPt1AndPt3(const IntPoint pt1,
+                               const IntPoint pt2,
+                               const IntPoint pt3)
+    {
+        if ((pt1 == pt3) || (pt1 == pt2) || (pt3 == pt2))
+            return false;
+        else if (pt1.X != pt3.X)
+            return (pt2.X > pt1.X) == (pt2.X < pt3.X);
+        else
+            return (pt2.Y > pt1.Y) == (pt2.Y < pt3.Y);
+    }
+    //------------------------------------------------------------------------------
 
-void ClipperBase::SwapPositionsInAEL(TEdge *Edge1, TEdge *Edge2)
-{
-  //check that one or other edge hasn't already been removed from AEL ...
-  if (Edge1->NextInAEL == Edge1->PrevInAEL ||
-    Edge2->NextInAEL == Edge2->PrevInAEL) return;
-
-  if (Edge1->NextInAEL == Edge2)
-  {
-    TEdge* Next = Edge2->NextInAEL;
-    if (Next) Next->PrevInAEL = Edge1;
-    TEdge* Prev = Edge1->PrevInAEL;
-    if (Prev) Prev->NextInAEL = Edge2;
-    Edge2->PrevInAEL = Prev;
-    Edge2->NextInAEL = Edge1;
-    Edge1->PrevInAEL = Edge2;
-    Edge1->NextInAEL = Next;
-  }
-  else if (Edge2->NextInAEL == Edge1)
-  {
-    TEdge* Next = Edge1->NextInAEL;
-    if (Next) Next->PrevInAEL = Edge2;
-    TEdge* Prev = Edge2->PrevInAEL;
-    if (Prev) Prev->NextInAEL = Edge1;
-    Edge1->PrevInAEL = Prev;
-    Edge1->NextInAEL = Edge2;
-    Edge2->PrevInAEL = Edge1;
-    Edge2->NextInAEL = Next;
-  }
-  else
-  {
-    TEdge* Next = Edge1->NextInAEL;
-    TEdge* Prev = Edge1->PrevInAEL;
-    Edge1->NextInAEL = Edge2->NextInAEL;
-    if (Edge1->NextInAEL) Edge1->NextInAEL->PrevInAEL = Edge1;
-    Edge1->PrevInAEL = Edge2->PrevInAEL;
-    if (Edge1->PrevInAEL) Edge1->PrevInAEL->NextInAEL = Edge1;
-    Edge2->NextInAEL = Next;
-    if (Edge2->NextInAEL) Edge2->NextInAEL->PrevInAEL = Edge2;
-    Edge2->PrevInAEL = Prev;
-    if (Edge2->PrevInAEL) Edge2->PrevInAEL->NextInAEL = Edge2;
-  }
-
-  if (!Edge1->PrevInAEL) m_ActiveEdges = Edge1;
-  else if (!Edge2->PrevInAEL) m_ActiveEdges = Edge2;
-}
-//------------------------------------------------------------------------------
+    bool HorzSegmentsOverlap(cInt seg1a, cInt seg1b, cInt seg2a, cInt seg2b)
+    {
+        if (seg1a > seg1b) std::swap(seg1a, seg1b);
+        if (seg2a > seg2b) std::swap(seg2a, seg2b);
+        return (seg1a < seg2b) && (seg2a < seg1b);
+    }
 
-void ClipperBase::UpdateEdgeIntoAEL(TEdge *&e)
-{
-  if (!e->NextInLML)
-    throw clipperException("UpdateEdgeIntoAEL: invalid call");
-
-  e->NextInLML->OutIdx = e->OutIdx;
-  TEdge* AelPrev = e->PrevInAEL;
-  TEdge* AelNext = e->NextInAEL;
-  if (AelPrev) AelPrev->NextInAEL = e->NextInLML;
-  else m_ActiveEdges = e->NextInLML;
-  if (AelNext) AelNext->PrevInAEL = e->NextInLML;
-  e->NextInLML->Side = e->Side;
-  e->NextInLML->WindDelta = e->WindDelta;
-  e->NextInLML->WindCnt = e->WindCnt;
-  e->NextInLML->WindCnt2 = e->WindCnt2;
-  e = e->NextInLML;
-  e->Curr = e->Bot;
-  e->PrevInAEL = AelPrev;
-  e->NextInAEL = AelNext;
-  if (!IsHorizontal(*e)) InsertScanbeam(e->Top.Y);
-}
-//------------------------------------------------------------------------------
+    //------------------------------------------------------------------------------
+    // ClipperBase class methods ...
+    //------------------------------------------------------------------------------
 
-bool ClipperBase::LocalMinimaPending()
-{
-  return (m_CurrentLM != m_MinimaList.end());
-}
+    ClipperBase::ClipperBase()  // constructor
+    {
+        m_CurrentLM    = m_MinimaList.begin();  // begin() == end() here
+        m_UseFullRange = false;
+    }
+    //------------------------------------------------------------------------------
 
-//------------------------------------------------------------------------------
-// TClipper methods ...
-//------------------------------------------------------------------------------
+    ClipperBase::~ClipperBase()  // destructor
+    {
+        Clear();
+    }
+    //------------------------------------------------------------------------------
 
-Clipper::Clipper(int initOptions) : ClipperBase() //constructor
-{
-  m_ExecuteLocked = false;
-  m_UseFullRange = false;
-  m_ReverseOutput = ((initOptions & ioReverseSolution) != 0);
-  m_StrictSimple = ((initOptions & ioStrictlySimple) != 0);
-  m_PreserveCollinear = ((initOptions & ioPreserveCollinear) != 0);
-  m_HasOpenPaths = false;
-#ifdef use_xyz
-  m_ZFill = 0;
-#endif
-}
-//------------------------------------------------------------------------------
+    void RangeTest(const IntPoint& Pt, bool& useFullRange)
+    {
+        if (useFullRange)
+        {
+            if (Pt.X > hiRange || Pt.Y > hiRange || -Pt.X > hiRange || -Pt.Y > hiRange)
+                throw clipperException("Coordinate outside allowed range");
+        }
+        else if (Pt.X > loRange || Pt.Y > loRange || -Pt.X > loRange || -Pt.Y > loRange)
+        {
+            useFullRange = true;
+            RangeTest(Pt, useFullRange);
+        }
+    }
+    //------------------------------------------------------------------------------
 
-#ifdef use_xyz
-void Clipper::ZFillFunction(ZFillCallback zFillFunc)
-{
-  m_ZFill = zFillFunc;
-}
-//------------------------------------------------------------------------------
-#endif
+    TEdge* FindNextLocMin(TEdge* E)
+    {
+        for (;;)
+        {
+            while (E->Bot != E->Prev->Bot || E->Curr == E->Top) E = E->Next;
+            if (!IsHorizontal(*E) && !IsHorizontal(*E->Prev)) break;
+            while (IsHorizontal(*E->Prev)) E = E->Prev;
+            TEdge* E2 = E;
+            while (IsHorizontal(*E)) E = E->Next;
+            if (E->Top.Y == E->Prev->Bot.Y) continue;  // ie just an intermediate horz.
+            if (E2->Prev->Bot.X < E->Bot.X) E = E2;
+            break;
+        }
+        return E;
+    }
+    //------------------------------------------------------------------------------
 
-bool Clipper::Execute(ClipType clipType, Paths &solution, PolyFillType fillType)
-{
-    return Execute(clipType, solution, fillType, fillType);
-}
-//------------------------------------------------------------------------------
+    TEdge* ClipperBase::ProcessBound(TEdge* E, bool NextIsForward)
+    {
+        TEdge* Result = E;
+        TEdge* Horz   = 0;
 
-bool Clipper::Execute(ClipType clipType, PolyTree &polytree, PolyFillType fillType)
-{
-    return Execute(clipType, polytree, fillType, fillType);
-}
-//------------------------------------------------------------------------------
+        if (E->OutIdx == Skip)
+        {
+            // if edges still remain in the current bound beyond the skip edge then
+            // create another LocMin and call ProcessBound once more
+            if (NextIsForward)
+            {
+                while (E->Top.Y == E->Next->Bot.Y) E = E->Next;
+                // don't include top horizontals when parsing a bound a second time,
+                // they will be contained in the opposite bound ...
+                while (E != Result && IsHorizontal(*E)) E = E->Prev;
+            }
+            else
+            {
+                while (E->Top.Y == E->Prev->Bot.Y) E = E->Prev;
+                while (E != Result && IsHorizontal(*E)) E = E->Next;
+            }
 
-bool Clipper::Execute(ClipType clipType, Paths &solution,
-    PolyFillType subjFillType, PolyFillType clipFillType)
-{
-  if( m_ExecuteLocked ) return false;
-  if (m_HasOpenPaths)
-    throw clipperException("Error: PolyTree struct is needed for open path clipping.");
-  m_ExecuteLocked = true;
-  solution.resize(0);
-  m_SubjFillType = subjFillType;
-  m_ClipFillType = clipFillType;
-  m_ClipType = clipType;
-  m_UsingPolyTree = false;
-  bool succeeded = ExecuteInternal();
-  if (succeeded) BuildResult(solution);
-  DisposeAllOutRecs();
-  m_ExecuteLocked = false;
-  return succeeded;
-}
-//------------------------------------------------------------------------------
+            if (E == Result)
+            {
+                if (NextIsForward)
+                    Result = E->Next;
+                else
+                    Result = E->Prev;
+            }
+            else
+            {
+                // there are more edges in the bound beyond result starting with E
+                if (NextIsForward)
+                    E = Result->Next;
+                else
+                    E = Result->Prev;
+                MinimaList::value_type locMin;
+                locMin.Y          = E->Bot.Y;
+                locMin.LeftBound  = 0;
+                locMin.RightBound = E;
+                E->WindDelta      = 0;
+                Result            = ProcessBound(E, NextIsForward);
+                m_MinimaList.push_back(locMin);
+            }
+            return Result;
+        }
 
-bool Clipper::Execute(ClipType clipType, PolyTree& polytree,
-    PolyFillType subjFillType, PolyFillType clipFillType)
-{
-  if( m_ExecuteLocked ) return false;
-  m_ExecuteLocked = true;
-  m_SubjFillType = subjFillType;
-  m_ClipFillType = clipFillType;
-  m_ClipType = clipType;
-  m_UsingPolyTree = true;
-  bool succeeded = ExecuteInternal();
-  if (succeeded) BuildResult2(polytree);
-  DisposeAllOutRecs();
-  m_ExecuteLocked = false;
-  return succeeded;
-}
-//------------------------------------------------------------------------------
+        TEdge* EStart;
 
-void Clipper::FixHoleLinkage(OutRec &outrec)
-{
-  //skip OutRecs that (a) contain outermost polygons or
-  //(b) already have the correct owner/child linkage ...
-  if (!outrec.FirstLeft ||
-      (outrec.IsHole != outrec.FirstLeft->IsHole &&
-      outrec.FirstLeft->Pts)) return;
-
-  OutRec* orfl = outrec.FirstLeft;
-  while (orfl && ((orfl->IsHole == outrec.IsHole) || !orfl->Pts))
-      orfl = orfl->FirstLeft;
-  outrec.FirstLeft = orfl;
-}
-//------------------------------------------------------------------------------
+        if (IsHorizontal(*E))
+        {
+            // We need to be careful with open paths because this may not be a
+            // true local minima (ie E may be following a skip edge).
+            // Also, consecutive horz. edges may start heading left before going right.
+            if (NextIsForward)
+                EStart = E->Prev;
+            else
+                EStart = E->Next;
+            if (IsHorizontal(*EStart))  // ie an adjoining horizontal skip edge
+            {
+                if (EStart->Bot.X != E->Bot.X && EStart->Top.X != E->Bot.X)
+                    ReverseHorizontal(*E);
+            }
+            else if (EStart->Bot.X != E->Bot.X)
+                ReverseHorizontal(*E);
+        }
 
-bool Clipper::ExecuteInternal()
-{
-  bool succeeded = true;
-  try {
-    Reset();
-    m_Maxima = MaximaList();
-    m_SortedEdges = 0;
-
-    succeeded = true;
-    cInt botY, topY;
-    if (!PopScanbeam(botY)) return false;
-    InsertLocalMinimaIntoAEL(botY);
-    while (PopScanbeam(topY) || LocalMinimaPending())
-    {
-      ProcessHorizontals();
-	    ClearGhostJoins();
-      if (!ProcessIntersections(topY))
-      {
-        succeeded = false;
-        break;
-      }
-      ProcessEdgesAtTopOfScanbeam(topY);
-      botY = topY;
-      InsertLocalMinimaIntoAEL(botY);
-    }
-  }
-  catch(...)
-  {
-    succeeded = false;
-  }
-
-  if (succeeded)
-  {
-    //fix orientations ...
-    for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); ++i)
-    {
-      OutRec *outRec = m_PolyOuts[i];
-      if (!outRec->Pts || outRec->IsOpen) continue;
-      if ((outRec->IsHole ^ m_ReverseOutput) == (Area(*outRec) > 0))
-        ReversePolyPtLinks(outRec->Pts);
-    }
-
-    if (!m_Joins.empty()) JoinCommonEdges();
-
-    //unfortunately FixupOutPolygon() must be done after JoinCommonEdges()
-    for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); ++i)
-    {
-      OutRec *outRec = m_PolyOuts[i];
-      if (!outRec->Pts) continue;
-      if (outRec->IsOpen)
-        FixupOutPolyline(*outRec);
-      else
-        FixupOutPolygon(*outRec);
-    }
-
-    if (m_StrictSimple) DoSimplePolygons();
-  }
-
-  ClearJoins();
-  ClearGhostJoins();
-  return succeeded;
-}
-//------------------------------------------------------------------------------
+        EStart = E;
+        if (NextIsForward)
+        {
+            while (Result->Top.Y == Result->Next->Bot.Y && Result->Next->OutIdx != Skip)
+                Result = Result->Next;
+            if (IsHorizontal(*Result) && Result->Next->OutIdx != Skip)
+            {
+                // nb: at the top of a bound, horizontals are added to the bound
+                // only when the preceding edge attaches to the horizontal's left vertex
+                // unless a Skip edge is encountered when that becomes the top divide
+                Horz = Result;
+                while (IsHorizontal(*Horz->Prev)) Horz = Horz->Prev;
+                if (Horz->Prev->Top.X > Result->Next->Top.X) Result = Horz->Prev;
+            }
+            while (E != Result)
+            {
+                E->NextInLML = E->Next;
+                if (IsHorizontal(*E) && E != EStart &&
+                    E->Bot.X != E->Prev->Top.X) ReverseHorizontal(*E);
+                E = E->Next;
+            }
+            if (IsHorizontal(*E) && E != EStart && E->Bot.X != E->Prev->Top.X)
+                ReverseHorizontal(*E);
+            Result = Result->Next;  // move to the edge just beyond current bound
+        }
+        else
+        {
+            while (Result->Top.Y == Result->Prev->Bot.Y && Result->Prev->OutIdx != Skip)
+                Result = Result->Prev;
+            if (IsHorizontal(*Result) && Result->Prev->OutIdx != Skip)
+            {
+                Horz = Result;
+                while (IsHorizontal(*Horz->Next)) Horz = Horz->Next;
+                if (Horz->Next->Top.X == Result->Prev->Top.X ||
+                    Horz->Next->Top.X > Result->Prev->Top.X) Result = Horz->Next;
+            }
 
-void Clipper::SetWindingCount(TEdge &edge)
-{
-  TEdge *e = edge.PrevInAEL;
-  //find the edge of the same polytype that immediately preceeds 'edge' in AEL
-  while (e  && ((e->PolyTyp != edge.PolyTyp) || (e->WindDelta == 0))) e = e->PrevInAEL;
-  if (!e)
-  {
-    if (edge.WindDelta == 0)
-    {
-      PolyFillType pft = (edge.PolyTyp == ptSubject ? m_SubjFillType : m_ClipFillType);
-      edge.WindCnt = (pft == pftNegative ? -1 : 1);
-    }
-    else
-      edge.WindCnt = edge.WindDelta;
-    edge.WindCnt2 = 0;
-    e = m_ActiveEdges; //ie get ready to calc WindCnt2
-  }
-  else if (edge.WindDelta == 0 && m_ClipType != ctUnion)
-  {
-    edge.WindCnt = 1;
-    edge.WindCnt2 = e->WindCnt2;
-    e = e->NextInAEL; //ie get ready to calc WindCnt2
-  }
-  else if (IsEvenOddFillType(edge))
-  {
-    //EvenOdd filling ...
-    if (edge.WindDelta == 0)
-    {
-      //are we inside a subj polygon ...
-      bool Inside = true;
-      TEdge *e2 = e->PrevInAEL;
-      while (e2)
-      {
-        if (e2->PolyTyp == e->PolyTyp && e2->WindDelta != 0)
-          Inside = !Inside;
-        e2 = e2->PrevInAEL;
-      }
-      edge.WindCnt = (Inside ? 0 : 1);
-    }
-    else
-    {
-      edge.WindCnt = edge.WindDelta;
-    }
-    edge.WindCnt2 = e->WindCnt2;
-    e = e->NextInAEL; //ie get ready to calc WindCnt2
-  }
-  else
-  {
-    //nonZero, Positive or Negative filling ...
-    if (e->WindCnt * e->WindDelta < 0)
-    {
-      //prev edge is 'decreasing' WindCount (WC) toward zero
-      //so we're outside the previous polygon ...
-      if (Abs(e->WindCnt) > 1)
-      {
-        //outside prev poly but still inside another.
-        //when reversing direction of prev poly use the same WC
-        if (e->WindDelta * edge.WindDelta < 0) edge.WindCnt = e->WindCnt;
-        //otherwise continue to 'decrease' WC ...
-        else edge.WindCnt = e->WindCnt + edge.WindDelta;
-      }
-      else
-        //now outside all polys of same polytype so set own WC ...
-        edge.WindCnt = (edge.WindDelta == 0 ? 1 : edge.WindDelta);
-    } else
-    {
-      //prev edge is 'increasing' WindCount (WC) away from zero
-      //so we're inside the previous polygon ...
-      if (edge.WindDelta == 0)
-        edge.WindCnt = (e->WindCnt < 0 ? e->WindCnt - 1 : e->WindCnt + 1);
-      //if wind direction is reversing prev then use same WC
-      else if (e->WindDelta * edge.WindDelta < 0) edge.WindCnt = e->WindCnt;
-      //otherwise add to WC ...
-      else edge.WindCnt = e->WindCnt + edge.WindDelta;
-    }
-    edge.WindCnt2 = e->WindCnt2;
-    e = e->NextInAEL; //ie get ready to calc WindCnt2
-  }
-
-  //update WindCnt2 ...
-  if (IsEvenOddAltFillType(edge))
-  {
-    //EvenOdd filling ...
-    while (e != &edge)
-    {
-      if (e->WindDelta != 0)
-        edge.WindCnt2 = (edge.WindCnt2 == 0 ? 1 : 0);
-      e = e->NextInAEL;
-    }
-  } else
-  {
-    //nonZero, Positive or Negative filling ...
-    while ( e != &edge )
-    {
-      edge.WindCnt2 += e->WindDelta;
-      e = e->NextInAEL;
-    }
-  }
-}
-//------------------------------------------------------------------------------
+            while (E != Result)
+            {
+                E->NextInLML = E->Prev;
+                if (IsHorizontal(*E) && E != EStart && E->Bot.X != E->Next->Top.X)
+                    ReverseHorizontal(*E);
+                E = E->Prev;
+            }
+            if (IsHorizontal(*E) && E != EStart && E->Bot.X != E->Next->Top.X)
+                ReverseHorizontal(*E);
+            Result = Result->Prev;  // move to the edge just beyond current bound
+        }
 
-bool Clipper::IsEvenOddFillType(const TEdge& edge) const
-{
-  if (edge.PolyTyp == ptSubject)
-    return m_SubjFillType == pftEvenOdd; else
-    return m_ClipFillType == pftEvenOdd;
-}
-//------------------------------------------------------------------------------
+        return Result;
+    }
+    //------------------------------------------------------------------------------
 
-bool Clipper::IsEvenOddAltFillType(const TEdge& edge) const
-{
-  if (edge.PolyTyp == ptSubject)
-    return m_ClipFillType == pftEvenOdd; else
-    return m_SubjFillType == pftEvenOdd;
-}
-//------------------------------------------------------------------------------
+    bool ClipperBase::AddPath(const Path& pg, PolyType PolyTyp, bool Closed)
+    {
+#ifdef use_lines
+        if (!Closed && PolyTyp == ptClip)
+            throw clipperException("AddPath: Open paths must be subject.");
+#else
+        if (!Closed)
+            throw clipperException("AddPath: Open paths have been disabled.");
+#endif
 
-bool Clipper::IsContributing(const TEdge& edge) const
-{
-  PolyFillType pft, pft2;
-  if (edge.PolyTyp == ptSubject)
-  {
-    pft = m_SubjFillType;
-    pft2 = m_ClipFillType;
-  } else
-  {
-    pft = m_ClipFillType;
-    pft2 = m_SubjFillType;
-  }
-
-  switch(pft)
-  {
-    case pftEvenOdd:
-      //return false if a subj line has been flagged as inside a subj polygon
-      if (edge.WindDelta == 0 && edge.WindCnt != 1) return false;
-      break;
-    case pftNonZero:
-      if (Abs(edge.WindCnt) != 1) return false;
-      break;
-    case pftPositive:
-      if (edge.WindCnt != 1) return false;
-      break;
-    default: //pftNegative
-      if (edge.WindCnt != -1) return false;
-  }
-
-  switch(m_ClipType)
-  {
-    case ctIntersection:
-      switch(pft2)
-      {
-        case pftEvenOdd:
-        case pftNonZero:
-          return (edge.WindCnt2 != 0);
-        case pftPositive:
-          return (edge.WindCnt2 > 0);
-        default:
-          return (edge.WindCnt2 < 0);
-      }
-      break;
-    case ctUnion:
-      switch(pft2)
-      {
-        case pftEvenOdd:
-        case pftNonZero:
-          return (edge.WindCnt2 == 0);
-        case pftPositive:
-          return (edge.WindCnt2 <= 0);
-        default:
-          return (edge.WindCnt2 >= 0);
-      }
-      break;
-    case ctDifference:
-      if (edge.PolyTyp == ptSubject)
-        switch(pft2)
-        {
-          case pftEvenOdd:
-          case pftNonZero:
-            return (edge.WindCnt2 == 0);
-          case pftPositive:
-            return (edge.WindCnt2 <= 0);
-          default:
-            return (edge.WindCnt2 >= 0);
-        }
-      else
-        switch(pft2)
-        {
-          case pftEvenOdd:
-          case pftNonZero:
-            return (edge.WindCnt2 != 0);
-          case pftPositive:
-            return (edge.WindCnt2 > 0);
-          default:
-            return (edge.WindCnt2 < 0);
-        }
-      break;
-    case ctXor:
-      if (edge.WindDelta == 0) //XOr always contributing unless open
-        switch(pft2)
-        {
-          case pftEvenOdd:
-          case pftNonZero:
-            return (edge.WindCnt2 == 0);
-          case pftPositive:
-            return (edge.WindCnt2 <= 0);
-          default:
-            return (edge.WindCnt2 >= 0);
-        }
-      else
-        return true;
-      break;
-    default:
-      return true;
-  }
-}
-//------------------------------------------------------------------------------
+        int highI = (int)pg.size() - 1;
+        if (Closed)
+            while (highI > 0 && (pg[highI] == pg[0])) --highI;
+        while (highI > 0 && (pg[highI] == pg[highI - 1])) --highI;
+        if ((Closed && highI < 2) || (!Closed && highI < 1)) return false;
 
-OutPt* Clipper::AddLocalMinPoly(TEdge *e1, TEdge *e2, const IntPoint &Pt)
-{
-  OutPt* result;
-  TEdge *e, *prevE;
-  if (IsHorizontal(*e2) || ( e1->Dx > e2->Dx ))
-  {
-    result = AddOutPt(e1, Pt);
-    e2->OutIdx = e1->OutIdx;
-    e1->Side = esLeft;
-    e2->Side = esRight;
-    e = e1;
-    if (e->PrevInAEL == e2)
-      prevE = e2->PrevInAEL;
-    else
-      prevE = e->PrevInAEL;
-  } else
-  {
-    result = AddOutPt(e2, Pt);
-    e1->OutIdx = e2->OutIdx;
-    e1->Side = esRight;
-    e2->Side = esLeft;
-    e = e2;
-    if (e->PrevInAEL == e1)
-        prevE = e1->PrevInAEL;
-    else
-        prevE = e->PrevInAEL;
-  }
-
-  if (prevE && prevE->OutIdx >= 0 && prevE->Top.Y < Pt.Y && e->Top.Y < Pt.Y)
-  {
-    cInt xPrev = TopX(*prevE, Pt.Y);
-    cInt xE = TopX(*e, Pt.Y);
-    if (xPrev == xE && (e->WindDelta != 0) && (prevE->WindDelta != 0) &&
-      SlopesEqual(IntPoint(xPrev, Pt.Y), prevE->Top, IntPoint(xE, Pt.Y), e->Top, m_UseFullRange))
-    {
-      OutPt* outPt = AddOutPt(prevE, Pt);
-      AddJoin(result, outPt, e->Top);
-    }
-  }
-  return result;
-}
-//------------------------------------------------------------------------------
+        // create a new edge array ...
+        TEdge* edges = new TEdge[highI + 1];
 
-void Clipper::AddLocalMaxPoly(TEdge *e1, TEdge *e2, const IntPoint &Pt)
-{
-  AddOutPt( e1, Pt );
-  if (e2->WindDelta == 0) AddOutPt(e2, Pt);
-  if( e1->OutIdx == e2->OutIdx )
-  {
-    e1->OutIdx = Unassigned;
-    e2->OutIdx = Unassigned;
-  }
-  else if (e1->OutIdx < e2->OutIdx)
-    AppendPolygon(e1, e2);
-  else
-    AppendPolygon(e2, e1);
-}
-//------------------------------------------------------------------------------
+        bool   IsFlat = true;
+        // 1. Basic (first) edge initialization ...
+        try
+        {
+            edges[1].Curr = pg[1];
+            RangeTest(pg[0], m_UseFullRange);
+            RangeTest(pg[highI], m_UseFullRange);
+            InitEdge(&edges[0], &edges[1], &edges[highI], pg[0]);
+            InitEdge(&edges[highI], &edges[0], &edges[highI - 1], pg[highI]);
+            for (int i = highI - 1; i >= 1; --i)
+            {
+                RangeTest(pg[i], m_UseFullRange);
+                InitEdge(&edges[i], &edges[i + 1], &edges[i - 1], pg[i]);
+            }
+        }
+        catch (...)
+        {
+            delete[] edges;
+            throw;  // range test fails
+        }
+        TEdge* eStart = &edges[0];
 
-void Clipper::AddEdgeToSEL(TEdge *edge)
-{
-  //SEL pointers in PEdge are reused to build a list of horizontal edges.
-  //However, we don't need to worry about order with horizontal edge processing.
-  if( !m_SortedEdges )
-  {
-    m_SortedEdges = edge;
-    edge->PrevInSEL = 0;
-    edge->NextInSEL = 0;
-  }
-  else
-  {
-    edge->NextInSEL = m_SortedEdges;
-    edge->PrevInSEL = 0;
-    m_SortedEdges->PrevInSEL = edge;
-    m_SortedEdges = edge;
-  }
-}
-//------------------------------------------------------------------------------
+        // 2. Remove duplicate vertices, and (when closed) collinear edges ...
+        TEdge *E = eStart, *eLoopStop = eStart;
+        for (;;)
+        {
+            // nb: allows matching start and end points when not Closed ...
+            if (E->Curr == E->Next->Curr && (Closed || E->Next != eStart))
+            {
+                if (E == E->Next) break;
+                if (E == eStart) eStart = E->Next;
+                E         = RemoveEdge(E);
+                eLoopStop = E;
+                continue;
+            }
+            if (E->Prev == E->Next)
+                break;  // only two vertices
+            else if (Closed &&
+                     SlopesEqual(E->Prev->Curr, E->Curr, E->Next->Curr, m_UseFullRange) &&
+                     (!m_PreserveCollinear ||
+                      !Pt2IsBetweenPt1AndPt3(E->Prev->Curr, E->Curr, E->Next->Curr)))
+            {
+                // Collinear edges are allowed for open paths but in closed paths
+                // the default is to merge adjacent collinear edges into a single edge.
+                // However, if the PreserveCollinear property is enabled, only overlapping
+                // collinear edges (ie spikes) will be removed from closed paths.
+                if (E == eStart) eStart = E->Next;
+                E         = RemoveEdge(E);
+                E         = E->Prev;
+                eLoopStop = E;
+                continue;
+            }
+            E = E->Next;
+            if ((E == eLoopStop) || (!Closed && E->Next == eStart)) break;
+        }
 
-bool Clipper::PopEdgeFromSEL(TEdge *&edge)
-{
-  if (!m_SortedEdges) return false;
-  edge = m_SortedEdges;
-  DeleteFromSEL(m_SortedEdges);
-  return true;
-}
-//------------------------------------------------------------------------------
+        if ((!Closed && (E == E->Next)) || (Closed && (E->Prev == E->Next)))
+        {
+            delete[] edges;
+            return false;
+        }
 
-void Clipper::CopyAELToSEL()
-{
-  TEdge* e = m_ActiveEdges;
-  m_SortedEdges = e;
-  while ( e )
-  {
-    e->PrevInSEL = e->PrevInAEL;
-    e->NextInSEL = e->NextInAEL;
-    e = e->NextInAEL;
-  }
-}
-//------------------------------------------------------------------------------
+        if (!Closed)
+        {
+            m_HasOpenPaths       = true;
+            eStart->Prev->OutIdx = Skip;
+        }
 
-void Clipper::AddJoin(OutPt *op1, OutPt *op2, const IntPoint OffPt)
-{
-  Join* j = new Join;
-  j->OutPt1 = op1;
-  j->OutPt2 = op2;
-  j->OffPt = OffPt;
-  m_Joins.push_back(j);
-}
-//------------------------------------------------------------------------------
+        // 3. Do second stage of edge initialization ...
+        E = eStart;
+        do
+        {
+            InitEdge2(*E, PolyTyp);
+            E = E->Next;
+            if (IsFlat && E->Curr.Y != eStart->Curr.Y) IsFlat = false;
+        } while (E != eStart);
 
-void Clipper::ClearJoins()
-{
-  for (JoinList::size_type i = 0; i < m_Joins.size(); i++)
-    delete m_Joins[i];
-  m_Joins.resize(0);
-}
-//------------------------------------------------------------------------------
+        // 4. Finally, add edge bounds to LocalMinima list ...
 
-void Clipper::ClearGhostJoins()
-{
-  for (JoinList::size_type i = 0; i < m_GhostJoins.size(); i++)
-    delete m_GhostJoins[i];
-  m_GhostJoins.resize(0);
-}
-//------------------------------------------------------------------------------
+        // Totally flat paths must be handled differently when adding them
+        // to LocalMinima list to avoid endless loops etc ...
+        if (IsFlat)
+        {
+            if (Closed)
+            {
+                delete[] edges;
+                return false;
+            }
+            E->Prev->OutIdx = Skip;
+            MinimaList::value_type locMin;
+            locMin.Y                     = E->Bot.Y;
+            locMin.LeftBound             = 0;
+            locMin.RightBound            = E;
+            locMin.RightBound->Side      = esRight;
+            locMin.RightBound->WindDelta = 0;
+            for (;;)
+            {
+                if (E->Bot.X != E->Prev->Top.X) ReverseHorizontal(*E);
+                if (E->Next->OutIdx == Skip) break;
+                E->NextInLML = E->Next;
+                E            = E->Next;
+            }
+            m_MinimaList.push_back(locMin);
+            m_edges.push_back(edges);
+            return true;
+        }
 
-void Clipper::AddGhostJoin(OutPt *op, const IntPoint OffPt)
-{
-  Join* j = new Join;
-  j->OutPt1 = op;
-  j->OutPt2 = 0;
-  j->OffPt = OffPt;
-  m_GhostJoins.push_back(j);
-}
-//------------------------------------------------------------------------------
+        m_edges.push_back(edges);
+        bool   leftBoundIsForward;
+        TEdge* EMin = 0;
 
-void Clipper::InsertLocalMinimaIntoAEL(const cInt botY)
-{
-  const LocalMinimum *lm;
-  while (PopLocalMinima(botY, lm))
-  {
-    TEdge* lb = lm->LeftBound;
-    TEdge* rb = lm->RightBound;
-
-    OutPt *Op1 = 0;
-    if (!lb)
-    {
-      //nb: don't insert LB into either AEL or SEL
-      InsertEdgeIntoAEL(rb, 0);
-      SetWindingCount(*rb);
-      if (IsContributing(*rb))
-        Op1 = AddOutPt(rb, rb->Bot);
-    }
-    else if (!rb)
-    {
-      InsertEdgeIntoAEL(lb, 0);
-      SetWindingCount(*lb);
-      if (IsContributing(*lb))
-        Op1 = AddOutPt(lb, lb->Bot);
-      InsertScanbeam(lb->Top.Y);
-    }
-    else
-    {
-      InsertEdgeIntoAEL(lb, 0);
-      InsertEdgeIntoAEL(rb, lb);
-      SetWindingCount( *lb );
-      rb->WindCnt = lb->WindCnt;
-      rb->WindCnt2 = lb->WindCnt2;
-      if (IsContributing(*lb))
-        Op1 = AddLocalMinPoly(lb, rb, lb->Bot);
-      InsertScanbeam(lb->Top.Y);
-    }
-
-     if (rb)
-     {
-		 if (IsHorizontal(*rb))
-		 {
-			 AddEdgeToSEL(rb);
-			 if (rb->NextInLML)
-				 InsertScanbeam(rb->NextInLML->Top.Y);
-		 }
-		 else InsertScanbeam( rb->Top.Y );
-     }
-
-    if (!lb || !rb) continue;
-
-    //if any output polygons share an edge, they'll need joining later ...
-    if (Op1 && IsHorizontal(*rb) &&
-      m_GhostJoins.size() > 0 && (rb->WindDelta != 0))
-    {
-      for (JoinList::size_type i = 0; i < m_GhostJoins.size(); ++i)
-      {
-        Join* jr = m_GhostJoins[i];
-        //if the horizontal Rb and a 'ghost' horizontal overlap, then convert
-        //the 'ghost' join to a real join ready for later ...
-        if (HorzSegmentsOverlap(jr->OutPt1->Pt.X, jr->OffPt.X, rb->Bot.X, rb->Top.X))
-          AddJoin(jr->OutPt1, Op1, jr->OffPt);
-      }
-    }
-
-    if (lb->OutIdx >= 0 && lb->PrevInAEL &&
-      lb->PrevInAEL->Curr.X == lb->Bot.X &&
-      lb->PrevInAEL->OutIdx >= 0 &&
-      SlopesEqual(lb->PrevInAEL->Bot, lb->PrevInAEL->Top, lb->Curr, lb->Top, m_UseFullRange) &&
-      (lb->WindDelta != 0) && (lb->PrevInAEL->WindDelta != 0))
-    {
-        OutPt *Op2 = AddOutPt(lb->PrevInAEL, lb->Bot);
-        AddJoin(Op1, Op2, lb->Top);
-    }
-
-    if(lb->NextInAEL != rb)
-    {
-
-      if (rb->OutIdx >= 0 && rb->PrevInAEL->OutIdx >= 0 &&
-        SlopesEqual(rb->PrevInAEL->Curr, rb->PrevInAEL->Top, rb->Curr, rb->Top, m_UseFullRange) &&
-        (rb->WindDelta != 0) && (rb->PrevInAEL->WindDelta != 0))
-      {
-          OutPt *Op2 = AddOutPt(rb->PrevInAEL, rb->Bot);
-          AddJoin(Op1, Op2, rb->Top);
-      }
-
-      TEdge* e = lb->NextInAEL;
-      if (e)
-      {
-        while( e != rb )
-        {
-          //nb: For calculating winding counts etc, IntersectEdges() assumes
-          //that param1 will be to the Right of param2 ABOVE the intersection ...
-          IntersectEdges(rb , e , lb->Curr); //order important here
-          e = e->NextInAEL;
-        }
-      }
-    }
-
-  }
-}
+        // workaround to avoid an endless loop in the while loop below when
+        // open paths have matching start and end points ...
+        if (E->Prev->Bot == E->Prev->Top) E = E->Next;
+
+        for (;;)
+        {
+            E = FindNextLocMin(E);
+            if (E == EMin)
+                break;
+            else if (!EMin)
+                EMin = E;
+
+            // E and E.Prev now share a local minima (left aligned if horizontal).
+            // Compare their slopes to find which starts which bound ...
+            MinimaList::value_type locMin;
+            locMin.Y = E->Bot.Y;
+            if (E->Dx < E->Prev->Dx)
+            {
+                locMin.LeftBound   = E->Prev;
+                locMin.RightBound  = E;
+                leftBoundIsForward = false;  // Q.nextInLML = Q.prev
+            }
+            else
+            {
+                locMin.LeftBound   = E;
+                locMin.RightBound  = E->Prev;
+                leftBoundIsForward = true;  // Q.nextInLML = Q.next
+            }
+
+            if (!Closed)
+                locMin.LeftBound->WindDelta = 0;
+            else if (locMin.LeftBound->Next == locMin.RightBound)
+                locMin.LeftBound->WindDelta = -1;
+            else
+                locMin.LeftBound->WindDelta = 1;
+            locMin.RightBound->WindDelta = -locMin.LeftBound->WindDelta;
+
+            E = ProcessBound(locMin.LeftBound, leftBoundIsForward);
+            if (E->OutIdx == Skip) E = ProcessBound(E, leftBoundIsForward);
+
+            TEdge* E2 = ProcessBound(locMin.RightBound, !leftBoundIsForward);
+            if (E2->OutIdx == Skip) E2 = ProcessBound(E2, !leftBoundIsForward);
+
+            if (locMin.LeftBound->OutIdx == Skip)
+                locMin.LeftBound = 0;
+            else if (locMin.RightBound->OutIdx == Skip)
+                locMin.RightBound = 0;
+            m_MinimaList.push_back(locMin);
+            if (!leftBoundIsForward) E = E2;
+        }
+        return true;
+    }
+    //------------------------------------------------------------------------------
+
+    bool ClipperBase::AddPaths(const Paths& ppg, PolyType PolyTyp, bool Closed)
+    {
+        bool result = false;
+        for (Paths::size_type i = 0; i < ppg.size(); ++i)
+            if (AddPath(ppg[i], PolyTyp, Closed)) result = true;
+        return result;
+    }
+    //------------------------------------------------------------------------------
+
+    void ClipperBase::Clear()
+    {
+        DisposeLocalMinimaList();
+        for (EdgeList::size_type i = 0; i < m_edges.size(); ++i)
+        {
+            TEdge* edges = m_edges[i];
+            delete[] edges;
+        }
+        m_edges.clear();
+        m_UseFullRange = false;
+        m_HasOpenPaths = false;
+    }
+    //------------------------------------------------------------------------------
+
+    void ClipperBase::Reset()
+    {
+        m_CurrentLM = m_MinimaList.begin();
+        if (m_CurrentLM == m_MinimaList.end()) return;  // ie nothing to process
+        std::sort(m_MinimaList.begin(), m_MinimaList.end(), LocMinSorter());
+
+        m_Scanbeam = ScanbeamList();  // clears/resets priority_queue
+        // reset all edges ...
+        for (MinimaList::iterator lm = m_MinimaList.begin(); lm != m_MinimaList.end(); ++lm)
+        {
+            InsertScanbeam(lm->Y);
+            TEdge* e = lm->LeftBound;
+            if (e)
+            {
+                e->Curr   = e->Bot;
+                e->Side   = esLeft;
+                e->OutIdx = Unassigned;
+            }
+
+            e = lm->RightBound;
+            if (e)
+            {
+                e->Curr   = e->Bot;
+                e->Side   = esRight;
+                e->OutIdx = Unassigned;
+            }
+        }
+        m_ActiveEdges = 0;
+        m_CurrentLM   = m_MinimaList.begin();
+    }
+    //------------------------------------------------------------------------------
+
+    void ClipperBase::DisposeLocalMinimaList()
+    {
+        m_MinimaList.clear();
+        m_CurrentLM = m_MinimaList.begin();
+    }
+    //------------------------------------------------------------------------------
+
+    bool ClipperBase::PopLocalMinima(cInt Y, const LocalMinimum*& locMin)
+    {
+        if (m_CurrentLM == m_MinimaList.end() || (*m_CurrentLM).Y != Y) return false;
+        locMin = &(*m_CurrentLM);
+        ++m_CurrentLM;
+        return true;
+    }
+    //------------------------------------------------------------------------------
+
+    IntRect ClipperBase::GetBounds()
+    {
+        IntRect              result;
+        MinimaList::iterator lm = m_MinimaList.begin();
+        if (lm == m_MinimaList.end())
+        {
+            result.left = result.top = result.right = result.bottom = 0;
+            return result;
+        }
+        result.left   = lm->LeftBound->Bot.X;
+        result.top    = lm->LeftBound->Bot.Y;
+        result.right  = lm->LeftBound->Bot.X;
+        result.bottom = lm->LeftBound->Bot.Y;
+        while (lm != m_MinimaList.end())
+        {
+            // todo - needs fixing for open paths
+            result.bottom = std::max(result.bottom, lm->LeftBound->Bot.Y);
+            TEdge* e      = lm->LeftBound;
+            for (;;)
+            {
+                TEdge* bottomE = e;
+                while (e->NextInLML)
+                {
+                    if (e->Bot.X < result.left) result.left = e->Bot.X;
+                    if (e->Bot.X > result.right) result.right = e->Bot.X;
+                    e = e->NextInLML;
+                }
+                result.left  = std::min(result.left, e->Bot.X);
+                result.right = std::max(result.right, e->Bot.X);
+                result.left  = std::min(result.left, e->Top.X);
+                result.right = std::max(result.right, e->Top.X);
+                result.top   = std::min(result.top, e->Top.Y);
+                if (bottomE == lm->LeftBound)
+                    e = lm->RightBound;
+                else
+                    break;
+            }
+            ++lm;
+        }
+        return result;
+    }
+    //------------------------------------------------------------------------------
+
+    void ClipperBase::InsertScanbeam(const cInt Y)
+    {
+        m_Scanbeam.push(Y);
+    }
+    //------------------------------------------------------------------------------
+
+    bool ClipperBase::PopScanbeam(cInt& Y)
+    {
+        if (m_Scanbeam.empty()) return false;
+        Y = m_Scanbeam.top();
+        m_Scanbeam.pop();
+        while (!m_Scanbeam.empty() && Y == m_Scanbeam.top()) { m_Scanbeam.pop(); }  // Pop duplicates.
+        return true;
+    }
+    //------------------------------------------------------------------------------
+
+    void ClipperBase::DisposeAllOutRecs()
+    {
+        for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); ++i)
+            DisposeOutRec(i);
+        m_PolyOuts.clear();
+    }
+    //------------------------------------------------------------------------------
+
+    void ClipperBase::DisposeOutRec(PolyOutList::size_type index)
+    {
+        OutRec* outRec = m_PolyOuts[index];
+        if (outRec->Pts) DisposeOutPts(outRec->Pts);
+        delete outRec;
+        m_PolyOuts[index] = 0;
+    }
+    //------------------------------------------------------------------------------
+
+    void ClipperBase::DeleteFromAEL(TEdge* e)
+    {
+        TEdge* AelPrev = e->PrevInAEL;
+        TEdge* AelNext = e->NextInAEL;
+        if (!AelPrev && !AelNext && (e != m_ActiveEdges)) return;  // already deleted
+        if (AelPrev)
+            AelPrev->NextInAEL = AelNext;
+        else
+            m_ActiveEdges = AelNext;
+        if (AelNext) AelNext->PrevInAEL = AelPrev;
+        e->NextInAEL = 0;
+        e->PrevInAEL = 0;
+    }
+    //------------------------------------------------------------------------------
+
+    OutRec* ClipperBase::CreateOutRec()
+    {
+        OutRec* result    = new OutRec;
+        result->IsHole    = false;
+        result->IsOpen    = false;
+        result->FirstLeft = 0;
+        result->Pts       = 0;
+        result->BottomPt  = 0;
+        result->PolyNd    = 0;
+        m_PolyOuts.push_back(result);
+        result->Idx = (int)m_PolyOuts.size() - 1;
+        return result;
+    }
+    //------------------------------------------------------------------------------
+
+    void ClipperBase::SwapPositionsInAEL(TEdge* Edge1, TEdge* Edge2)
+    {
+        // check that one or other edge hasn't already been removed from AEL ...
+        if (Edge1->NextInAEL == Edge1->PrevInAEL ||
+            Edge2->NextInAEL == Edge2->PrevInAEL) return;
+
+        if (Edge1->NextInAEL == Edge2)
+        {
+            TEdge* Next = Edge2->NextInAEL;
+            if (Next) Next->PrevInAEL = Edge1;
+            TEdge* Prev = Edge1->PrevInAEL;
+            if (Prev) Prev->NextInAEL = Edge2;
+            Edge2->PrevInAEL = Prev;
+            Edge2->NextInAEL = Edge1;
+            Edge1->PrevInAEL = Edge2;
+            Edge1->NextInAEL = Next;
+        }
+        else if (Edge2->NextInAEL == Edge1)
+        {
+            TEdge* Next = Edge1->NextInAEL;
+            if (Next) Next->PrevInAEL = Edge2;
+            TEdge* Prev = Edge2->PrevInAEL;
+            if (Prev) Prev->NextInAEL = Edge1;
+            Edge1->PrevInAEL = Prev;
+            Edge1->NextInAEL = Edge2;
+            Edge2->PrevInAEL = Edge1;
+            Edge2->NextInAEL = Next;
+        }
+        else
+        {
+            TEdge* Next      = Edge1->NextInAEL;
+            TEdge* Prev      = Edge1->PrevInAEL;
+            Edge1->NextInAEL = Edge2->NextInAEL;
+            if (Edge1->NextInAEL) Edge1->NextInAEL->PrevInAEL = Edge1;
+            Edge1->PrevInAEL = Edge2->PrevInAEL;
+            if (Edge1->PrevInAEL) Edge1->PrevInAEL->NextInAEL = Edge1;
+            Edge2->NextInAEL = Next;
+            if (Edge2->NextInAEL) Edge2->NextInAEL->PrevInAEL = Edge2;
+            Edge2->PrevInAEL = Prev;
+            if (Edge2->PrevInAEL) Edge2->PrevInAEL->NextInAEL = Edge2;
+        }
+
+        if (!Edge1->PrevInAEL)
+            m_ActiveEdges = Edge1;
+        else if (!Edge2->PrevInAEL)
+            m_ActiveEdges = Edge2;
+    }
+    //------------------------------------------------------------------------------
+
+    void ClipperBase::UpdateEdgeIntoAEL(TEdge*& e)
+    {
+        if (!e->NextInLML)
+            throw clipperException("UpdateEdgeIntoAEL: invalid call");
+
+        e->NextInLML->OutIdx = e->OutIdx;
+        TEdge* AelPrev       = e->PrevInAEL;
+        TEdge* AelNext       = e->NextInAEL;
+        if (AelPrev)
+            AelPrev->NextInAEL = e->NextInLML;
+        else
+            m_ActiveEdges = e->NextInLML;
+        if (AelNext) AelNext->PrevInAEL = e->NextInLML;
+        e->NextInLML->Side      = e->Side;
+        e->NextInLML->WindDelta = e->WindDelta;
+        e->NextInLML->WindCnt   = e->WindCnt;
+        e->NextInLML->WindCnt2  = e->WindCnt2;
+        e                       = e->NextInLML;
+        e->Curr                 = e->Bot;
+        e->PrevInAEL            = AelPrev;
+        e->NextInAEL            = AelNext;
+        if (!IsHorizontal(*e)) InsertScanbeam(e->Top.Y);
+    }
+    //------------------------------------------------------------------------------
+
+    bool ClipperBase::LocalMinimaPending()
+    {
+        return (m_CurrentLM != m_MinimaList.end());
+    }
+
+    //------------------------------------------------------------------------------
+    // TClipper methods ...
+    //------------------------------------------------------------------------------
+
+    Clipper::Clipper(int initOptions)
+        : ClipperBase()  // constructor
+    {
+        m_ExecuteLocked     = false;
+        m_UseFullRange      = false;
+        m_ReverseOutput     = ((initOptions & ioReverseSolution) != 0);
+        m_StrictSimple      = ((initOptions & ioStrictlySimple) != 0);
+        m_PreserveCollinear = ((initOptions & ioPreserveCollinear) != 0);
+        m_HasOpenPaths      = false;
+#ifdef use_xyz
+        m_ZFill = 0;
+#endif
+    }
+    //------------------------------------------------------------------------------
+
+#ifdef use_xyz
+    void Clipper::ZFillFunction(ZFillCallback zFillFunc)
+    {
+        m_ZFill = zFillFunc;
+    }
 //------------------------------------------------------------------------------
+#endif
+
+    bool Clipper::Execute(ClipType clipType, Paths& solution, PolyFillType fillType)
+    {
+        return Execute(clipType, solution, fillType, fillType);
+    }
+    //------------------------------------------------------------------------------
+
+    bool Clipper::Execute(ClipType clipType, PolyTree& polytree, PolyFillType fillType)
+    {
+        return Execute(clipType, polytree, fillType, fillType);
+    }
+    //------------------------------------------------------------------------------
+
+    bool Clipper::Execute(ClipType clipType, Paths& solution, PolyFillType subjFillType, PolyFillType clipFillType)
+    {
+        if (m_ExecuteLocked) return false;
+        if (m_HasOpenPaths)
+            throw clipperException("Error: PolyTree struct is needed for open path clipping.");
+        m_ExecuteLocked = true;
+        solution.resize(0);
+        m_SubjFillType  = subjFillType;
+        m_ClipFillType  = clipFillType;
+        m_ClipType      = clipType;
+        m_UsingPolyTree = false;
+        bool succeeded  = ExecuteInternal();
+        if (succeeded) BuildResult(solution);
+        DisposeAllOutRecs();
+        m_ExecuteLocked = false;
+        return succeeded;
+    }
+    //------------------------------------------------------------------------------
+
+    bool Clipper::Execute(ClipType clipType, PolyTree& polytree, PolyFillType subjFillType, PolyFillType clipFillType)
+    {
+        if (m_ExecuteLocked) return false;
+        m_ExecuteLocked = true;
+        m_SubjFillType  = subjFillType;
+        m_ClipFillType  = clipFillType;
+        m_ClipType      = clipType;
+        m_UsingPolyTree = true;
+        bool succeeded  = ExecuteInternal();
+        if (succeeded) BuildResult2(polytree);
+        DisposeAllOutRecs();
+        m_ExecuteLocked = false;
+        return succeeded;
+    }
+    //------------------------------------------------------------------------------
+
+    void Clipper::FixHoleLinkage(OutRec& outrec)
+    {
+        // skip OutRecs that (a) contain outermost polygons or
+        //(b) already have the correct owner/child linkage ...
+        if (!outrec.FirstLeft ||
+            (outrec.IsHole != outrec.FirstLeft->IsHole &&
+             outrec.FirstLeft->Pts)) return;
+
+        OutRec* orfl = outrec.FirstLeft;
+        while (orfl && ((orfl->IsHole == outrec.IsHole) || !orfl->Pts))
+            orfl = orfl->FirstLeft;
+        outrec.FirstLeft = orfl;
+    }
+    //------------------------------------------------------------------------------
+
+    bool Clipper::ExecuteInternal()
+    {
+        bool succeeded = true;
+        try
+        {
+            Reset();
+            m_Maxima      = MaximaList();
+            m_SortedEdges = 0;
+
+            succeeded = true;
+            cInt botY, topY;
+            if (!PopScanbeam(botY)) return false;
+            InsertLocalMinimaIntoAEL(botY);
+            while (PopScanbeam(topY) || LocalMinimaPending())
+            {
+                ProcessHorizontals();
+                ClearGhostJoins();
+                if (!ProcessIntersections(topY))
+                {
+                    succeeded = false;
+                    break;
+                }
+                ProcessEdgesAtTopOfScanbeam(topY);
+                botY = topY;
+                InsertLocalMinimaIntoAEL(botY);
+            }
+        }
+        catch (...)
+        {
+            succeeded = false;
+        }
+
+        if (succeeded)
+        {
+            // fix orientations ...
+            for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); ++i)
+            {
+                OutRec* outRec = m_PolyOuts[i];
+                if (!outRec->Pts || outRec->IsOpen) continue;
+                if ((outRec->IsHole ^ m_ReverseOutput) == (Area(*outRec) > 0))
+                    ReversePolyPtLinks(outRec->Pts);
+            }
+
+            if (!m_Joins.empty()) JoinCommonEdges();
+
+            // unfortunately FixupOutPolygon() must be done after JoinCommonEdges()
+            for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); ++i)
+            {
+                OutRec* outRec = m_PolyOuts[i];
+                if (!outRec->Pts) continue;
+                if (outRec->IsOpen)
+                    FixupOutPolyline(*outRec);
+                else
+                    FixupOutPolygon(*outRec);
+            }
+
+            if (m_StrictSimple) DoSimplePolygons();
+        }
+
+        ClearJoins();
+        ClearGhostJoins();
+        return succeeded;
+    }
+    //------------------------------------------------------------------------------
+
+    void Clipper::SetWindingCount(TEdge& edge)
+    {
+        TEdge* e = edge.PrevInAEL;
+        // find the edge of the same polytype that immediately preceeds 'edge' in AEL
+        while (e && ((e->PolyTyp != edge.PolyTyp) || (e->WindDelta == 0))) e = e->PrevInAEL;
+        if (!e)
+        {
+            if (edge.WindDelta == 0)
+            {
+                PolyFillType pft = (edge.PolyTyp == ptSubject ? m_SubjFillType : m_ClipFillType);
+                edge.WindCnt     = (pft == pftNegative ? -1 : 1);
+            }
+            else
+                edge.WindCnt = edge.WindDelta;
+            edge.WindCnt2 = 0;
+            e             = m_ActiveEdges;  // ie get ready to calc WindCnt2
+        }
+        else if (edge.WindDelta == 0 && m_ClipType != ctUnion)
+        {
+            edge.WindCnt  = 1;
+            edge.WindCnt2 = e->WindCnt2;
+            e             = e->NextInAEL;  // ie get ready to calc WindCnt2
+        }
+        else if (IsEvenOddFillType(edge))
+        {
+            // EvenOdd filling ...
+            if (edge.WindDelta == 0)
+            {
+                // are we inside a subj polygon ...
+                bool   Inside = true;
+                TEdge* e2     = e->PrevInAEL;
+                while (e2)
+                {
+                    if (e2->PolyTyp == e->PolyTyp && e2->WindDelta != 0)
+                        Inside = !Inside;
+                    e2 = e2->PrevInAEL;
+                }
+                edge.WindCnt = (Inside ? 0 : 1);
+            }
+            else
+            {
+                edge.WindCnt = edge.WindDelta;
+            }
+            edge.WindCnt2 = e->WindCnt2;
+            e             = e->NextInAEL;  // ie get ready to calc WindCnt2
+        }
+        else
+        {
+            // nonZero, Positive or Negative filling ...
+            if (e->WindCnt * e->WindDelta < 0)
+            {
+                // prev edge is 'decreasing' WindCount (WC) toward zero
+                // so we're outside the previous polygon ...
+                if (Abs(e->WindCnt) > 1)
+                {
+                    // outside prev poly but still inside another.
+                    // when reversing direction of prev poly use the same WC
+                    if (e->WindDelta * edge.WindDelta < 0) edge.WindCnt = e->WindCnt;
+                    // otherwise continue to 'decrease' WC ...
+                    else
+                        edge.WindCnt = e->WindCnt + edge.WindDelta;
+                }
+                else
+                    // now outside all polys of same polytype so set own WC ...
+                    edge.WindCnt = (edge.WindDelta == 0 ? 1 : edge.WindDelta);
+            }
+            else
+            {
+                // prev edge is 'increasing' WindCount (WC) away from zero
+                // so we're inside the previous polygon ...
+                if (edge.WindDelta == 0)
+                    edge.WindCnt = (e->WindCnt < 0 ? e->WindCnt - 1 : e->WindCnt + 1);
+                // if wind direction is reversing prev then use same WC
+                else if (e->WindDelta * edge.WindDelta < 0)
+                    edge.WindCnt = e->WindCnt;
+                // otherwise add to WC ...
+                else
+                    edge.WindCnt = e->WindCnt + edge.WindDelta;
+            }
+            edge.WindCnt2 = e->WindCnt2;
+            e             = e->NextInAEL;  // ie get ready to calc WindCnt2
+        }
+
+        // update WindCnt2 ...
+        if (IsEvenOddAltFillType(edge))
+        {
+            // EvenOdd filling ...
+            while (e != &edge)
+            {
+                if (e->WindDelta != 0)
+                    edge.WindCnt2 = (edge.WindCnt2 == 0 ? 1 : 0);
+                e = e->NextInAEL;
+            }
+        }
+        else
+        {
+            // nonZero, Positive or Negative filling ...
+            while (e != &edge)
+            {
+                edge.WindCnt2 += e->WindDelta;
+                e = e->NextInAEL;
+            }
+        }
+    }
+    //------------------------------------------------------------------------------
+
+    bool Clipper::IsEvenOddFillType(const TEdge& edge) const
+    {
+        if (edge.PolyTyp == ptSubject)
+            return m_SubjFillType == pftEvenOdd;
+        else
+            return m_ClipFillType == pftEvenOdd;
+    }
+    //------------------------------------------------------------------------------
+
+    bool Clipper::IsEvenOddAltFillType(const TEdge& edge) const
+    {
+        if (edge.PolyTyp == ptSubject)
+            return m_ClipFillType == pftEvenOdd;
+        else
+            return m_SubjFillType == pftEvenOdd;
+    }
+    //------------------------------------------------------------------------------
+
+    bool Clipper::IsContributing(const TEdge& edge) const
+    {
+        PolyFillType pft, pft2;
+        if (edge.PolyTyp == ptSubject)
+        {
+            pft  = m_SubjFillType;
+            pft2 = m_ClipFillType;
+        }
+        else
+        {
+            pft  = m_ClipFillType;
+            pft2 = m_SubjFillType;
+        }
+
+        switch (pft)
+        {
+            case pftEvenOdd:
+                // return false if a subj line has been flagged as inside a subj polygon
+                if (edge.WindDelta == 0 && edge.WindCnt != 1) return false;
+                break;
+            case pftNonZero:
+                if (Abs(edge.WindCnt) != 1) return false;
+                break;
+            case pftPositive:
+                if (edge.WindCnt != 1) return false;
+                break;
+            default:  // pftNegative
+                if (edge.WindCnt != -1) return false;
+        }
+
+        switch (m_ClipType)
+        {
+            case ctIntersection:
+                switch (pft2)
+                {
+                    case pftEvenOdd:
+                    case pftNonZero:
+                        return (edge.WindCnt2 != 0);
+                    case pftPositive:
+                        return (edge.WindCnt2 > 0);
+                    default:
+                        return (edge.WindCnt2 < 0);
+                }
+                break;
+            case ctUnion:
+                switch (pft2)
+                {
+                    case pftEvenOdd:
+                    case pftNonZero:
+                        return (edge.WindCnt2 == 0);
+                    case pftPositive:
+                        return (edge.WindCnt2 <= 0);
+                    default:
+                        return (edge.WindCnt2 >= 0);
+                }
+                break;
+            case ctDifference:
+                if (edge.PolyTyp == ptSubject)
+                    switch (pft2)
+                    {
+                        case pftEvenOdd:
+                        case pftNonZero:
+                            return (edge.WindCnt2 == 0);
+                        case pftPositive:
+                            return (edge.WindCnt2 <= 0);
+                        default:
+                            return (edge.WindCnt2 >= 0);
+                    }
+                else
+                    switch (pft2)
+                    {
+                        case pftEvenOdd:
+                        case pftNonZero:
+                            return (edge.WindCnt2 != 0);
+                        case pftPositive:
+                            return (edge.WindCnt2 > 0);
+                        default:
+                            return (edge.WindCnt2 < 0);
+                    }
+                break;
+            case ctXor:
+                if (edge.WindDelta == 0)  // XOr always contributing unless open
+                    switch (pft2)
+                    {
+                        case pftEvenOdd:
+                        case pftNonZero:
+                            return (edge.WindCnt2 == 0);
+                        case pftPositive:
+                            return (edge.WindCnt2 <= 0);
+                        default:
+                            return (edge.WindCnt2 >= 0);
+                    }
+                else
+                    return true;
+                break;
+            default:
+                return true;
+        }
+    }
+    //------------------------------------------------------------------------------
+
+    OutPt* Clipper::AddLocalMinPoly(TEdge* e1, TEdge* e2, const IntPoint& Pt)
+    {
+        OutPt* result;
+        TEdge *e, *prevE;
+        if (IsHorizontal(*e2) || (e1->Dx > e2->Dx))
+        {
+            result     = AddOutPt(e1, Pt);
+            e2->OutIdx = e1->OutIdx;
+            e1->Side   = esLeft;
+            e2->Side   = esRight;
+            e          = e1;
+            if (e->PrevInAEL == e2)
+                prevE = e2->PrevInAEL;
+            else
+                prevE = e->PrevInAEL;
+        }
+        else
+        {
+            result     = AddOutPt(e2, Pt);
+            e1->OutIdx = e2->OutIdx;
+            e1->Side   = esRight;
+            e2->Side   = esLeft;
+            e          = e2;
+            if (e->PrevInAEL == e1)
+                prevE = e1->PrevInAEL;
+            else
+                prevE = e->PrevInAEL;
+        }
+
+        if (prevE && prevE->OutIdx >= 0 && prevE->Top.Y < Pt.Y && e->Top.Y < Pt.Y)
+        {
+            cInt xPrev = TopX(*prevE, Pt.Y);
+            cInt xE    = TopX(*e, Pt.Y);
+            if (xPrev == xE && (e->WindDelta != 0) && (prevE->WindDelta != 0) &&
+                SlopesEqual(IntPoint(xPrev, Pt.Y), prevE->Top, IntPoint(xE, Pt.Y), e->Top, m_UseFullRange))
+            {
+                OutPt* outPt = AddOutPt(prevE, Pt);
+                AddJoin(result, outPt, e->Top);
+            }
+        }
+        return result;
+    }
+    //------------------------------------------------------------------------------
+
+    void Clipper::AddLocalMaxPoly(TEdge* e1, TEdge* e2, const IntPoint& Pt)
+    {
+        AddOutPt(e1, Pt);
+        if (e2->WindDelta == 0) AddOutPt(e2, Pt);
+        if (e1->OutIdx == e2->OutIdx)
+        {
+            e1->OutIdx = Unassigned;
+            e2->OutIdx = Unassigned;
+        }
+        else if (e1->OutIdx < e2->OutIdx)
+            AppendPolygon(e1, e2);
+        else
+            AppendPolygon(e2, e1);
+    }
+    //------------------------------------------------------------------------------
+
+    void Clipper::AddEdgeToSEL(TEdge* edge)
+    {
+        // SEL pointers in PEdge are reused to build a list of horizontal edges.
+        // However, we don't need to worry about order with horizontal edge processing.
+        if (!m_SortedEdges)
+        {
+            m_SortedEdges   = edge;
+            edge->PrevInSEL = 0;
+            edge->NextInSEL = 0;
+        }
+        else
+        {
+            edge->NextInSEL          = m_SortedEdges;
+            edge->PrevInSEL          = 0;
+            m_SortedEdges->PrevInSEL = edge;
+            m_SortedEdges            = edge;
+        }
+    }
+    //------------------------------------------------------------------------------
+
+    bool Clipper::PopEdgeFromSEL(TEdge*& edge)
+    {
+        if (!m_SortedEdges) return false;
+        edge = m_SortedEdges;
+        DeleteFromSEL(m_SortedEdges);
+        return true;
+    }
+    //------------------------------------------------------------------------------
+
+    void Clipper::CopyAELToSEL()
+    {
+        TEdge* e      = m_ActiveEdges;
+        m_SortedEdges = e;
+        while (e)
+        {
+            e->PrevInSEL = e->PrevInAEL;
+            e->NextInSEL = e->NextInAEL;
+            e            = e->NextInAEL;
+        }
+    }
+    //------------------------------------------------------------------------------
+
+    void Clipper::AddJoin(OutPt* op1, OutPt* op2, const IntPoint OffPt)
+    {
+        Join* j   = new Join;
+        j->OutPt1 = op1;
+        j->OutPt2 = op2;
+        j->OffPt  = OffPt;
+        m_Joins.push_back(j);
+    }
+    //------------------------------------------------------------------------------
+
+    void Clipper::ClearJoins()
+    {
+        for (JoinList::size_type i = 0; i < m_Joins.size(); i++)
+            delete m_Joins[i];
+        m_Joins.resize(0);
+    }
+    //------------------------------------------------------------------------------
+
+    void Clipper::ClearGhostJoins()
+    {
+        for (JoinList::size_type i = 0; i < m_GhostJoins.size(); i++)
+            delete m_GhostJoins[i];
+        m_GhostJoins.resize(0);
+    }
+    //------------------------------------------------------------------------------
+
+    void Clipper::AddGhostJoin(OutPt* op, const IntPoint OffPt)
+    {
+        Join* j   = new Join;
+        j->OutPt1 = op;
+        j->OutPt2 = 0;
+        j->OffPt  = OffPt;
+        m_GhostJoins.push_back(j);
+    }
+    //------------------------------------------------------------------------------
+
+    void Clipper::InsertLocalMinimaIntoAEL(const cInt botY)
+    {
+        const LocalMinimum* lm;
+        while (PopLocalMinima(botY, lm))
+        {
+            TEdge* lb = lm->LeftBound;
+            TEdge* rb = lm->RightBound;
+
+            OutPt* Op1 = 0;
+            if (!lb)
+            {
+                // nb: don't insert LB into either AEL or SEL
+                InsertEdgeIntoAEL(rb, 0);
+                SetWindingCount(*rb);
+                if (IsContributing(*rb))
+                    Op1 = AddOutPt(rb, rb->Bot);
+            }
+            else if (!rb)
+            {
+                InsertEdgeIntoAEL(lb, 0);
+                SetWindingCount(*lb);
+                if (IsContributing(*lb))
+                    Op1 = AddOutPt(lb, lb->Bot);
+                InsertScanbeam(lb->Top.Y);
+            }
+            else
+            {
+                InsertEdgeIntoAEL(lb, 0);
+                InsertEdgeIntoAEL(rb, lb);
+                SetWindingCount(*lb);
+                rb->WindCnt  = lb->WindCnt;
+                rb->WindCnt2 = lb->WindCnt2;
+                if (IsContributing(*lb))
+                    Op1 = AddLocalMinPoly(lb, rb, lb->Bot);
+                InsertScanbeam(lb->Top.Y);
+            }
+
+            if (rb)
+            {
+                if (IsHorizontal(*rb))
+                {
+                    AddEdgeToSEL(rb);
+                    if (rb->NextInLML)
+                        InsertScanbeam(rb->NextInLML->Top.Y);
+                }
+                else
+                    InsertScanbeam(rb->Top.Y);
+            }
+
+            if (!lb || !rb) continue;
+
+            // if any output polygons share an edge, they'll need joining later ...
+            if (Op1 && IsHorizontal(*rb) &&
+                m_GhostJoins.size() > 0 && (rb->WindDelta != 0))
+            {
+                for (JoinList::size_type i = 0; i < m_GhostJoins.size(); ++i)
+                {
+                    Join* jr = m_GhostJoins[i];
+                    // if the horizontal Rb and a 'ghost' horizontal overlap, then convert
+                    // the 'ghost' join to a real join ready for later ...
+                    if (HorzSegmentsOverlap(jr->OutPt1->Pt.X, jr->OffPt.X, rb->Bot.X, rb->Top.X))
+                        AddJoin(jr->OutPt1, Op1, jr->OffPt);
+                }
+            }
+
+            if (lb->OutIdx >= 0 && lb->PrevInAEL &&
+                lb->PrevInAEL->Curr.X == lb->Bot.X &&
+                lb->PrevInAEL->OutIdx >= 0 &&
+                SlopesEqual(lb->PrevInAEL->Bot, lb->PrevInAEL->Top, lb->Curr, lb->Top, m_UseFullRange) &&
+                (lb->WindDelta != 0) && (lb->PrevInAEL->WindDelta != 0))
+            {
+                OutPt* Op2 = AddOutPt(lb->PrevInAEL, lb->Bot);
+                AddJoin(Op1, Op2, lb->Top);
+            }
+
+            if (lb->NextInAEL != rb)
+            {
+                if (rb->OutIdx >= 0 && rb->PrevInAEL->OutIdx >= 0 &&
+                    SlopesEqual(rb->PrevInAEL->Curr, rb->PrevInAEL->Top, rb->Curr, rb->Top, m_UseFullRange) &&
+                    (rb->WindDelta != 0) && (rb->PrevInAEL->WindDelta != 0))
+                {
+                    OutPt* Op2 = AddOutPt(rb->PrevInAEL, rb->Bot);
+                    AddJoin(Op1, Op2, rb->Top);
+                }
+
+                TEdge* e = lb->NextInAEL;
+                if (e)
+                {
+                    while (e != rb)
+                    {
+                        // nb: For calculating winding counts etc, IntersectEdges() assumes
+                        // that param1 will be to the Right of param2 ABOVE the intersection ...
+                        IntersectEdges(rb, e, lb->Curr);  // order important here
+                        e = e->NextInAEL;
+                    }
+                }
+            }
+        }
+    }
+    //------------------------------------------------------------------------------
+
+    void Clipper::DeleteFromSEL(TEdge* e)
+    {
+        TEdge* SelPrev = e->PrevInSEL;
+        TEdge* SelNext = e->NextInSEL;
+        if (!SelPrev && !SelNext && (e != m_SortedEdges)) return;  // already deleted
+        if (SelPrev)
+            SelPrev->NextInSEL = SelNext;
+        else
+            m_SortedEdges = SelNext;
+        if (SelNext) SelNext->PrevInSEL = SelPrev;
+        e->NextInSEL = 0;
+        e->PrevInSEL = 0;
+    }
+    //------------------------------------------------------------------------------
+
+#ifdef use_xyz
+    void Clipper::SetZ(IntPoint& pt, TEdge& e1, TEdge& e2)
+    {
+        if (pt.Z != 0 || !m_ZFill)
+            return;
+        else if (pt == e1.Bot)
+            pt.Z = e1.Bot.Z;
+        else if (pt == e1.Top)
+            pt.Z = e1.Top.Z;
+        else if (pt == e2.Bot)
+            pt.Z = e2.Bot.Z;
+        else if (pt == e2.Top)
+            pt.Z = e2.Top.Z;
+        else
+            (*m_ZFill)(e1.Bot, e1.Top, e2.Bot, e2.Top, pt);
+    }
+//------------------------------------------------------------------------------
+#endif
+
+    void Clipper::IntersectEdges(TEdge* e1, TEdge* e2, IntPoint& Pt)
+    {
+        bool e1Contributing = (e1->OutIdx >= 0);
+        bool e2Contributing = (e2->OutIdx >= 0);
+
+#ifdef use_xyz
+        SetZ(Pt, *e1, *e2);
+#endif
+
+#ifdef use_lines
+        // if either edge is on an OPEN path ...
+        if (e1->WindDelta == 0 || e2->WindDelta == 0)
+        {
+            // ignore subject-subject open path intersections UNLESS they
+            // are both open paths, AND they are both 'contributing maximas' ...
+            if (e1->WindDelta == 0 && e2->WindDelta == 0) return;
+
+            // if intersecting a subj line with a subj poly ...
+            else if (e1->PolyTyp == e2->PolyTyp &&
+                     e1->WindDelta != e2->WindDelta && m_ClipType == ctUnion)
+            {
+                if (e1->WindDelta == 0)
+                {
+                    if (e2Contributing)
+                    {
+                        AddOutPt(e1, Pt);
+                        if (e1Contributing) e1->OutIdx = Unassigned;
+                    }
+                }
+                else
+                {
+                    if (e1Contributing)
+                    {
+                        AddOutPt(e2, Pt);
+                        if (e2Contributing) e2->OutIdx = Unassigned;
+                    }
+                }
+            }
+            else if (e1->PolyTyp != e2->PolyTyp)
+            {
+                // toggle subj open path OutIdx on/off when Abs(clip.WndCnt) == 1 ...
+                if ((e1->WindDelta == 0) && abs(e2->WindCnt) == 1 &&
+                    (m_ClipType != ctUnion || e2->WindCnt2 == 0))
+                {
+                    AddOutPt(e1, Pt);
+                    if (e1Contributing) e1->OutIdx = Unassigned;
+                }
+                else if ((e2->WindDelta == 0) && (abs(e1->WindCnt) == 1) &&
+                         (m_ClipType != ctUnion || e1->WindCnt2 == 0))
+                {
+                    AddOutPt(e2, Pt);
+                    if (e2Contributing) e2->OutIdx = Unassigned;
+                }
+            }
+            return;
+        }
+#endif
+
+        // update winding counts...
+        // assumes that e1 will be to the Right of e2 ABOVE the intersection
+        if (e1->PolyTyp == e2->PolyTyp)
+        {
+            if (IsEvenOddFillType(*e1))
+            {
+                int oldE1WindCnt = e1->WindCnt;
+                e1->WindCnt      = e2->WindCnt;
+                e2->WindCnt      = oldE1WindCnt;
+            }
+            else
+            {
+                if (e1->WindCnt + e2->WindDelta == 0)
+                    e1->WindCnt = -e1->WindCnt;
+                else
+                    e1->WindCnt += e2->WindDelta;
+                if (e2->WindCnt - e1->WindDelta == 0)
+                    e2->WindCnt = -e2->WindCnt;
+                else
+                    e2->WindCnt -= e1->WindDelta;
+            }
+        }
+        else
+        {
+            if (!IsEvenOddFillType(*e2))
+                e1->WindCnt2 += e2->WindDelta;
+            else
+                e1->WindCnt2 = (e1->WindCnt2 == 0) ? 1 : 0;
+            if (!IsEvenOddFillType(*e1))
+                e2->WindCnt2 -= e1->WindDelta;
+            else
+                e2->WindCnt2 = (e2->WindCnt2 == 0) ? 1 : 0;
+        }
+
+        PolyFillType e1FillType, e2FillType, e1FillType2, e2FillType2;
+        if (e1->PolyTyp == ptSubject)
+        {
+            e1FillType  = m_SubjFillType;
+            e1FillType2 = m_ClipFillType;
+        }
+        else
+        {
+            e1FillType  = m_ClipFillType;
+            e1FillType2 = m_SubjFillType;
+        }
+        if (e2->PolyTyp == ptSubject)
+        {
+            e2FillType  = m_SubjFillType;
+            e2FillType2 = m_ClipFillType;
+        }
+        else
+        {
+            e2FillType  = m_ClipFillType;
+            e2FillType2 = m_SubjFillType;
+        }
+
+        cInt e1Wc, e2Wc;
+        switch (e1FillType)
+        {
+            case pftPositive: e1Wc = e1->WindCnt; break;
+            case pftNegative: e1Wc = -e1->WindCnt; break;
+            default: e1Wc = Abs(e1->WindCnt);
+        }
+        switch (e2FillType)
+        {
+            case pftPositive: e2Wc = e2->WindCnt; break;
+            case pftNegative: e2Wc = -e2->WindCnt; break;
+            default: e2Wc = Abs(e2->WindCnt);
+        }
+
+        if (e1Contributing && e2Contributing)
+        {
+            if ((e1Wc != 0 && e1Wc != 1) || (e2Wc != 0 && e2Wc != 1) ||
+                (e1->PolyTyp != e2->PolyTyp && m_ClipType != ctXor))
+            {
+                AddLocalMaxPoly(e1, e2, Pt);
+            }
+            else
+            {
+                AddOutPt(e1, Pt);
+                AddOutPt(e2, Pt);
+                SwapSides(*e1, *e2);
+                SwapPolyIndexes(*e1, *e2);
+            }
+        }
+        else if (e1Contributing)
+        {
+            if (e2Wc == 0 || e2Wc == 1)
+            {
+                AddOutPt(e1, Pt);
+                SwapSides(*e1, *e2);
+                SwapPolyIndexes(*e1, *e2);
+            }
+        }
+        else if (e2Contributing)
+        {
+            if (e1Wc == 0 || e1Wc == 1)
+            {
+                AddOutPt(e2, Pt);
+                SwapSides(*e1, *e2);
+                SwapPolyIndexes(*e1, *e2);
+            }
+        }
+        else if ((e1Wc == 0 || e1Wc == 1) && (e2Wc == 0 || e2Wc == 1))
+        {
+            // neither edge is currently contributing ...
+
+            cInt e1Wc2, e2Wc2;
+            switch (e1FillType2)
+            {
+                case pftPositive: e1Wc2 = e1->WindCnt2; break;
+                case pftNegative: e1Wc2 = -e1->WindCnt2; break;
+                default: e1Wc2 = Abs(e1->WindCnt2);
+            }
+            switch (e2FillType2)
+            {
+                case pftPositive: e2Wc2 = e2->WindCnt2; break;
+                case pftNegative: e2Wc2 = -e2->WindCnt2; break;
+                default: e2Wc2 = Abs(e2->WindCnt2);
+            }
+
+            if (e1->PolyTyp != e2->PolyTyp)
+            {
+                AddLocalMinPoly(e1, e2, Pt);
+            }
+            else if (e1Wc == 1 && e2Wc == 1)
+                switch (m_ClipType)
+                {
+                    case ctIntersection:
+                        if (e1Wc2 > 0 && e2Wc2 > 0)
+                            AddLocalMinPoly(e1, e2, Pt);
+                        break;
+                    case ctUnion:
+                        if (e1Wc2 <= 0 && e2Wc2 <= 0)
+                            AddLocalMinPoly(e1, e2, Pt);
+                        break;
+                    case ctDifference:
+                        if (((e1->PolyTyp == ptClip) && (e1Wc2 > 0) && (e2Wc2 > 0)) ||
+                            ((e1->PolyTyp == ptSubject) && (e1Wc2 <= 0) && (e2Wc2 <= 0)))
+                            AddLocalMinPoly(e1, e2, Pt);
+                        break;
+                    case ctXor:
+                        AddLocalMinPoly(e1, e2, Pt);
+                }
+            else
+                SwapSides(*e1, *e2);
+        }
+    }
+    //------------------------------------------------------------------------------
+
+    void Clipper::SetHoleState(TEdge* e, OutRec* outrec)
+    {
+        TEdge* e2   = e->PrevInAEL;
+        TEdge* eTmp = 0;
+        while (e2)
+        {
+            if (e2->OutIdx >= 0 && e2->WindDelta != 0)
+            {
+                if (!eTmp)
+                    eTmp = e2;
+                else if (eTmp->OutIdx == e2->OutIdx)
+                    eTmp = 0;
+            }
+            e2 = e2->PrevInAEL;
+        }
+        if (!eTmp)
+        {
+            outrec->FirstLeft = 0;
+            outrec->IsHole    = false;
+        }
+        else
+        {
+            outrec->FirstLeft = m_PolyOuts[eTmp->OutIdx];
+            outrec->IsHole    = !outrec->FirstLeft->IsHole;
+        }
+    }
+    //------------------------------------------------------------------------------
+
+    OutRec* GetLowermostRec(OutRec* outRec1, OutRec* outRec2)
+    {
+        // work out which polygon fragment has the correct hole state ...
+        if (!outRec1->BottomPt)
+            outRec1->BottomPt = GetBottomPt(outRec1->Pts);
+        if (!outRec2->BottomPt)
+            outRec2->BottomPt = GetBottomPt(outRec2->Pts);
+        OutPt* OutPt1 = outRec1->BottomPt;
+        OutPt* OutPt2 = outRec2->BottomPt;
+        if (OutPt1->Pt.Y > OutPt2->Pt.Y)
+            return outRec1;
+        else if (OutPt1->Pt.Y < OutPt2->Pt.Y)
+            return outRec2;
+        else if (OutPt1->Pt.X < OutPt2->Pt.X)
+            return outRec1;
+        else if (OutPt1->Pt.X > OutPt2->Pt.X)
+            return outRec2;
+        else if (OutPt1->Next == OutPt1)
+            return outRec2;
+        else if (OutPt2->Next == OutPt2)
+            return outRec1;
+        else if (FirstIsBottomPt(OutPt1, OutPt2))
+            return outRec1;
+        else
+            return outRec2;
+    }
+    //------------------------------------------------------------------------------
+
+    bool OutRec1RightOfOutRec2(OutRec* outRec1, OutRec* outRec2)
+    {
+        do
+        {
+            outRec1 = outRec1->FirstLeft;
+            if (outRec1 == outRec2) return true;
+        } while (outRec1);
+        return false;
+    }
+    //------------------------------------------------------------------------------
+
+    OutRec* Clipper::GetOutRec(int Idx)
+    {
+        OutRec* outrec = m_PolyOuts[Idx];
+        while (outrec != m_PolyOuts[outrec->Idx])
+            outrec = m_PolyOuts[outrec->Idx];
+        return outrec;
+    }
+    //------------------------------------------------------------------------------
+
+    void Clipper::AppendPolygon(TEdge* e1, TEdge* e2)
+    {
+        // get the start and ends of both output polygons ...
+        OutRec* outRec1 = m_PolyOuts[e1->OutIdx];
+        OutRec* outRec2 = m_PolyOuts[e2->OutIdx];
+
+        OutRec* holeStateRec;
+        if (OutRec1RightOfOutRec2(outRec1, outRec2))
+            holeStateRec = outRec2;
+        else if (OutRec1RightOfOutRec2(outRec2, outRec1))
+            holeStateRec = outRec1;
+        else
+            holeStateRec = GetLowermostRec(outRec1, outRec2);
+
+        // get the start and ends of both output polygons and
+        // join e2 poly onto e1 poly and delete pointers to e2 ...
+
+        OutPt* p1_lft = outRec1->Pts;
+        OutPt* p1_rt  = p1_lft->Prev;
+        OutPt* p2_lft = outRec2->Pts;
+        OutPt* p2_rt  = p2_lft->Prev;
+
+        // join e2 poly onto e1 poly and delete pointers to e2 ...
+        if (e1->Side == esLeft)
+        {
+            if (e2->Side == esLeft)
+            {
+                // z y x a b c
+                ReversePolyPtLinks(p2_lft);
+                p2_lft->Next = p1_lft;
+                p1_lft->Prev = p2_lft;
+                p1_rt->Next  = p2_rt;
+                p2_rt->Prev  = p1_rt;
+                outRec1->Pts = p2_rt;
+            }
+            else
+            {
+                // x y z a b c
+                p2_rt->Next  = p1_lft;
+                p1_lft->Prev = p2_rt;
+                p2_lft->Prev = p1_rt;
+                p1_rt->Next  = p2_lft;
+                outRec1->Pts = p2_lft;
+            }
+        }
+        else
+        {
+            if (e2->Side == esRight)
+            {
+                // a b c z y x
+                ReversePolyPtLinks(p2_lft);
+                p1_rt->Next  = p2_rt;
+                p2_rt->Prev  = p1_rt;
+                p2_lft->Next = p1_lft;
+                p1_lft->Prev = p2_lft;
+            }
+            else
+            {
+                // a b c x y z
+                p1_rt->Next  = p2_lft;
+                p2_lft->Prev = p1_rt;
+                p1_lft->Prev = p2_rt;
+                p2_rt->Next  = p1_lft;
+            }
+        }
+
+        outRec1->BottomPt = 0;
+        if (holeStateRec == outRec2)
+        {
+            if (outRec2->FirstLeft != outRec1)
+                outRec1->FirstLeft = outRec2->FirstLeft;
+            outRec1->IsHole = outRec2->IsHole;
+        }
+        outRec2->Pts       = 0;
+        outRec2->BottomPt  = 0;
+        outRec2->FirstLeft = outRec1;
+
+        int OKIdx       = e1->OutIdx;
+        int ObsoleteIdx = e2->OutIdx;
+
+        e1->OutIdx = Unassigned;  // nb: safe because we only get here via AddLocalMaxPoly
+        e2->OutIdx = Unassigned;
+
+        TEdge* e = m_ActiveEdges;
+        while (e)
+        {
+            if (e->OutIdx == ObsoleteIdx)
+            {
+                e->OutIdx = OKIdx;
+                e->Side   = e1->Side;
+                break;
+            }
+            e = e->NextInAEL;
+        }
+
+        outRec2->Idx = outRec1->Idx;
+    }
+    //------------------------------------------------------------------------------
+
+    OutPt* Clipper::AddOutPt(TEdge* e, const IntPoint& pt)
+    {
+        if (e->OutIdx < 0)
+        {
+            OutRec* outRec = CreateOutRec();
+            outRec->IsOpen = (e->WindDelta == 0);
+            OutPt* newOp   = new OutPt;
+            outRec->Pts    = newOp;
+            newOp->Idx     = outRec->Idx;
+            newOp->Pt      = pt;
+            newOp->Next    = newOp;
+            newOp->Prev    = newOp;
+            if (!outRec->IsOpen)
+                SetHoleState(e, outRec);
+            e->OutIdx = outRec->Idx;
+            return newOp;
+        }
+        else
+        {
+            OutRec* outRec = m_PolyOuts[e->OutIdx];
+            // OutRec.Pts is the 'Left-most' point & OutRec.Pts.Prev is the 'Right-most'
+            OutPt*  op     = outRec->Pts;
+
+            bool    ToFront = (e->Side == esLeft);
+            if (ToFront && (pt == op->Pt))
+                return op;
+            else if (!ToFront && (pt == op->Prev->Pt))
+                return op->Prev;
+
+            OutPt* newOp      = new OutPt;
+            newOp->Idx        = outRec->Idx;
+            newOp->Pt         = pt;
+            newOp->Next       = op;
+            newOp->Prev       = op->Prev;
+            newOp->Prev->Next = newOp;
+            op->Prev          = newOp;
+            if (ToFront) outRec->Pts = newOp;
+            return newOp;
+        }
+    }
+    //------------------------------------------------------------------------------
+
+    OutPt* Clipper::GetLastOutPt(TEdge* e)
+    {
+        OutRec* outRec = m_PolyOuts[e->OutIdx];
+        if (e->Side == esLeft)
+            return outRec->Pts;
+        else
+            return outRec->Pts->Prev;
+    }
+    //------------------------------------------------------------------------------
+
+    void Clipper::ProcessHorizontals()
+    {
+        TEdge* horzEdge;
+        while (PopEdgeFromSEL(horzEdge))
+            ProcessHorizontal(horzEdge);
+    }
+    //------------------------------------------------------------------------------
+
+    inline bool IsMinima(TEdge* e)
+    {
+        return e && (e->Prev->NextInLML != e) && (e->Next->NextInLML != e);
+    }
+    //------------------------------------------------------------------------------
+
+    inline bool IsMaxima(TEdge* e, const cInt Y)
+    {
+        return e && e->Top.Y == Y && !e->NextInLML;
+    }
+    //------------------------------------------------------------------------------
+
+    inline bool IsIntermediate(TEdge* e, const cInt Y)
+    {
+        return e->Top.Y == Y && e->NextInLML;
+    }
+    //------------------------------------------------------------------------------
+
+    TEdge* GetMaximaPair(TEdge* e)
+    {
+        if ((e->Next->Top == e->Top) && !e->Next->NextInLML)
+            return e->Next;
+        else if ((e->Prev->Top == e->Top) && !e->Prev->NextInLML)
+            return e->Prev;
+        else
+            return 0;
+    }
+    //------------------------------------------------------------------------------
 
-void Clipper::DeleteFromSEL(TEdge *e)
-{
-  TEdge* SelPrev = e->PrevInSEL;
-  TEdge* SelNext = e->NextInSEL;
-  if( !SelPrev &&  !SelNext && (e != m_SortedEdges) ) return; //already deleted
-  if( SelPrev ) SelPrev->NextInSEL = SelNext;
-  else m_SortedEdges = SelNext;
-  if( SelNext ) SelNext->PrevInSEL = SelPrev;
-  e->NextInSEL = 0;
-  e->PrevInSEL = 0;
-}
-//------------------------------------------------------------------------------
+    TEdge* GetMaximaPairEx(TEdge* e)
+    {
+        // as GetMaximaPair() but returns 0 if MaxPair isn't in AEL (unless it's horizontal)
+        TEdge* result = GetMaximaPair(e);
+        if (result && (result->OutIdx == Skip ||
+                       (result->NextInAEL == result->PrevInAEL && !IsHorizontal(*result)))) return 0;
+        return result;
+    }
+    //------------------------------------------------------------------------------
+
+    void Clipper::SwapPositionsInSEL(TEdge* Edge1, TEdge* Edge2)
+    {
+        if (!(Edge1->NextInSEL) && !(Edge1->PrevInSEL)) return;
+        if (!(Edge2->NextInSEL) && !(Edge2->PrevInSEL)) return;
+
+        if (Edge1->NextInSEL == Edge2)
+        {
+            TEdge* Next = Edge2->NextInSEL;
+            if (Next) Next->PrevInSEL = Edge1;
+            TEdge* Prev = Edge1->PrevInSEL;
+            if (Prev) Prev->NextInSEL = Edge2;
+            Edge2->PrevInSEL = Prev;
+            Edge2->NextInSEL = Edge1;
+            Edge1->PrevInSEL = Edge2;
+            Edge1->NextInSEL = Next;
+        }
+        else if (Edge2->NextInSEL == Edge1)
+        {
+            TEdge* Next = Edge1->NextInSEL;
+            if (Next) Next->PrevInSEL = Edge2;
+            TEdge* Prev = Edge2->PrevInSEL;
+            if (Prev) Prev->NextInSEL = Edge1;
+            Edge1->PrevInSEL = Prev;
+            Edge1->NextInSEL = Edge2;
+            Edge2->PrevInSEL = Edge1;
+            Edge2->NextInSEL = Next;
+        }
+        else
+        {
+            TEdge* Next      = Edge1->NextInSEL;
+            TEdge* Prev      = Edge1->PrevInSEL;
+            Edge1->NextInSEL = Edge2->NextInSEL;
+            if (Edge1->NextInSEL) Edge1->NextInSEL->PrevInSEL = Edge1;
+            Edge1->PrevInSEL = Edge2->PrevInSEL;
+            if (Edge1->PrevInSEL) Edge1->PrevInSEL->NextInSEL = Edge1;
+            Edge2->NextInSEL = Next;
+            if (Edge2->NextInSEL) Edge2->NextInSEL->PrevInSEL = Edge2;
+            Edge2->PrevInSEL = Prev;
+            if (Edge2->PrevInSEL) Edge2->PrevInSEL->NextInSEL = Edge2;
+        }
+
+        if (!Edge1->PrevInSEL)
+            m_SortedEdges = Edge1;
+        else if (!Edge2->PrevInSEL)
+            m_SortedEdges = Edge2;
+    }
+    //------------------------------------------------------------------------------
+
+    TEdge* GetNextInAEL(TEdge* e, Direction dir)
+    {
+        return dir == dLeftToRight ? e->NextInAEL : e->PrevInAEL;
+    }
+    //------------------------------------------------------------------------------
+
+    void GetHorzDirection(TEdge& HorzEdge, Direction& Dir, cInt& Left, cInt& Right)
+    {
+        if (HorzEdge.Bot.X < HorzEdge.Top.X)
+        {
+            Left  = HorzEdge.Bot.X;
+            Right = HorzEdge.Top.X;
+            Dir   = dLeftToRight;
+        }
+        else
+        {
+            Left  = HorzEdge.Top.X;
+            Right = HorzEdge.Bot.X;
+            Dir   = dRightToLeft;
+        }
+    }
+    //------------------------------------------------------------------------
+
+    /*******************************************************************************
+     * Notes: Horizontal edges (HEs) at scanline intersections (ie at the Top or    *
+     * Bottom of a scanbeam) are processed as if layered. The order in which HEs    *
+     * are processed doesn't matter. HEs intersect with other HE Bot.Xs only [#]    *
+     * (or they could intersect with Top.Xs only, ie EITHER Bot.Xs OR Top.Xs),      *
+     * and with other non-horizontal edges [*]. Once these intersections are        *
+     * processed, intermediate HEs then 'promote' the Edge above (NextInLML) into   *
+     * the AEL. These 'promoted' edges may in turn intersect [%] with other HEs.    *
+     *******************************************************************************/
+
+    void Clipper::ProcessHorizontal(TEdge* horzEdge)
+    {
+        Direction dir;
+        cInt      horzLeft, horzRight;
+        bool      IsOpen = (horzEdge->WindDelta == 0);
+
+        GetHorzDirection(*horzEdge, dir, horzLeft, horzRight);
+
+        TEdge *eLastHorz = horzEdge, *eMaxPair = 0;
+        while (eLastHorz->NextInLML && IsHorizontal(*eLastHorz->NextInLML))
+            eLastHorz = eLastHorz->NextInLML;
+        if (!eLastHorz->NextInLML)
+            eMaxPair = GetMaximaPair(eLastHorz);
 
+        MaximaList::const_iterator         maxIt;
+        MaximaList::const_reverse_iterator maxRit;
+        if (m_Maxima.size() > 0)
+        {
+            // get the first maxima in range (X) ...
+            if (dir == dLeftToRight)
+            {
+                maxIt = m_Maxima.begin();
+                while (maxIt != m_Maxima.end() && *maxIt <= horzEdge->Bot.X) maxIt++;
+                if (maxIt != m_Maxima.end() && *maxIt >= eLastHorz->Top.X)
+                    maxIt = m_Maxima.end();
+            }
+            else
+            {
+                maxRit = m_Maxima.rbegin();
+                while (maxRit != m_Maxima.rend() && *maxRit > horzEdge->Bot.X) maxRit++;
+                if (maxRit != m_Maxima.rend() && *maxRit <= eLastHorz->Top.X)
+                    maxRit = m_Maxima.rend();
+            }
+        }
+
+        OutPt* op1 = 0;
+
+        for (;;)  // loop through consec. horizontal edges
+        {
+            bool   IsLastHorz = (horzEdge == eLastHorz);
+            TEdge* e          = GetNextInAEL(horzEdge, dir);
+            while (e)
+            {
+                // this code block inserts extra coords into horizontal edges (in output
+                // polygons) whereever maxima touch these horizontal edges. This helps
+                //'simplifying' polygons (ie if the Simplify property is set).
+                if (m_Maxima.size() > 0)
+                {
+                    if (dir == dLeftToRight)
+                    {
+                        while (maxIt != m_Maxima.end() && *maxIt < e->Curr.X)
+                        {
+                            if (horzEdge->OutIdx >= 0 && !IsOpen)
+                                AddOutPt(horzEdge, IntPoint(*maxIt, horzEdge->Bot.Y));
+                            maxIt++;
+                        }
+                    }
+                    else
+                    {
+                        while (maxRit != m_Maxima.rend() && *maxRit > e->Curr.X)
+                        {
+                            if (horzEdge->OutIdx >= 0 && !IsOpen)
+                                AddOutPt(horzEdge, IntPoint(*maxRit, horzEdge->Bot.Y));
+                            maxRit++;
+                        }
+                    }
+                };
+
+                if ((dir == dLeftToRight && e->Curr.X > horzRight) ||
+                    (dir == dRightToLeft && e->Curr.X < horzLeft)) break;
+
+                // Also break if we've got to the end of an intermediate horizontal edge ...
+                // nb: Smaller Dx's are to the right of larger Dx's ABOVE the horizontal.
+                if (e->Curr.X == horzEdge->Top.X && horzEdge->NextInLML &&
+                    e->Dx < horzEdge->NextInLML->Dx) break;
+
+                if (horzEdge->OutIdx >= 0 && !IsOpen)  // note: may be done multiple times
+                {
 #ifdef use_xyz
-void Clipper::SetZ(IntPoint& pt, TEdge& e1, TEdge& e2)
-{
-  if (pt.Z != 0 || !m_ZFill) return;
-  else if (pt == e1.Bot) pt.Z = e1.Bot.Z;
-  else if (pt == e1.Top) pt.Z = e1.Top.Z;
-  else if (pt == e2.Bot) pt.Z = e2.Bot.Z;
-  else if (pt == e2.Top) pt.Z = e2.Top.Z;
-  else (*m_ZFill)(e1.Bot, e1.Top, e2.Bot, e2.Top, pt);
-}
-//------------------------------------------------------------------------------
+                    if (dir == dLeftToRight)
+                        SetZ(e->Curr, *horzEdge, *e);
+                    else
+                        SetZ(e->Curr, *e, *horzEdge);
 #endif
+                    op1              = AddOutPt(horzEdge, e->Curr);
+                    TEdge* eNextHorz = m_SortedEdges;
+                    while (eNextHorz)
+                    {
+                        if (eNextHorz->OutIdx >= 0 &&
+                            HorzSegmentsOverlap(horzEdge->Bot.X,
+                                                horzEdge->Top.X,
+                                                eNextHorz->Bot.X,
+                                                eNextHorz->Top.X))
+                        {
+                            OutPt* op2 = GetLastOutPt(eNextHorz);
+                            AddJoin(op2, op1, eNextHorz->Top);
+                        }
+                        eNextHorz = eNextHorz->NextInSEL;
+                    }
+                    AddGhostJoin(op1, horzEdge->Bot);
+                }
 
-void Clipper::IntersectEdges(TEdge *e1, TEdge *e2, IntPoint &Pt)
-{
-  bool e1Contributing = ( e1->OutIdx >= 0 );
-  bool e2Contributing = ( e2->OutIdx >= 0 );
+                // OK, so far we're still in range of the horizontal Edge  but make sure
+                // we're at the last of consec. horizontals when matching with eMaxPair
+                if (e == eMaxPair && IsLastHorz)
+                {
+                    if (horzEdge->OutIdx >= 0)
+                        AddLocalMaxPoly(horzEdge, eMaxPair, horzEdge->Top);
+                    DeleteFromAEL(horzEdge);
+                    DeleteFromAEL(eMaxPair);
+                    return;
+                }
+
+                if (dir == dLeftToRight)
+                {
+                    IntPoint Pt = IntPoint(e->Curr.X, horzEdge->Curr.Y);
+                    IntersectEdges(horzEdge, e, Pt);
+                }
+                else
+                {
+                    IntPoint Pt = IntPoint(e->Curr.X, horzEdge->Curr.Y);
+                    IntersectEdges(e, horzEdge, Pt);
+                }
+                TEdge* eNext = GetNextInAEL(e, dir);
+                SwapPositionsInAEL(horzEdge, e);
+                e = eNext;
+            }  // end while(e)
+
+            // Break out of loop if HorzEdge.NextInLML is not also horizontal ...
+            if (!horzEdge->NextInLML || !IsHorizontal(*horzEdge->NextInLML)) break;
+
+            UpdateEdgeIntoAEL(horzEdge);
+            if (horzEdge->OutIdx >= 0) AddOutPt(horzEdge, horzEdge->Bot);
+            GetHorzDirection(*horzEdge, dir, horzLeft, horzRight);
+
+        }  // end for (;;)
+
+        if (horzEdge->OutIdx >= 0 && !op1)
+        {
+            op1              = GetLastOutPt(horzEdge);
+            TEdge* eNextHorz = m_SortedEdges;
+            while (eNextHorz)
+            {
+                if (eNextHorz->OutIdx >= 0 &&
+                    HorzSegmentsOverlap(horzEdge->Bot.X,
+                                        horzEdge->Top.X,
+                                        eNextHorz->Bot.X,
+                                        eNextHorz->Top.X))
+                {
+                    OutPt* op2 = GetLastOutPt(eNextHorz);
+                    AddJoin(op2, op1, eNextHorz->Top);
+                }
+                eNextHorz = eNextHorz->NextInSEL;
+            }
+            AddGhostJoin(op1, horzEdge->Top);
+        }
+
+        if (horzEdge->NextInLML)
+        {
+            if (horzEdge->OutIdx >= 0)
+            {
+                op1 = AddOutPt(horzEdge, horzEdge->Top);
+                UpdateEdgeIntoAEL(horzEdge);
+                if (horzEdge->WindDelta == 0) return;
+                // nb: HorzEdge is no longer horizontal here
+                TEdge* ePrev = horzEdge->PrevInAEL;
+                TEdge* eNext = horzEdge->NextInAEL;
+                if (ePrev && ePrev->Curr.X == horzEdge->Bot.X &&
+                    ePrev->Curr.Y == horzEdge->Bot.Y && ePrev->WindDelta != 0 &&
+                    (ePrev->OutIdx >= 0 && ePrev->Curr.Y > ePrev->Top.Y &&
+                     SlopesEqual(*horzEdge, *ePrev, m_UseFullRange)))
+                {
+                    OutPt* op2 = AddOutPt(ePrev, horzEdge->Bot);
+                    AddJoin(op1, op2, horzEdge->Top);
+                }
+                else if (eNext && eNext->Curr.X == horzEdge->Bot.X &&
+                         eNext->Curr.Y == horzEdge->Bot.Y && eNext->WindDelta != 0 &&
+                         eNext->OutIdx >= 0 && eNext->Curr.Y > eNext->Top.Y &&
+                         SlopesEqual(*horzEdge, *eNext, m_UseFullRange))
+                {
+                    OutPt* op2 = AddOutPt(eNext, horzEdge->Bot);
+                    AddJoin(op1, op2, horzEdge->Top);
+                }
+            }
+            else
+                UpdateEdgeIntoAEL(horzEdge);
+        }
+        else
+        {
+            if (horzEdge->OutIdx >= 0) AddOutPt(horzEdge, horzEdge->Top);
+            DeleteFromAEL(horzEdge);
+        }
+    }
+    //------------------------------------------------------------------------------
+
+    bool Clipper::ProcessIntersections(const cInt topY)
+    {
+        if (!m_ActiveEdges) return true;
+        try
+        {
+            BuildIntersectList(topY);
+            size_t IlSize = m_IntersectList.size();
+            if (IlSize == 0) return true;
+            if (IlSize == 1 || FixupIntersectionOrder())
+                ProcessIntersectList();
+            else
+                return false;
+        }
+        catch (...)
+        {
+            m_SortedEdges = 0;
+            DisposeIntersectNodes();
+            throw clipperException("ProcessIntersections error");
+        }
+        m_SortedEdges = 0;
+        return true;
+    }
+    //------------------------------------------------------------------------------
+
+    void Clipper::DisposeIntersectNodes()
+    {
+        for (size_t i = 0; i < m_IntersectList.size(); ++i)
+            delete m_IntersectList[i];
+        m_IntersectList.clear();
+    }
+    //------------------------------------------------------------------------------
+
+    void Clipper::BuildIntersectList(const cInt topY)
+    {
+        if (!m_ActiveEdges) return;
+
+        // prepare for sorting ...
+        TEdge* e      = m_ActiveEdges;
+        m_SortedEdges = e;
+        while (e)
+        {
+            e->PrevInSEL = e->PrevInAEL;
+            e->NextInSEL = e->NextInAEL;
+            e->Curr.X    = TopX(*e, topY);
+            e            = e->NextInAEL;
+        }
+
+        // bubblesort ...
+        bool isModified;
+        do
+        {
+            isModified = false;
+            e          = m_SortedEdges;
+            while (e->NextInSEL)
+            {
+                TEdge*   eNext = e->NextInSEL;
+                IntPoint Pt;
+                if (e->Curr.X > eNext->Curr.X)
+                {
+                    IntersectPoint(*e, *eNext, Pt);
+                    if (Pt.Y < topY) Pt = IntPoint(TopX(*e, topY), topY);
+                    IntersectNode* newNode = new IntersectNode;
+                    newNode->Edge1         = e;
+                    newNode->Edge2         = eNext;
+                    newNode->Pt            = Pt;
+                    m_IntersectList.push_back(newNode);
+
+                    SwapPositionsInSEL(e, eNext);
+                    isModified = true;
+                }
+                else
+                    e = eNext;
+            }
+            if (e->PrevInSEL)
+                e->PrevInSEL->NextInSEL = 0;
+            else
+                break;
+        } while (isModified);
+        m_SortedEdges = 0;  // important
+    }
+    //------------------------------------------------------------------------------
+
+
+    void Clipper::ProcessIntersectList()
+    {
+        for (size_t i = 0; i < m_IntersectList.size(); ++i)
+        {
+            IntersectNode* iNode = m_IntersectList[i];
+            {
+                IntersectEdges(iNode->Edge1, iNode->Edge2, iNode->Pt);
+                SwapPositionsInAEL(iNode->Edge1, iNode->Edge2);
+            }
+            delete iNode;
+        }
+        m_IntersectList.clear();
+    }
+    //------------------------------------------------------------------------------
+
+    bool IntersectListSort(IntersectNode* node1, IntersectNode* node2)
+    {
+        return node2->Pt.Y < node1->Pt.Y;
+    }
+    //------------------------------------------------------------------------------
+
+    inline bool EdgesAdjacent(const IntersectNode& inode)
+    {
+        return (inode.Edge1->NextInSEL == inode.Edge2) ||
+               (inode.Edge1->PrevInSEL == inode.Edge2);
+    }
+    //------------------------------------------------------------------------------
+
+    bool Clipper::FixupIntersectionOrder()
+    {
+        // pre-condition: intersections are sorted Bottom-most first.
+        // Now it's crucial that intersections are made only between adjacent edges,
+        // so to ensure this the order of intersections may need adjusting ...
+        CopyAELToSEL();
+        std::sort(m_IntersectList.begin(), m_IntersectList.end(), IntersectListSort);
+        size_t cnt = m_IntersectList.size();
+        for (size_t i = 0; i < cnt; ++i)
+        {
+            if (!EdgesAdjacent(*m_IntersectList[i]))
+            {
+                size_t j = i + 1;
+                while (j < cnt && !EdgesAdjacent(*m_IntersectList[j])) j++;
+                if (j == cnt) return false;
+                std::swap(m_IntersectList[i], m_IntersectList[j]);
+            }
+            SwapPositionsInSEL(m_IntersectList[i]->Edge1, m_IntersectList[i]->Edge2);
+        }
+        return true;
+    }
+    //------------------------------------------------------------------------------
+
+    void Clipper::DoMaxima(TEdge* e)
+    {
+        TEdge* eMaxPair = GetMaximaPairEx(e);
+        if (!eMaxPair)
+        {
+            if (e->OutIdx >= 0)
+                AddOutPt(e, e->Top);
+            DeleteFromAEL(e);
+            return;
+        }
+
+        TEdge* eNext = e->NextInAEL;
+        while (eNext && eNext != eMaxPair)
+        {
+            IntersectEdges(e, eNext, e->Top);
+            SwapPositionsInAEL(e, eNext);
+            eNext = e->NextInAEL;
+        }
+
+        if (e->OutIdx == Unassigned && eMaxPair->OutIdx == Unassigned)
+        {
+            DeleteFromAEL(e);
+            DeleteFromAEL(eMaxPair);
+        }
+        else if (e->OutIdx >= 0 && eMaxPair->OutIdx >= 0)
+        {
+            if (e->OutIdx >= 0) AddLocalMaxPoly(e, eMaxPair, e->Top);
+            DeleteFromAEL(e);
+            DeleteFromAEL(eMaxPair);
+        }
+#ifdef use_lines
+        else if (e->WindDelta == 0)
+        {
+            if (e->OutIdx >= 0)
+            {
+                AddOutPt(e, e->Top);
+                e->OutIdx = Unassigned;
+            }
+            DeleteFromAEL(e);
+
+            if (eMaxPair->OutIdx >= 0)
+            {
+                AddOutPt(eMaxPair, e->Top);
+                eMaxPair->OutIdx = Unassigned;
+            }
+            DeleteFromAEL(eMaxPair);
+        }
+#endif
+        else
+            throw clipperException("DoMaxima error");
+    }
+    //------------------------------------------------------------------------------
+
+    void Clipper::ProcessEdgesAtTopOfScanbeam(const cInt topY)
+    {
+        TEdge* e = m_ActiveEdges;
+        while (e)
+        {
+            // 1. process maxima, treating them as if they're 'bent' horizontal edges,
+            //    but exclude maxima with horizontal edges. nb: e can't be a horizontal.
+            bool IsMaximaEdge = IsMaxima(e, topY);
+
+            if (IsMaximaEdge)
+            {
+                TEdge* eMaxPair = GetMaximaPairEx(e);
+                IsMaximaEdge    = (!eMaxPair || !IsHorizontal(*eMaxPair));
+            }
+
+            if (IsMaximaEdge)
+            {
+                if (m_StrictSimple) m_Maxima.push_back(e->Top.X);
+                TEdge* ePrev = e->PrevInAEL;
+                DoMaxima(e);
+                if (!ePrev)
+                    e = m_ActiveEdges;
+                else
+                    e = ePrev->NextInAEL;
+            }
+            else
+            {
+                // 2. promote horizontal edges, otherwise update Curr.X and Curr.Y ...
+                if (IsIntermediate(e, topY) && IsHorizontal(*e->NextInLML))
+                {
+                    UpdateEdgeIntoAEL(e);
+                    if (e->OutIdx >= 0)
+                        AddOutPt(e, e->Bot);
+                    AddEdgeToSEL(e);
+                }
+                else
+                {
+                    e->Curr.X = TopX(*e, topY);
+                    e->Curr.Y = topY;
+#ifdef use_xyz
+                    e->Curr.Z = topY == e->Top.Y ? e->Top.Z : (topY == e->Bot.Y ? e->Bot.Z : 0);
+#endif
+                }
 
+                // When StrictlySimple and 'e' is being touched by another edge, then
+                // make sure both edges have a vertex here ...
+                if (m_StrictSimple)
+                {
+                    TEdge* ePrev = e->PrevInAEL;
+                    if ((e->OutIdx >= 0) && (e->WindDelta != 0) && ePrev && (ePrev->OutIdx >= 0) &&
+                        (ePrev->Curr.X == e->Curr.X) && (ePrev->WindDelta != 0))
+                    {
+                        IntPoint pt = e->Curr;
 #ifdef use_xyz
-        SetZ(Pt, *e1, *e2);
+                        SetZ(pt, *ePrev, *e);
 #endif
+                        OutPt* op  = AddOutPt(ePrev, pt);
+                        OutPt* op2 = AddOutPt(e, pt);
+                        AddJoin(op, op2, pt);  // StrictlySimple (type-3) join
+                    }
+                }
+
+                e = e->NextInAEL;
+            }
+        }
+
+        // 3. Process horizontals at the Top of the scanbeam ...
+        m_Maxima.sort();
+        ProcessHorizontals();
+        m_Maxima.clear();
+
+        // 4. Promote intermediate vertices ...
+        e = m_ActiveEdges;
+        while (e)
+        {
+            if (IsIntermediate(e, topY))
+            {
+                OutPt* op = 0;
+                if (e->OutIdx >= 0)
+                    op = AddOutPt(e, e->Top);
+                UpdateEdgeIntoAEL(e);
+
+                // if output polygons share an edge, they'll need joining later ...
+                TEdge* ePrev = e->PrevInAEL;
+                TEdge* eNext = e->NextInAEL;
+                if (ePrev && ePrev->Curr.X == e->Bot.X &&
+                    ePrev->Curr.Y == e->Bot.Y && op &&
+                    ePrev->OutIdx >= 0 && ePrev->Curr.Y > ePrev->Top.Y &&
+                    SlopesEqual(e->Curr, e->Top, ePrev->Curr, ePrev->Top, m_UseFullRange) &&
+                    (e->WindDelta != 0) && (ePrev->WindDelta != 0))
+                {
+                    OutPt* op2 = AddOutPt(ePrev, e->Bot);
+                    AddJoin(op, op2, e->Top);
+                }
+                else if (eNext && eNext->Curr.X == e->Bot.X &&
+                         eNext->Curr.Y == e->Bot.Y && op &&
+                         eNext->OutIdx >= 0 && eNext->Curr.Y > eNext->Top.Y &&
+                         SlopesEqual(e->Curr, e->Top, eNext->Curr, eNext->Top, m_UseFullRange) &&
+                         (e->WindDelta != 0) && (eNext->WindDelta != 0))
+                {
+                    OutPt* op2 = AddOutPt(eNext, e->Bot);
+                    AddJoin(op, op2, e->Top);
+                }
+            }
+            e = e->NextInAEL;
+        }
+    }
+    //------------------------------------------------------------------------------
+
+    void Clipper::FixupOutPolyline(OutRec& outrec)
+    {
+        OutPt* pp     = outrec.Pts;
+        OutPt* lastPP = pp->Prev;
+        while (pp != lastPP)
+        {
+            pp = pp->Next;
+            if (pp->Pt == pp->Prev->Pt)
+            {
+                if (pp == lastPP) lastPP = pp->Prev;
+                OutPt* tmpPP   = pp->Prev;
+                tmpPP->Next    = pp->Next;
+                pp->Next->Prev = tmpPP;
+                delete pp;
+                pp = tmpPP;
+            }
+        }
+
+        if (pp == pp->Prev)
+        {
+            DisposeOutPts(pp);
+            outrec.Pts = 0;
+            return;
+        }
+    }
+    //------------------------------------------------------------------------------
+
+    void Clipper::FixupOutPolygon(OutRec& outrec)
+    {
+        // FixupOutPolygon() - removes duplicate points and simplifies consecutive
+        // parallel edges by removing the middle vertex.
+        OutPt* lastOK      = 0;
+        outrec.BottomPt    = 0;
+        OutPt* pp          = outrec.Pts;
+        bool   preserveCol = m_PreserveCollinear || m_StrictSimple;
+
+        for (;;)
+        {
+            if (pp->Prev == pp || pp->Prev == pp->Next)
+            {
+                DisposeOutPts(pp);
+                outrec.Pts = 0;
+                return;
+            }
+
+            // test for duplicate points and collinear edges ...
+            if ((pp->Pt == pp->Next->Pt) || (pp->Pt == pp->Prev->Pt) ||
+                (SlopesEqual(pp->Prev->Pt, pp->Pt, pp->Next->Pt, m_UseFullRange) &&
+                 (!preserveCol || !Pt2IsBetweenPt1AndPt3(pp->Prev->Pt, pp->Pt, pp->Next->Pt))))
+            {
+                lastOK         = 0;
+                OutPt* tmp     = pp;
+                pp->Prev->Next = pp->Next;
+                pp->Next->Prev = pp->Prev;
+                pp             = pp->Prev;
+                delete tmp;
+            }
+            else if (pp == lastOK)
+                break;
+            else
+            {
+                if (!lastOK) lastOK = pp;
+                pp = pp->Next;
+            }
+        }
+        outrec.Pts = pp;
+    }
+    //------------------------------------------------------------------------------
+
+    int PointCount(OutPt* Pts)
+    {
+        if (!Pts) return 0;
+        int    result = 0;
+        OutPt* p      = Pts;
+        do
+        {
+            result++;
+            p = p->Next;
+        } while (p != Pts);
+        return result;
+    }
+    //------------------------------------------------------------------------------
+
+    void Clipper::BuildResult(Paths& polys)
+    {
+        polys.reserve(m_PolyOuts.size());
+        for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); ++i)
+        {
+            if (!m_PolyOuts[i]->Pts) continue;
+            Path   pg;
+            OutPt* p   = m_PolyOuts[i]->Pts->Prev;
+            int    cnt = PointCount(p);
+            if (cnt < 2) continue;
+            pg.reserve(cnt);
+            for (int i = 0; i < cnt; ++i)
+            {
+                pg.push_back(p->Pt);
+                p = p->Prev;
+            }
+            polys.push_back(pg);
+        }
+    }
+    //------------------------------------------------------------------------------
+
+    void Clipper::BuildResult2(PolyTree& polytree)
+    {
+        polytree.Clear();
+        polytree.AllNodes.reserve(m_PolyOuts.size());
+        // add each output polygon/contour to polytree ...
+        for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); i++)
+        {
+            OutRec* outRec = m_PolyOuts[i];
+            int     cnt    = PointCount(outRec->Pts);
+            if ((outRec->IsOpen && cnt < 2) || (!outRec->IsOpen && cnt < 3)) continue;
+            FixHoleLinkage(*outRec);
+            PolyNode* pn = new PolyNode();
+            // nb: polytree takes ownership of all the PolyNodes
+            polytree.AllNodes.push_back(pn);
+            outRec->PolyNd = pn;
+            pn->Parent     = 0;
+            pn->Index      = 0;
+            pn->Contour.reserve(cnt);
+            OutPt* op = outRec->Pts->Prev;
+            for (int j = 0; j < cnt; j++)
+            {
+                pn->Contour.push_back(op->Pt);
+                op = op->Prev;
+            }
+        }
+
+        // fixup PolyNode links etc ...
+        polytree.Childs.reserve(m_PolyOuts.size());
+        for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); i++)
+        {
+            OutRec* outRec = m_PolyOuts[i];
+            if (!outRec->PolyNd) continue;
+            if (outRec->IsOpen)
+            {
+                outRec->PolyNd->m_IsOpen = true;
+                polytree.AddChild(*outRec->PolyNd);
+            }
+            else if (outRec->FirstLeft && outRec->FirstLeft->PolyNd)
+                outRec->FirstLeft->PolyNd->AddChild(*outRec->PolyNd);
+            else
+                polytree.AddChild(*outRec->PolyNd);
+        }
+    }
+    //------------------------------------------------------------------------------
+
+    void SwapIntersectNodes(IntersectNode& int1, IntersectNode& int2)
+    {
+        // just swap the contents (because fIntersectNodes is a single-linked-list)
+        IntersectNode inode = int1;  // gets a copy of Int1
+        int1.Edge1          = int2.Edge1;
+        int1.Edge2          = int2.Edge2;
+        int1.Pt             = int2.Pt;
+        int2.Edge1          = inode.Edge1;
+        int2.Edge2          = inode.Edge2;
+        int2.Pt             = inode.Pt;
+    }
+    //------------------------------------------------------------------------------
+
+    inline bool E2InsertsBeforeE1(TEdge& e1, TEdge& e2)
+    {
+        if (e2.Curr.X == e1.Curr.X)
+        {
+            if (e2.Top.Y > e1.Top.Y)
+                return e2.Top.X < TopX(e1, e2.Top.Y);
+            else
+                return e1.Top.X > TopX(e2, e1.Top.Y);
+        }
+        else
+            return e2.Curr.X < e1.Curr.X;
+    }
+    //------------------------------------------------------------------------------
+
+    bool GetOverlap(const cInt a1, const cInt a2, const cInt b1, const cInt b2, cInt& Left, cInt& Right)
+    {
+        if (a1 < a2)
+        {
+            if (b1 < b2)
+            {
+                Left  = std::max(a1, b1);
+                Right = std::min(a2, b2);
+            }
+            else
+            {
+                Left  = std::max(a1, b2);
+                Right = std::min(a2, b1);
+            }
+        }
+        else
+        {
+            if (b1 < b2)
+            {
+                Left  = std::max(a2, b1);
+                Right = std::min(a1, b2);
+            }
+            else
+            {
+                Left  = std::max(a2, b2);
+                Right = std::min(a1, b1);
+            }
+        }
+        return Left < Right;
+    }
+    //------------------------------------------------------------------------------
 
-#ifdef use_lines
-  //if either edge is on an OPEN path ...
-  if (e1->WindDelta == 0 || e2->WindDelta == 0)
-  {
-    //ignore subject-subject open path intersections UNLESS they
-    //are both open paths, AND they are both 'contributing maximas' ...
-	if (e1->WindDelta == 0 && e2->WindDelta == 0) return;
+    inline void UpdateOutPtIdxs(OutRec& outrec)
+    {
+        OutPt* op = outrec.Pts;
+        do
+        {
+            op->Idx = outrec.Idx;
+            op      = op->Prev;
+        } while (op != outrec.Pts);
+    }
+    //------------------------------------------------------------------------------
 
-    //if intersecting a subj line with a subj poly ...
-    else if (e1->PolyTyp == e2->PolyTyp &&
-      e1->WindDelta != e2->WindDelta && m_ClipType == ctUnion)
+    void Clipper::InsertEdgeIntoAEL(TEdge* edge, TEdge* startEdge)
     {
-      if (e1->WindDelta == 0)
-      {
-        if (e2Contributing)
+        if (!m_ActiveEdges)
         {
-          AddOutPt(e1, Pt);
-          if (e1Contributing) e1->OutIdx = Unassigned;
+            edge->PrevInAEL = 0;
+            edge->NextInAEL = 0;
+            m_ActiveEdges   = edge;
         }
-      }
-      else
-      {
-        if (e1Contributing)
+        else if (!startEdge && E2InsertsBeforeE1(*m_ActiveEdges, *edge))
         {
-          AddOutPt(e2, Pt);
-          if (e2Contributing) e2->OutIdx = Unassigned;
+            edge->PrevInAEL          = 0;
+            edge->NextInAEL          = m_ActiveEdges;
+            m_ActiveEdges->PrevInAEL = edge;
+            m_ActiveEdges            = edge;
+        }
+        else
+        {
+            if (!startEdge) startEdge = m_ActiveEdges;
+            while (startEdge->NextInAEL &&
+                   !E2InsertsBeforeE1(*startEdge->NextInAEL, *edge))
+                startEdge = startEdge->NextInAEL;
+            edge->NextInAEL = startEdge->NextInAEL;
+            if (startEdge->NextInAEL) startEdge->NextInAEL->PrevInAEL = edge;
+            edge->PrevInAEL      = startEdge;
+            startEdge->NextInAEL = edge;
         }
-      }
     }
-    else if (e1->PolyTyp != e2->PolyTyp)
+    //----------------------------------------------------------------------
+
+    OutPt* DupOutPt(OutPt* outPt, bool InsertAfter)
     {
-      //toggle subj open path OutIdx on/off when Abs(clip.WndCnt) == 1 ...
-      if ((e1->WindDelta == 0) && abs(e2->WindCnt) == 1 &&
-        (m_ClipType != ctUnion || e2->WindCnt2 == 0))
-      {
-        AddOutPt(e1, Pt);
-        if (e1Contributing) e1->OutIdx = Unassigned;
-      }
-      else if ((e2->WindDelta == 0) && (abs(e1->WindCnt) == 1) &&
-        (m_ClipType != ctUnion || e1->WindCnt2 == 0))
-      {
-        AddOutPt(e2, Pt);
-        if (e2Contributing) e2->OutIdx = Unassigned;
-      }
-    }
-    return;
-  }
-#endif
+        OutPt* result = new OutPt;
+        result->Pt    = outPt->Pt;
+        result->Idx   = outPt->Idx;
+        if (InsertAfter)
+        {
+            result->Next      = outPt->Next;
+            result->Prev      = outPt;
+            outPt->Next->Prev = result;
+            outPt->Next       = result;
+        }
+        else
+        {
+            result->Prev      = outPt->Prev;
+            result->Next      = outPt;
+            outPt->Prev->Next = result;
+            outPt->Prev       = result;
+        }
+        return result;
+    }
+    //------------------------------------------------------------------------------
 
-  //update winding counts...
-  //assumes that e1 will be to the Right of e2 ABOVE the intersection
-  if ( e1->PolyTyp == e2->PolyTyp )
-  {
-    if ( IsEvenOddFillType( *e1) )
-    {
-      int oldE1WindCnt = e1->WindCnt;
-      e1->WindCnt = e2->WindCnt;
-      e2->WindCnt = oldE1WindCnt;
-    } else
-    {
-      if (e1->WindCnt + e2->WindDelta == 0 ) e1->WindCnt = -e1->WindCnt;
-      else e1->WindCnt += e2->WindDelta;
-      if ( e2->WindCnt - e1->WindDelta == 0 ) e2->WindCnt = -e2->WindCnt;
-      else e2->WindCnt -= e1->WindDelta;
-    }
-  } else
-  {
-    if (!IsEvenOddFillType(*e2)) e1->WindCnt2 += e2->WindDelta;
-    else e1->WindCnt2 = ( e1->WindCnt2 == 0 ) ? 1 : 0;
-    if (!IsEvenOddFillType(*e1)) e2->WindCnt2 -= e1->WindDelta;
-    else e2->WindCnt2 = ( e2->WindCnt2 == 0 ) ? 1 : 0;
-  }
-
-  PolyFillType e1FillType, e2FillType, e1FillType2, e2FillType2;
-  if (e1->PolyTyp == ptSubject)
-  {
-    e1FillType = m_SubjFillType;
-    e1FillType2 = m_ClipFillType;
-  } else
-  {
-    e1FillType = m_ClipFillType;
-    e1FillType2 = m_SubjFillType;
-  }
-  if (e2->PolyTyp == ptSubject)
-  {
-    e2FillType = m_SubjFillType;
-    e2FillType2 = m_ClipFillType;
-  } else
-  {
-    e2FillType = m_ClipFillType;
-    e2FillType2 = m_SubjFillType;
-  }
-
-  cInt e1Wc, e2Wc;
-  switch (e1FillType)
-  {
-    case pftPositive: e1Wc = e1->WindCnt; break;
-    case pftNegative: e1Wc = -e1->WindCnt; break;
-    default: e1Wc = Abs(e1->WindCnt);
-  }
-  switch(e2FillType)
-  {
-    case pftPositive: e2Wc = e2->WindCnt; break;
-    case pftNegative: e2Wc = -e2->WindCnt; break;
-    default: e2Wc = Abs(e2->WindCnt);
-  }
-
-  if ( e1Contributing && e2Contributing )
-  {
-    if ((e1Wc != 0 && e1Wc != 1) || (e2Wc != 0 && e2Wc != 1) ||
-      (e1->PolyTyp != e2->PolyTyp && m_ClipType != ctXor) )
-    {
-      AddLocalMaxPoly(e1, e2, Pt);
-    }
-    else
-    {
-      AddOutPt(e1, Pt);
-      AddOutPt(e2, Pt);
-      SwapSides( *e1 , *e2 );
-      SwapPolyIndexes( *e1 , *e2 );
-    }
-  }
-  else if ( e1Contributing )
-  {
-    if (e2Wc == 0 || e2Wc == 1)
-    {
-      AddOutPt(e1, Pt);
-      SwapSides(*e1, *e2);
-      SwapPolyIndexes(*e1, *e2);
-    }
-  }
-  else if ( e2Contributing )
-  {
-    if (e1Wc == 0 || e1Wc == 1)
-    {
-      AddOutPt(e2, Pt);
-      SwapSides(*e1, *e2);
-      SwapPolyIndexes(*e1, *e2);
-    }
-  }
-  else if ( (e1Wc == 0 || e1Wc == 1) && (e2Wc == 0 || e2Wc == 1))
-  {
-    //neither edge is currently contributing ...
-
-    cInt e1Wc2, e2Wc2;
-    switch (e1FillType2)
-    {
-      case pftPositive: e1Wc2 = e1->WindCnt2; break;
-      case pftNegative : e1Wc2 = -e1->WindCnt2; break;
-      default: e1Wc2 = Abs(e1->WindCnt2);
-    }
-    switch (e2FillType2)
-    {
-      case pftPositive: e2Wc2 = e2->WindCnt2; break;
-      case pftNegative: e2Wc2 = -e2->WindCnt2; break;
-      default: e2Wc2 = Abs(e2->WindCnt2);
-    }
-
-    if (e1->PolyTyp != e2->PolyTyp)
-    {
-      AddLocalMinPoly(e1, e2, Pt);
-    }
-    else if (e1Wc == 1 && e2Wc == 1)
-      switch( m_ClipType ) {
-        case ctIntersection:
-          if (e1Wc2 > 0 && e2Wc2 > 0)
-            AddLocalMinPoly(e1, e2, Pt);
-          break;
-        case ctUnion:
-          if ( e1Wc2 <= 0 && e2Wc2 <= 0 )
-            AddLocalMinPoly(e1, e2, Pt);
-          break;
-        case ctDifference:
-          if (((e1->PolyTyp == ptClip) && (e1Wc2 > 0) && (e2Wc2 > 0)) ||
-              ((e1->PolyTyp == ptSubject) && (e1Wc2 <= 0) && (e2Wc2 <= 0)))
-                AddLocalMinPoly(e1, e2, Pt);
-          break;
-        case ctXor:
-          AddLocalMinPoly(e1, e2, Pt);
-      }
-    else
-      SwapSides( *e1, *e2 );
-  }
-}
-//------------------------------------------------------------------------------
+    bool JoinHorz(OutPt* op1, OutPt* op1b, OutPt* op2, OutPt* op2b, const IntPoint Pt, bool DiscardLeft)
+    {
+        Direction Dir1 = (op1->Pt.X > op1b->Pt.X ? dRightToLeft : dLeftToRight);
+        Direction Dir2 = (op2->Pt.X > op2b->Pt.X ? dRightToLeft : dLeftToRight);
+        if (Dir1 == Dir2) return false;
+
+        // When DiscardLeft, we want Op1b to be on the Left of Op1, otherwise we
+        // want Op1b to be on the Right. (And likewise with Op2 and Op2b.)
+        // So, to facilitate this while inserting Op1b and Op2b ...
+        // when DiscardLeft, make sure we're AT or RIGHT of Pt before adding Op1b,
+        // otherwise make sure we're AT or LEFT of Pt. (Likewise with Op2b.)
+        if (Dir1 == dLeftToRight)
+        {
+            while (op1->Next->Pt.X <= Pt.X &&
+                   op1->Next->Pt.X >= op1->Pt.X && op1->Next->Pt.Y == Pt.Y)
+                op1 = op1->Next;
+            if (DiscardLeft && (op1->Pt.X != Pt.X)) op1 = op1->Next;
+            op1b = DupOutPt(op1, !DiscardLeft);
+            if (op1b->Pt != Pt)
+            {
+                op1     = op1b;
+                op1->Pt = Pt;
+                op1b    = DupOutPt(op1, !DiscardLeft);
+            }
+        }
+        else
+        {
+            while (op1->Next->Pt.X >= Pt.X &&
+                   op1->Next->Pt.X <= op1->Pt.X && op1->Next->Pt.Y == Pt.Y)
+                op1 = op1->Next;
+            if (!DiscardLeft && (op1->Pt.X != Pt.X)) op1 = op1->Next;
+            op1b = DupOutPt(op1, DiscardLeft);
+            if (op1b->Pt != Pt)
+            {
+                op1     = op1b;
+                op1->Pt = Pt;
+                op1b    = DupOutPt(op1, DiscardLeft);
+            }
+        }
 
-void Clipper::SetHoleState(TEdge *e, OutRec *outrec)
-{
-  TEdge *e2 = e->PrevInAEL;
-  TEdge *eTmp = 0;
-  while (e2)
-  {
-    if (e2->OutIdx >= 0 && e2->WindDelta != 0)
-    {
-      if (!eTmp) eTmp = e2;
-      else if (eTmp->OutIdx == e2->OutIdx) eTmp = 0;
-    }
-    e2 = e2->PrevInAEL;
-  }
-  if (!eTmp)
-  {
-    outrec->FirstLeft = 0;
-    outrec->IsHole = false;
-  }
-  else
-  {
-    outrec->FirstLeft = m_PolyOuts[eTmp->OutIdx];
-    outrec->IsHole = !outrec->FirstLeft->IsHole;
-  }
-}
-//------------------------------------------------------------------------------
+        if (Dir2 == dLeftToRight)
+        {
+            while (op2->Next->Pt.X <= Pt.X &&
+                   op2->Next->Pt.X >= op2->Pt.X && op2->Next->Pt.Y == Pt.Y)
+                op2 = op2->Next;
+            if (DiscardLeft && (op2->Pt.X != Pt.X)) op2 = op2->Next;
+            op2b = DupOutPt(op2, !DiscardLeft);
+            if (op2b->Pt != Pt)
+            {
+                op2     = op2b;
+                op2->Pt = Pt;
+                op2b    = DupOutPt(op2, !DiscardLeft);
+            };
+        }
+        else
+        {
+            while (op2->Next->Pt.X >= Pt.X &&
+                   op2->Next->Pt.X <= op2->Pt.X && op2->Next->Pt.Y == Pt.Y)
+                op2 = op2->Next;
+            if (!DiscardLeft && (op2->Pt.X != Pt.X)) op2 = op2->Next;
+            op2b = DupOutPt(op2, DiscardLeft);
+            if (op2b->Pt != Pt)
+            {
+                op2     = op2b;
+                op2->Pt = Pt;
+                op2b    = DupOutPt(op2, DiscardLeft);
+            };
+        };
 
-OutRec* GetLowermostRec(OutRec *outRec1, OutRec *outRec2)
-{
-  //work out which polygon fragment has the correct hole state ...
-  if (!outRec1->BottomPt)
-    outRec1->BottomPt = GetBottomPt(outRec1->Pts);
-  if (!outRec2->BottomPt)
-    outRec2->BottomPt = GetBottomPt(outRec2->Pts);
-  OutPt *OutPt1 = outRec1->BottomPt;
-  OutPt *OutPt2 = outRec2->BottomPt;
-  if (OutPt1->Pt.Y > OutPt2->Pt.Y) return outRec1;
-  else if (OutPt1->Pt.Y < OutPt2->Pt.Y) return outRec2;
-  else if (OutPt1->Pt.X < OutPt2->Pt.X) return outRec1;
-  else if (OutPt1->Pt.X > OutPt2->Pt.X) return outRec2;
-  else if (OutPt1->Next == OutPt1) return outRec2;
-  else if (OutPt2->Next == OutPt2) return outRec1;
-  else if (FirstIsBottomPt(OutPt1, OutPt2)) return outRec1;
-  else return outRec2;
-}
-//------------------------------------------------------------------------------
+        if ((Dir1 == dLeftToRight) == DiscardLeft)
+        {
+            op1->Prev  = op2;
+            op2->Next  = op1;
+            op1b->Next = op2b;
+            op2b->Prev = op1b;
+        }
+        else
+        {
+            op1->Next  = op2;
+            op2->Prev  = op1;
+            op1b->Prev = op2b;
+            op2b->Next = op1b;
+        }
+        return true;
+    }
+    //------------------------------------------------------------------------------
 
-bool OutRec1RightOfOutRec2(OutRec* outRec1, OutRec* outRec2)
-{
-  do
-  {
-    outRec1 = outRec1->FirstLeft;
-    if (outRec1 == outRec2) return true;
-  } while (outRec1);
-  return false;
-}
-//------------------------------------------------------------------------------
+    bool Clipper::JoinPoints(Join* j, OutRec* outRec1, OutRec* outRec2)
+    {
+        OutPt *op1 = j->OutPt1, *op1b;
+        OutPt *op2 = j->OutPt2, *op2b;
+
+        // There are 3 kinds of joins for output polygons ...
+        // 1. Horizontal joins where Join.OutPt1 & Join.OutPt2 are vertices anywhere
+        // along (horizontal) collinear edges (& Join.OffPt is on the same horizontal).
+        // 2. Non-horizontal joins where Join.OutPt1 & Join.OutPt2 are at the same
+        // location at the Bottom of the overlapping segment (& Join.OffPt is above).
+        // 3. StrictSimple joins where edges touch but are not collinear and where
+        // Join.OutPt1, Join.OutPt2 & Join.OffPt all share the same point.
+        bool   isHorizontal = (j->OutPt1->Pt.Y == j->OffPt.Y);
+
+        if (isHorizontal && (j->OffPt == j->OutPt1->Pt) &&
+            (j->OffPt == j->OutPt2->Pt))
+        {
+            // Strictly Simple join ...
+            if (outRec1 != outRec2) return false;
+            op1b = j->OutPt1->Next;
+            while (op1b != op1 && (op1b->Pt == j->OffPt))
+                op1b = op1b->Next;
+            bool reverse1 = (op1b->Pt.Y > j->OffPt.Y);
+            op2b          = j->OutPt2->Next;
+            while (op2b != op2 && (op2b->Pt == j->OffPt))
+                op2b = op2b->Next;
+            bool reverse2 = (op2b->Pt.Y > j->OffPt.Y);
+            if (reverse1 == reverse2) return false;
+            if (reverse1)
+            {
+                op1b       = DupOutPt(op1, false);
+                op2b       = DupOutPt(op2, true);
+                op1->Prev  = op2;
+                op2->Next  = op1;
+                op1b->Next = op2b;
+                op2b->Prev = op1b;
+                j->OutPt1  = op1;
+                j->OutPt2  = op1b;
+                return true;
+            }
+            else
+            {
+                op1b       = DupOutPt(op1, true);
+                op2b       = DupOutPt(op2, false);
+                op1->Next  = op2;
+                op2->Prev  = op1;
+                op1b->Prev = op2b;
+                op2b->Next = op1b;
+                j->OutPt1  = op1;
+                j->OutPt2  = op1b;
+                return true;
+            }
+        }
+        else if (isHorizontal)
+        {
+            // treat horizontal joins differently to non-horizontal joins since with
+            // them we're not yet sure where the overlapping is. OutPt1.Pt & OutPt2.Pt
+            // may be anywhere along the horizontal edge.
+            op1b = op1;
+            while (op1->Prev->Pt.Y == op1->Pt.Y && op1->Prev != op1b && op1->Prev != op2)
+                op1 = op1->Prev;
+            while (op1b->Next->Pt.Y == op1b->Pt.Y && op1b->Next != op1 && op1b->Next != op2)
+                op1b = op1b->Next;
+            if (op1b->Next == op1 || op1b->Next == op2) return false;  // a flat 'polygon'
+
+            op2b = op2;
+            while (op2->Prev->Pt.Y == op2->Pt.Y && op2->Prev != op2b && op2->Prev != op1b)
+                op2 = op2->Prev;
+            while (op2b->Next->Pt.Y == op2b->Pt.Y && op2b->Next != op2 && op2b->Next != op1)
+                op2b = op2b->Next;
+            if (op2b->Next == op2 || op2b->Next == op1) return false;  // a flat 'polygon'
+
+            cInt Left, Right;
+            // Op1 --> Op1b & Op2 --> Op2b are the extremites of the horizontal edges
+            if (!GetOverlap(op1->Pt.X, op1b->Pt.X, op2->Pt.X, op2b->Pt.X, Left, Right))
+                return false;
+
+            // DiscardLeftSide: when overlapping edges are joined, a spike will created
+            // which needs to be cleaned up. However, we don't want Op1 or Op2 caught up
+            // on the discard Side as either may still be needed for other joins ...
+            IntPoint Pt;
+            bool     DiscardLeftSide;
+            if (op1->Pt.X >= Left && op1->Pt.X <= Right)
+            {
+                Pt              = op1->Pt;
+                DiscardLeftSide = (op1->Pt.X > op1b->Pt.X);
+            }
+            else if (op2->Pt.X >= Left && op2->Pt.X <= Right)
+            {
+                Pt              = op2->Pt;
+                DiscardLeftSide = (op2->Pt.X > op2b->Pt.X);
+            }
+            else if (op1b->Pt.X >= Left && op1b->Pt.X <= Right)
+            {
+                Pt              = op1b->Pt;
+                DiscardLeftSide = op1b->Pt.X > op1->Pt.X;
+            }
+            else
+            {
+                Pt              = op2b->Pt;
+                DiscardLeftSide = (op2b->Pt.X > op2->Pt.X);
+            }
+            j->OutPt1 = op1;
+            j->OutPt2 = op2;
+            return JoinHorz(op1, op1b, op2, op2b, Pt, DiscardLeftSide);
+        }
+        else
+        {
+            // nb: For non-horizontal joins ...
+            //     1. Jr.OutPt1.Pt.Y == Jr.OutPt2.Pt.Y
+            //     2. Jr.OutPt1.Pt > Jr.OffPt.Y
+
+            // make sure the polygons are correctly oriented ...
+            op1b = op1->Next;
+            while ((op1b->Pt == op1->Pt) && (op1b != op1)) op1b = op1b->Next;
+            bool Reverse1 = ((op1b->Pt.Y > op1->Pt.Y) ||
+                             !SlopesEqual(op1->Pt, op1b->Pt, j->OffPt, m_UseFullRange));
+            if (Reverse1)
+            {
+                op1b = op1->Prev;
+                while ((op1b->Pt == op1->Pt) && (op1b != op1)) op1b = op1b->Prev;
+                if ((op1b->Pt.Y > op1->Pt.Y) ||
+                    !SlopesEqual(op1->Pt, op1b->Pt, j->OffPt, m_UseFullRange)) return false;
+            };
+            op2b = op2->Next;
+            while ((op2b->Pt == op2->Pt) && (op2b != op2)) op2b = op2b->Next;
+            bool Reverse2 = ((op2b->Pt.Y > op2->Pt.Y) ||
+                             !SlopesEqual(op2->Pt, op2b->Pt, j->OffPt, m_UseFullRange));
+            if (Reverse2)
+            {
+                op2b = op2->Prev;
+                while ((op2b->Pt == op2->Pt) && (op2b != op2)) op2b = op2b->Prev;
+                if ((op2b->Pt.Y > op2->Pt.Y) ||
+                    !SlopesEqual(op2->Pt, op2b->Pt, j->OffPt, m_UseFullRange)) return false;
+            }
 
-OutRec* Clipper::GetOutRec(int Idx)
-{
-  OutRec* outrec = m_PolyOuts[Idx];
-  while (outrec != m_PolyOuts[outrec->Idx])
-    outrec = m_PolyOuts[outrec->Idx];
-  return outrec;
-}
-//------------------------------------------------------------------------------
+            if ((op1b == op1) || (op2b == op2) || (op1b == op2b) ||
+                ((outRec1 == outRec2) && (Reverse1 == Reverse2))) return false;
 
-void Clipper::AppendPolygon(TEdge *e1, TEdge *e2)
-{
-  //get the start and ends of both output polygons ...
-  OutRec *outRec1 = m_PolyOuts[e1->OutIdx];
-  OutRec *outRec2 = m_PolyOuts[e2->OutIdx];
-
-  OutRec *holeStateRec;
-  if (OutRec1RightOfOutRec2(outRec1, outRec2))
-    holeStateRec = outRec2;
-  else if (OutRec1RightOfOutRec2(outRec2, outRec1))
-    holeStateRec = outRec1;
-  else
-    holeStateRec = GetLowermostRec(outRec1, outRec2);
-
-  //get the start and ends of both output polygons and
-  //join e2 poly onto e1 poly and delete pointers to e2 ...
-
-  OutPt* p1_lft = outRec1->Pts;
-  OutPt* p1_rt = p1_lft->Prev;
-  OutPt* p2_lft = outRec2->Pts;
-  OutPt* p2_rt = p2_lft->Prev;
-
-  //join e2 poly onto e1 poly and delete pointers to e2 ...
-  if(  e1->Side == esLeft )
-  {
-    if(  e2->Side == esLeft )
-    {
-      //z y x a b c
-      ReversePolyPtLinks(p2_lft);
-      p2_lft->Next = p1_lft;
-      p1_lft->Prev = p2_lft;
-      p1_rt->Next = p2_rt;
-      p2_rt->Prev = p1_rt;
-      outRec1->Pts = p2_rt;
-    } else
-    {
-      //x y z a b c
-      p2_rt->Next = p1_lft;
-      p1_lft->Prev = p2_rt;
-      p2_lft->Prev = p1_rt;
-      p1_rt->Next = p2_lft;
-      outRec1->Pts = p2_lft;
-    }
-  } else
-  {
-    if(  e2->Side == esRight )
-    {
-      //a b c z y x
-      ReversePolyPtLinks(p2_lft);
-      p1_rt->Next = p2_rt;
-      p2_rt->Prev = p1_rt;
-      p2_lft->Next = p1_lft;
-      p1_lft->Prev = p2_lft;
-    } else
-    {
-      //a b c x y z
-      p1_rt->Next = p2_lft;
-      p2_lft->Prev = p1_rt;
-      p1_lft->Prev = p2_rt;
-      p2_rt->Next = p1_lft;
-    }
-  }
-
-  outRec1->BottomPt = 0;
-  if (holeStateRec == outRec2)
-  {
-    if (outRec2->FirstLeft != outRec1)
-      outRec1->FirstLeft = outRec2->FirstLeft;
-    outRec1->IsHole = outRec2->IsHole;
-  }
-  outRec2->Pts = 0;
-  outRec2->BottomPt = 0;
-  outRec2->FirstLeft = outRec1;
-
-  int OKIdx = e1->OutIdx;
-  int ObsoleteIdx = e2->OutIdx;
-
-  e1->OutIdx = Unassigned; //nb: safe because we only get here via AddLocalMaxPoly
-  e2->OutIdx = Unassigned;
-
-  TEdge* e = m_ActiveEdges;
-  while( e )
-  {
-    if( e->OutIdx == ObsoleteIdx )
-    {
-      e->OutIdx = OKIdx;
-      e->Side = e1->Side;
-      break;
-    }
-    e = e->NextInAEL;
-  }
-
-  outRec2->Idx = outRec1->Idx;
-}
-//------------------------------------------------------------------------------
+            if (Reverse1)
+            {
+                op1b       = DupOutPt(op1, false);
+                op2b       = DupOutPt(op2, true);
+                op1->Prev  = op2;
+                op2->Next  = op1;
+                op1b->Next = op2b;
+                op2b->Prev = op1b;
+                j->OutPt1  = op1;
+                j->OutPt2  = op1b;
+                return true;
+            }
+            else
+            {
+                op1b       = DupOutPt(op1, true);
+                op2b       = DupOutPt(op2, false);
+                op1->Next  = op2;
+                op2->Prev  = op1;
+                op1b->Prev = op2b;
+                op2b->Next = op1b;
+                j->OutPt1  = op1;
+                j->OutPt2  = op1b;
+                return true;
+            }
+        }
+    }
+    //----------------------------------------------------------------------
 
-OutPt* Clipper::AddOutPt(TEdge *e, const IntPoint &pt)
-{
-  if(  e->OutIdx < 0 )
-  {
-    OutRec *outRec = CreateOutRec();
-    outRec->IsOpen = (e->WindDelta == 0);
-    OutPt* newOp = new OutPt;
-    outRec->Pts = newOp;
-    newOp->Idx = outRec->Idx;
-    newOp->Pt = pt;
-    newOp->Next = newOp;
-    newOp->Prev = newOp;
-    if (!outRec->IsOpen)
-      SetHoleState(e, outRec);
-    e->OutIdx = outRec->Idx;
-    return newOp;
-  } else
-  {
-    OutRec *outRec = m_PolyOuts[e->OutIdx];
-    //OutRec.Pts is the 'Left-most' point & OutRec.Pts.Prev is the 'Right-most'
-    OutPt* op = outRec->Pts;
-
-	bool ToFront = (e->Side == esLeft);
-	if (ToFront && (pt == op->Pt)) return op;
-    else if (!ToFront && (pt == op->Prev->Pt)) return op->Prev;
-
-    OutPt* newOp = new OutPt;
-    newOp->Idx = outRec->Idx;
-    newOp->Pt = pt;
-    newOp->Next = op;
-    newOp->Prev = op->Prev;
-    newOp->Prev->Next = newOp;
-    op->Prev = newOp;
-    if (ToFront) outRec->Pts = newOp;
-    return newOp;
-  }
-}
-//------------------------------------------------------------------------------
+    static OutRec* ParseFirstLeft(OutRec* FirstLeft)
+    {
+        while (FirstLeft && !FirstLeft->Pts)
+            FirstLeft = FirstLeft->FirstLeft;
+        return FirstLeft;
+    }
+    //------------------------------------------------------------------------------
 
-OutPt* Clipper::GetLastOutPt(TEdge *e)
-{
-	OutRec *outRec = m_PolyOuts[e->OutIdx];
-	if (e->Side == esLeft)
-		return outRec->Pts;
-	else
-		return outRec->Pts->Prev;
-}
-//------------------------------------------------------------------------------
+    void Clipper::FixupFirstLefts1(OutRec* OldOutRec, OutRec* NewOutRec)
+    {
+        // tests if NewOutRec contains the polygon before reassigning FirstLeft
+        for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); ++i)
+        {
+            OutRec* outRec    = m_PolyOuts[i];
+            OutRec* firstLeft = ParseFirstLeft(outRec->FirstLeft);
+            if (outRec->Pts && firstLeft == OldOutRec)
+            {
+                if (Poly2ContainsPoly1(outRec->Pts, NewOutRec->Pts))
+                    outRec->FirstLeft = NewOutRec;
+            }
+        }
+    }
+    //----------------------------------------------------------------------
 
-void Clipper::ProcessHorizontals()
-{
-  TEdge* horzEdge;
-  while (PopEdgeFromSEL(horzEdge))
-    ProcessHorizontal(horzEdge);
-}
-//------------------------------------------------------------------------------
+    void Clipper::FixupFirstLefts2(OutRec* InnerOutRec, OutRec* OuterOutRec)
+    {
+        // A polygon has split into two such that one is now the inner of the other.
+        // It's possible that these polygons now wrap around other polygons, so check
+        // every polygon that's also contained by OuterOutRec's FirstLeft container
+        //(including 0) to see if they've become inner to the new inner polygon ...
+        OutRec* orfl = OuterOutRec->FirstLeft;
+        for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); ++i)
+        {
+            OutRec* outRec = m_PolyOuts[i];
+
+            if (!outRec->Pts || outRec == OuterOutRec || outRec == InnerOutRec)
+                continue;
+            OutRec* firstLeft = ParseFirstLeft(outRec->FirstLeft);
+            if (firstLeft != orfl && firstLeft != InnerOutRec && firstLeft != OuterOutRec)
+                continue;
+            if (Poly2ContainsPoly1(outRec->Pts, InnerOutRec->Pts))
+                outRec->FirstLeft = InnerOutRec;
+            else if (Poly2ContainsPoly1(outRec->Pts, OuterOutRec->Pts))
+                outRec->FirstLeft = OuterOutRec;
+            else if (outRec->FirstLeft == InnerOutRec || outRec->FirstLeft == OuterOutRec)
+                outRec->FirstLeft = orfl;
+        }
+    }
+    //----------------------------------------------------------------------
+    void Clipper::FixupFirstLefts3(OutRec* OldOutRec, OutRec* NewOutRec)
+    {
+        // reassigns FirstLeft WITHOUT testing if NewOutRec contains the polygon
+        for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); ++i)
+        {
+            OutRec* outRec    = m_PolyOuts[i];
+            OutRec* firstLeft = ParseFirstLeft(outRec->FirstLeft);
+            if (outRec->Pts && firstLeft == OldOutRec)
+                outRec->FirstLeft = NewOutRec;
+        }
+    }
+    //----------------------------------------------------------------------
 
-inline bool IsMinima(TEdge *e)
-{
-  return e  && (e->Prev->NextInLML != e) && (e->Next->NextInLML != e);
-}
-//------------------------------------------------------------------------------
+    void Clipper::JoinCommonEdges()
+    {
+        for (JoinList::size_type i = 0; i < m_Joins.size(); i++)
+        {
+            Join*   join = m_Joins[i];
+
+            OutRec* outRec1 = GetOutRec(join->OutPt1->Idx);
+            OutRec* outRec2 = GetOutRec(join->OutPt2->Idx);
+
+            if (!outRec1->Pts || !outRec2->Pts) continue;
+            if (outRec1->IsOpen || outRec2->IsOpen) continue;
+
+            // get the polygon fragment with the correct hole state (FirstLeft)
+            // before calling JoinPoints() ...
+            OutRec* holeStateRec;
+            if (outRec1 == outRec2)
+                holeStateRec = outRec1;
+            else if (OutRec1RightOfOutRec2(outRec1, outRec2))
+                holeStateRec = outRec2;
+            else if (OutRec1RightOfOutRec2(outRec2, outRec1))
+                holeStateRec = outRec1;
+            else
+                holeStateRec = GetLowermostRec(outRec1, outRec2);
 
-inline bool IsMaxima(TEdge *e, const cInt Y)
-{
-  return e && e->Top.Y == Y && !e->NextInLML;
-}
-//------------------------------------------------------------------------------
+            if (!JoinPoints(join, outRec1, outRec2)) continue;
 
-inline bool IsIntermediate(TEdge *e, const cInt Y)
-{
-  return e->Top.Y == Y && e->NextInLML;
-}
-//------------------------------------------------------------------------------
+            if (outRec1 == outRec2)
+            {
+                // instead of joining two polygons, we've just created a new one by
+                // splitting one polygon into two.
+                outRec1->Pts      = join->OutPt1;
+                outRec1->BottomPt = 0;
+                outRec2           = CreateOutRec();
+                outRec2->Pts      = join->OutPt2;
 
-TEdge *GetMaximaPair(TEdge *e)
-{
-  if ((e->Next->Top == e->Top) && !e->Next->NextInLML)
-    return e->Next;
-  else if ((e->Prev->Top == e->Top) && !e->Prev->NextInLML)
-    return e->Prev;
-  else return 0;
-}
-//------------------------------------------------------------------------------
+                // update all OutRec2.Pts Idx's ...
+                UpdateOutPtIdxs(*outRec2);
 
-TEdge *GetMaximaPairEx(TEdge *e)
-{
-  //as GetMaximaPair() but returns 0 if MaxPair isn't in AEL (unless it's horizontal)
-  TEdge* result = GetMaximaPair(e);
-  if (result && (result->OutIdx == Skip ||
-    (result->NextInAEL == result->PrevInAEL && !IsHorizontal(*result)))) return 0;
-  return result;
-}
-//------------------------------------------------------------------------------
+                if (Poly2ContainsPoly1(outRec2->Pts, outRec1->Pts))
+                {
+                    // outRec1 contains outRec2 ...
+                    outRec2->IsHole    = !outRec1->IsHole;
+                    outRec2->FirstLeft = outRec1;
 
-void Clipper::SwapPositionsInSEL(TEdge *Edge1, TEdge *Edge2)
-{
-  if(  !( Edge1->NextInSEL ) &&  !( Edge1->PrevInSEL ) ) return;
-  if(  !( Edge2->NextInSEL ) &&  !( Edge2->PrevInSEL ) ) return;
-
-  if(  Edge1->NextInSEL == Edge2 )
-  {
-    TEdge* Next = Edge2->NextInSEL;
-    if( Next ) Next->PrevInSEL = Edge1;
-    TEdge* Prev = Edge1->PrevInSEL;
-    if( Prev ) Prev->NextInSEL = Edge2;
-    Edge2->PrevInSEL = Prev;
-    Edge2->NextInSEL = Edge1;
-    Edge1->PrevInSEL = Edge2;
-    Edge1->NextInSEL = Next;
-  }
-  else if(  Edge2->NextInSEL == Edge1 )
-  {
-    TEdge* Next = Edge1->NextInSEL;
-    if( Next ) Next->PrevInSEL = Edge2;
-    TEdge* Prev = Edge2->PrevInSEL;
-    if( Prev ) Prev->NextInSEL = Edge1;
-    Edge1->PrevInSEL = Prev;
-    Edge1->NextInSEL = Edge2;
-    Edge2->PrevInSEL = Edge1;
-    Edge2->NextInSEL = Next;
-  }
-  else
-  {
-    TEdge* Next = Edge1->NextInSEL;
-    TEdge* Prev = Edge1->PrevInSEL;
-    Edge1->NextInSEL = Edge2->NextInSEL;
-    if( Edge1->NextInSEL ) Edge1->NextInSEL->PrevInSEL = Edge1;
-    Edge1->PrevInSEL = Edge2->PrevInSEL;
-    if( Edge1->PrevInSEL ) Edge1->PrevInSEL->NextInSEL = Edge1;
-    Edge2->NextInSEL = Next;
-    if( Edge2->NextInSEL ) Edge2->NextInSEL->PrevInSEL = Edge2;
-    Edge2->PrevInSEL = Prev;
-    if( Edge2->PrevInSEL ) Edge2->PrevInSEL->NextInSEL = Edge2;
-  }
-
-  if( !Edge1->PrevInSEL ) m_SortedEdges = Edge1;
-  else if( !Edge2->PrevInSEL ) m_SortedEdges = Edge2;
-}
-//------------------------------------------------------------------------------
+                    if (m_UsingPolyTree) FixupFirstLefts2(outRec2, outRec1);
 
-TEdge* GetNextInAEL(TEdge *e, Direction dir)
-{
-  return dir == dLeftToRight ? e->NextInAEL : e->PrevInAEL;
-}
-//------------------------------------------------------------------------------
+                    if ((outRec2->IsHole ^ m_ReverseOutput) == (Area(*outRec2) > 0))
+                        ReversePolyPtLinks(outRec2->Pts);
+                }
+                else if (Poly2ContainsPoly1(outRec1->Pts, outRec2->Pts))
+                {
+                    // outRec2 contains outRec1 ...
+                    outRec2->IsHole    = outRec1->IsHole;
+                    outRec1->IsHole    = !outRec2->IsHole;
+                    outRec2->FirstLeft = outRec1->FirstLeft;
+                    outRec1->FirstLeft = outRec2;
 
-void GetHorzDirection(TEdge& HorzEdge, Direction& Dir, cInt& Left, cInt& Right)
-{
-  if (HorzEdge.Bot.X < HorzEdge.Top.X)
-  {
-    Left = HorzEdge.Bot.X;
-    Right = HorzEdge.Top.X;
-    Dir = dLeftToRight;
-  } else
-  {
-    Left = HorzEdge.Top.X;
-    Right = HorzEdge.Bot.X;
-    Dir = dRightToLeft;
-  }
-}
-//------------------------------------------------------------------------
+                    if (m_UsingPolyTree) FixupFirstLefts2(outRec1, outRec2);
 
-/*******************************************************************************
-* Notes: Horizontal edges (HEs) at scanline intersections (ie at the Top or    *
-* Bottom of a scanbeam) are processed as if layered. The order in which HEs    *
-* are processed doesn't matter. HEs intersect with other HE Bot.Xs only [#]    *
-* (or they could intersect with Top.Xs only, ie EITHER Bot.Xs OR Top.Xs),      *
-* and with other non-horizontal edges [*]. Once these intersections are        *
-* processed, intermediate HEs then 'promote' the Edge above (NextInLML) into   *
-* the AEL. These 'promoted' edges may in turn intersect [%] with other HEs.    *
-*******************************************************************************/
-
-void Clipper::ProcessHorizontal(TEdge *horzEdge)
-{
-  Direction dir;
-  cInt horzLeft, horzRight;
-  bool IsOpen = (horzEdge->WindDelta == 0);
-
-  GetHorzDirection(*horzEdge, dir, horzLeft, horzRight);
-
-  TEdge* eLastHorz = horzEdge, *eMaxPair = 0;
-  while (eLastHorz->NextInLML && IsHorizontal(*eLastHorz->NextInLML))
-    eLastHorz = eLastHorz->NextInLML;
-  if (!eLastHorz->NextInLML)
-    eMaxPair = GetMaximaPair(eLastHorz);
-
-  MaximaList::const_iterator maxIt;
-  MaximaList::const_reverse_iterator maxRit;
-  if (m_Maxima.size() > 0)
-  {
-      //get the first maxima in range (X) ...
-      if (dir == dLeftToRight)
-      {
-          maxIt = m_Maxima.begin();
-          while (maxIt != m_Maxima.end() && *maxIt <= horzEdge->Bot.X) maxIt++;
-          if (maxIt != m_Maxima.end() && *maxIt >= eLastHorz->Top.X)
-              maxIt = m_Maxima.end();
-      }
-      else
-      {
-          maxRit = m_Maxima.rbegin();
-          while (maxRit != m_Maxima.rend() && *maxRit > horzEdge->Bot.X) maxRit++;
-          if (maxRit != m_Maxima.rend() && *maxRit <= eLastHorz->Top.X)
-              maxRit = m_Maxima.rend();
-      }
-  }
-
-  OutPt* op1 = 0;
-
-  for (;;) //loop through consec. horizontal edges
-  {
-
-    bool IsLastHorz = (horzEdge == eLastHorz);
-    TEdge* e = GetNextInAEL(horzEdge, dir);
-    while(e)
-    {
-
-        //this code block inserts extra coords into horizontal edges (in output
-        //polygons) whereever maxima touch these horizontal edges. This helps
-        //'simplifying' polygons (ie if the Simplify property is set).
-        if (m_Maxima.size() > 0)
-        {
-            if (dir == dLeftToRight)
-            {
-                while (maxIt != m_Maxima.end() && *maxIt < e->Curr.X)
+                    if ((outRec1->IsHole ^ m_ReverseOutput) == (Area(*outRec1) > 0))
+                        ReversePolyPtLinks(outRec1->Pts);
+                }
+                else
                 {
-                  if (horzEdge->OutIdx >= 0 && !IsOpen)
-                    AddOutPt(horzEdge, IntPoint(*maxIt, horzEdge->Bot.Y));
-                  maxIt++;
+                    // the 2 polygons are completely separate ...
+                    outRec2->IsHole    = outRec1->IsHole;
+                    outRec2->FirstLeft = outRec1->FirstLeft;
+
+                    // fixup FirstLeft pointers that may need reassigning to OutRec2
+                    if (m_UsingPolyTree) FixupFirstLefts1(outRec1, outRec2);
                 }
             }
             else
             {
-                while (maxRit != m_Maxima.rend() && *maxRit > e->Curr.X)
-                {
-                  if (horzEdge->OutIdx >= 0 && !IsOpen)
-                    AddOutPt(horzEdge, IntPoint(*maxRit, horzEdge->Bot.Y));
-                  maxRit++;
-                }
-            }
-        };
-
-        if ((dir == dLeftToRight && e->Curr.X > horzRight) ||
-			(dir == dRightToLeft && e->Curr.X < horzLeft)) break;
+                // joined 2 polygons together ...
 
-		//Also break if we've got to the end of an intermediate horizontal edge ...
-		//nb: Smaller Dx's are to the right of larger Dx's ABOVE the horizontal.
-		if (e->Curr.X == horzEdge->Top.X && horzEdge->NextInLML &&
-			e->Dx < horzEdge->NextInLML->Dx) break;
+                outRec2->Pts      = 0;
+                outRec2->BottomPt = 0;
+                outRec2->Idx      = outRec1->Idx;
 
-    if (horzEdge->OutIdx >= 0 && !IsOpen)  //note: may be done multiple times
-		{
-#ifdef use_xyz
-			if (dir == dLeftToRight) SetZ(e->Curr, *horzEdge, *e);
-			else SetZ(e->Curr, *e, *horzEdge);
-#endif
-			op1 = AddOutPt(horzEdge, e->Curr);
-			TEdge* eNextHorz = m_SortedEdges;
-			while (eNextHorz)
-			{
-				if (eNextHorz->OutIdx >= 0 &&
-					HorzSegmentsOverlap(horzEdge->Bot.X,
-					horzEdge->Top.X, eNextHorz->Bot.X, eNextHorz->Top.X))
-				{
-                    OutPt* op2 = GetLastOutPt(eNextHorz);
-                    AddJoin(op2, op1, eNextHorz->Top);
-				}
-				eNextHorz = eNextHorz->NextInSEL;
-			}
-			AddGhostJoin(op1, horzEdge->Bot);
-		}
+                outRec1->IsHole = holeStateRec->IsHole;
+                if (holeStateRec == outRec2)
+                    outRec1->FirstLeft = outRec2->FirstLeft;
+                outRec2->FirstLeft = outRec1;
 
-		//OK, so far we're still in range of the horizontal Edge  but make sure
-        //we're at the last of consec. horizontals when matching with eMaxPair
-        if(e == eMaxPair && IsLastHorz)
-        {
-          if (horzEdge->OutIdx >= 0)
-            AddLocalMaxPoly(horzEdge, eMaxPair, horzEdge->Top);
-          DeleteFromAEL(horzEdge);
-          DeleteFromAEL(eMaxPair);
-          return;
-        }
-
-		if(dir == dLeftToRight)
-        {
-          IntPoint Pt = IntPoint(e->Curr.X, horzEdge->Curr.Y);
-          IntersectEdges(horzEdge, e, Pt);
+                if (m_UsingPolyTree) FixupFirstLefts3(outRec2, outRec1);
+            }
         }
-        else
-        {
-          IntPoint Pt = IntPoint(e->Curr.X, horzEdge->Curr.Y);
-          IntersectEdges( e, horzEdge, Pt);
-        }
-        TEdge* eNext = GetNextInAEL(e, dir);
-        SwapPositionsInAEL( horzEdge, e );
-        e = eNext;
-    } //end while(e)
-
-	//Break out of loop if HorzEdge.NextInLML is not also horizontal ...
-	if (!horzEdge->NextInLML || !IsHorizontal(*horzEdge->NextInLML)) break;
-
-	UpdateEdgeIntoAEL(horzEdge);
-    if (horzEdge->OutIdx >= 0) AddOutPt(horzEdge, horzEdge->Bot);
-    GetHorzDirection(*horzEdge, dir, horzLeft, horzRight);
-
-  } //end for (;;)
-
-  if (horzEdge->OutIdx >= 0 && !op1)
-  {
-      op1 = GetLastOutPt(horzEdge);
-      TEdge* eNextHorz = m_SortedEdges;
-      while (eNextHorz)
-      {
-          if (eNextHorz->OutIdx >= 0 &&
-              HorzSegmentsOverlap(horzEdge->Bot.X,
-              horzEdge->Top.X, eNextHorz->Bot.X, eNextHorz->Top.X))
-          {
-              OutPt* op2 = GetLastOutPt(eNextHorz);
-              AddJoin(op2, op1, eNextHorz->Top);
-          }
-          eNextHorz = eNextHorz->NextInSEL;
-      }
-      AddGhostJoin(op1, horzEdge->Top);
-  }
-
-  if (horzEdge->NextInLML)
-  {
-    if(horzEdge->OutIdx >= 0)
-    {
-      op1 = AddOutPt( horzEdge, horzEdge->Top);
-      UpdateEdgeIntoAEL(horzEdge);
-      if (horzEdge->WindDelta == 0) return;
-      //nb: HorzEdge is no longer horizontal here
-      TEdge* ePrev = horzEdge->PrevInAEL;
-      TEdge* eNext = horzEdge->NextInAEL;
-      if (ePrev && ePrev->Curr.X == horzEdge->Bot.X &&
-        ePrev->Curr.Y == horzEdge->Bot.Y && ePrev->WindDelta != 0 &&
-        (ePrev->OutIdx >= 0 && ePrev->Curr.Y > ePrev->Top.Y &&
-        SlopesEqual(*horzEdge, *ePrev, m_UseFullRange)))
-      {
-        OutPt* op2 = AddOutPt(ePrev, horzEdge->Bot);
-        AddJoin(op1, op2, horzEdge->Top);
-      }
-      else if (eNext && eNext->Curr.X == horzEdge->Bot.X &&
-        eNext->Curr.Y == horzEdge->Bot.Y && eNext->WindDelta != 0 &&
-        eNext->OutIdx >= 0 && eNext->Curr.Y > eNext->Top.Y &&
-        SlopesEqual(*horzEdge, *eNext, m_UseFullRange))
-      {
-        OutPt* op2 = AddOutPt(eNext, horzEdge->Bot);
-        AddJoin(op1, op2, horzEdge->Top);
-      }
-    }
-    else
-      UpdateEdgeIntoAEL(horzEdge);
-  }
-  else
-  {
-    if (horzEdge->OutIdx >= 0) AddOutPt(horzEdge, horzEdge->Top);
-    DeleteFromAEL(horzEdge);
-  }
-}
-//------------------------------------------------------------------------------
-
-bool Clipper::ProcessIntersections(const cInt topY)
-{
-  if( !m_ActiveEdges ) return true;
-  try {
-    BuildIntersectList(topY);
-    size_t IlSize = m_IntersectList.size();
-    if (IlSize == 0) return true;
-    if (IlSize == 1 || FixupIntersectionOrder()) ProcessIntersectList();
-    else return false;
-  }
-  catch(...)
-  {
-    m_SortedEdges = 0;
-    DisposeIntersectNodes();
-    throw clipperException("ProcessIntersections error");
-  }
-  m_SortedEdges = 0;
-  return true;
-}
-//------------------------------------------------------------------------------
-
-void Clipper::DisposeIntersectNodes()
-{
-  for (size_t i = 0; i < m_IntersectList.size(); ++i )
-    delete m_IntersectList[i];
-  m_IntersectList.clear();
-}
-//------------------------------------------------------------------------------
-
-void Clipper::BuildIntersectList(const cInt topY)
-{
-  if ( !m_ActiveEdges ) return;
-
-  //prepare for sorting ...
-  TEdge* e = m_ActiveEdges;
-  m_SortedEdges = e;
-  while( e )
-  {
-    e->PrevInSEL = e->PrevInAEL;
-    e->NextInSEL = e->NextInAEL;
-    e->Curr.X = TopX( *e, topY );
-    e = e->NextInAEL;
-  }
-
-  //bubblesort ...
-  bool isModified;
-  do
-  {
-    isModified = false;
-    e = m_SortedEdges;
-    while( e->NextInSEL )
-    {
-      TEdge *eNext = e->NextInSEL;
-      IntPoint Pt;
-      if(e->Curr.X > eNext->Curr.X)
-      {
-        IntersectPoint(*e, *eNext, Pt);
-        if (Pt.Y < topY) Pt = IntPoint(TopX(*e, topY), topY);
-        IntersectNode * newNode = new IntersectNode;
-        newNode->Edge1 = e;
-        newNode->Edge2 = eNext;
-        newNode->Pt = Pt;
-        m_IntersectList.push_back(newNode);
-
-        SwapPositionsInSEL(e, eNext);
-        isModified = true;
-      }
-      else
-        e = eNext;
-    }
-    if( e->PrevInSEL ) e->PrevInSEL->NextInSEL = 0;
-    else break;
-  }
-  while ( isModified );
-  m_SortedEdges = 0; //important
-}
-//------------------------------------------------------------------------------
+    }
 
+    //------------------------------------------------------------------------------
+    // ClipperOffset support functions ...
+    //------------------------------------------------------------------------------
 
-void Clipper::ProcessIntersectList()
-{
-  for (size_t i = 0; i < m_IntersectList.size(); ++i)
-  {
-    IntersectNode* iNode = m_IntersectList[i];
+    DoublePoint GetUnitNormal(const IntPoint& pt1, const IntPoint& pt2)
     {
-      IntersectEdges( iNode->Edge1, iNode->Edge2, iNode->Pt);
-      SwapPositionsInAEL( iNode->Edge1 , iNode->Edge2 );
+        if (pt2.X == pt1.X && pt2.Y == pt1.Y)
+            return DoublePoint(0, 0);
+
+        double Dx = (double)(pt2.X - pt1.X);
+        double dy = (double)(pt2.Y - pt1.Y);
+        double f  = 1 * 1.0 / std::sqrt(Dx * Dx + dy * dy);
+        Dx *= f;
+        dy *= f;
+        return DoublePoint(dy, -Dx);
     }
-    delete iNode;
-  }
-  m_IntersectList.clear();
-}
-//------------------------------------------------------------------------------
 
-bool IntersectListSort(IntersectNode* node1, IntersectNode* node2)
-{
-  return node2->Pt.Y < node1->Pt.Y;
-}
-//------------------------------------------------------------------------------
-
-inline bool EdgesAdjacent(const IntersectNode &inode)
-{
-  return (inode.Edge1->NextInSEL == inode.Edge2) ||
-    (inode.Edge1->PrevInSEL == inode.Edge2);
-}
-//------------------------------------------------------------------------------
-
-bool Clipper::FixupIntersectionOrder()
-{
-  //pre-condition: intersections are sorted Bottom-most first.
-  //Now it's crucial that intersections are made only between adjacent edges,
-  //so to ensure this the order of intersections may need adjusting ...
-  CopyAELToSEL();
-  std::sort(m_IntersectList.begin(), m_IntersectList.end(), IntersectListSort);
-  size_t cnt = m_IntersectList.size();
-  for (size_t i = 0; i < cnt; ++i)
-  {
-    if (!EdgesAdjacent(*m_IntersectList[i]))
-    {
-      size_t j = i + 1;
-      while (j < cnt && !EdgesAdjacent(*m_IntersectList[j])) j++;
-      if (j == cnt)  return false;
-      std::swap(m_IntersectList[i], m_IntersectList[j]);
-    }
-    SwapPositionsInSEL(m_IntersectList[i]->Edge1, m_IntersectList[i]->Edge2);
-  }
-  return true;
-}
-//------------------------------------------------------------------------------
+    //------------------------------------------------------------------------------
+    // ClipperOffset class
+    //------------------------------------------------------------------------------
 
-void Clipper::DoMaxima(TEdge *e)
-{
-  TEdge* eMaxPair = GetMaximaPairEx(e);
-  if (!eMaxPair)
-  {
-    if (e->OutIdx >= 0)
-      AddOutPt(e, e->Top);
-    DeleteFromAEL(e);
-    return;
-  }
-
-  TEdge* eNext = e->NextInAEL;
-  while(eNext && eNext != eMaxPair)
-  {
-    IntersectEdges(e, eNext, e->Top);
-    SwapPositionsInAEL(e, eNext);
-    eNext = e->NextInAEL;
-  }
-
-  if(e->OutIdx == Unassigned && eMaxPair->OutIdx == Unassigned)
-  {
-    DeleteFromAEL(e);
-    DeleteFromAEL(eMaxPair);
-  }
-  else if( e->OutIdx >= 0 && eMaxPair->OutIdx >= 0 )
-  {
-    if (e->OutIdx >= 0) AddLocalMaxPoly(e, eMaxPair, e->Top);
-    DeleteFromAEL(e);
-    DeleteFromAEL(eMaxPair);
-  }
-#ifdef use_lines
-  else if (e->WindDelta == 0)
-  {
-    if (e->OutIdx >= 0)
+    ClipperOffset::ClipperOffset(double miterLimit, double arcTolerance)
     {
-      AddOutPt(e, e->Top);
-      e->OutIdx = Unassigned;
+        this->MiterLimit   = miterLimit;
+        this->ArcTolerance = arcTolerance;
+        m_lowest.X         = -1;
     }
-    DeleteFromAEL(e);
+    //------------------------------------------------------------------------------
 
-    if (eMaxPair->OutIdx >= 0)
+    ClipperOffset::~ClipperOffset()
     {
-      AddOutPt(eMaxPair, e->Top);
-      eMaxPair->OutIdx = Unassigned;
+        Clear();
     }
-    DeleteFromAEL(eMaxPair);
-  }
-#endif
-  else throw clipperException("DoMaxima error");
-}
-//------------------------------------------------------------------------------
-
-void Clipper::ProcessEdgesAtTopOfScanbeam(const cInt topY)
-{
-  TEdge* e = m_ActiveEdges;
-  while( e )
-  {
-    //1. process maxima, treating them as if they're 'bent' horizontal edges,
-    //   but exclude maxima with horizontal edges. nb: e can't be a horizontal.
-    bool IsMaximaEdge = IsMaxima(e, topY);
-
-    if(IsMaximaEdge)
-    {
-      TEdge* eMaxPair = GetMaximaPairEx(e);
-      IsMaximaEdge = (!eMaxPair || !IsHorizontal(*eMaxPair));
-    }
-
-    if(IsMaximaEdge)
-    {
-      if (m_StrictSimple) m_Maxima.push_back(e->Top.X);
-      TEdge* ePrev = e->PrevInAEL;
-      DoMaxima(e);
-      if( !ePrev ) e = m_ActiveEdges;
-      else e = ePrev->NextInAEL;
-    }
-    else
-    {
-      //2. promote horizontal edges, otherwise update Curr.X and Curr.Y ...
-      if (IsIntermediate(e, topY) && IsHorizontal(*e->NextInLML))
-      {
-        UpdateEdgeIntoAEL(e);
-        if (e->OutIdx >= 0)
-          AddOutPt(e, e->Bot);
-        AddEdgeToSEL(e);
-      }
-      else
-      {
-        e->Curr.X = TopX( *e, topY );
-        e->Curr.Y = topY;
-#ifdef use_xyz
-		e->Curr.Z = topY == e->Top.Y ? e->Top.Z : (topY == e->Bot.Y ? e->Bot.Z : 0);
-#endif
-	  }
+    //------------------------------------------------------------------------------
 
-      //When StrictlySimple and 'e' is being touched by another edge, then
-      //make sure both edges have a vertex here ...
-      if (m_StrictSimple)
-      {
-        TEdge* ePrev = e->PrevInAEL;
-        if ((e->OutIdx >= 0) && (e->WindDelta != 0) && ePrev && (ePrev->OutIdx >= 0) &&
-          (ePrev->Curr.X == e->Curr.X) && (ePrev->WindDelta != 0))
-        {
-          IntPoint pt = e->Curr;
-#ifdef use_xyz
-          SetZ(pt, *ePrev, *e);
-#endif
-          OutPt* op = AddOutPt(ePrev, pt);
-          OutPt* op2 = AddOutPt(e, pt);
-          AddJoin(op, op2, pt); //StrictlySimple (type-3) join
-        }
-      }
-
-      e = e->NextInAEL;
-    }
-  }
-
-  //3. Process horizontals at the Top of the scanbeam ...
-  m_Maxima.sort();
-  ProcessHorizontals();
-  m_Maxima.clear();
-
-  //4. Promote intermediate vertices ...
-  e = m_ActiveEdges;
-  while(e)
-  {
-    if(IsIntermediate(e, topY))
-    {
-      OutPt* op = 0;
-      if( e->OutIdx >= 0 )
-        op = AddOutPt(e, e->Top);
-      UpdateEdgeIntoAEL(e);
-
-      //if output polygons share an edge, they'll need joining later ...
-      TEdge* ePrev = e->PrevInAEL;
-      TEdge* eNext = e->NextInAEL;
-      if (ePrev && ePrev->Curr.X == e->Bot.X &&
-        ePrev->Curr.Y == e->Bot.Y && op &&
-        ePrev->OutIdx >= 0 && ePrev->Curr.Y > ePrev->Top.Y &&
-        SlopesEqual(e->Curr, e->Top, ePrev->Curr, ePrev->Top, m_UseFullRange) &&
-        (e->WindDelta != 0) && (ePrev->WindDelta != 0))
-      {
-        OutPt* op2 = AddOutPt(ePrev, e->Bot);
-        AddJoin(op, op2, e->Top);
-      }
-      else if (eNext && eNext->Curr.X == e->Bot.X &&
-        eNext->Curr.Y == e->Bot.Y && op &&
-        eNext->OutIdx >= 0 && eNext->Curr.Y > eNext->Top.Y &&
-        SlopesEqual(e->Curr, e->Top, eNext->Curr, eNext->Top, m_UseFullRange) &&
-        (e->WindDelta != 0) && (eNext->WindDelta != 0))
-      {
-        OutPt* op2 = AddOutPt(eNext, e->Bot);
-        AddJoin(op, op2, e->Top);
-      }
-    }
-    e = e->NextInAEL;
-  }
-}
-//------------------------------------------------------------------------------
-
-void Clipper::FixupOutPolyline(OutRec &outrec)
-{
-  OutPt *pp = outrec.Pts;
-  OutPt *lastPP = pp->Prev;
-  while (pp != lastPP)
-  {
-    pp = pp->Next;
-    if (pp->Pt == pp->Prev->Pt)
-    {
-      if (pp == lastPP) lastPP = pp->Prev;
-      OutPt *tmpPP = pp->Prev;
-      tmpPP->Next = pp->Next;
-      pp->Next->Prev = tmpPP;
-      delete pp;
-      pp = tmpPP;
-    }
-  }
-
-  if (pp == pp->Prev)
-  {
-    DisposeOutPts(pp);
-    outrec.Pts = 0;
-    return;
-  }
-}
-//------------------------------------------------------------------------------
-
-void Clipper::FixupOutPolygon(OutRec &outrec)
-{
-    //FixupOutPolygon() - removes duplicate points and simplifies consecutive
-    //parallel edges by removing the middle vertex.
-    OutPt *lastOK = 0;
-    outrec.BottomPt = 0;
-    OutPt *pp = outrec.Pts;
-    bool preserveCol = m_PreserveCollinear || m_StrictSimple;
+    void ClipperOffset::Clear()
+    {
+        for (int i = 0; i < m_polyNodes.ChildCount(); ++i)
+            delete m_polyNodes.Childs[i];
+        m_polyNodes.Childs.clear();
+        m_lowest.X = -1;
+    }
+    //------------------------------------------------------------------------------
 
-    for (;;)
+    void ClipperOffset::AddPath(const Path& path, JoinType joinType, EndType endType)
     {
-        if (pp->Prev == pp || pp->Prev == pp->Next)
+        int highI = (int)path.size() - 1;
+        if (highI < 0) return;
+        PolyNode* newNode   = new PolyNode();
+        newNode->m_jointype = joinType;
+        newNode->m_endtype  = endType;
+
+        // strip duplicate points from path and also get index to the lowest point ...
+        if (endType == etClosedLine || endType == etClosedPolygon)
+            while (highI > 0 && path[0] == path[highI]) highI--;
+        newNode->Contour.reserve(highI + 1);
+        newNode->Contour.push_back(path[0]);
+        int j = 0, k = 0;
+        for (int i = 1; i <= highI; i++)
+            if (newNode->Contour[j] != path[i])
+            {
+                j++;
+                newNode->Contour.push_back(path[i]);
+                if (path[i].Y > newNode->Contour[k].Y ||
+                    (path[i].Y == newNode->Contour[k].Y &&
+                     path[i].X < newNode->Contour[k].X)) k = j;
+            }
+        if (endType == etClosedPolygon && j < 2)
         {
-            DisposeOutPts(pp);
-            outrec.Pts = 0;
+            delete newNode;
             return;
         }
+        m_polyNodes.AddChild(*newNode);
 
-        //test for duplicate points and collinear edges ...
-        if ((pp->Pt == pp->Next->Pt) || (pp->Pt == pp->Prev->Pt) ||
-            (SlopesEqual(pp->Prev->Pt, pp->Pt, pp->Next->Pt, m_UseFullRange) &&
-            (!preserveCol || !Pt2IsBetweenPt1AndPt3(pp->Prev->Pt, pp->Pt, pp->Next->Pt))))
-        {
-            lastOK = 0;
-            OutPt *tmp = pp;
-            pp->Prev->Next = pp->Next;
-            pp->Next->Prev = pp->Prev;
-            pp = pp->Prev;
-            delete tmp;
-        }
-        else if (pp == lastOK) break;
+        // if this path's lowest pt is lower than all the others then update m_lowest
+        if (endType != etClosedPolygon) return;
+        if (m_lowest.X < 0)
+            m_lowest = IntPoint(m_polyNodes.ChildCount() - 1, k);
         else
         {
-            if (!lastOK) lastOK = pp;
-            pp = pp->Next;
+            IntPoint ip = m_polyNodes.Childs[(int)m_lowest.X]->Contour[(int)m_lowest.Y];
+            if (newNode->Contour[k].Y > ip.Y ||
+                (newNode->Contour[k].Y == ip.Y &&
+                 newNode->Contour[k].X < ip.X))
+                m_lowest = IntPoint(m_polyNodes.ChildCount() - 1, k);
         }
     }
-    outrec.Pts = pp;
-}
-//------------------------------------------------------------------------------
+    //------------------------------------------------------------------------------
 
-int PointCount(OutPt *Pts)
-{
-    if (!Pts) return 0;
-    int result = 0;
-    OutPt* p = Pts;
-    do
+    void ClipperOffset::AddPaths(const Paths& paths, JoinType joinType, EndType endType)
     {
-        result++;
-        p = p->Next;
+        for (Paths::size_type i = 0; i < paths.size(); ++i)
+            AddPath(paths[i], joinType, endType);
     }
-    while (p != Pts);
-    return result;
-}
-//------------------------------------------------------------------------------
-
-void Clipper::BuildResult(Paths &polys)
-{
-  polys.reserve(m_PolyOuts.size());
-  for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); ++i)
-  {
-    if (!m_PolyOuts[i]->Pts) continue;
-    Path pg;
-    OutPt* p = m_PolyOuts[i]->Pts->Prev;
-    int cnt = PointCount(p);
-    if (cnt < 2) continue;
-    pg.reserve(cnt);
-    for (int i = 0; i < cnt; ++i)
-    {
-      pg.push_back(p->Pt);
-      p = p->Prev;
-    }
-    polys.push_back(pg);
-  }
-}
-//------------------------------------------------------------------------------
+    //------------------------------------------------------------------------------
 
-void Clipper::BuildResult2(PolyTree& polytree)
-{
-    polytree.Clear();
-    polytree.AllNodes.reserve(m_PolyOuts.size());
-    //add each output polygon/contour to polytree ...
-    for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); i++)
-    {
-        OutRec* outRec = m_PolyOuts[i];
-        int cnt = PointCount(outRec->Pts);
-        if ((outRec->IsOpen && cnt < 2) || (!outRec->IsOpen && cnt < 3)) continue;
-        FixHoleLinkage(*outRec);
-        PolyNode* pn = new PolyNode();
-        //nb: polytree takes ownership of all the PolyNodes
-        polytree.AllNodes.push_back(pn);
-        outRec->PolyNd = pn;
-        pn->Parent = 0;
-        pn->Index = 0;
-        pn->Contour.reserve(cnt);
-        OutPt *op = outRec->Pts->Prev;
-        for (int j = 0; j < cnt; j++)
-        {
-            pn->Contour.push_back(op->Pt);
-            op = op->Prev;
-        }
-    }
-
-    //fixup PolyNode links etc ...
-    polytree.Childs.reserve(m_PolyOuts.size());
-    for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); i++)
-    {
-        OutRec* outRec = m_PolyOuts[i];
-        if (!outRec->PolyNd) continue;
-        if (outRec->IsOpen)
-        {
-          outRec->PolyNd->m_IsOpen = true;
-          polytree.AddChild(*outRec->PolyNd);
-        }
-        else if (outRec->FirstLeft && outRec->FirstLeft->PolyNd)
-          outRec->FirstLeft->PolyNd->AddChild(*outRec->PolyNd);
+    void ClipperOffset::FixOrientations()
+    {
+        // fixup orientations of all closed paths if the orientation of the
+        // closed path with the lowermost vertex is wrong ...
+        if (m_lowest.X >= 0 &&
+            !Orientation(m_polyNodes.Childs[(int)m_lowest.X]->Contour))
+        {
+            for (int i = 0; i < m_polyNodes.ChildCount(); ++i)
+            {
+                PolyNode& node = *m_polyNodes.Childs[i];
+                if (node.m_endtype == etClosedPolygon ||
+                    (node.m_endtype == etClosedLine && Orientation(node.Contour)))
+                    ReversePath(node.Contour);
+            }
+        }
         else
-          polytree.AddChild(*outRec->PolyNd);
+        {
+            for (int i = 0; i < m_polyNodes.ChildCount(); ++i)
+            {
+                PolyNode& node = *m_polyNodes.Childs[i];
+                if (node.m_endtype == etClosedLine && !Orientation(node.Contour))
+                    ReversePath(node.Contour);
+            }
+        }
     }
-}
-//------------------------------------------------------------------------------
-
-void SwapIntersectNodes(IntersectNode &int1, IntersectNode &int2)
-{
-  //just swap the contents (because fIntersectNodes is a single-linked-list)
-  IntersectNode inode = int1; //gets a copy of Int1
-  int1.Edge1 = int2.Edge1;
-  int1.Edge2 = int2.Edge2;
-  int1.Pt = int2.Pt;
-  int2.Edge1 = inode.Edge1;
-  int2.Edge2 = inode.Edge2;
-  int2.Pt = inode.Pt;
-}
-//------------------------------------------------------------------------------
-
-inline bool E2InsertsBeforeE1(TEdge &e1, TEdge &e2)
-{
-  if (e2.Curr.X == e1.Curr.X)
-  {
-    if (e2.Top.Y > e1.Top.Y)
-      return e2.Top.X < TopX(e1, e2.Top.Y);
-      else return e1.Top.X > TopX(e2, e1.Top.Y);
-  }
-  else return e2.Curr.X < e1.Curr.X;
-}
-//------------------------------------------------------------------------------
-
-bool GetOverlap(const cInt a1, const cInt a2, const cInt b1, const cInt b2,
-    cInt& Left, cInt& Right)
-{
-  if (a1 < a2)
-  {
-    if (b1 < b2) {Left = std::max(a1,b1); Right = std::min(a2,b2);}
-    else {Left = std::max(a1,b2); Right = std::min(a2,b1);}
-  }
-  else
-  {
-    if (b1 < b2) {Left = std::max(a2,b1); Right = std::min(a1,b2);}
-    else {Left = std::max(a2,b2); Right = std::min(a1,b1);}
-  }
-  return Left < Right;
-}
-//------------------------------------------------------------------------------
-
-inline void UpdateOutPtIdxs(OutRec& outrec)
-{
-  OutPt* op = outrec.Pts;
-  do
-  {
-    op->Idx = outrec.Idx;
-    op = op->Prev;
-  }
-  while(op != outrec.Pts);
-}
-//------------------------------------------------------------------------------
-
-void Clipper::InsertEdgeIntoAEL(TEdge *edge, TEdge* startEdge)
-{
-  if(!m_ActiveEdges)
-  {
-    edge->PrevInAEL = 0;
-    edge->NextInAEL = 0;
-    m_ActiveEdges = edge;
-  }
-  else if(!startEdge && E2InsertsBeforeE1(*m_ActiveEdges, *edge))
-  {
-      edge->PrevInAEL = 0;
-      edge->NextInAEL = m_ActiveEdges;
-      m_ActiveEdges->PrevInAEL = edge;
-      m_ActiveEdges = edge;
-  }
-  else
-  {
-    if(!startEdge) startEdge = m_ActiveEdges;
-    while(startEdge->NextInAEL  &&
-      !E2InsertsBeforeE1(*startEdge->NextInAEL , *edge))
-        startEdge = startEdge->NextInAEL;
-    edge->NextInAEL = startEdge->NextInAEL;
-    if(startEdge->NextInAEL) startEdge->NextInAEL->PrevInAEL = edge;
-    edge->PrevInAEL = startEdge;
-    startEdge->NextInAEL = edge;
-  }
-}
-//----------------------------------------------------------------------
-
-OutPt* DupOutPt(OutPt* outPt, bool InsertAfter)
-{
-  OutPt* result = new OutPt;
-  result->Pt = outPt->Pt;
-  result->Idx = outPt->Idx;
-  if (InsertAfter)
-  {
-    result->Next = outPt->Next;
-    result->Prev = outPt;
-    outPt->Next->Prev = result;
-    outPt->Next = result;
-  }
-  else
-  {
-    result->Prev = outPt->Prev;
-    result->Next = outPt;
-    outPt->Prev->Next = result;
-    outPt->Prev = result;
-  }
-  return result;
-}
-//------------------------------------------------------------------------------
-
-bool JoinHorz(OutPt* op1, OutPt* op1b, OutPt* op2, OutPt* op2b,
-  const IntPoint Pt, bool DiscardLeft)
-{
-  Direction Dir1 = (op1->Pt.X > op1b->Pt.X ? dRightToLeft : dLeftToRight);
-  Direction Dir2 = (op2->Pt.X > op2b->Pt.X ? dRightToLeft : dLeftToRight);
-  if (Dir1 == Dir2) return false;
-
-  //When DiscardLeft, we want Op1b to be on the Left of Op1, otherwise we
-  //want Op1b to be on the Right. (And likewise with Op2 and Op2b.)
-  //So, to facilitate this while inserting Op1b and Op2b ...
-  //when DiscardLeft, make sure we're AT or RIGHT of Pt before adding Op1b,
-  //otherwise make sure we're AT or LEFT of Pt. (Likewise with Op2b.)
-  if (Dir1 == dLeftToRight)
-  {
-    while (op1->Next->Pt.X <= Pt.X &&
-      op1->Next->Pt.X >= op1->Pt.X && op1->Next->Pt.Y == Pt.Y)
-        op1 = op1->Next;
-    if (DiscardLeft && (op1->Pt.X != Pt.X)) op1 = op1->Next;
-    op1b = DupOutPt(op1, !DiscardLeft);
-    if (op1b->Pt != Pt)
-    {
-      op1 = op1b;
-      op1->Pt = Pt;
-      op1b = DupOutPt(op1, !DiscardLeft);
-    }
-  }
-  else
-  {
-    while (op1->Next->Pt.X >= Pt.X &&
-      op1->Next->Pt.X <= op1->Pt.X && op1->Next->Pt.Y == Pt.Y)
-        op1 = op1->Next;
-    if (!DiscardLeft && (op1->Pt.X != Pt.X)) op1 = op1->Next;
-    op1b = DupOutPt(op1, DiscardLeft);
-    if (op1b->Pt != Pt)
-    {
-      op1 = op1b;
-      op1->Pt = Pt;
-      op1b = DupOutPt(op1, DiscardLeft);
-    }
-  }
-
-  if (Dir2 == dLeftToRight)
-  {
-    while (op2->Next->Pt.X <= Pt.X &&
-      op2->Next->Pt.X >= op2->Pt.X && op2->Next->Pt.Y == Pt.Y)
-        op2 = op2->Next;
-    if (DiscardLeft && (op2->Pt.X != Pt.X)) op2 = op2->Next;
-    op2b = DupOutPt(op2, !DiscardLeft);
-    if (op2b->Pt != Pt)
-    {
-      op2 = op2b;
-      op2->Pt = Pt;
-      op2b = DupOutPt(op2, !DiscardLeft);
-    };
-  } else
-  {
-    while (op2->Next->Pt.X >= Pt.X &&
-      op2->Next->Pt.X <= op2->Pt.X && op2->Next->Pt.Y == Pt.Y)
-        op2 = op2->Next;
-    if (!DiscardLeft && (op2->Pt.X != Pt.X)) op2 = op2->Next;
-    op2b = DupOutPt(op2, DiscardLeft);
-    if (op2b->Pt != Pt)
-    {
-      op2 = op2b;
-      op2->Pt = Pt;
-      op2b = DupOutPt(op2, DiscardLeft);
-    };
-  };
-
-  if ((Dir1 == dLeftToRight) == DiscardLeft)
-  {
-    op1->Prev = op2;
-    op2->Next = op1;
-    op1b->Next = op2b;
-    op2b->Prev = op1b;
-  }
-  else
-  {
-    op1->Next = op2;
-    op2->Prev = op1;
-    op1b->Prev = op2b;
-    op2b->Next = op1b;
-  }
-  return true;
-}
-//------------------------------------------------------------------------------
+    //------------------------------------------------------------------------------
 
-bool Clipper::JoinPoints(Join *j, OutRec* outRec1, OutRec* outRec2)
-{
-  OutPt *op1 = j->OutPt1, *op1b;
-  OutPt *op2 = j->OutPt2, *op2b;
-
-  //There are 3 kinds of joins for output polygons ...
-  //1. Horizontal joins where Join.OutPt1 & Join.OutPt2 are vertices anywhere
-  //along (horizontal) collinear edges (& Join.OffPt is on the same horizontal).
-  //2. Non-horizontal joins where Join.OutPt1 & Join.OutPt2 are at the same
-  //location at the Bottom of the overlapping segment (& Join.OffPt is above).
-  //3. StrictSimple joins where edges touch but are not collinear and where
-  //Join.OutPt1, Join.OutPt2 & Join.OffPt all share the same point.
-  bool isHorizontal = (j->OutPt1->Pt.Y == j->OffPt.Y);
-
-  if (isHorizontal  && (j->OffPt == j->OutPt1->Pt) &&
-  (j->OffPt == j->OutPt2->Pt))
-  {
-    //Strictly Simple join ...
-    if (outRec1 != outRec2) return false;
-    op1b = j->OutPt1->Next;
-    while (op1b != op1 && (op1b->Pt == j->OffPt))
-      op1b = op1b->Next;
-    bool reverse1 = (op1b->Pt.Y > j->OffPt.Y);
-    op2b = j->OutPt2->Next;
-    while (op2b != op2 && (op2b->Pt == j->OffPt))
-      op2b = op2b->Next;
-    bool reverse2 = (op2b->Pt.Y > j->OffPt.Y);
-    if (reverse1 == reverse2) return false;
-    if (reverse1)
-    {
-      op1b = DupOutPt(op1, false);
-      op2b = DupOutPt(op2, true);
-      op1->Prev = op2;
-      op2->Next = op1;
-      op1b->Next = op2b;
-      op2b->Prev = op1b;
-      j->OutPt1 = op1;
-      j->OutPt2 = op1b;
-      return true;
-    } else
-    {
-      op1b = DupOutPt(op1, true);
-      op2b = DupOutPt(op2, false);
-      op1->Next = op2;
-      op2->Prev = op1;
-      op1b->Prev = op2b;
-      op2b->Next = op1b;
-      j->OutPt1 = op1;
-      j->OutPt2 = op1b;
-      return true;
-    }
-  }
-  else if (isHorizontal)
-  {
-    //treat horizontal joins differently to non-horizontal joins since with
-    //them we're not yet sure where the overlapping is. OutPt1.Pt & OutPt2.Pt
-    //may be anywhere along the horizontal edge.
-    op1b = op1;
-    while (op1->Prev->Pt.Y == op1->Pt.Y && op1->Prev != op1b && op1->Prev != op2)
-      op1 = op1->Prev;
-    while (op1b->Next->Pt.Y == op1b->Pt.Y && op1b->Next != op1 && op1b->Next != op2)
-      op1b = op1b->Next;
-    if (op1b->Next == op1 || op1b->Next == op2) return false; //a flat 'polygon'
-
-    op2b = op2;
-    while (op2->Prev->Pt.Y == op2->Pt.Y && op2->Prev != op2b && op2->Prev != op1b)
-      op2 = op2->Prev;
-    while (op2b->Next->Pt.Y == op2b->Pt.Y && op2b->Next != op2 && op2b->Next != op1)
-      op2b = op2b->Next;
-    if (op2b->Next == op2 || op2b->Next == op1) return false; //a flat 'polygon'
-
-    cInt Left, Right;
-    //Op1 --> Op1b & Op2 --> Op2b are the extremites of the horizontal edges
-    if (!GetOverlap(op1->Pt.X, op1b->Pt.X, op2->Pt.X, op2b->Pt.X, Left, Right))
-      return false;
-
-    //DiscardLeftSide: when overlapping edges are joined, a spike will created
-    //which needs to be cleaned up. However, we don't want Op1 or Op2 caught up
-    //on the discard Side as either may still be needed for other joins ...
-    IntPoint Pt;
-    bool DiscardLeftSide;
-    if (op1->Pt.X >= Left && op1->Pt.X <= Right)
-    {
-      Pt = op1->Pt; DiscardLeftSide = (op1->Pt.X > op1b->Pt.X);
-    }
-    else if (op2->Pt.X >= Left&& op2->Pt.X <= Right)
-    {
-      Pt = op2->Pt; DiscardLeftSide = (op2->Pt.X > op2b->Pt.X);
-    }
-    else if (op1b->Pt.X >= Left && op1b->Pt.X <= Right)
-    {
-      Pt = op1b->Pt; DiscardLeftSide = op1b->Pt.X > op1->Pt.X;
-    }
-    else
-    {
-      Pt = op2b->Pt; DiscardLeftSide = (op2b->Pt.X > op2->Pt.X);
-    }
-    j->OutPt1 = op1; j->OutPt2 = op2;
-    return JoinHorz(op1, op1b, op2, op2b, Pt, DiscardLeftSide);
-  } else
-  {
-    //nb: For non-horizontal joins ...
-    //    1. Jr.OutPt1.Pt.Y == Jr.OutPt2.Pt.Y
-    //    2. Jr.OutPt1.Pt > Jr.OffPt.Y
-
-    //make sure the polygons are correctly oriented ...
-    op1b = op1->Next;
-    while ((op1b->Pt == op1->Pt) && (op1b != op1)) op1b = op1b->Next;
-    bool Reverse1 = ((op1b->Pt.Y > op1->Pt.Y) ||
-      !SlopesEqual(op1->Pt, op1b->Pt, j->OffPt, m_UseFullRange));
-    if (Reverse1)
-    {
-      op1b = op1->Prev;
-      while ((op1b->Pt == op1->Pt) && (op1b != op1)) op1b = op1b->Prev;
-      if ((op1b->Pt.Y > op1->Pt.Y) ||
-        !SlopesEqual(op1->Pt, op1b->Pt, j->OffPt, m_UseFullRange)) return false;
-    };
-    op2b = op2->Next;
-    while ((op2b->Pt == op2->Pt) && (op2b != op2))op2b = op2b->Next;
-    bool Reverse2 = ((op2b->Pt.Y > op2->Pt.Y) ||
-      !SlopesEqual(op2->Pt, op2b->Pt, j->OffPt, m_UseFullRange));
-    if (Reverse2)
-    {
-      op2b = op2->Prev;
-      while ((op2b->Pt == op2->Pt) && (op2b != op2)) op2b = op2b->Prev;
-      if ((op2b->Pt.Y > op2->Pt.Y) ||
-        !SlopesEqual(op2->Pt, op2b->Pt, j->OffPt, m_UseFullRange)) return false;
-    }
-
-    if ((op1b == op1) || (op2b == op2) || (op1b == op2b) ||
-      ((outRec1 == outRec2) && (Reverse1 == Reverse2))) return false;
-
-    if (Reverse1)
-    {
-      op1b = DupOutPt(op1, false);
-      op2b = DupOutPt(op2, true);
-      op1->Prev = op2;
-      op2->Next = op1;
-      op1b->Next = op2b;
-      op2b->Prev = op1b;
-      j->OutPt1 = op1;
-      j->OutPt2 = op1b;
-      return true;
-    } else
-    {
-      op1b = DupOutPt(op1, true);
-      op2b = DupOutPt(op2, false);
-      op1->Next = op2;
-      op2->Prev = op1;
-      op1b->Prev = op2b;
-      op2b->Next = op1b;
-      j->OutPt1 = op1;
-      j->OutPt2 = op1b;
-      return true;
-    }
-  }
-}
-//----------------------------------------------------------------------
-
-static OutRec* ParseFirstLeft(OutRec* FirstLeft)
-{
-  while (FirstLeft && !FirstLeft->Pts)
-    FirstLeft = FirstLeft->FirstLeft;
-  return FirstLeft;
-}
-//------------------------------------------------------------------------------
+    void ClipperOffset::Execute(Paths& solution, double delta)
+    {
+        solution.clear();
+        FixOrientations();
+        DoOffset(delta);
+
+        // now clean up 'corners' ...
+        Clipper clpr;
+        clpr.AddPaths(m_destPolys, ptSubject, true);
+        if (delta > 0)
+        {
+            clpr.Execute(ctUnion, solution, pftPositive, pftPositive);
+        }
+        else
+        {
+            IntRect r = clpr.GetBounds();
+            Path    outer(4);
+            outer[0] = IntPoint(r.left - 10, r.bottom + 10);
+            outer[1] = IntPoint(r.right + 10, r.bottom + 10);
+            outer[2] = IntPoint(r.right + 10, r.top - 10);
+            outer[3] = IntPoint(r.left - 10, r.top - 10);
+
+            clpr.AddPath(outer, ptSubject, true);
+            clpr.ReverseSolution(true);
+            clpr.Execute(ctUnion, solution, pftNegative, pftNegative);
+            if (solution.size() > 0) solution.erase(solution.begin());
+        }
+    }
+    //------------------------------------------------------------------------------
 
-void Clipper::FixupFirstLefts1(OutRec* OldOutRec, OutRec* NewOutRec)
-{
-  //tests if NewOutRec contains the polygon before reassigning FirstLeft
-  for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); ++i)
-  {
-    OutRec* outRec = m_PolyOuts[i];
-    OutRec* firstLeft = ParseFirstLeft(outRec->FirstLeft);
-    if (outRec->Pts  && firstLeft == OldOutRec)
+    void ClipperOffset::Execute(PolyTree& solution, double delta)
     {
-      if (Poly2ContainsPoly1(outRec->Pts, NewOutRec->Pts))
-        outRec->FirstLeft = NewOutRec;
+        solution.Clear();
+        FixOrientations();
+        DoOffset(delta);
+
+        // now clean up 'corners' ...
+        Clipper clpr;
+        clpr.AddPaths(m_destPolys, ptSubject, true);
+        if (delta > 0)
+        {
+            clpr.Execute(ctUnion, solution, pftPositive, pftPositive);
+        }
+        else
+        {
+            IntRect r = clpr.GetBounds();
+            Path    outer(4);
+            outer[0] = IntPoint(r.left - 10, r.bottom + 10);
+            outer[1] = IntPoint(r.right + 10, r.bottom + 10);
+            outer[2] = IntPoint(r.right + 10, r.top - 10);
+            outer[3] = IntPoint(r.left - 10, r.top - 10);
+
+            clpr.AddPath(outer, ptSubject, true);
+            clpr.ReverseSolution(true);
+            clpr.Execute(ctUnion, solution, pftNegative, pftNegative);
+            // remove the outer PolyNode rectangle ...
+            if (solution.ChildCount() == 1 && solution.Childs[0]->ChildCount() > 0)
+            {
+                PolyNode* outerNode = solution.Childs[0];
+                solution.Childs.reserve(outerNode->ChildCount());
+                solution.Childs[0]         = outerNode->Childs[0];
+                solution.Childs[0]->Parent = outerNode->Parent;
+                for (int i = 1; i < outerNode->ChildCount(); ++i)
+                    solution.AddChild(*outerNode->Childs[i]);
+            }
+            else
+                solution.Clear();
+        }
     }
-  }
-}
-//----------------------------------------------------------------------
+    //------------------------------------------------------------------------------
 
-void Clipper::FixupFirstLefts2(OutRec* InnerOutRec, OutRec* OuterOutRec)
-{
-  //A polygon has split into two such that one is now the inner of the other.
-  //It's possible that these polygons now wrap around other polygons, so check
-  //every polygon that's also contained by OuterOutRec's FirstLeft container
-  //(including 0) to see if they've become inner to the new inner polygon ...
-  OutRec* orfl = OuterOutRec->FirstLeft;
-  for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); ++i)
-  {
-    OutRec* outRec = m_PolyOuts[i];
-
-    if (!outRec->Pts || outRec == OuterOutRec || outRec == InnerOutRec)
-      continue;
-    OutRec* firstLeft = ParseFirstLeft(outRec->FirstLeft);
-    if (firstLeft != orfl && firstLeft != InnerOutRec && firstLeft != OuterOutRec)
-      continue;
-    if (Poly2ContainsPoly1(outRec->Pts, InnerOutRec->Pts))
-      outRec->FirstLeft = InnerOutRec;
-    else if (Poly2ContainsPoly1(outRec->Pts, OuterOutRec->Pts))
-      outRec->FirstLeft = OuterOutRec;
-    else if (outRec->FirstLeft == InnerOutRec || outRec->FirstLeft == OuterOutRec)
-      outRec->FirstLeft = orfl;
-  }
-}
-//----------------------------------------------------------------------
-void Clipper::FixupFirstLefts3(OutRec* OldOutRec, OutRec* NewOutRec)
-{
-  //reassigns FirstLeft WITHOUT testing if NewOutRec contains the polygon
-  for (PolyOutList::size_type i = 0; i < m_PolyOuts.size(); ++i)
-  {
-    OutRec* outRec = m_PolyOuts[i];
-    OutRec* firstLeft = ParseFirstLeft(outRec->FirstLeft);
-    if (outRec->Pts && firstLeft == OldOutRec)
-      outRec->FirstLeft = NewOutRec;
-  }
-}
-//----------------------------------------------------------------------
-
-void Clipper::JoinCommonEdges()
-{
-  for (JoinList::size_type i = 0; i < m_Joins.size(); i++)
-  {
-    Join* join = m_Joins[i];
-
-    OutRec *outRec1 = GetOutRec(join->OutPt1->Idx);
-    OutRec *outRec2 = GetOutRec(join->OutPt2->Idx);
-
-    if (!outRec1->Pts || !outRec2->Pts) continue;
-    if (outRec1->IsOpen || outRec2->IsOpen) continue;
-
-    //get the polygon fragment with the correct hole state (FirstLeft)
-    //before calling JoinPoints() ...
-    OutRec *holeStateRec;
-    if (outRec1 == outRec2) holeStateRec = outRec1;
-    else if (OutRec1RightOfOutRec2(outRec1, outRec2)) holeStateRec = outRec2;
-    else if (OutRec1RightOfOutRec2(outRec2, outRec1)) holeStateRec = outRec1;
-    else holeStateRec = GetLowermostRec(outRec1, outRec2);
-
-    if (!JoinPoints(join, outRec1, outRec2)) continue;
-
-    if (outRec1 == outRec2)
-    {
-      //instead of joining two polygons, we've just created a new one by
-      //splitting one polygon into two.
-      outRec1->Pts = join->OutPt1;
-      outRec1->BottomPt = 0;
-      outRec2 = CreateOutRec();
-      outRec2->Pts = join->OutPt2;
-
-      //update all OutRec2.Pts Idx's ...
-      UpdateOutPtIdxs(*outRec2);
-
-      if (Poly2ContainsPoly1(outRec2->Pts, outRec1->Pts))
-      {
-        //outRec1 contains outRec2 ...
-        outRec2->IsHole = !outRec1->IsHole;
-        outRec2->FirstLeft = outRec1;
+    void ClipperOffset::DoOffset(double delta)
+    {
+        m_destPolys.clear();
+        m_delta = delta;
 
-        if (m_UsingPolyTree) FixupFirstLefts2(outRec2, outRec1);
+        // if Zero offset, just copy any CLOSED polygons to m_p and return ...
+        if (NEAR_ZERO(delta))
+        {
+            m_destPolys.reserve(m_polyNodes.ChildCount());
+            for (int i = 0; i < m_polyNodes.ChildCount(); i++)
+            {
+                PolyNode& node = *m_polyNodes.Childs[i];
+                if (node.m_endtype == etClosedPolygon)
+                    m_destPolys.push_back(node.Contour);
+            }
+            return;
+        }
 
-        if ((outRec2->IsHole ^ m_ReverseOutput) == (Area(*outRec2) > 0))
-          ReversePolyPtLinks(outRec2->Pts);
+        // see offset_triginometry3.svg in the documentation folder ...
+        if (MiterLimit > 2)
+            m_miterLim = 2 / (MiterLimit * MiterLimit);
+        else
+            m_miterLim = 0.5;
 
-      } else if (Poly2ContainsPoly1(outRec1->Pts, outRec2->Pts))
-      {
-        //outRec2 contains outRec1 ...
-        outRec2->IsHole = outRec1->IsHole;
-        outRec1->IsHole = !outRec2->IsHole;
-        outRec2->FirstLeft = outRec1->FirstLeft;
-        outRec1->FirstLeft = outRec2;
+        double y;
+        if (ArcTolerance <= 0.0)
+            y = def_arc_tolerance;
+        else if (ArcTolerance > std::fabs(delta) * def_arc_tolerance)
+            y = std::fabs(delta) * def_arc_tolerance;
+        else
+            y = ArcTolerance;
+        // see offset_triginometry2.svg in the documentation folder ...
+        double steps = pi / std::acos(1 - y / std::fabs(delta));
+        if (steps > std::fabs(delta) * pi)
+            steps = std::fabs(delta) * pi;  // ie excessive precision check
+        m_sin         = std::sin(two_pi / steps);
+        m_cos         = std::cos(two_pi / steps);
+        m_StepsPerRad = steps / two_pi;
+        if (delta < 0.0) m_sin = -m_sin;
+
+        m_destPolys.reserve(m_polyNodes.ChildCount() * 2);
+        for (int i = 0; i < m_polyNodes.ChildCount(); i++)
+        {
+            PolyNode& node = *m_polyNodes.Childs[i];
+            m_srcPoly      = node.Contour;
 
-        if (m_UsingPolyTree) FixupFirstLefts2(outRec1, outRec2);
+            int len = (int)m_srcPoly.size();
+            if (len == 0 || (delta <= 0 && (len < 3 || node.m_endtype != etClosedPolygon)))
+                continue;
 
-        if ((outRec1->IsHole ^ m_ReverseOutput) == (Area(*outRec1) > 0))
-          ReversePolyPtLinks(outRec1->Pts);
-      }
-      else
-      {
-        //the 2 polygons are completely separate ...
-        outRec2->IsHole = outRec1->IsHole;
-        outRec2->FirstLeft = outRec1->FirstLeft;
+            m_destPoly.clear();
+            if (len == 1)
+            {
+                if (node.m_jointype == jtRound)
+                {
+                    double X = 1.0, Y = 0.0;
+                    for (cInt j = 1; j <= steps; j++)
+                    {
+                        m_destPoly.push_back(IntPoint(
+                            Round(m_srcPoly[0].X + X * delta),
+                            Round(m_srcPoly[0].Y + Y * delta)));
+                        double X2 = X;
+                        X         = X * m_cos - m_sin * Y;
+                        Y         = X2 * m_sin + Y * m_cos;
+                    }
+                }
+                else
+                {
+                    double X = -1.0, Y = -1.0;
+                    for (int j = 0; j < 4; ++j)
+                    {
+                        m_destPoly.push_back(IntPoint(
+                            Round(m_srcPoly[0].X + X * delta),
+                            Round(m_srcPoly[0].Y + Y * delta)));
+                        if (X < 0)
+                            X = 1;
+                        else if (Y < 0)
+                            Y = 1;
+                        else
+                            X = -1;
+                    }
+                }
+                m_destPolys.push_back(m_destPoly);
+                continue;
+            }
+            // build m_normals ...
+            m_normals.clear();
+            m_normals.reserve(len);
+            for (int j = 0; j < len - 1; ++j)
+                m_normals.push_back(GetUnitNormal(m_srcPoly[j], m_srcPoly[j + 1]));
+            if (node.m_endtype == etClosedLine || node.m_endtype == etClosedPolygon)
+                m_normals.push_back(GetUnitNormal(m_srcPoly[len - 1], m_srcPoly[0]));
+            else
+                m_normals.push_back(DoublePoint(m_normals[len - 2]));
 
-        //fixup FirstLeft pointers that may need reassigning to OutRec2
-        if (m_UsingPolyTree) FixupFirstLefts1(outRec1, outRec2);
-      }
+            if (node.m_endtype == etClosedPolygon)
+            {
+                int k = len - 1;
+                for (int j = 0; j < len; ++j)
+                    OffsetPoint(j, k, node.m_jointype);
+                m_destPolys.push_back(m_destPoly);
+            }
+            else if (node.m_endtype == etClosedLine)
+            {
+                int k = len - 1;
+                for (int j = 0; j < len; ++j)
+                    OffsetPoint(j, k, node.m_jointype);
+                m_destPolys.push_back(m_destPoly);
+                m_destPoly.clear();
+                // re-build m_normals ...
+                DoublePoint n = m_normals[len - 1];
+                for (int j = len - 1; j > 0; j--)
+                    m_normals[j] = DoublePoint(-m_normals[j - 1].X, -m_normals[j - 1].Y);
+                m_normals[0] = DoublePoint(-n.X, -n.Y);
+                k            = 0;
+                for (int j = len - 1; j >= 0; j--)
+                    OffsetPoint(j, k, node.m_jointype);
+                m_destPolys.push_back(m_destPoly);
+            }
+            else
+            {
+                int k = 0;
+                for (int j = 1; j < len - 1; ++j)
+                    OffsetPoint(j, k, node.m_jointype);
 
-    } else
-    {
-      //joined 2 polygons together ...
+                IntPoint pt1;
+                if (node.m_endtype == etOpenButt)
+                {
+                    int j = len - 1;
+                    pt1   = IntPoint((cInt)Round(m_srcPoly[j].X + m_normals[j].X *
+                                                                    delta),
+                                   (cInt)Round(m_srcPoly[j].Y + m_normals[j].Y * delta));
+                    m_destPoly.push_back(pt1);
+                    pt1 = IntPoint((cInt)Round(m_srcPoly[j].X - m_normals[j].X *
+                                                                    delta),
+                                   (cInt)Round(m_srcPoly[j].Y - m_normals[j].Y * delta));
+                    m_destPoly.push_back(pt1);
+                }
+                else
+                {
+                    int j        = len - 1;
+                    k            = len - 2;
+                    m_sinA       = 0;
+                    m_normals[j] = DoublePoint(-m_normals[j].X, -m_normals[j].Y);
+                    if (node.m_endtype == etOpenSquare)
+                        DoSquare(j, k);
+                    else
+                        DoRound(j, k);
+                }
 
-      outRec2->Pts = 0;
-      outRec2->BottomPt = 0;
-      outRec2->Idx = outRec1->Idx;
+                // re-build m_normals ...
+                for (int j = len - 1; j > 0; j--)
+                    m_normals[j] = DoublePoint(-m_normals[j - 1].X, -m_normals[j - 1].Y);
+                m_normals[0] = DoublePoint(-m_normals[1].X, -m_normals[1].Y);
 
-      outRec1->IsHole = holeStateRec->IsHole;
-      if (holeStateRec == outRec2)
-        outRec1->FirstLeft = outRec2->FirstLeft;
-      outRec2->FirstLeft = outRec1;
+                k = len - 1;
+                for (int j = k - 1; j > 0; --j) OffsetPoint(j, k, node.m_jointype);
 
-      if (m_UsingPolyTree) FixupFirstLefts3(outRec2, outRec1);
+                if (node.m_endtype == etOpenButt)
+                {
+                    pt1 = IntPoint((cInt)Round(m_srcPoly[0].X - m_normals[0].X * delta),
+                                   (cInt)Round(m_srcPoly[0].Y - m_normals[0].Y * delta));
+                    m_destPoly.push_back(pt1);
+                    pt1 = IntPoint((cInt)Round(m_srcPoly[0].X + m_normals[0].X * delta),
+                                   (cInt)Round(m_srcPoly[0].Y + m_normals[0].Y * delta));
+                    m_destPoly.push_back(pt1);
+                }
+                else
+                {
+                    k      = 1;
+                    m_sinA = 0;
+                    if (node.m_endtype == etOpenSquare)
+                        DoSquare(0, 1);
+                    else
+                        DoRound(0, 1);
+                }
+                m_destPolys.push_back(m_destPoly);
+            }
+        }
     }
-  }
-}
-
-//------------------------------------------------------------------------------
-// ClipperOffset support functions ...
-//------------------------------------------------------------------------------
+    //------------------------------------------------------------------------------
 
-DoublePoint GetUnitNormal(const IntPoint &pt1, const IntPoint &pt2)
-{
-  if(pt2.X == pt1.X && pt2.Y == pt1.Y)
-    return DoublePoint(0, 0);
-
-  double Dx = (double)(pt2.X - pt1.X);
-  double dy = (double)(pt2.Y - pt1.Y);
-  double f = 1 *1.0/ std::sqrt( Dx*Dx + dy*dy );
-  Dx *= f;
-  dy *= f;
-  return DoublePoint(dy, -Dx);
-}
-
-//------------------------------------------------------------------------------
-// ClipperOffset class
-//------------------------------------------------------------------------------
+    void ClipperOffset::OffsetPoint(int j, int& k, JoinType jointype)
+    {
+        // cross product ...
+        m_sinA = (m_normals[k].X * m_normals[j].Y - m_normals[j].X * m_normals[k].Y);
+        if (std::fabs(m_sinA * m_delta) < 1.0)
+        {
+            // dot product ...
+            double cosA = (m_normals[k].X * m_normals[j].X + m_normals[j].Y * m_normals[k].Y);
+            if (cosA > 0)  // angle => 0 degrees
+            {
+                m_destPoly.push_back(IntPoint(Round(m_srcPoly[j].X + m_normals[k].X * m_delta),
+                                              Round(m_srcPoly[j].Y + m_normals[k].Y * m_delta)));
+                return;
+            }
+            // else angle => 180 degrees
+        }
+        else if (m_sinA > 1.0)
+            m_sinA = 1.0;
+        else if (m_sinA < -1.0)
+            m_sinA = -1.0;
 
-ClipperOffset::ClipperOffset(double miterLimit, double arcTolerance)
-{
-  this->MiterLimit = miterLimit;
-  this->ArcTolerance = arcTolerance;
-  m_lowest.X = -1;
-}
-//------------------------------------------------------------------------------
+        if (m_sinA * m_delta < 0)
+        {
+            m_destPoly.push_back(IntPoint(Round(m_srcPoly[j].X + m_normals[k].X * m_delta),
+                                          Round(m_srcPoly[j].Y + m_normals[k].Y * m_delta)));
+            m_destPoly.push_back(m_srcPoly[j]);
+            m_destPoly.push_back(IntPoint(Round(m_srcPoly[j].X + m_normals[j].X * m_delta),
+                                          Round(m_srcPoly[j].Y + m_normals[j].Y * m_delta)));
+        }
+        else
+            switch (jointype)
+            {
+                case jtMiter:
+                {
+                    double r = 1 + (m_normals[j].X * m_normals[k].X +
+                                    m_normals[j].Y * m_normals[k].Y);
+                    if (r >= m_miterLim)
+                        DoMiter(j, k, r);
+                    else
+                        DoSquare(j, k);
+                    break;
+                }
+                case jtSquare: DoSquare(j, k); break;
+                case jtRound: DoRound(j, k); break;
+            }
+        k = j;
+    }
+    //------------------------------------------------------------------------------
 
-ClipperOffset::~ClipperOffset()
-{
-  Clear();
-}
-//------------------------------------------------------------------------------
+    void ClipperOffset::DoSquare(int j, int k)
+    {
+        double dx = std::tan(std::atan2(m_sinA,
+                                        m_normals[k].X * m_normals[j].X + m_normals[k].Y * m_normals[j].Y) /
+                             4);
+        m_destPoly.push_back(IntPoint(
+            Round(m_srcPoly[j].X + m_delta * (m_normals[k].X - m_normals[k].Y * dx)),
+            Round(m_srcPoly[j].Y + m_delta * (m_normals[k].Y + m_normals[k].X * dx))));
+        m_destPoly.push_back(IntPoint(
+            Round(m_srcPoly[j].X + m_delta * (m_normals[j].X + m_normals[j].Y * dx)),
+            Round(m_srcPoly[j].Y + m_delta * (m_normals[j].Y - m_normals[j].X * dx))));
+    }
+    //------------------------------------------------------------------------------
 
-void ClipperOffset::Clear()
-{
-  for (int i = 0; i < m_polyNodes.ChildCount(); ++i)
-    delete m_polyNodes.Childs[i];
-  m_polyNodes.Childs.clear();
-  m_lowest.X = -1;
-}
-//------------------------------------------------------------------------------
+    void ClipperOffset::DoMiter(int j, int k, double r)
+    {
+        double q = m_delta / r;
+        m_destPoly.push_back(IntPoint(Round(m_srcPoly[j].X + (m_normals[k].X + m_normals[j].X) * q),
+                                      Round(m_srcPoly[j].Y + (m_normals[k].Y + m_normals[j].Y) * q)));
+    }
+    //------------------------------------------------------------------------------
 
-void ClipperOffset::AddPath(const Path& path, JoinType joinType, EndType endType)
-{
-  int highI = (int)path.size() - 1;
-  if (highI < 0) return;
-  PolyNode* newNode = new PolyNode();
-  newNode->m_jointype = joinType;
-  newNode->m_endtype = endType;
-
-  //strip duplicate points from path and also get index to the lowest point ...
-  if (endType == etClosedLine || endType == etClosedPolygon)
-    while (highI > 0 && path[0] == path[highI]) highI--;
-  newNode->Contour.reserve(highI + 1);
-  newNode->Contour.push_back(path[0]);
-  int j = 0, k = 0;
-  for (int i = 1; i <= highI; i++)
-    if (newNode->Contour[j] != path[i])
-    {
-      j++;
-      newNode->Contour.push_back(path[i]);
-      if (path[i].Y > newNode->Contour[k].Y ||
-        (path[i].Y == newNode->Contour[k].Y &&
-        path[i].X < newNode->Contour[k].X)) k = j;
-    }
-  if (endType == etClosedPolygon && j < 2)
-  {
-    delete newNode;
-    return;
-  }
-  m_polyNodes.AddChild(*newNode);
-
-  //if this path's lowest pt is lower than all the others then update m_lowest
-  if (endType != etClosedPolygon) return;
-  if (m_lowest.X < 0)
-    m_lowest = IntPoint(m_polyNodes.ChildCount() - 1, k);
-  else
-  {
-    IntPoint ip = m_polyNodes.Childs[(int)m_lowest.X]->Contour[(int)m_lowest.Y];
-    if (newNode->Contour[k].Y > ip.Y ||
-      (newNode->Contour[k].Y == ip.Y &&
-      newNode->Contour[k].X < ip.X))
-      m_lowest = IntPoint(m_polyNodes.ChildCount() - 1, k);
-  }
-}
-//------------------------------------------------------------------------------
+    void ClipperOffset::DoRound(int j, int k)
+    {
+        double a     = std::atan2(m_sinA,
+                              m_normals[k].X * m_normals[j].X + m_normals[k].Y * m_normals[j].Y);
+        int    steps = std::max((int)Round(m_StepsPerRad * std::fabs(a)), 1);
 
-void ClipperOffset::AddPaths(const Paths& paths, JoinType joinType, EndType endType)
-{
-  for (Paths::size_type i = 0; i < paths.size(); ++i)
-    AddPath(paths[i], joinType, endType);
-}
-//------------------------------------------------------------------------------
+        double X = m_normals[k].X, Y = m_normals[k].Y, X2;
+        for (int i = 0; i < steps; ++i)
+        {
+            m_destPoly.push_back(IntPoint(
+                Round(m_srcPoly[j].X + X * m_delta),
+                Round(m_srcPoly[j].Y + Y * m_delta)));
+            X2 = X;
+            X  = X * m_cos - m_sin * Y;
+            Y  = X2 * m_sin + Y * m_cos;
+        }
+        m_destPoly.push_back(IntPoint(
+            Round(m_srcPoly[j].X + m_normals[j].X * m_delta),
+            Round(m_srcPoly[j].Y + m_normals[j].Y * m_delta)));
+    }
 
-void ClipperOffset::FixOrientations()
-{
-  //fixup orientations of all closed paths if the orientation of the
-  //closed path with the lowermost vertex is wrong ...
-  if (m_lowest.X >= 0 &&
-    !Orientation(m_polyNodes.Childs[(int)m_lowest.X]->Contour))
-  {
-    for (int i = 0; i < m_polyNodes.ChildCount(); ++i)
-    {
-      PolyNode& node = *m_polyNodes.Childs[i];
-      if (node.m_endtype == etClosedPolygon ||
-        (node.m_endtype == etClosedLine && Orientation(node.Contour)))
-          ReversePath(node.Contour);
-    }
-  } else
-  {
-    for (int i = 0; i < m_polyNodes.ChildCount(); ++i)
-    {
-      PolyNode& node = *m_polyNodes.Childs[i];
-      if (node.m_endtype == etClosedLine && !Orientation(node.Contour))
-        ReversePath(node.Contour);
-    }
-  }
-}
-//------------------------------------------------------------------------------
+    //------------------------------------------------------------------------------
+    // Miscellaneous public functions
+    //------------------------------------------------------------------------------
 
-void ClipperOffset::Execute(Paths& solution, double delta)
-{
-  solution.clear();
-  FixOrientations();
-  DoOffset(delta);
-
-  //now clean up 'corners' ...
-  Clipper clpr;
-  clpr.AddPaths(m_destPolys, ptSubject, true);
-  if (delta > 0)
-  {
-    clpr.Execute(ctUnion, solution, pftPositive, pftPositive);
-  }
-  else
-  {
-    IntRect r = clpr.GetBounds();
-    Path outer(4);
-    outer[0] = IntPoint(r.left - 10, r.bottom + 10);
-    outer[1] = IntPoint(r.right + 10, r.bottom + 10);
-    outer[2] = IntPoint(r.right + 10, r.top - 10);
-    outer[3] = IntPoint(r.left - 10, r.top - 10);
-
-    clpr.AddPath(outer, ptSubject, true);
-    clpr.ReverseSolution(true);
-    clpr.Execute(ctUnion, solution, pftNegative, pftNegative);
-    if (solution.size() > 0) solution.erase(solution.begin());
-  }
-}
-//------------------------------------------------------------------------------
+    void Clipper::DoSimplePolygons()
+    {
+        PolyOutList::size_type i = 0;
+        while (i < m_PolyOuts.size())
+        {
+            OutRec* outrec = m_PolyOuts[i++];
+            OutPt*  op     = outrec->Pts;
+            if (!op || outrec->IsOpen) continue;
+            do  // for each Pt in Polygon until duplicate found do ...
+            {
+                OutPt* op2 = op->Next;
+                while (op2 != outrec->Pts)
+                {
+                    if ((op->Pt == op2->Pt) && op2->Next != op && op2->Prev != op)
+                    {
+                        // split the polygon into two ...
+                        OutPt* op3 = op->Prev;
+                        OutPt* op4 = op2->Prev;
+                        op->Prev   = op4;
+                        op4->Next  = op;
+                        op2->Prev  = op3;
+                        op3->Next  = op2;
+
+                        outrec->Pts     = op;
+                        OutRec* outrec2 = CreateOutRec();
+                        outrec2->Pts    = op2;
+                        UpdateOutPtIdxs(*outrec2);
+                        if (Poly2ContainsPoly1(outrec2->Pts, outrec->Pts))
+                        {
+                            // OutRec2 is contained by OutRec1 ...
+                            outrec2->IsHole    = !outrec->IsHole;
+                            outrec2->FirstLeft = outrec;
+                            if (m_UsingPolyTree) FixupFirstLefts2(outrec2, outrec);
+                        }
+                        else if (Poly2ContainsPoly1(outrec->Pts, outrec2->Pts))
+                        {
+                            // OutRec1 is contained by OutRec2 ...
+                            outrec2->IsHole    = outrec->IsHole;
+                            outrec->IsHole     = !outrec2->IsHole;
+                            outrec2->FirstLeft = outrec->FirstLeft;
+                            outrec->FirstLeft  = outrec2;
+                            if (m_UsingPolyTree) FixupFirstLefts2(outrec, outrec2);
+                        }
+                        else
+                        {
+                            // the 2 polygons are separate ...
+                            outrec2->IsHole    = outrec->IsHole;
+                            outrec2->FirstLeft = outrec->FirstLeft;
+                            if (m_UsingPolyTree) FixupFirstLefts1(outrec, outrec2);
+                        }
+                        op2 = op;  // ie get ready for the Next iteration
+                    }
+                    op2 = op2->Next;
+                }
+                op = op->Next;
+            } while (op != outrec->Pts);
+        }
+    }
+    //------------------------------------------------------------------------------
 
-void ClipperOffset::Execute(PolyTree& solution, double delta)
-{
-  solution.Clear();
-  FixOrientations();
-  DoOffset(delta);
-
-  //now clean up 'corners' ...
-  Clipper clpr;
-  clpr.AddPaths(m_destPolys, ptSubject, true);
-  if (delta > 0)
-  {
-    clpr.Execute(ctUnion, solution, pftPositive, pftPositive);
-  }
-  else
-  {
-    IntRect r = clpr.GetBounds();
-    Path outer(4);
-    outer[0] = IntPoint(r.left - 10, r.bottom + 10);
-    outer[1] = IntPoint(r.right + 10, r.bottom + 10);
-    outer[2] = IntPoint(r.right + 10, r.top - 10);
-    outer[3] = IntPoint(r.left - 10, r.top - 10);
-
-    clpr.AddPath(outer, ptSubject, true);
-    clpr.ReverseSolution(true);
-    clpr.Execute(ctUnion, solution, pftNegative, pftNegative);
-    //remove the outer PolyNode rectangle ...
-    if (solution.ChildCount() == 1 && solution.Childs[0]->ChildCount() > 0)
-    {
-      PolyNode* outerNode = solution.Childs[0];
-      solution.Childs.reserve(outerNode->ChildCount());
-      solution.Childs[0] = outerNode->Childs[0];
-      solution.Childs[0]->Parent = outerNode->Parent;
-      for (int i = 1; i < outerNode->ChildCount(); ++i)
-        solution.AddChild(*outerNode->Childs[i]);
-    }
-    else
-      solution.Clear();
-  }
-}
-//------------------------------------------------------------------------------
+    void ReversePath(Path& p)
+    {
+        std::reverse(p.begin(), p.end());
+    }
+    //------------------------------------------------------------------------------
 
-void ClipperOffset::DoOffset(double delta)
-{
-  m_destPolys.clear();
-  m_delta = delta;
-
-  //if Zero offset, just copy any CLOSED polygons to m_p and return ...
-  if (NEAR_ZERO(delta))
-  {
-    m_destPolys.reserve(m_polyNodes.ChildCount());
-    for (int i = 0; i < m_polyNodes.ChildCount(); i++)
-    {
-      PolyNode& node = *m_polyNodes.Childs[i];
-      if (node.m_endtype == etClosedPolygon)
-        m_destPolys.push_back(node.Contour);
-    }
-    return;
-  }
-
-  //see offset_triginometry3.svg in the documentation folder ...
-  if (MiterLimit > 2) m_miterLim = 2/(MiterLimit * MiterLimit);
-  else m_miterLim = 0.5;
-
-  double y;
-  if (ArcTolerance <= 0.0) y = def_arc_tolerance;
-  else if (ArcTolerance > std::fabs(delta) * def_arc_tolerance)
-    y = std::fabs(delta) * def_arc_tolerance;
-  else y = ArcTolerance;
-  //see offset_triginometry2.svg in the documentation folder ...
-  double steps = pi / std::acos(1 - y / std::fabs(delta));
-  if (steps > std::fabs(delta) * pi)
-    steps = std::fabs(delta) * pi;  //ie excessive precision check
-  m_sin = std::sin(two_pi / steps);
-  m_cos = std::cos(two_pi / steps);
-  m_StepsPerRad = steps / two_pi;
-  if (delta < 0.0) m_sin = -m_sin;
-
-  m_destPolys.reserve(m_polyNodes.ChildCount() * 2);
-  for (int i = 0; i < m_polyNodes.ChildCount(); i++)
-  {
-    PolyNode& node = *m_polyNodes.Childs[i];
-    m_srcPoly = node.Contour;
-
-    int len = (int)m_srcPoly.size();
-    if (len == 0 || (delta <= 0 && (len < 3 || node.m_endtype != etClosedPolygon)))
-        continue;
-
-    m_destPoly.clear();
-    if (len == 1)
-    {
-      if (node.m_jointype == jtRound)
-      {
-        double X = 1.0, Y = 0.0;
-        for (cInt j = 1; j <= steps; j++)
-        {
-          m_destPoly.push_back(IntPoint(
-            Round(m_srcPoly[0].X + X * delta),
-            Round(m_srcPoly[0].Y + Y * delta)));
-          double X2 = X;
-          X = X * m_cos - m_sin * Y;
-          Y = X2 * m_sin + Y * m_cos;
-        }
-      }
-      else
-      {
-        double X = -1.0, Y = -1.0;
-        for (int j = 0; j < 4; ++j)
-        {
-          m_destPoly.push_back(IntPoint(
-            Round(m_srcPoly[0].X + X * delta),
-            Round(m_srcPoly[0].Y + Y * delta)));
-          if (X < 0) X = 1;
-          else if (Y < 0) Y = 1;
-          else X = -1;
-        }
-      }
-      m_destPolys.push_back(m_destPoly);
-      continue;
-    }
-    //build m_normals ...
-    m_normals.clear();
-    m_normals.reserve(len);
-    for (int j = 0; j < len - 1; ++j)
-      m_normals.push_back(GetUnitNormal(m_srcPoly[j], m_srcPoly[j + 1]));
-    if (node.m_endtype == etClosedLine || node.m_endtype == etClosedPolygon)
-      m_normals.push_back(GetUnitNormal(m_srcPoly[len - 1], m_srcPoly[0]));
-    else
-      m_normals.push_back(DoublePoint(m_normals[len - 2]));
-
-    if (node.m_endtype == etClosedPolygon)
-    {
-      int k = len - 1;
-      for (int j = 0; j < len; ++j)
-        OffsetPoint(j, k, node.m_jointype);
-      m_destPolys.push_back(m_destPoly);
-    }
-    else if (node.m_endtype == etClosedLine)
-    {
-      int k = len - 1;
-      for (int j = 0; j < len; ++j)
-        OffsetPoint(j, k, node.m_jointype);
-      m_destPolys.push_back(m_destPoly);
-      m_destPoly.clear();
-      //re-build m_normals ...
-      DoublePoint n = m_normals[len -1];
-      for (int j = len - 1; j > 0; j--)
-        m_normals[j] = DoublePoint(-m_normals[j - 1].X, -m_normals[j - 1].Y);
-      m_normals[0] = DoublePoint(-n.X, -n.Y);
-      k = 0;
-      for (int j = len - 1; j >= 0; j--)
-        OffsetPoint(j, k, node.m_jointype);
-      m_destPolys.push_back(m_destPoly);
-    }
-    else
-    {
-      int k = 0;
-      for (int j = 1; j < len - 1; ++j)
-        OffsetPoint(j, k, node.m_jointype);
-
-      IntPoint pt1;
-      if (node.m_endtype == etOpenButt)
-      {
-        int j = len - 1;
-        pt1 = IntPoint((cInt)Round(m_srcPoly[j].X + m_normals[j].X *
-          delta), (cInt)Round(m_srcPoly[j].Y + m_normals[j].Y * delta));
-        m_destPoly.push_back(pt1);
-        pt1 = IntPoint((cInt)Round(m_srcPoly[j].X - m_normals[j].X *
-          delta), (cInt)Round(m_srcPoly[j].Y - m_normals[j].Y * delta));
-        m_destPoly.push_back(pt1);
-      }
-      else
-      {
-        int j = len - 1;
-        k = len - 2;
-        m_sinA = 0;
-        m_normals[j] = DoublePoint(-m_normals[j].X, -m_normals[j].Y);
-        if (node.m_endtype == etOpenSquare)
-          DoSquare(j, k);
-        else
-          DoRound(j, k);
-      }
-
-      //re-build m_normals ...
-      for (int j = len - 1; j > 0; j--)
-        m_normals[j] = DoublePoint(-m_normals[j - 1].X, -m_normals[j - 1].Y);
-      m_normals[0] = DoublePoint(-m_normals[1].X, -m_normals[1].Y);
-
-      k = len - 1;
-      for (int j = k - 1; j > 0; --j) OffsetPoint(j, k, node.m_jointype);
-
-      if (node.m_endtype == etOpenButt)
-      {
-        pt1 = IntPoint((cInt)Round(m_srcPoly[0].X - m_normals[0].X * delta),
-          (cInt)Round(m_srcPoly[0].Y - m_normals[0].Y * delta));
-        m_destPoly.push_back(pt1);
-        pt1 = IntPoint((cInt)Round(m_srcPoly[0].X + m_normals[0].X * delta),
-          (cInt)Round(m_srcPoly[0].Y + m_normals[0].Y * delta));
-        m_destPoly.push_back(pt1);
-      }
-      else
-      {
-        k = 1;
-        m_sinA = 0;
-        if (node.m_endtype == etOpenSquare)
-          DoSquare(0, 1);
-        else
-          DoRound(0, 1);
-      }
-      m_destPolys.push_back(m_destPoly);
+    void ReversePaths(Paths& p)
+    {
+        for (Paths::size_type i = 0; i < p.size(); ++i)
+            ReversePath(p[i]);
     }
-  }
-}
-//------------------------------------------------------------------------------
+    //------------------------------------------------------------------------------
 
-void ClipperOffset::OffsetPoint(int j, int& k, JoinType jointype)
-{
-  //cross product ...
-  m_sinA = (m_normals[k].X * m_normals[j].Y - m_normals[j].X * m_normals[k].Y);
-  if (std::fabs(m_sinA * m_delta) < 1.0)
-  {
-    //dot product ...
-    double cosA = (m_normals[k].X * m_normals[j].X + m_normals[j].Y * m_normals[k].Y );
-    if (cosA > 0) // angle => 0 degrees
-    {
-      m_destPoly.push_back(IntPoint(Round(m_srcPoly[j].X + m_normals[k].X * m_delta),
-        Round(m_srcPoly[j].Y + m_normals[k].Y * m_delta)));
-      return;
-    }
-    //else angle => 180 degrees
-  }
-  else if (m_sinA > 1.0) m_sinA = 1.0;
-  else if (m_sinA < -1.0) m_sinA = -1.0;
-
-  if (m_sinA * m_delta < 0)
-  {
-    m_destPoly.push_back(IntPoint(Round(m_srcPoly[j].X + m_normals[k].X * m_delta),
-      Round(m_srcPoly[j].Y + m_normals[k].Y * m_delta)));
-    m_destPoly.push_back(m_srcPoly[j]);
-    m_destPoly.push_back(IntPoint(Round(m_srcPoly[j].X + m_normals[j].X * m_delta),
-      Round(m_srcPoly[j].Y + m_normals[j].Y * m_delta)));
-  }
-  else
-    switch (jointype)
-    {
-      case jtMiter:
-        {
-          double r = 1 + (m_normals[j].X * m_normals[k].X +
-            m_normals[j].Y * m_normals[k].Y);
-          if (r >= m_miterLim) DoMiter(j, k, r); else DoSquare(j, k);
-          break;
-        }
-      case jtSquare: DoSquare(j, k); break;
-      case jtRound: DoRound(j, k); break;
-    }
-  k = j;
-}
-//------------------------------------------------------------------------------
+    void SimplifyPolygon(const Path& in_poly, Paths& out_polys, PolyFillType fillType)
+    {
+        Clipper c;
+        c.StrictlySimple(true);
+        c.AddPath(in_poly, ptSubject, true);
+        c.Execute(ctUnion, out_polys, fillType, fillType);
+    }
+    //------------------------------------------------------------------------------
 
-void ClipperOffset::DoSquare(int j, int k)
-{
-  double dx = std::tan(std::atan2(m_sinA,
-      m_normals[k].X * m_normals[j].X + m_normals[k].Y * m_normals[j].Y) / 4);
-  m_destPoly.push_back(IntPoint(
-      Round(m_srcPoly[j].X + m_delta * (m_normals[k].X - m_normals[k].Y * dx)),
-      Round(m_srcPoly[j].Y + m_delta * (m_normals[k].Y + m_normals[k].X * dx))));
-  m_destPoly.push_back(IntPoint(
-      Round(m_srcPoly[j].X + m_delta * (m_normals[j].X + m_normals[j].Y * dx)),
-      Round(m_srcPoly[j].Y + m_delta * (m_normals[j].Y - m_normals[j].X * dx))));
-}
-//------------------------------------------------------------------------------
+    void SimplifyPolygons(const Paths& in_polys, Paths& out_polys, PolyFillType fillType)
+    {
+        Clipper c;
+        c.StrictlySimple(true);
+        c.AddPaths(in_polys, ptSubject, true);
+        c.Execute(ctUnion, out_polys, fillType, fillType);
+    }
+    //------------------------------------------------------------------------------
 
-void ClipperOffset::DoMiter(int j, int k, double r)
-{
-  double q = m_delta / r;
-  m_destPoly.push_back(IntPoint(Round(m_srcPoly[j].X + (m_normals[k].X + m_normals[j].X) * q),
-      Round(m_srcPoly[j].Y + (m_normals[k].Y + m_normals[j].Y) * q)));
-}
-//------------------------------------------------------------------------------
+    void SimplifyPolygons(Paths& polys, PolyFillType fillType)
+    {
+        SimplifyPolygons(polys, polys, fillType);
+    }
+    //------------------------------------------------------------------------------
 
-void ClipperOffset::DoRound(int j, int k)
-{
-  double a = std::atan2(m_sinA,
-  m_normals[k].X * m_normals[j].X + m_normals[k].Y * m_normals[j].Y);
-  int steps = std::max((int)Round(m_StepsPerRad * std::fabs(a)), 1);
-
-  double X = m_normals[k].X, Y = m_normals[k].Y, X2;
-  for (int i = 0; i < steps; ++i)
-  {
-    m_destPoly.push_back(IntPoint(
-        Round(m_srcPoly[j].X + X * m_delta),
-        Round(m_srcPoly[j].Y + Y * m_delta)));
-    X2 = X;
-    X = X * m_cos - m_sin * Y;
-    Y = X2 * m_sin + Y * m_cos;
-  }
-  m_destPoly.push_back(IntPoint(
-  Round(m_srcPoly[j].X + m_normals[j].X * m_delta),
-  Round(m_srcPoly[j].Y + m_normals[j].Y * m_delta)));
-}
+    inline double DistanceSqrd(const IntPoint& pt1, const IntPoint& pt2)
+    {
+        double Dx = ((double)pt1.X - pt2.X);
+        double dy = ((double)pt1.Y - pt2.Y);
+        return (Dx * Dx + dy * dy);
+    }
+    //------------------------------------------------------------------------------
 
-//------------------------------------------------------------------------------
-// Miscellaneous public functions
-//------------------------------------------------------------------------------
+    double DistanceFromLineSqrd(
+        const IntPoint& pt,
+        const IntPoint& ln1,
+        const IntPoint& ln2)
+    {
+        // The equation of a line in general form (Ax + By + C = 0)
+        // given 2 points (x�,y�) & (x�,y�) is ...
+        //(y� - y�)x + (x� - x�)y + (y� - y�)x� - (x� - x�)y� = 0
+        // A = (y� - y�); B = (x� - x�); C = (y� - y�)x� - (x� - x�)y�
+        // perpendicular distance of point (x�,y�) = (Ax� + By� + C)/Sqrt(A� + B�)
+        // see http://en.wikipedia.org/wiki/Perpendicular_distance
+        double A = double(ln1.Y - ln2.Y);
+        double B = double(ln2.X - ln1.X);
+        double C = A * ln1.X + B * ln1.Y;
+        C        = A * pt.X + B * pt.Y - C;
+        return (C * C) / (A * A + B * B);
+    }
+    //---------------------------------------------------------------------------
 
-void Clipper::DoSimplePolygons()
-{
-  PolyOutList::size_type i = 0;
-  while (i < m_PolyOuts.size())
-  {
-    OutRec* outrec = m_PolyOuts[i++];
-    OutPt* op = outrec->Pts;
-    if (!op || outrec->IsOpen) continue;
-    do //for each Pt in Polygon until duplicate found do ...
-    {
-      OutPt* op2 = op->Next;
-      while (op2 != outrec->Pts)
-      {
-        if ((op->Pt == op2->Pt) && op2->Next != op && op2->Prev != op)
-        {
-          //split the polygon into two ...
-          OutPt* op3 = op->Prev;
-          OutPt* op4 = op2->Prev;
-          op->Prev = op4;
-          op4->Next = op;
-          op2->Prev = op3;
-          op3->Next = op2;
-
-          outrec->Pts = op;
-          OutRec* outrec2 = CreateOutRec();
-          outrec2->Pts = op2;
-          UpdateOutPtIdxs(*outrec2);
-          if (Poly2ContainsPoly1(outrec2->Pts, outrec->Pts))
-          {
-            //OutRec2 is contained by OutRec1 ...
-            outrec2->IsHole = !outrec->IsHole;
-            outrec2->FirstLeft = outrec;
-            if (m_UsingPolyTree) FixupFirstLefts2(outrec2, outrec);
-          }
-          else
-            if (Poly2ContainsPoly1(outrec->Pts, outrec2->Pts))
-          {
-            //OutRec1 is contained by OutRec2 ...
-            outrec2->IsHole = outrec->IsHole;
-            outrec->IsHole = !outrec2->IsHole;
-            outrec2->FirstLeft = outrec->FirstLeft;
-            outrec->FirstLeft = outrec2;
-            if (m_UsingPolyTree) FixupFirstLefts2(outrec, outrec2);
-            }
+    bool SlopesNearCollinear(const IntPoint& pt1,
+                             const IntPoint& pt2,
+                             const IntPoint& pt3,
+                             double          distSqrd)
+    {
+        // this function is more accurate when the point that's geometrically
+        // between the other 2 points is the one that's tested for distance.
+        // ie makes it more likely to pick up 'spikes' ...
+        if (Abs(pt1.X - pt2.X) > Abs(pt1.Y - pt2.Y))
+        {
+            if ((pt1.X > pt2.X) == (pt1.X < pt3.X))
+                return DistanceFromLineSqrd(pt1, pt2, pt3) < distSqrd;
+            else if ((pt2.X > pt1.X) == (pt2.X < pt3.X))
+                return DistanceFromLineSqrd(pt2, pt1, pt3) < distSqrd;
             else
-          {
-            //the 2 polygons are separate ...
-            outrec2->IsHole = outrec->IsHole;
-            outrec2->FirstLeft = outrec->FirstLeft;
-            if (m_UsingPolyTree) FixupFirstLefts1(outrec, outrec2);
-            }
-          op2 = op; //ie get ready for the Next iteration
+                return DistanceFromLineSqrd(pt3, pt1, pt2) < distSqrd;
+        }
+        else
+        {
+            if ((pt1.Y > pt2.Y) == (pt1.Y < pt3.Y))
+                return DistanceFromLineSqrd(pt1, pt2, pt3) < distSqrd;
+            else if ((pt2.Y > pt1.Y) == (pt2.Y < pt3.Y))
+                return DistanceFromLineSqrd(pt2, pt1, pt3) < distSqrd;
+            else
+                return DistanceFromLineSqrd(pt3, pt1, pt2) < distSqrd;
         }
-        op2 = op2->Next;
-      }
-      op = op->Next;
     }
-    while (op != outrec->Pts);
-  }
-}
-//------------------------------------------------------------------------------
+    //------------------------------------------------------------------------------
 
-void ReversePath(Path& p)
-{
-  std::reverse(p.begin(), p.end());
-}
-//------------------------------------------------------------------------------
+    bool PointsAreClose(IntPoint pt1, IntPoint pt2, double distSqrd)
+    {
+        double Dx = (double)pt1.X - pt2.X;
+        double dy = (double)pt1.Y - pt2.Y;
+        return ((Dx * Dx) + (dy * dy) <= distSqrd);
+    }
+    //------------------------------------------------------------------------------
 
-void ReversePaths(Paths& p)
-{
-  for (Paths::size_type i = 0; i < p.size(); ++i)
-    ReversePath(p[i]);
-}
-//------------------------------------------------------------------------------
+    OutPt* ExcludeOp(OutPt* op)
+    {
+        OutPt* result  = op->Prev;
+        result->Next   = op->Next;
+        op->Next->Prev = result;
+        result->Idx    = 0;
+        return result;
+    }
+    //------------------------------------------------------------------------------
 
-void SimplifyPolygon(const Path &in_poly, Paths &out_polys, PolyFillType fillType)
-{
-  Clipper c;
-  c.StrictlySimple(true);
-  c.AddPath(in_poly, ptSubject, true);
-  c.Execute(ctUnion, out_polys, fillType, fillType);
-}
-//------------------------------------------------------------------------------
+    void CleanPolygon(const Path& in_poly, Path& out_poly, double distance)
+    {
+        // distance = proximity in units/pixels below which vertices
+        // will be stripped. Default ~= sqrt(2).
 
-void SimplifyPolygons(const Paths &in_polys, Paths &out_polys, PolyFillType fillType)
-{
-  Clipper c;
-  c.StrictlySimple(true);
-  c.AddPaths(in_polys, ptSubject, true);
-  c.Execute(ctUnion, out_polys, fillType, fillType);
-}
-//------------------------------------------------------------------------------
+        size_t size = in_poly.size();
 
-void SimplifyPolygons(Paths &polys, PolyFillType fillType)
-{
-  SimplifyPolygons(polys, polys, fillType);
-}
-//------------------------------------------------------------------------------
+        if (size == 0)
+        {
+            out_poly.clear();
+            return;
+        }
 
-inline double DistanceSqrd(const IntPoint& pt1, const IntPoint& pt2)
-{
-  double Dx = ((double)pt1.X - pt2.X);
-  double dy = ((double)pt1.Y - pt2.Y);
-  return (Dx*Dx + dy*dy);
-}
-//------------------------------------------------------------------------------
+        OutPt* outPts = new OutPt[size];
+        for (size_t i = 0; i < size; ++i)
+        {
+            outPts[i].Pt         = in_poly[i];
+            outPts[i].Next       = &outPts[(i + 1) % size];
+            outPts[i].Next->Prev = &outPts[i];
+            outPts[i].Idx        = 0;
+        }
 
-double DistanceFromLineSqrd(
-  const IntPoint& pt, const IntPoint& ln1, const IntPoint& ln2)
-{
-  //The equation of a line in general form (Ax + By + C = 0)
-  //given 2 points (x�,y�) & (x�,y�) is ...
-  //(y� - y�)x + (x� - x�)y + (y� - y�)x� - (x� - x�)y� = 0
-  //A = (y� - y�); B = (x� - x�); C = (y� - y�)x� - (x� - x�)y�
-  //perpendicular distance of point (x�,y�) = (Ax� + By� + C)/Sqrt(A� + B�)
-  //see http://en.wikipedia.org/wiki/Perpendicular_distance
-  double A = double(ln1.Y - ln2.Y);
-  double B = double(ln2.X - ln1.X);
-  double C = A * ln1.X  + B * ln1.Y;
-  C = A * pt.X + B * pt.Y - C;
-  return (C * C) / (A * A + B * B);
-}
-//---------------------------------------------------------------------------
-
-bool SlopesNearCollinear(const IntPoint& pt1,
-    const IntPoint& pt2, const IntPoint& pt3, double distSqrd)
-{
-  //this function is more accurate when the point that's geometrically
-  //between the other 2 points is the one that's tested for distance.
-  //ie makes it more likely to pick up 'spikes' ...
-	if (Abs(pt1.X - pt2.X) > Abs(pt1.Y - pt2.Y))
-	{
-    if ((pt1.X > pt2.X) == (pt1.X < pt3.X))
-      return DistanceFromLineSqrd(pt1, pt2, pt3) < distSqrd;
-    else if ((pt2.X > pt1.X) == (pt2.X < pt3.X))
-      return DistanceFromLineSqrd(pt2, pt1, pt3) < distSqrd;
-		else
-	    return DistanceFromLineSqrd(pt3, pt1, pt2) < distSqrd;
-	}
-	else
-	{
-    if ((pt1.Y > pt2.Y) == (pt1.Y < pt3.Y))
-      return DistanceFromLineSqrd(pt1, pt2, pt3) < distSqrd;
-    else if ((pt2.Y > pt1.Y) == (pt2.Y < pt3.Y))
-      return DistanceFromLineSqrd(pt2, pt1, pt3) < distSqrd;
-		else
-      return DistanceFromLineSqrd(pt3, pt1, pt2) < distSqrd;
-	}
-}
-//------------------------------------------------------------------------------
+        double distSqrd = distance * distance;
+        OutPt* op       = &outPts[0];
+        while (op->Idx == 0 && op->Next != op->Prev)
+        {
+            if (PointsAreClose(op->Pt, op->Prev->Pt, distSqrd))
+            {
+                op = ExcludeOp(op);
+                size--;
+            }
+            else if (PointsAreClose(op->Prev->Pt, op->Next->Pt, distSqrd))
+            {
+                ExcludeOp(op->Next);
+                op = ExcludeOp(op);
+                size -= 2;
+            }
+            else if (SlopesNearCollinear(op->Prev->Pt, op->Pt, op->Next->Pt, distSqrd))
+            {
+                op = ExcludeOp(op);
+                size--;
+            }
+            else
+            {
+                op->Idx = 1;
+                op      = op->Next;
+            }
+        }
 
-bool PointsAreClose(IntPoint pt1, IntPoint pt2, double distSqrd)
-{
-    double Dx = (double)pt1.X - pt2.X;
-    double dy = (double)pt1.Y - pt2.Y;
-    return ((Dx * Dx) + (dy * dy) <= distSqrd);
-}
-//------------------------------------------------------------------------------
+        if (size < 3) size = 0;
+        out_poly.resize(size);
+        for (size_t i = 0; i < size; ++i)
+        {
+            out_poly[i] = op->Pt;
+            op          = op->Next;
+        }
+        delete[] outPts;
+    }
+    //------------------------------------------------------------------------------
 
-OutPt* ExcludeOp(OutPt* op)
-{
-  OutPt* result = op->Prev;
-  result->Next = op->Next;
-  op->Next->Prev = result;
-  result->Idx = 0;
-  return result;
-}
-//------------------------------------------------------------------------------
+    void CleanPolygon(Path& poly, double distance)
+    {
+        CleanPolygon(poly, poly, distance);
+    }
+    //------------------------------------------------------------------------------
 
-void CleanPolygon(const Path& in_poly, Path& out_poly, double distance)
-{
-  //distance = proximity in units/pixels below which vertices
-  //will be stripped. Default ~= sqrt(2).
-
-  size_t size = in_poly.size();
-
-  if (size == 0)
-  {
-    out_poly.clear();
-    return;
-  }
-
-  OutPt* outPts = new OutPt[size];
-  for (size_t i = 0; i < size; ++i)
-  {
-    outPts[i].Pt = in_poly[i];
-    outPts[i].Next = &outPts[(i + 1) % size];
-    outPts[i].Next->Prev = &outPts[i];
-    outPts[i].Idx = 0;
-  }
-
-  double distSqrd = distance * distance;
-  OutPt* op = &outPts[0];
-  while (op->Idx == 0 && op->Next != op->Prev)
-  {
-    if (PointsAreClose(op->Pt, op->Prev->Pt, distSqrd))
-    {
-      op = ExcludeOp(op);
-      size--;
-    }
-    else if (PointsAreClose(op->Prev->Pt, op->Next->Pt, distSqrd))
-    {
-      ExcludeOp(op->Next);
-      op = ExcludeOp(op);
-      size -= 2;
-    }
-    else if (SlopesNearCollinear(op->Prev->Pt, op->Pt, op->Next->Pt, distSqrd))
-    {
-      op = ExcludeOp(op);
-      size--;
-    }
-    else
-    {
-      op->Idx = 1;
-      op = op->Next;
-    }
-  }
-
-  if (size < 3) size = 0;
-  out_poly.resize(size);
-  for (size_t i = 0; i < size; ++i)
-  {
-    out_poly[i] = op->Pt;
-    op = op->Next;
-  }
-  delete [] outPts;
-}
-//------------------------------------------------------------------------------
+    void CleanPolygons(const Paths& in_polys, Paths& out_polys, double distance)
+    {
+        out_polys.resize(in_polys.size());
+        for (Paths::size_type i = 0; i < in_polys.size(); ++i)
+            CleanPolygon(in_polys[i], out_polys[i], distance);
+    }
+    //------------------------------------------------------------------------------
 
-void CleanPolygon(Path& poly, double distance)
-{
-  CleanPolygon(poly, poly, distance);
-}
-//------------------------------------------------------------------------------
+    void CleanPolygons(Paths& polys, double distance)
+    {
+        CleanPolygons(polys, polys, distance);
+    }
+    //------------------------------------------------------------------------------
 
-void CleanPolygons(const Paths& in_polys, Paths& out_polys, double distance)
-{
-  out_polys.resize(in_polys.size());
-  for (Paths::size_type i = 0; i < in_polys.size(); ++i)
-    CleanPolygon(in_polys[i], out_polys[i], distance);
-}
-//------------------------------------------------------------------------------
+    void Minkowski(const Path& poly, const Path& path, Paths& solution, bool isSum, bool isClosed)
+    {
+        int    delta   = (isClosed ? 1 : 0);
+        size_t polyCnt = poly.size();
+        size_t pathCnt = path.size();
+        Paths  pp;
+        pp.reserve(pathCnt);
+        if (isSum)
+            for (size_t i = 0; i < pathCnt; ++i)
+            {
+                Path p;
+                p.reserve(polyCnt);
+                for (size_t j = 0; j < poly.size(); ++j)
+                    p.push_back(IntPoint(path[i].X + poly[j].X, path[i].Y + poly[j].Y));
+                pp.push_back(p);
+            }
+        else
+            for (size_t i = 0; i < pathCnt; ++i)
+            {
+                Path p;
+                p.reserve(polyCnt);
+                for (size_t j = 0; j < poly.size(); ++j)
+                    p.push_back(IntPoint(path[i].X - poly[j].X, path[i].Y - poly[j].Y));
+                pp.push_back(p);
+            }
 
-void CleanPolygons(Paths& polys, double distance)
-{
-  CleanPolygons(polys, polys, distance);
-}
-//------------------------------------------------------------------------------
+        solution.clear();
+        solution.reserve((pathCnt + delta) * (polyCnt + 1));
+        for (size_t i = 0; i < pathCnt - 1 + delta; ++i)
+            for (size_t j = 0; j < polyCnt; ++j)
+            {
+                Path quad;
+                quad.reserve(4);
+                quad.push_back(pp[i % pathCnt][j % polyCnt]);
+                quad.push_back(pp[(i + 1) % pathCnt][j % polyCnt]);
+                quad.push_back(pp[(i + 1) % pathCnt][(j + 1) % polyCnt]);
+                quad.push_back(pp[i % pathCnt][(j + 1) % polyCnt]);
+                if (!Orientation(quad)) ReversePath(quad);
+                solution.push_back(quad);
+            }
+    }
+    //------------------------------------------------------------------------------
 
-void Minkowski(const Path& poly, const Path& path,
-  Paths& solution, bool isSum, bool isClosed)
-{
-  int delta = (isClosed ? 1 : 0);
-  size_t polyCnt = poly.size();
-  size_t pathCnt = path.size();
-  Paths pp;
-  pp.reserve(pathCnt);
-  if (isSum)
-    for (size_t i = 0; i < pathCnt; ++i)
-    {
-      Path p;
-      p.reserve(polyCnt);
-      for (size_t j = 0; j < poly.size(); ++j)
-        p.push_back(IntPoint(path[i].X + poly[j].X, path[i].Y + poly[j].Y));
-      pp.push_back(p);
-    }
-  else
-    for (size_t i = 0; i < pathCnt; ++i)
-    {
-      Path p;
-      p.reserve(polyCnt);
-      for (size_t j = 0; j < poly.size(); ++j)
-        p.push_back(IntPoint(path[i].X - poly[j].X, path[i].Y - poly[j].Y));
-      pp.push_back(p);
-    }
-
-  solution.clear();
-  solution.reserve((pathCnt + delta) * (polyCnt + 1));
-  for (size_t i = 0; i < pathCnt - 1 + delta; ++i)
-    for (size_t j = 0; j < polyCnt; ++j)
-    {
-      Path quad;
-      quad.reserve(4);
-      quad.push_back(pp[i % pathCnt][j % polyCnt]);
-      quad.push_back(pp[(i + 1) % pathCnt][j % polyCnt]);
-      quad.push_back(pp[(i + 1) % pathCnt][(j + 1) % polyCnt]);
-      quad.push_back(pp[i % pathCnt][(j + 1) % polyCnt]);
-      if (!Orientation(quad)) ReversePath(quad);
-      solution.push_back(quad);
-    }
-}
-//------------------------------------------------------------------------------
+    void MinkowskiSum(const Path& pattern, const Path& path, Paths& solution, bool pathIsClosed)
+    {
+        Minkowski(pattern, path, solution, true, pathIsClosed);
+        Clipper c;
+        c.AddPaths(solution, ptSubject, true);
+        c.Execute(ctUnion, solution, pftNonZero, pftNonZero);
+    }
+    //------------------------------------------------------------------------------
 
-void MinkowskiSum(const Path& pattern, const Path& path, Paths& solution, bool pathIsClosed)
-{
-  Minkowski(pattern, path, solution, true, pathIsClosed);
-  Clipper c;
-  c.AddPaths(solution, ptSubject, true);
-  c.Execute(ctUnion, solution, pftNonZero, pftNonZero);
-}
-//------------------------------------------------------------------------------
+    void TranslatePath(const Path& input, Path& output, const IntPoint delta)
+    {
+        // precondition: input != output
+        output.resize(input.size());
+        for (size_t i = 0; i < input.size(); ++i)
+            output[i] = IntPoint(input[i].X + delta.X, input[i].Y + delta.Y);
+    }
+    //------------------------------------------------------------------------------
 
-void TranslatePath(const Path& input, Path& output, const IntPoint delta)
-{
-  //precondition: input != output
-  output.resize(input.size());
-  for (size_t i = 0; i < input.size(); ++i)
-    output[i] = IntPoint(input[i].X + delta.X, input[i].Y + delta.Y);
-}
-//------------------------------------------------------------------------------
+    void MinkowskiSum(const Path& pattern, const Paths& paths, Paths& solution, bool pathIsClosed)
+    {
+        Clipper c;
+        for (size_t i = 0; i < paths.size(); ++i)
+        {
+            Paths tmp;
+            Minkowski(pattern, paths[i], tmp, true, pathIsClosed);
+            c.AddPaths(tmp, ptSubject, true);
+            if (pathIsClosed)
+            {
+                Path tmp2;
+                TranslatePath(paths[i], tmp2, pattern[0]);
+                c.AddPath(tmp2, ptClip, true);
+            }
+        }
+        c.Execute(ctUnion, solution, pftNonZero, pftNonZero);
+    }
+    //------------------------------------------------------------------------------
 
-void MinkowskiSum(const Path& pattern, const Paths& paths, Paths& solution, bool pathIsClosed)
-{
-  Clipper c;
-  for (size_t i = 0; i < paths.size(); ++i)
-  {
-    Paths tmp;
-    Minkowski(pattern, paths[i], tmp, true, pathIsClosed);
-    c.AddPaths(tmp, ptSubject, true);
-    if (pathIsClosed)
-    {
-      Path tmp2;
-      TranslatePath(paths[i], tmp2, pattern[0]);
-      c.AddPath(tmp2, ptClip, true);
-    }
-  }
-    c.Execute(ctUnion, solution, pftNonZero, pftNonZero);
-}
-//------------------------------------------------------------------------------
+    void MinkowskiDiff(const Path& poly1, const Path& poly2, Paths& solution)
+    {
+        Minkowski(poly1, poly2, solution, false, true);
+        Clipper c;
+        c.AddPaths(solution, ptSubject, true);
+        c.Execute(ctUnion, solution, pftNonZero, pftNonZero);
+    }
+    //------------------------------------------------------------------------------
 
-void MinkowskiDiff(const Path& poly1, const Path& poly2, Paths& solution)
-{
-  Minkowski(poly1, poly2, solution, false, true);
-  Clipper c;
-  c.AddPaths(solution, ptSubject, true);
-  c.Execute(ctUnion, solution, pftNonZero, pftNonZero);
-}
-//------------------------------------------------------------------------------
+    enum NodeType
+    {
+        ntAny,
+        ntOpen,
+        ntClosed
+    };
 
-enum NodeType {ntAny, ntOpen, ntClosed};
+    void AddPolyNodeToPaths(const PolyNode& polynode, NodeType nodetype, Paths& paths)
+    {
+        bool match = true;
+        if (nodetype == ntClosed)
+            match = !polynode.IsOpen();
+        else if (nodetype == ntOpen)
+            return;
 
-void AddPolyNodeToPaths(const PolyNode& polynode, NodeType nodetype, Paths& paths)
-{
-  bool match = true;
-  if (nodetype == ntClosed) match = !polynode.IsOpen();
-  else if (nodetype == ntOpen) return;
-
-  if (!polynode.Contour.empty() && match)
-    paths.push_back(polynode.Contour);
-  for (int i = 0; i < polynode.ChildCount(); ++i)
-    AddPolyNodeToPaths(*polynode.Childs[i], nodetype, paths);
-}
-//------------------------------------------------------------------------------
+        if (!polynode.Contour.empty() && match)
+            paths.push_back(polynode.Contour);
+        for (int i = 0; i < polynode.ChildCount(); ++i)
+            AddPolyNodeToPaths(*polynode.Childs[i], nodetype, paths);
+    }
+    //------------------------------------------------------------------------------
 
-void PolyTreeToPaths(const PolyTree& polytree, Paths& paths)
-{
-  paths.resize(0);
-  paths.reserve(polytree.Total());
-  AddPolyNodeToPaths(polytree, ntAny, paths);
-}
-//------------------------------------------------------------------------------
+    void PolyTreeToPaths(const PolyTree& polytree, Paths& paths)
+    {
+        paths.resize(0);
+        paths.reserve(polytree.Total());
+        AddPolyNodeToPaths(polytree, ntAny, paths);
+    }
+    //------------------------------------------------------------------------------
 
-void ClosedPathsFromPolyTree(const PolyTree& polytree, Paths& paths)
-{
-  paths.resize(0);
-  paths.reserve(polytree.Total());
-  AddPolyNodeToPaths(polytree, ntClosed, paths);
-}
-//------------------------------------------------------------------------------
+    void ClosedPathsFromPolyTree(const PolyTree& polytree, Paths& paths)
+    {
+        paths.resize(0);
+        paths.reserve(polytree.Total());
+        AddPolyNodeToPaths(polytree, ntClosed, paths);
+    }
+    //------------------------------------------------------------------------------
 
-void OpenPathsFromPolyTree(PolyTree& polytree, Paths& paths)
-{
-  paths.resize(0);
-  paths.reserve(polytree.Total());
-  //Open paths are top level only, so ...
-  for (int i = 0; i < polytree.ChildCount(); ++i)
-    if (polytree.Childs[i]->IsOpen())
-      paths.push_back(polytree.Childs[i]->Contour);
-}
-//------------------------------------------------------------------------------
+    void OpenPathsFromPolyTree(PolyTree& polytree, Paths& paths)
+    {
+        paths.resize(0);
+        paths.reserve(polytree.Total());
+        // Open paths are top level only, so ...
+        for (int i = 0; i < polytree.ChildCount(); ++i)
+            if (polytree.Childs[i]->IsOpen())
+                paths.push_back(polytree.Childs[i]->Contour);
+    }
+    //------------------------------------------------------------------------------
 
-std::ostream& operator <<(std::ostream &s, const IntPoint &p)
-{
-  s << "(" << p.X << "," << p.Y << ")";
-  return s;
-}
-//------------------------------------------------------------------------------
+    std::ostream& operator<<(std::ostream& s, const IntPoint& p)
+    {
+        s << "(" << p.X << "," << p.Y << ")";
+        return s;
+    }
+    //------------------------------------------------------------------------------
 
-std::ostream& operator <<(std::ostream &s, const Path &p)
-{
-  if (p.empty()) return s;
-  Path::size_type last = p.size() -1;
-  for (Path::size_type i = 0; i < last; i++)
-    s << "(" << p[i].X << "," << p[i].Y << "), ";
-  s << "(" << p[last].X << "," << p[last].Y << ")\n";
-  return s;
-}
-//------------------------------------------------------------------------------
+    std::ostream& operator<<(std::ostream& s, const Path& p)
+    {
+        if (p.empty()) return s;
+        Path::size_type last = p.size() - 1;
+        for (Path::size_type i = 0; i < last; i++)
+            s << "(" << p[i].X << "," << p[i].Y << "), ";
+        s << "(" << p[last].X << "," << p[last].Y << ")\n";
+        return s;
+    }
+    //------------------------------------------------------------------------------
 
-std::ostream& operator <<(std::ostream &s, const Paths &p)
-{
-  for (Paths::size_type i = 0; i < p.size(); i++)
-    s << p[i];
-  s << "\n";
-  return s;
-}
-//------------------------------------------------------------------------------
+    std::ostream& operator<<(std::ostream& s, const Paths& p)
+    {
+        for (Paths::size_type i = 0; i < p.size(); i++)
+            s << p[i];
+        s << "\n";
+        return s;
+    }
+    //------------------------------------------------------------------------------
 
-} //ClipperLib namespace
+}  // namespace ClipperLib
diff --git a/third_party/clipper/clipper.hpp b/third_party/clipper/clipper.hpp
index df517471c0..e3aa9ff2d5 100644
--- a/third_party/clipper/clipper.hpp
+++ b/third_party/clipper/clipper.hpp
@@ -1,404 +1,493 @@
 /*******************************************************************************
-*                                                                              *
-* Author    :  Angus Johnson                                                   *
-* Version   :  6.4.2                                                           *
-* Date      :  27 February 2017                                                *
-* Website   :  http://www.angusj.com                                           *
-* Copyright :  Angus Johnson 2010-2017                                         *
-*                                                                              *
-* License:                                                                     *
-* Use, modification & distribution is subject to Boost Software License Ver 1. *
-* http://www.boost.org/LICENSE_1_0.txt                                         *
-*                                                                              *
-* Attributions:                                                                *
-* The code in this library is an extension of Bala Vatti's clipping algorithm: *
-* "A generic solution to polygon clipping"                                     *
-* Communications of the ACM, Vol 35, Issue 7 (July 1992) pp 56-63.             *
-* http://portal.acm.org/citation.cfm?id=129906                                 *
-*                                                                              *
-* Computer graphics and geometric modeling: implementation and algorithms      *
-* By Max K. Agoston                                                            *
-* Springer; 1 edition (January 4, 2005)                                        *
-* http://books.google.com/books?q=vatti+clipping+agoston                       *
-*                                                                              *
-* See also:                                                                    *
-* "Polygon Offsetting by Computing Winding Numbers"                            *
-* Paper no. DETC2005-85513 pp. 565-575                                         *
-* ASME 2005 International Design Engineering Technical Conferences             *
-* and Computers and Information in Engineering Conference (IDETC/CIE2005)      *
-* September 24-28, 2005 , Long Beach, California, USA                          *
-* http://www.me.berkeley.edu/~mcmains/pubs/DAC05OffsetPolygon.pdf              *
-*                                                                              *
-*******************************************************************************/
+ *                                                                              *
+ * Author    :  Angus Johnson                                                   *
+ * Version   :  6.4.2                                                           *
+ * Date      :  27 February 2017                                                *
+ * Website   :  http://www.angusj.com                                           *
+ * Copyright :  Angus Johnson 2010-2017                                         *
+ *                                                                              *
+ * License:                                                                     *
+ * Use, modification & distribution is subject to Boost Software License Ver 1. *
+ * http://www.boost.org/LICENSE_1_0.txt                                         *
+ *                                                                              *
+ * Attributions:                                                                *
+ * The code in this library is an extension of Bala Vatti's clipping algorithm: *
+ * "A generic solution to polygon clipping"                                     *
+ * Communications of the ACM, Vol 35, Issue 7 (July 1992) pp 56-63.             *
+ * http://portal.acm.org/citation.cfm?id=129906                                 *
+ *                                                                              *
+ * Computer graphics and geometric modeling: implementation and algorithms      *
+ * By Max K. Agoston                                                            *
+ * Springer; 1 edition (January 4, 2005)                                        *
+ * http://books.google.com/books?q=vatti+clipping+agoston                       *
+ *                                                                              *
+ * See also:                                                                    *
+ * "Polygon Offsetting by Computing Winding Numbers"                            *
+ * Paper no. DETC2005-85513 pp. 565-575                                         *
+ * ASME 2005 International Design Engineering Technical Conferences             *
+ * and Computers and Information in Engineering Conference (IDETC/CIE2005)      *
+ * September 24-28, 2005 , Long Beach, California, USA                          *
+ * http://www.me.berkeley.edu/~mcmains/pubs/DAC05OffsetPolygon.pdf              *
+ *                                                                              *
+ *******************************************************************************/
 
 #ifndef clipper_hpp
 #define clipper_hpp
 
 #define CLIPPER_VERSION "6.4.2"
 
-//use_int32: When enabled 32bit ints are used instead of 64bit ints. This
-//improve performance but coordinate values are limited to the range +/- 46340
-//#define use_int32
+// use_int32: When enabled 32bit ints are used instead of 64bit ints. This
+// improve performance but coordinate values are limited to the range +/- 46340
+// #define use_int32
 
-//use_xyz: adds a Z member to IntPoint. Adds a minor cost to perfomance.
-//#define use_xyz
+// use_xyz: adds a Z member to IntPoint. Adds a minor cost to perfomance.
+// #define use_xyz
 
-//use_lines: Enables line clipping. Adds a very minor cost to performance.
+// use_lines: Enables line clipping. Adds a very minor cost to performance.
 #define use_lines
 
-//use_deprecated: Enables temporary support for the obsolete functions
-//#define use_deprecated
+// use_deprecated: Enables temporary support for the obsolete functions
+// #define use_deprecated
 
-#include <vector>
-#include <list>
-#include <set>
-#include <stdexcept>
-#include <cstring>
 #include <cstdlib>
-#include <ostream>
+#include <cstring>
 #include <functional>
+#include <list>
+#include <ostream>
 #include <queue>
+#include <set>
+#include <stdexcept>
+#include <vector>
 
-namespace ClipperLib {
+namespace ClipperLib
+{
 
-enum ClipType { ctIntersection, ctUnion, ctDifference, ctXor };
-enum PolyType { ptSubject, ptClip };
-//By far the most widely used winding rules for polygon filling are
-//EvenOdd & NonZero (GDI, GDI+, XLib, OpenGL, Cairo, AGG, Quartz, SVG, Gr32)
-//Others rules include Positive, Negative and ABS_GTR_EQ_TWO (only in OpenGL)
-//see http://glprogramming.com/red/chapter11.html
-enum PolyFillType { pftEvenOdd, pftNonZero, pftPositive, pftNegative };
+    enum ClipType
+    {
+        ctIntersection,
+        ctUnion,
+        ctDifference,
+        ctXor
+    };
+    enum PolyType
+    {
+        ptSubject,
+        ptClip
+    };
+    // By far the most widely used winding rules for polygon filling are
+    // EvenOdd & NonZero (GDI, GDI+, XLib, OpenGL, Cairo, AGG, Quartz, SVG, Gr32)
+    // Others rules include Positive, Negative and ABS_GTR_EQ_TWO (only in OpenGL)
+    // see http://glprogramming.com/red/chapter11.html
+    enum PolyFillType
+    {
+        pftEvenOdd,
+        pftNonZero,
+        pftPositive,
+        pftNegative
+    };
 
 #ifdef use_int32
-  typedef int cInt;
-  static cInt const loRange = 0x7FFF;
-  static cInt const hiRange = 0x7FFF;
+    typedef int       cInt;
+    static cInt const loRange = 0x7FFF;
+    static cInt const hiRange = 0x7FFF;
 #else
-  typedef signed long long cInt;
-  static cInt const loRange = 0x3FFFFFFF;
-  static cInt const hiRange = 0x3FFFFFFFFFFFFFFFLL;
-  typedef signed long long long64;     //used by Int128 class
-  typedef unsigned long long ulong64;
+    typedef signed long long   cInt;
+    static cInt const          loRange = 0x3FFFFFFF;
+    static cInt const          hiRange = 0x3FFFFFFFFFFFFFFFLL;
+    typedef signed long long   long64;  // used by Int128 class
+    typedef unsigned long long ulong64;
 
 #endif
 
-struct IntPoint {
-  cInt X;
-  cInt Y;
+    struct IntPoint
+    {
+        cInt X;
+        cInt Y;
 #ifdef use_xyz
-  cInt Z;
-  IntPoint(cInt x = 0, cInt y = 0, cInt z = 0): X(x), Y(y), Z(z) {};
+        cInt Z;
+        IntPoint(cInt x = 0, cInt y = 0, cInt z = 0)
+            : X(x)
+            , Y(y)
+            , Z(z){};
 #else
-  IntPoint(cInt x = 0, cInt y = 0): X(x), Y(y) {};
+        IntPoint(cInt x = 0, cInt y = 0)
+            : X(x)
+            , Y(y){};
 #endif
 
-  friend inline bool operator== (const IntPoint& a, const IntPoint& b)
-  {
-    return a.X == b.X && a.Y == b.Y;
-  }
-  friend inline bool operator!= (const IntPoint& a, const IntPoint& b)
-  {
-    return a.X != b.X  || a.Y != b.Y;
-  }
-};
-//------------------------------------------------------------------------------
-
-typedef std::vector< IntPoint > Path;
-typedef std::vector< Path > Paths;
-
-inline Path& operator <<(Path& poly, const IntPoint& p) {poly.push_back(p); return poly;}
-inline Paths& operator <<(Paths& polys, const Path& p) {polys.push_back(p); return polys;}
-
-std::ostream& operator <<(std::ostream &s, const IntPoint &p);
-std::ostream& operator <<(std::ostream &s, const Path &p);
-std::ostream& operator <<(std::ostream &s, const Paths &p);
-
-struct DoublePoint
-{
-  double X;
-  double Y;
-  DoublePoint(double x = 0, double y = 0) : X(x), Y(y) {}
-  DoublePoint(IntPoint ip) : X((double)ip.X), Y((double)ip.Y) {}
-};
-//------------------------------------------------------------------------------
+        friend inline bool operator==(const IntPoint& a, const IntPoint& b)
+        {
+            return a.X == b.X && a.Y == b.Y;
+        }
+        friend inline bool operator!=(const IntPoint& a, const IntPoint& b)
+        {
+            return a.X != b.X || a.Y != b.Y;
+        }
+    };
+    //------------------------------------------------------------------------------
+
+    typedef std::vector<IntPoint> Path;
+    typedef std::vector<Path>     Paths;
+
+    inline Path&                  operator<<(Path& poly, const IntPoint& p)
+    {
+        poly.push_back(p);
+        return poly;
+    }
+    inline Paths& operator<<(Paths& polys, const Path& p)
+    {
+        polys.push_back(p);
+        return polys;
+    }
+
+    std::ostream& operator<<(std::ostream& s, const IntPoint& p);
+    std::ostream& operator<<(std::ostream& s, const Path& p);
+    std::ostream& operator<<(std::ostream& s, const Paths& p);
+
+    struct DoublePoint
+    {
+        double X;
+        double Y;
+        DoublePoint(double x = 0, double y = 0)
+            : X(x)
+            , Y(y)
+        {
+        }
+        DoublePoint(IntPoint ip)
+            : X((double)ip.X)
+            , Y((double)ip.Y)
+        {
+        }
+    };
+    //------------------------------------------------------------------------------
 
 #ifdef use_xyz
-typedef void (*ZFillCallback)(IntPoint& e1bot, IntPoint& e1top, IntPoint& e2bot, IntPoint& e2top, IntPoint& pt);
+    typedef void (*ZFillCallback)(IntPoint& e1bot, IntPoint& e1top, IntPoint& e2bot, IntPoint& e2top, IntPoint& pt);
 #endif
 
-enum InitOptions {ioReverseSolution = 1, ioStrictlySimple = 2, ioPreserveCollinear = 4};
-enum JoinType {jtSquare, jtRound, jtMiter};
-enum EndType {etClosedPolygon, etClosedLine, etOpenButt, etOpenSquare, etOpenRound};
-
-class PolyNode;
-typedef std::vector< PolyNode* > PolyNodes;
-
-class PolyNode
-{
-public:
-    PolyNode();
-    virtual ~PolyNode(){};
-    Path Contour;
-    PolyNodes Childs;
-    PolyNode* Parent;
-    PolyNode* GetNext() const;
-    bool IsHole() const;
-    bool IsOpen() const;
-    int ChildCount() const;
-private:
-    //PolyNode& operator =(PolyNode& other);
-    unsigned Index; //node index in Parent.Childs
-    bool m_IsOpen;
-    JoinType m_jointype;
-    EndType m_endtype;
-    PolyNode* GetNextSiblingUp() const;
-    void AddChild(PolyNode& child);
-    friend class Clipper; //to access Index
-    friend class ClipperOffset;
-};
-
-class PolyTree: public PolyNode
-{
-public:
-    ~PolyTree(){ Clear(); };
-    PolyNode* GetFirst() const;
-    void Clear();
-    int Total() const;
-private:
-  //PolyTree& operator =(PolyTree& other);
-  PolyNodes AllNodes;
-    friend class Clipper; //to access AllNodes
-};
-
-bool Orientation(const Path &poly);
-double Area(const Path &poly);
-int PointInPolygon(const IntPoint &pt, const Path &path);
-
-void SimplifyPolygon(const Path &in_poly, Paths &out_polys, PolyFillType fillType = pftEvenOdd);
-void SimplifyPolygons(const Paths &in_polys, Paths &out_polys, PolyFillType fillType = pftEvenOdd);
-void SimplifyPolygons(Paths &polys, PolyFillType fillType = pftEvenOdd);
-
-void CleanPolygon(const Path& in_poly, Path& out_poly, double distance = 1.415);
-void CleanPolygon(Path& poly, double distance = 1.415);
-void CleanPolygons(const Paths& in_polys, Paths& out_polys, double distance = 1.415);
-void CleanPolygons(Paths& polys, double distance = 1.415);
-
-void MinkowskiSum(const Path& pattern, const Path& path, Paths& solution, bool pathIsClosed);
-void MinkowskiSum(const Path& pattern, const Paths& paths, Paths& solution, bool pathIsClosed);
-void MinkowskiDiff(const Path& poly1, const Path& poly2, Paths& solution);
-
-void PolyTreeToPaths(const PolyTree& polytree, Paths& paths);
-void ClosedPathsFromPolyTree(const PolyTree& polytree, Paths& paths);
-void OpenPathsFromPolyTree(PolyTree& polytree, Paths& paths);
-
-void ReversePath(Path& p);
-void ReversePaths(Paths& p);
-
-struct IntRect { cInt left; cInt top; cInt right; cInt bottom; };
-
-//enums that are used internally ...
-enum EdgeSide { esLeft = 1, esRight = 2};
-
-//forward declarations (for stuff used internally) ...
-struct TEdge;
-struct IntersectNode;
-struct LocalMinimum;
-struct OutPt;
-struct OutRec;
-struct Join;
-
-typedef std::vector < OutRec* > PolyOutList;
-typedef std::vector < TEdge* > EdgeList;
-typedef std::vector < Join* > JoinList;
-typedef std::vector < IntersectNode* > IntersectList;
-
-//------------------------------------------------------------------------------
-
-//ClipperBase is the ancestor to the Clipper class. It should not be
-//instantiated directly. This class simply abstracts the conversion of sets of
-//polygon coordinates into edge objects that are stored in a LocalMinima list.
-class ClipperBase
-{
-public:
-  ClipperBase();
-  virtual ~ClipperBase();
-  virtual bool AddPath(const Path &pg, PolyType PolyTyp, bool Closed);
-  bool AddPaths(const Paths &ppg, PolyType PolyTyp, bool Closed);
-  virtual void Clear();
-  IntRect GetBounds();
-  bool PreserveCollinear() {return m_PreserveCollinear;};
-  void PreserveCollinear(bool value) {m_PreserveCollinear = value;};
-protected:
-  void DisposeLocalMinimaList();
-  TEdge* AddBoundsToLML(TEdge *e, bool IsClosed);
-  virtual void Reset();
-  TEdge* ProcessBound(TEdge* E, bool IsClockwise);
-  void InsertScanbeam(const cInt Y);
-  bool PopScanbeam(cInt &Y);
-  bool LocalMinimaPending();
-  bool PopLocalMinima(cInt Y, const LocalMinimum *&locMin);
-  OutRec* CreateOutRec();
-  void DisposeAllOutRecs();
-  void DisposeOutRec(PolyOutList::size_type index);
-  void SwapPositionsInAEL(TEdge *edge1, TEdge *edge2);
-  void DeleteFromAEL(TEdge *e);
-  void UpdateEdgeIntoAEL(TEdge *&e);
-
-  typedef std::vector<LocalMinimum> MinimaList;
-  MinimaList::iterator m_CurrentLM;
-  MinimaList           m_MinimaList;
-
-  bool              m_UseFullRange;
-  EdgeList          m_edges;
-  bool              m_PreserveCollinear;
-  bool              m_HasOpenPaths;
-  PolyOutList       m_PolyOuts;
-  TEdge           *m_ActiveEdges;
-
-  typedef std::priority_queue<cInt> ScanbeamList;
-  ScanbeamList     m_Scanbeam;
-};
-//------------------------------------------------------------------------------
-
-class Clipper : public virtual ClipperBase
-{
-public:
-  Clipper(int initOptions = 0);
-  bool Execute(ClipType clipType,
-      Paths &solution,
-      PolyFillType fillType = pftEvenOdd);
-  bool Execute(ClipType clipType,
-      Paths &solution,
-      PolyFillType subjFillType,
-      PolyFillType clipFillType);
-  bool Execute(ClipType clipType,
-      PolyTree &polytree,
-      PolyFillType fillType = pftEvenOdd);
-  bool Execute(ClipType clipType,
-      PolyTree &polytree,
-      PolyFillType subjFillType,
-      PolyFillType clipFillType);
-  bool ReverseSolution() { return m_ReverseOutput; };
-  void ReverseSolution(bool value) {m_ReverseOutput = value;};
-  bool StrictlySimple() {return m_StrictSimple;};
-  void StrictlySimple(bool value) {m_StrictSimple = value;};
-  //set the callback function for z value filling on intersections (otherwise Z is 0)
+    enum InitOptions
+    {
+        ioReverseSolution   = 1,
+        ioStrictlySimple    = 2,
+        ioPreserveCollinear = 4
+    };
+    enum JoinType
+    {
+        jtSquare,
+        jtRound,
+        jtMiter
+    };
+    enum EndType
+    {
+        etClosedPolygon,
+        etClosedLine,
+        etOpenButt,
+        etOpenSquare,
+        etOpenRound
+    };
+
+    class PolyNode;
+    typedef std::vector<PolyNode*> PolyNodes;
+
+    class PolyNode
+    {
+      public:
+        PolyNode();
+        virtual ~PolyNode(){};
+        Path      Contour;
+        PolyNodes Childs;
+        PolyNode* Parent;
+        PolyNode* GetNext() const;
+        bool      IsHole() const;
+        bool      IsOpen() const;
+        int       ChildCount() const;
+
+      private:
+        // PolyNode& operator =(PolyNode& other);
+        unsigned  Index;  // node index in Parent.Childs
+        bool      m_IsOpen;
+        JoinType  m_jointype;
+        EndType   m_endtype;
+        PolyNode* GetNextSiblingUp() const;
+        void      AddChild(PolyNode& child);
+        friend class Clipper;  // to access Index
+        friend class ClipperOffset;
+    };
+
+    class PolyTree : public PolyNode
+    {
+      public:
+        ~PolyTree()
+        {
+            Clear();
+        };
+        PolyNode* GetFirst() const;
+        void      Clear();
+        int       Total() const;
+
+      private:
+        // PolyTree& operator =(PolyTree& other);
+        PolyNodes AllNodes;
+        friend class Clipper;  // to access AllNodes
+    };
+
+    bool   Orientation(const Path& poly);
+    double Area(const Path& poly);
+    int    PointInPolygon(const IntPoint& pt, const Path& path);
+
+    void   SimplifyPolygon(const Path& in_poly, Paths& out_polys, PolyFillType fillType = pftEvenOdd);
+    void   SimplifyPolygons(const Paths& in_polys, Paths& out_polys, PolyFillType fillType = pftEvenOdd);
+    void   SimplifyPolygons(Paths& polys, PolyFillType fillType = pftEvenOdd);
+
+    void   CleanPolygon(const Path& in_poly, Path& out_poly, double distance = 1.415);
+    void   CleanPolygon(Path& poly, double distance = 1.415);
+    void   CleanPolygons(const Paths& in_polys, Paths& out_polys, double distance = 1.415);
+    void   CleanPolygons(Paths& polys, double distance = 1.415);
+
+    void   MinkowskiSum(const Path& pattern, const Path& path, Paths& solution, bool pathIsClosed);
+    void   MinkowskiSum(const Path& pattern, const Paths& paths, Paths& solution, bool pathIsClosed);
+    void   MinkowskiDiff(const Path& poly1, const Path& poly2, Paths& solution);
+
+    void   PolyTreeToPaths(const PolyTree& polytree, Paths& paths);
+    void   ClosedPathsFromPolyTree(const PolyTree& polytree, Paths& paths);
+    void   OpenPathsFromPolyTree(PolyTree& polytree, Paths& paths);
+
+    void   ReversePath(Path& p);
+    void   ReversePaths(Paths& p);
+
+    struct IntRect
+    {
+        cInt left;
+        cInt top;
+        cInt right;
+        cInt bottom;
+    };
+
+    // enums that are used internally ...
+    enum EdgeSide
+    {
+        esLeft  = 1,
+        esRight = 2
+    };
+
+    // forward declarations (for stuff used internally) ...
+    struct TEdge;
+    struct IntersectNode;
+    struct LocalMinimum;
+    struct OutPt;
+    struct OutRec;
+    struct Join;
+
+    typedef std::vector<OutRec*>        PolyOutList;
+    typedef std::vector<TEdge*>         EdgeList;
+    typedef std::vector<Join*>          JoinList;
+    typedef std::vector<IntersectNode*> IntersectList;
+
+    //------------------------------------------------------------------------------
+
+    // ClipperBase is the ancestor to the Clipper class. It should not be
+    // instantiated directly. This class simply abstracts the conversion of sets of
+    // polygon coordinates into edge objects that are stored in a LocalMinima list.
+    class ClipperBase
+    {
+      public:
+        ClipperBase();
+        virtual ~ClipperBase();
+        virtual bool AddPath(const Path& pg, PolyType PolyTyp, bool Closed);
+        bool         AddPaths(const Paths& ppg, PolyType PolyTyp, bool Closed);
+        virtual void Clear();
+        IntRect      GetBounds();
+        bool         PreserveCollinear()
+        {
+            return m_PreserveCollinear;
+        };
+        void PreserveCollinear(bool value)
+        {
+            m_PreserveCollinear = value;
+        };
+
+      protected:
+        void                              DisposeLocalMinimaList();
+        TEdge*                            AddBoundsToLML(TEdge* e, bool IsClosed);
+        virtual void                      Reset();
+        TEdge*                            ProcessBound(TEdge* E, bool IsClockwise);
+        void                              InsertScanbeam(const cInt Y);
+        bool                              PopScanbeam(cInt& Y);
+        bool                              LocalMinimaPending();
+        bool                              PopLocalMinima(cInt Y, const LocalMinimum*& locMin);
+        OutRec*                           CreateOutRec();
+        void                              DisposeAllOutRecs();
+        void                              DisposeOutRec(PolyOutList::size_type index);
+        void                              SwapPositionsInAEL(TEdge* edge1, TEdge* edge2);
+        void                              DeleteFromAEL(TEdge* e);
+        void                              UpdateEdgeIntoAEL(TEdge*& e);
+
+        typedef std::vector<LocalMinimum> MinimaList;
+        MinimaList::iterator              m_CurrentLM;
+        MinimaList                        m_MinimaList;
+
+        bool                              m_UseFullRange;
+        EdgeList                          m_edges;
+        bool                              m_PreserveCollinear;
+        bool                              m_HasOpenPaths;
+        PolyOutList                       m_PolyOuts;
+        TEdge*                            m_ActiveEdges;
+
+        typedef std::priority_queue<cInt> ScanbeamList;
+        ScanbeamList                      m_Scanbeam;
+    };
+    //------------------------------------------------------------------------------
+
+    class Clipper : public virtual ClipperBase
+    {
+      public:
+        Clipper(int initOptions = 0);
+        bool Execute(ClipType clipType, Paths& solution, PolyFillType fillType = pftEvenOdd);
+        bool Execute(ClipType clipType, Paths& solution, PolyFillType subjFillType, PolyFillType clipFillType);
+        bool Execute(ClipType clipType, PolyTree& polytree, PolyFillType fillType = pftEvenOdd);
+        bool Execute(ClipType clipType, PolyTree& polytree, PolyFillType subjFillType, PolyFillType clipFillType);
+        bool ReverseSolution()
+        {
+            return m_ReverseOutput;
+        };
+        void ReverseSolution(bool value)
+        {
+            m_ReverseOutput = value;
+        };
+        bool StrictlySimple()
+        {
+            return m_StrictSimple;
+        };
+        void StrictlySimple(bool value)
+        {
+            m_StrictSimple = value;
+        };
+        // set the callback function for z value filling on intersections (otherwise Z is 0)
 #ifdef use_xyz
-  void ZFillFunction(ZFillCallback zFillFunc);
+        void ZFillFunction(ZFillCallback zFillFunc);
 #endif
-protected:
-  virtual bool ExecuteInternal();
-private:
-  JoinList         m_Joins;
-  JoinList         m_GhostJoins;
-  IntersectList    m_IntersectList;
-  ClipType         m_ClipType;
-  typedef std::list<cInt> MaximaList;
-  MaximaList       m_Maxima;
-  TEdge           *m_SortedEdges;
-  bool             m_ExecuteLocked;
-  PolyFillType     m_ClipFillType;
-  PolyFillType     m_SubjFillType;
-  bool             m_ReverseOutput;
-  bool             m_UsingPolyTree;
-  bool             m_StrictSimple;
+      protected:
+        virtual bool ExecuteInternal();
+
+      private:
+        JoinList                m_Joins;
+        JoinList                m_GhostJoins;
+        IntersectList           m_IntersectList;
+        ClipType                m_ClipType;
+        typedef std::list<cInt> MaximaList;
+        MaximaList              m_Maxima;
+        TEdge*                  m_SortedEdges;
+        bool                    m_ExecuteLocked;
+        PolyFillType            m_ClipFillType;
+        PolyFillType            m_SubjFillType;
+        bool                    m_ReverseOutput;
+        bool                    m_UsingPolyTree;
+        bool                    m_StrictSimple;
 #ifdef use_xyz
-  ZFillCallback   m_ZFill; //custom callback
+        ZFillCallback m_ZFill;  // custom callback
 #endif
-  void SetWindingCount(TEdge& edge);
-  bool IsEvenOddFillType(const TEdge& edge) const;
-  bool IsEvenOddAltFillType(const TEdge& edge) const;
-  void InsertLocalMinimaIntoAEL(const cInt botY);
-  void InsertEdgeIntoAEL(TEdge *edge, TEdge* startEdge);
-  void AddEdgeToSEL(TEdge *edge);
-  bool PopEdgeFromSEL(TEdge *&edge);
-  void CopyAELToSEL();
-  void DeleteFromSEL(TEdge *e);
-  void SwapPositionsInSEL(TEdge *edge1, TEdge *edge2);
-  bool IsContributing(const TEdge& edge) const;
-  bool IsTopHorz(const cInt XPos);
-  void DoMaxima(TEdge *e);
-  void ProcessHorizontals();
-  void ProcessHorizontal(TEdge *horzEdge);
-  void AddLocalMaxPoly(TEdge *e1, TEdge *e2, const IntPoint &pt);
-  OutPt* AddLocalMinPoly(TEdge *e1, TEdge *e2, const IntPoint &pt);
-  OutRec* GetOutRec(int idx);
-  void AppendPolygon(TEdge *e1, TEdge *e2);
-  void IntersectEdges(TEdge *e1, TEdge *e2, IntPoint &pt);
-  OutPt* AddOutPt(TEdge *e, const IntPoint &pt);
-  OutPt* GetLastOutPt(TEdge *e);
-  bool ProcessIntersections(const cInt topY);
-  void BuildIntersectList(const cInt topY);
-  void ProcessIntersectList();
-  void ProcessEdgesAtTopOfScanbeam(const cInt topY);
-  void BuildResult(Paths& polys);
-  void BuildResult2(PolyTree& polytree);
-  void SetHoleState(TEdge *e, OutRec *outrec);
-  void DisposeIntersectNodes();
-  bool FixupIntersectionOrder();
-  void FixupOutPolygon(OutRec &outrec);
-  void FixupOutPolyline(OutRec &outrec);
-  bool IsHole(TEdge *e);
-  bool FindOwnerFromSplitRecs(OutRec &outRec, OutRec *&currOrfl);
-  void FixHoleLinkage(OutRec &outrec);
-  void AddJoin(OutPt *op1, OutPt *op2, const IntPoint offPt);
-  void ClearJoins();
-  void ClearGhostJoins();
-  void AddGhostJoin(OutPt *op, const IntPoint offPt);
-  bool JoinPoints(Join *j, OutRec* outRec1, OutRec* outRec2);
-  void JoinCommonEdges();
-  void DoSimplePolygons();
-  void FixupFirstLefts1(OutRec* OldOutRec, OutRec* NewOutRec);
-  void FixupFirstLefts2(OutRec* InnerOutRec, OutRec* OuterOutRec);
-  void FixupFirstLefts3(OutRec* OldOutRec, OutRec* NewOutRec);
+        void    SetWindingCount(TEdge& edge);
+        bool    IsEvenOddFillType(const TEdge& edge) const;
+        bool    IsEvenOddAltFillType(const TEdge& edge) const;
+        void    InsertLocalMinimaIntoAEL(const cInt botY);
+        void    InsertEdgeIntoAEL(TEdge* edge, TEdge* startEdge);
+        void    AddEdgeToSEL(TEdge* edge);
+        bool    PopEdgeFromSEL(TEdge*& edge);
+        void    CopyAELToSEL();
+        void    DeleteFromSEL(TEdge* e);
+        void    SwapPositionsInSEL(TEdge* edge1, TEdge* edge2);
+        bool    IsContributing(const TEdge& edge) const;
+        bool    IsTopHorz(const cInt XPos);
+        void    DoMaxima(TEdge* e);
+        void    ProcessHorizontals();
+        void    ProcessHorizontal(TEdge* horzEdge);
+        void    AddLocalMaxPoly(TEdge* e1, TEdge* e2, const IntPoint& pt);
+        OutPt*  AddLocalMinPoly(TEdge* e1, TEdge* e2, const IntPoint& pt);
+        OutRec* GetOutRec(int idx);
+        void    AppendPolygon(TEdge* e1, TEdge* e2);
+        void    IntersectEdges(TEdge* e1, TEdge* e2, IntPoint& pt);
+        OutPt*  AddOutPt(TEdge* e, const IntPoint& pt);
+        OutPt*  GetLastOutPt(TEdge* e);
+        bool    ProcessIntersections(const cInt topY);
+        void    BuildIntersectList(const cInt topY);
+        void    ProcessIntersectList();
+        void    ProcessEdgesAtTopOfScanbeam(const cInt topY);
+        void    BuildResult(Paths& polys);
+        void    BuildResult2(PolyTree& polytree);
+        void    SetHoleState(TEdge* e, OutRec* outrec);
+        void    DisposeIntersectNodes();
+        bool    FixupIntersectionOrder();
+        void    FixupOutPolygon(OutRec& outrec);
+        void    FixupOutPolyline(OutRec& outrec);
+        bool    IsHole(TEdge* e);
+        bool    FindOwnerFromSplitRecs(OutRec& outRec, OutRec*& currOrfl);
+        void    FixHoleLinkage(OutRec& outrec);
+        void    AddJoin(OutPt* op1, OutPt* op2, const IntPoint offPt);
+        void    ClearJoins();
+        void    ClearGhostJoins();
+        void    AddGhostJoin(OutPt* op, const IntPoint offPt);
+        bool    JoinPoints(Join* j, OutRec* outRec1, OutRec* outRec2);
+        void    JoinCommonEdges();
+        void    DoSimplePolygons();
+        void    FixupFirstLefts1(OutRec* OldOutRec, OutRec* NewOutRec);
+        void    FixupFirstLefts2(OutRec* InnerOutRec, OutRec* OuterOutRec);
+        void    FixupFirstLefts3(OutRec* OldOutRec, OutRec* NewOutRec);
 #ifdef use_xyz
-  void SetZ(IntPoint& pt, TEdge& e1, TEdge& e2);
+        void SetZ(IntPoint& pt, TEdge& e1, TEdge& e2);
 #endif
-};
-//------------------------------------------------------------------------------
-
-class ClipperOffset
-{
-public:
-  ClipperOffset(double miterLimit = 2.0, double roundPrecision = 0.25);
-  ~ClipperOffset();
-  void AddPath(const Path& path, JoinType joinType, EndType endType);
-  void AddPaths(const Paths& paths, JoinType joinType, EndType endType);
-  void Execute(Paths& solution, double delta);
-  void Execute(PolyTree& solution, double delta);
-  void Clear();
-  double MiterLimit;
-  double ArcTolerance;
-private:
-  Paths m_destPolys;
-  Path m_srcPoly;
-  Path m_destPoly;
-  std::vector<DoublePoint> m_normals;
-  double m_delta, m_sinA, m_sin, m_cos;
-  double m_miterLim, m_StepsPerRad;
-  IntPoint m_lowest;
-  PolyNode m_polyNodes;
-
-  void FixOrientations();
-  void DoOffset(double delta);
-  void OffsetPoint(int j, int& k, JoinType jointype);
-  void DoSquare(int j, int k);
-  void DoMiter(int j, int k, double r);
-  void DoRound(int j, int k);
-};
-//------------------------------------------------------------------------------
-
-class clipperException : public std::exception
-{
-  public:
-    clipperException(const char* description): m_descr(description) {}
-    virtual ~clipperException() throw() {}
-    virtual const char* what() const throw() {return m_descr.c_str();}
-  private:
-    std::string m_descr;
-};
-//------------------------------------------------------------------------------
-
-} //ClipperLib namespace
-
-#endif //clipper_hpp
+    };
+    //------------------------------------------------------------------------------
+
+    class ClipperOffset
+    {
+      public:
+        ClipperOffset(double miterLimit = 2.0, double roundPrecision = 0.25);
+        ~ClipperOffset();
+        void   AddPath(const Path& path, JoinType joinType, EndType endType);
+        void   AddPaths(const Paths& paths, JoinType joinType, EndType endType);
+        void   Execute(Paths& solution, double delta);
+        void   Execute(PolyTree& solution, double delta);
+        void   Clear();
+        double MiterLimit;
+        double ArcTolerance;
+
+      private:
+        Paths                    m_destPolys;
+        Path                     m_srcPoly;
+        Path                     m_destPoly;
+        std::vector<DoublePoint> m_normals;
+        double                   m_delta, m_sinA, m_sin, m_cos;
+        double                   m_miterLim, m_StepsPerRad;
+        IntPoint                 m_lowest;
+        PolyNode                 m_polyNodes;
+
+        void                     FixOrientations();
+        void                     DoOffset(double delta);
+        void                     OffsetPoint(int j, int& k, JoinType jointype);
+        void                     DoSquare(int j, int k);
+        void                     DoMiter(int j, int k, double r);
+        void                     DoRound(int j, int k);
+    };
+    //------------------------------------------------------------------------------
+
+    class clipperException : public std::exception
+    {
+      public:
+        clipperException(const char* description)
+            : m_descr(description)
+        {
+        }
+        virtual ~clipperException() throw() {}
+        virtual const char* what() const throw()
+        {
+            return m_descr.c_str();
+        }
+
+      private:
+        std::string m_descr;
+    };
+    //------------------------------------------------------------------------------
+
+}  // namespace ClipperLib
+
+#endif  // clipper_hpp
diff --git a/third_party/concurrentqueue/concurrentqueue.h b/third_party/concurrentqueue/concurrentqueue.h
index 5c63686b4e..e33bd23ad5 100644
--- a/third_party/concurrentqueue/concurrentqueue.h
+++ b/third_party/concurrentqueue/concurrentqueue.h
@@ -32,233 +32,331 @@
 #pragma once
 
 #if defined(__GNUC__) && !defined(__INTEL_COMPILER)
-// Disable -Wconversion warnings (spuriously triggered when Traits::size_t and
-// Traits::index_t are set to < 32 bits, causing integer promotion, causing warnings
-// upon assigning any computed values)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wconversion"
-
-#ifdef MCDBGQ_USE_RELACY
-#pragma GCC diagnostic ignored "-Wint-to-pointer-cast"
-#endif
+    // Disable -Wconversion warnings (spuriously triggered when Traits::size_t and
+    // Traits::index_t are set to < 32 bits, causing integer promotion, causing warnings
+    // upon assigning any computed values)
+    #pragma GCC diagnostic push
+    #pragma GCC diagnostic ignored "-Wconversion"
+
+    #ifdef MCDBGQ_USE_RELACY
+        #pragma GCC diagnostic ignored "-Wint-to-pointer-cast"
+    #endif
 #endif
 
 #if defined(_MSC_VER) && (!defined(_HAS_CXX17) || !_HAS_CXX17)
-// VS2019 with /W4 warns about constant conditional expressions but unless /std=c++17 or higher
-// does not support `if constexpr`, so we have no choice but to simply disable the warning
-#pragma warning(push)
-#pragma warning(disable: 4127)  // conditional expression is constant
+    // VS2019 with /W4 warns about constant conditional expressions but unless /std=c++17 or higher
+    // does not support `if constexpr`, so we have no choice but to simply disable the warning
+    #pragma warning(push)
+    #pragma warning(disable : 4127)  // conditional expression is constant
 #endif
 
 #if defined(__APPLE__)
-#include "TargetConditionals.h"
+    #include "TargetConditionals.h"
 #endif
 
 #ifdef MCDBGQ_USE_RELACY
-#include "relacy/relacy_std.hpp"
-#include "relacy_shims.h"
-// We only use malloc/free anyway, and the delete macro messes up `= delete` method declarations.
-// We'll override the default trait malloc ourselves without a macro.
-#undef new
-#undef delete
-#undef malloc
-#undef free
+    #include "relacy/relacy_std.hpp"
+    #include "relacy_shims.h"
+    // We only use malloc/free anyway, and the delete macro messes up `= delete` method declarations.
+    // We'll override the default trait malloc ourselves without a macro.
+    #undef new
+    #undef delete
+    #undef malloc
+    #undef free
 #else
-#include <atomic>		// Requires C++11. Sorry VS2010.
-#include <cassert>
+    #include <atomic>  // Requires C++11. Sorry VS2010.
+    #include <cassert>
 #endif
-#include <cstddef>              // for max_align_t
+#include <cstddef>  // for max_align_t
 #include <cstdint>
 #include <cstdlib>
 #include <type_traits>
 #include <algorithm>
 #include <utility>
 #include <limits>
-#include <climits>		// for CHAR_BIT
+#include <climits>  // for CHAR_BIT
 #include <array>
-#include <thread>		// partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading
-#include <mutex>        // used for thread exit synchronization
+#include <thread>  // partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading
+#include <mutex>   // used for thread exit synchronization
 
 // Platform-specific definitions of a numeric thread ID type and an invalid value
-namespace moodycamel { namespace details {
-	template<typename thread_id_t> struct thread_id_converter {
-		typedef thread_id_t thread_id_numeric_size_t;
-		typedef thread_id_t thread_id_hash_t;
-		static thread_id_hash_t prehash(thread_id_t const& x) { return x; }
-	};
-} }
+namespace moodycamel
+{
+    namespace details
+    {
+        template<typename thread_id_t>
+        struct thread_id_converter
+        {
+            typedef thread_id_t     thread_id_numeric_size_t;
+            typedef thread_id_t     thread_id_hash_t;
+            static thread_id_hash_t prehash(thread_id_t const& x)
+            {
+                return x;
+            }
+        };
+    }  // namespace details
+}  // namespace moodycamel
 #if defined(MCDBGQ_USE_RELACY)
-namespace moodycamel { namespace details {
-	typedef std::uint32_t thread_id_t;
-	static const thread_id_t invalid_thread_id  = 0xFFFFFFFFU;
-	static const thread_id_t invalid_thread_id2 = 0xFFFFFFFEU;
-	static inline thread_id_t thread_id() { return rl::thread_index(); }
-} }
+namespace moodycamel
+{
+    namespace details
+    {
+        typedef std::uint32_t     thread_id_t;
+        static const thread_id_t  invalid_thread_id  = 0xFFFFFFFFU;
+        static const thread_id_t  invalid_thread_id2 = 0xFFFFFFFEU;
+        static inline thread_id_t thread_id()
+        {
+            return rl::thread_index();
+        }
+    }  // namespace details
+}  // namespace moodycamel
 #elif defined(_WIN32) || defined(__WINDOWS__) || defined(__WIN32__)
 // No sense pulling in windows.h in a header, we'll manually declare the function
 // we use and rely on backwards-compatibility for this not to break
 extern "C" __declspec(dllimport) unsigned long __stdcall GetCurrentThreadId(void);
-namespace moodycamel { namespace details {
-	static_assert(sizeof(unsigned long) == sizeof(std::uint32_t), "Expected size of unsigned long to be 32 bits on Windows");
-	typedef std::uint32_t thread_id_t;
-	static const thread_id_t invalid_thread_id  = 0;			// See http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx
-	static const thread_id_t invalid_thread_id2 = 0xFFFFFFFFU;	// Not technically guaranteed to be invalid, but is never used in practice. Note that all Win32 thread IDs are presently multiples of 4.
-	static inline thread_id_t thread_id() { return static_cast<thread_id_t>(::GetCurrentThreadId()); }
-} }
+namespace moodycamel
+{
+    namespace details
+    {
+        static_assert(sizeof(unsigned long) == sizeof(std::uint32_t), "Expected size of unsigned long to be 32 bits on Windows");
+        typedef std::uint32_t     thread_id_t;
+        static const thread_id_t  invalid_thread_id  = 0;            // See http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx
+        static const thread_id_t  invalid_thread_id2 = 0xFFFFFFFFU;  // Not technically guaranteed to be invalid, but is never used in practice. Note that all Win32 thread IDs are presently multiples of 4.
+        static inline thread_id_t thread_id()
+        {
+            return static_cast<thread_id_t>(::GetCurrentThreadId());
+        }
+    }  // namespace details
+}  // namespace moodycamel
 #elif defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || (defined(__APPLE__) && TARGET_OS_IPHONE) || defined(MOODYCAMEL_NO_THREAD_LOCAL)
-namespace moodycamel { namespace details {
-	static_assert(sizeof(std::thread::id) == 4 || sizeof(std::thread::id) == 8, "std::thread::id is expected to be either 4 or 8 bytes");
-
-	typedef std::thread::id thread_id_t;
-	static const thread_id_t invalid_thread_id;         // Default ctor creates invalid ID
-
-	// Note we don't define a invalid_thread_id2 since std::thread::id doesn't have one; it's
-	// only used if MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is defined anyway, which it won't
-	// be.
-	static inline thread_id_t thread_id() { return std::this_thread::get_id(); }
-
-	template<std::size_t> struct thread_id_size { };
-	template<> struct thread_id_size<4> { typedef std::uint32_t numeric_t; };
-	template<> struct thread_id_size<8> { typedef std::uint64_t numeric_t; };
-
-	template<> struct thread_id_converter<thread_id_t> {
-		typedef thread_id_size<sizeof(thread_id_t)>::numeric_t thread_id_numeric_size_t;
-#ifndef __APPLE__
-		typedef std::size_t thread_id_hash_t;
-#else
-		typedef thread_id_numeric_size_t thread_id_hash_t;
-#endif
-
-		static thread_id_hash_t prehash(thread_id_t const& x)
-		{
-#ifndef __APPLE__
-			return std::hash<std::thread::id>()(x);
-#else
-			return *reinterpret_cast<thread_id_hash_t const*>(&x);
-#endif
-		}
-	};
-} }
-#else
-// Use a nice trick from this answer: http://stackoverflow.com/a/8438730/21475
-// In order to get a numeric thread ID in a platform-independent way, we use a thread-local
-// static variable's address as a thread identifier :-)
-#if defined(__GNUC__) || defined(__INTEL_COMPILER)
-#define MOODYCAMEL_THREADLOCAL __thread
-#elif defined(_MSC_VER)
-#define MOODYCAMEL_THREADLOCAL __declspec(thread)
+namespace moodycamel
+{
+    namespace details
+    {
+        static_assert(sizeof(std::thread::id) == 4 || sizeof(std::thread::id) == 8, "std::thread::id is expected to be either 4 or 8 bytes");
+
+        typedef std::thread::id   thread_id_t;
+        static const thread_id_t  invalid_thread_id;  // Default ctor creates invalid ID
+
+        // Note we don't define a invalid_thread_id2 since std::thread::id doesn't have one; it's
+        // only used if MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is defined anyway, which it won't
+        // be.
+        static inline thread_id_t thread_id()
+        {
+            return std::this_thread::get_id();
+        }
+
+        template<std::size_t>
+        struct thread_id_size
+        {
+        };
+        template<>
+        struct thread_id_size<4>
+        {
+            typedef std::uint32_t numeric_t;
+        };
+        template<>
+        struct thread_id_size<8>
+        {
+            typedef std::uint64_t numeric_t;
+        };
+
+        template<>
+        struct thread_id_converter<thread_id_t>
+        {
+            typedef thread_id_size<sizeof(thread_id_t)>::numeric_t thread_id_numeric_size_t;
+    #ifndef __APPLE__
+            typedef std::size_t thread_id_hash_t;
+    #else
+            typedef thread_id_numeric_size_t thread_id_hash_t;
+    #endif
+
+            static thread_id_hash_t prehash(thread_id_t const& x)
+            {
+    #ifndef __APPLE__
+                return std::hash<std::thread::id>()(x);
+    #else
+                return *reinterpret_cast<thread_id_hash_t const*>(&x);
+    #endif
+            }
+        };
+    }
+}
 #else
-// Assume C++11 compliant compiler
-#define MOODYCAMEL_THREADLOCAL thread_local
-#endif
-namespace moodycamel { namespace details {
-	typedef std::uintptr_t thread_id_t;
-	static const thread_id_t invalid_thread_id  = 0;		// Address can't be nullptr
-	static const thread_id_t invalid_thread_id2 = 1;		// Member accesses off a null pointer are also generally invalid. Plus it's not aligned.
-	inline thread_id_t thread_id() { static MOODYCAMEL_THREADLOCAL int x; return reinterpret_cast<thread_id_t>(&x); }
-} }
+   // Use a nice trick from this answer: http://stackoverflow.com/a/8438730/21475
+    // In order to get a numeric thread ID in a platform-independent way, we use a thread-local
+    // static variable's address as a thread identifier :-)
+    #if defined(__GNUC__) || defined(__INTEL_COMPILER)
+        #define MOODYCAMEL_THREADLOCAL __thread
+    #elif defined(_MSC_VER)
+        #define MOODYCAMEL_THREADLOCAL __declspec(thread)
+    #else
+   // Assume C++11 compliant compiler
+        #define MOODYCAMEL_THREADLOCAL thread_local
+    #endif
+namespace moodycamel
+{
+    namespace details
+    {
+        typedef std::uintptr_t   thread_id_t;
+        static const thread_id_t invalid_thread_id  = 0;  // Address can't be nullptr
+        static const thread_id_t invalid_thread_id2 = 1;  // Member accesses off a null pointer are also generally invalid. Plus it's not aligned.
+        inline thread_id_t       thread_id()
+        {
+            static MOODYCAMEL_THREADLOCAL int x;
+            return reinterpret_cast<thread_id_t>(&x);
+        }
+    }
+}
 #endif
 
 // Constexpr if
 #ifndef MOODYCAMEL_CONSTEXPR_IF
-#if (defined(_MSC_VER) && defined(_HAS_CXX17) && _HAS_CXX17) || __cplusplus > 201402L
-#define MOODYCAMEL_CONSTEXPR_IF if constexpr
-#define MOODYCAMEL_MAYBE_UNUSED [[maybe_unused]]
-#else
-#define MOODYCAMEL_CONSTEXPR_IF if
-#define MOODYCAMEL_MAYBE_UNUSED
-#endif
+    #if (defined(_MSC_VER) && defined(_HAS_CXX17) && _HAS_CXX17) || __cplusplus > 201402L
+        #define MOODYCAMEL_CONSTEXPR_IF if constexpr
+        #define MOODYCAMEL_MAYBE_UNUSED [[maybe_unused]]
+    #else
+        #define MOODYCAMEL_CONSTEXPR_IF if
+        #define MOODYCAMEL_MAYBE_UNUSED
+    #endif
 #endif
 
 // Exceptions
 #ifndef MOODYCAMEL_EXCEPTIONS_ENABLED
-#if (defined(_MSC_VER) && defined(_CPPUNWIND)) || (defined(__GNUC__) && defined(__EXCEPTIONS)) || (!defined(_MSC_VER) && !defined(__GNUC__))
-#define MOODYCAMEL_EXCEPTIONS_ENABLED
-#endif
+    #if (defined(_MSC_VER) && defined(_CPPUNWIND)) || (defined(__GNUC__) && defined(__EXCEPTIONS)) || (!defined(_MSC_VER) && !defined(__GNUC__))
+        #define MOODYCAMEL_EXCEPTIONS_ENABLED
+    #endif
 #endif
 #ifdef MOODYCAMEL_EXCEPTIONS_ENABLED
-#define MOODYCAMEL_TRY try
-#define MOODYCAMEL_CATCH(...) catch(__VA_ARGS__)
-#define MOODYCAMEL_RETHROW throw
-#define MOODYCAMEL_THROW(expr) throw (expr)
+    #define MOODYCAMEL_TRY try
+    #define MOODYCAMEL_CATCH(...) catch (__VA_ARGS__)
+    #define MOODYCAMEL_RETHROW throw
+    #define MOODYCAMEL_THROW(expr) throw(expr)
 #else
-#define MOODYCAMEL_TRY MOODYCAMEL_CONSTEXPR_IF (true)
-#define MOODYCAMEL_CATCH(...) else MOODYCAMEL_CONSTEXPR_IF (false)
-#define MOODYCAMEL_RETHROW
-#define MOODYCAMEL_THROW(expr)
+    #define MOODYCAMEL_TRY MOODYCAMEL_CONSTEXPR_IF(true)
+    #define MOODYCAMEL_CATCH(...) else MOODYCAMEL_CONSTEXPR_IF(false)
+    #define MOODYCAMEL_RETHROW
+    #define MOODYCAMEL_THROW(expr)
 #endif
 
 #ifndef MOODYCAMEL_NOEXCEPT
-#if !defined(MOODYCAMEL_EXCEPTIONS_ENABLED)
-#define MOODYCAMEL_NOEXCEPT
-#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) true
-#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) true
-#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1800
-// VS2012's std::is_nothrow_[move_]constructible is broken and returns true when it shouldn't :-(
-// We have to assume *all* non-trivial constructors may throw on VS2012!
-#define MOODYCAMEL_NOEXCEPT _NOEXCEPT
-#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference<valueType>::value && std::is_move_constructible<type>::value ? std::is_trivially_move_constructible<type>::value : std::is_trivially_copy_constructible<type>::value)
-#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference<valueType>::value && std::is_move_assignable<type>::value ? std::is_trivially_move_assignable<type>::value || std::is_nothrow_move_assignable<type>::value : std::is_trivially_copy_assignable<type>::value || std::is_nothrow_copy_assignable<type>::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
-#elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1900
-#define MOODYCAMEL_NOEXCEPT _NOEXCEPT
-#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference<valueType>::value && std::is_move_constructible<type>::value ? std::is_trivially_move_constructible<type>::value || std::is_nothrow_move_constructible<type>::value : std::is_trivially_copy_constructible<type>::value || std::is_nothrow_copy_constructible<type>::value)
-#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference<valueType>::value && std::is_move_assignable<type>::value ? std::is_trivially_move_assignable<type>::value || std::is_nothrow_move_assignable<type>::value : std::is_trivially_copy_assignable<type>::value || std::is_nothrow_copy_assignable<type>::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
-#else
-#define MOODYCAMEL_NOEXCEPT noexcept
-#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) noexcept(expr)
-#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) noexcept(expr)
-#endif
+    #if !defined(MOODYCAMEL_EXCEPTIONS_ENABLED)
+        #define MOODYCAMEL_NOEXCEPT
+        #define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) true
+        #define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) true
+    #elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1800
+   // VS2012's std::is_nothrow_[move_]constructible is broken and returns true when it shouldn't :-(
+   // We have to assume *all* non-trivial constructors may throw on VS2012!
+        #define MOODYCAMEL_NOEXCEPT _NOEXCEPT
+        #define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference<valueType>::value && std::is_move_constructible<type>::value ? std::is_trivially_move_constructible<type>::value : std::is_trivially_copy_constructible<type>::value)
+        #define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference<valueType>::value && std::is_move_assignable<type>::value ? std::is_trivially_move_assignable<type>::value || std::is_nothrow_move_assignable<type>::value : std::is_trivially_copy_assignable<type>::value || std::is_nothrow_copy_assignable<type>::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
+    #elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1900
+        #define MOODYCAMEL_NOEXCEPT _NOEXCEPT
+        #define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference<valueType>::value && std::is_move_constructible<type>::value ? std::is_trivially_move_constructible<type>::value || std::is_nothrow_move_constructible<type>::value : std::is_trivially_copy_constructible<type>::value || std::is_nothrow_copy_constructible<type>::value)
+        #define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference<valueType>::value && std::is_move_assignable<type>::value ? std::is_trivially_move_assignable<type>::value || std::is_nothrow_move_assignable<type>::value : std::is_trivially_copy_assignable<type>::value || std::is_nothrow_copy_assignable<type>::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
+    #else
+        #define MOODYCAMEL_NOEXCEPT noexcept
+        #define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) noexcept(expr)
+        #define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) noexcept(expr)
+    #endif
 #endif
 
 #ifndef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
-#ifdef MCDBGQ_USE_RELACY
-#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
-#else
-// VS2013 doesn't support `thread_local`, and MinGW-w64 w/ POSIX threading has a crippling bug: http://sourceforge.net/p/mingw-w64/bugs/445
-// g++ <=4.7 doesn't support thread_local either.
-// Finally, iOS/ARM doesn't have support for it either, and g++/ARM allows it to compile but it's unconfirmed to actually work
-#if (!defined(_MSC_VER) || _MSC_VER >= 1900) && (!defined(__MINGW32__) && !defined(__MINGW64__) || !defined(__WINPTHREADS_VERSION)) && (!defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) && (!defined(__APPLE__) || !TARGET_OS_IPHONE) && !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__)
-// Assume `thread_local` is fully supported in all other C++11 compilers/platforms
-#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED    // tentatively enabled for now; years ago several users report having problems with it on
-#endif
-#endif
+    #ifdef MCDBGQ_USE_RELACY
+        #define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
+    #else
+   // VS2013 doesn't support `thread_local`, and MinGW-w64 w/ POSIX threading has a crippling bug: http://sourceforge.net/p/mingw-w64/bugs/445
+        // g++ <=4.7 doesn't support thread_local either.
+        // Finally, iOS/ARM doesn't have support for it either, and g++/ARM allows it to compile but it's unconfirmed to actually work
+        #if (!defined(_MSC_VER) || _MSC_VER >= 1900) && (!defined(__MINGW32__) && !defined(__MINGW64__) || !defined(__WINPTHREADS_VERSION)) && (!defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) && (!defined(__APPLE__) || !TARGET_OS_IPHONE) && !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__)
+   // Assume `thread_local` is fully supported in all other C++11 compilers/platforms
+            #define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED  // tentatively enabled for now; years ago several users report having problems with it on
+        #endif
+    #endif
 #endif
 
 // VS2012 doesn't support deleted functions.
 // In this case, we declare the function normally but don't define it. A link error will be generated if the function is called.
 #ifndef MOODYCAMEL_DELETE_FUNCTION
-#if defined(_MSC_VER) && _MSC_VER < 1800
-#define MOODYCAMEL_DELETE_FUNCTION
-#else
-#define MOODYCAMEL_DELETE_FUNCTION = delete
-#endif
+    #if defined(_MSC_VER) && _MSC_VER < 1800
+        #define MOODYCAMEL_DELETE_FUNCTION
+    #else
+        #define MOODYCAMEL_DELETE_FUNCTION = delete
+    #endif
 #endif
 
-namespace moodycamel { namespace details {
+namespace moodycamel
+{
+    namespace details
+    {
 #ifndef MOODYCAMEL_ALIGNAS
-// VS2013 doesn't support alignas or alignof, and align() requires a constant literal
-#if defined(_MSC_VER) && _MSC_VER <= 1800
-#define MOODYCAMEL_ALIGNAS(alignment) __declspec(align(alignment))
-#define MOODYCAMEL_ALIGNOF(obj) __alignof(obj)
-#define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) typename details::Vs2013Aligned<std::alignment_of<obj>::value, T>::type
-	template<int Align, typename T> struct Vs2013Aligned { };  // default, unsupported alignment
-	template<typename T> struct Vs2013Aligned<1, T> { typedef __declspec(align(1)) T type; };
-	template<typename T> struct Vs2013Aligned<2, T> { typedef __declspec(align(2)) T type; };
-	template<typename T> struct Vs2013Aligned<4, T> { typedef __declspec(align(4)) T type; };
-	template<typename T> struct Vs2013Aligned<8, T> { typedef __declspec(align(8)) T type; };
-	template<typename T> struct Vs2013Aligned<16, T> { typedef __declspec(align(16)) T type; };
-	template<typename T> struct Vs2013Aligned<32, T> { typedef __declspec(align(32)) T type; };
-	template<typename T> struct Vs2013Aligned<64, T> { typedef __declspec(align(64)) T type; };
-	template<typename T> struct Vs2013Aligned<128, T> { typedef __declspec(align(128)) T type; };
-	template<typename T> struct Vs2013Aligned<256, T> { typedef __declspec(align(256)) T type; };
-#else
-	template<typename T> struct identity { typedef T type; };
-#define MOODYCAMEL_ALIGNAS(alignment) alignas(alignment)
-#define MOODYCAMEL_ALIGNOF(obj) alignof(obj)
-#define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) alignas(alignof(obj)) typename details::identity<T>::type
-#endif
+    // VS2013 doesn't support alignas or alignof, and align() requires a constant literal
+    #if defined(_MSC_VER) && _MSC_VER <= 1800
+        #define MOODYCAMEL_ALIGNAS(alignment) __declspec(align(alignment))
+        #define MOODYCAMEL_ALIGNOF(obj) __alignof(obj)
+        #define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) typename details::Vs2013Aligned<std::alignment_of<obj>::value, T>::type
+        template<int Align, typename T>
+        struct Vs2013Aligned
+        {
+        };  // default, unsupported alignment
+        template<typename T>
+        struct Vs2013Aligned<1, T>
+        {
+            typedef __declspec(align(1)) T type;
+        };
+        template<typename T>
+        struct Vs2013Aligned<2, T>
+        {
+            typedef __declspec(align(2)) T type;
+        };
+        template<typename T>
+        struct Vs2013Aligned<4, T>
+        {
+            typedef __declspec(align(4)) T type;
+        };
+        template<typename T>
+        struct Vs2013Aligned<8, T>
+        {
+            typedef __declspec(align(8)) T type;
+        };
+        template<typename T>
+        struct Vs2013Aligned<16, T>
+        {
+            typedef __declspec(align(16)) T type;
+        };
+        template<typename T>
+        struct Vs2013Aligned<32, T>
+        {
+            typedef __declspec(align(32)) T type;
+        };
+        template<typename T>
+        struct Vs2013Aligned<64, T>
+        {
+            typedef __declspec(align(64)) T type;
+        };
+        template<typename T>
+        struct Vs2013Aligned<128, T>
+        {
+            typedef __declspec(align(128)) T type;
+        };
+        template<typename T>
+        struct Vs2013Aligned<256, T>
+        {
+            typedef __declspec(align(256)) T type;
+        };
+    #else
+        template<typename T>
+        struct identity
+        {
+            typedef T type;
+        };
+        #define MOODYCAMEL_ALIGNAS(alignment) alignas(alignment)
+        #define MOODYCAMEL_ALIGNOF(obj) alignof(obj)
+        #define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) alignas(alignof(obj)) typename details::identity<T>::type
+    #endif
 #endif
-} }
+    }  // namespace details
+}  // namespace moodycamel
 
 
 // TSAN can false report races in lock-free code.  To enable TSAN to be used from projects that use this one,
@@ -266,3482 +364,3958 @@ namespace moodycamel { namespace details {
 // See https://clang.llvm.org/docs/ThreadSanitizer.html#has-feature-thread-sanitizer
 #define MOODYCAMEL_NO_TSAN
 #if defined(__has_feature)
- #if __has_feature(thread_sanitizer)
-  #undef MOODYCAMEL_NO_TSAN
-  #define MOODYCAMEL_NO_TSAN __attribute__((no_sanitize("thread")))
- #endif // TSAN
-#endif // TSAN
+    #if __has_feature(thread_sanitizer)
+        #undef MOODYCAMEL_NO_TSAN
+        #define MOODYCAMEL_NO_TSAN __attribute__((no_sanitize("thread")))
+    #endif  // TSAN
+#endif      // TSAN
 
 // Compiler-specific likely/unlikely hints
-namespace moodycamel { namespace details {
+namespace moodycamel
+{
+    namespace details
+    {
 #if defined(__GNUC__)
-	static inline bool (likely)(bool x) { return __builtin_expect((x), true); }
-	static inline bool (unlikely)(bool x) { return __builtin_expect((x), false); }
+        static inline bool(likely)(bool x)
+        {
+            return __builtin_expect((x), true);
+        }
+        static inline bool(unlikely)(bool x)
+        {
+            return __builtin_expect((x), false);
+        }
 #else
-	static inline bool (likely)(bool x) { return x; }
-	static inline bool (unlikely)(bool x) { return x; }
+        static inline bool(likely)(bool x)
+        {
+            return x;
+        }
+        static inline bool(unlikely)(bool x)
+        {
+            return x;
+        }
 #endif
-} }
+    }  // namespace details
+}  // namespace moodycamel
 
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-#include "internal/concurrentqueue_internal_debug.h"
+    #include "internal/concurrentqueue_internal_debug.h"
 #endif
 
-namespace moodycamel {
-namespace details {
-	template<typename T>
-	struct const_numeric_max {
-		static_assert(std::is_integral<T>::value, "const_numeric_max can only be used with integers");
-		static const T value = std::numeric_limits<T>::is_signed
-			? (static_cast<T>(1) << (sizeof(T) * CHAR_BIT - 1)) - static_cast<T>(1)
-			: static_cast<T>(-1);
-	};
+namespace moodycamel
+{
+    namespace details
+    {
+        template<typename T>
+        struct const_numeric_max
+        {
+            static_assert(std::is_integral<T>::value, "const_numeric_max can only be used with integers");
+            static const T value = std::numeric_limits<T>::is_signed ? (static_cast<T>(1) << (sizeof(T) * CHAR_BIT - 1)) - static_cast<T>(1) : static_cast<T>(-1);
+        };
 
 #if defined(__GLIBCXX__)
-	typedef ::max_align_t std_max_align_t;      // libstdc++ forgot to add it to std:: for a while
+        typedef ::max_align_t std_max_align_t;  // libstdc++ forgot to add it to std:: for a while
 #else
-	typedef std::max_align_t std_max_align_t;   // Others (e.g. MSVC) insist it can *only* be accessed via std::
+        typedef std::max_align_t std_max_align_t;  // Others (e.g. MSVC) insist it can *only* be accessed via std::
 #endif
 
-	// Some platforms have incorrectly set max_align_t to a type with <8 bytes alignment even while supporting
-	// 8-byte aligned scalar values (*cough* 32-bit iOS). Work around this with our own union. See issue #64.
-	typedef union {
-		std_max_align_t x;
-		long long y;
-		void* z;
-	} max_align_t;
-}
-
-// Default traits for the ConcurrentQueue. To change some of the
-// traits without re-implementing all of them, inherit from this
-// struct and shadow the declarations you wish to be different;
-// since the traits are used as a template type parameter, the
-// shadowed declarations will be used where defined, and the defaults
-// otherwise.
-struct ConcurrentQueueDefaultTraits
-{
-	// General-purpose size type. std::size_t is strongly recommended.
-	typedef std::size_t size_t;
-
-	// The type used for the enqueue and dequeue indices. Must be at least as
-	// large as size_t. Should be significantly larger than the number of elements
-	// you expect to hold at once, especially if you have a high turnover rate;
-	// for example, on 32-bit x86, if you expect to have over a hundred million
-	// elements or pump several million elements through your queue in a very
-	// short space of time, using a 32-bit type *may* trigger a race condition.
-	// A 64-bit int type is recommended in that case, and in practice will
-	// prevent a race condition no matter the usage of the queue. Note that
-	// whether the queue is lock-free with a 64-int type depends on the whether
-	// std::atomic<std::uint64_t> is lock-free, which is platform-specific.
-	typedef std::size_t index_t;
-
-	// Internally, all elements are enqueued and dequeued from multi-element
-	// blocks; this is the smallest controllable unit. If you expect few elements
-	// but many producers, a smaller block size should be favoured. For few producers
-	// and/or many elements, a larger block size is preferred. A sane default
-	// is provided. Must be a power of 2.
-	static const size_t BLOCK_SIZE = 32;
-
-	// For explicit producers (i.e. when using a producer token), the block is
-	// checked for being empty by iterating through a list of flags, one per element.
-	// For large block sizes, this is too inefficient, and switching to an atomic
-	// counter-based approach is faster. The switch is made for block sizes strictly
-	// larger than this threshold.
-	static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32;
-
-	// How many full blocks can be expected for a single explicit producer? This should
-	// reflect that number's maximum for optimal performance. Must be a power of 2.
-	static const size_t EXPLICIT_INITIAL_INDEX_SIZE = 32;
-
-	// How many full blocks can be expected for a single implicit producer? This should
-	// reflect that number's maximum for optimal performance. Must be a power of 2.
-	static const size_t IMPLICIT_INITIAL_INDEX_SIZE = 32;
-
-	// The initial size of the hash table mapping thread IDs to implicit producers.
-	// Note that the hash is resized every time it becomes half full.
-	// Must be a power of two, and either 0 or at least 1. If 0, implicit production
-	// (using the enqueue methods without an explicit producer token) is disabled.
-	static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = 32;
-
-	// Controls the number of items that an explicit consumer (i.e. one with a token)
-	// must consume before it causes all consumers to rotate and move on to the next
-	// internal queue.
-	static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = 256;
-
-	// The maximum number of elements (inclusive) that can be enqueued to a sub-queue.
-	// Enqueue operations that would cause this limit to be surpassed will fail. Note
-	// that this limit is enforced at the block level (for performance reasons), i.e.
-	// it's rounded up to the nearest block size.
-	static const size_t MAX_SUBQUEUE_SIZE = details::const_numeric_max<size_t>::value;
-
-	// The number of times to spin before sleeping when waiting on a semaphore.
-	// Recommended values are on the order of 1000-10000 unless the number of
-	// consumer threads exceeds the number of idle cores (in which case try 0-100).
-	// Only affects instances of the BlockingConcurrentQueue.
-	static const int MAX_SEMA_SPINS = 10000;
-
-	// Whether to recycle dynamically-allocated blocks into an internal free list or
-	// not. If false, only pre-allocated blocks (controlled by the constructor
-	// arguments) will be recycled, and all others will be `free`d back to the heap.
-	// Note that blocks consumed by explicit producers are only freed on destruction
-	// of the queue (not following destruction of the token) regardless of this trait.
-	static const bool RECYCLE_ALLOCATED_BLOCKS = false;
+        // Some platforms have incorrectly set max_align_t to a type with <8 bytes alignment even while supporting
+        // 8-byte aligned scalar values (*cough* 32-bit iOS). Work around this with our own union. See issue #64.
+        typedef union
+        {
+            std_max_align_t x;
+            long long       y;
+            void*           z;
+        } max_align_t;
+    }  // namespace details
+
+    // Default traits for the ConcurrentQueue. To change some of the
+    // traits without re-implementing all of them, inherit from this
+    // struct and shadow the declarations you wish to be different;
+    // since the traits are used as a template type parameter, the
+    // shadowed declarations will be used where defined, and the defaults
+    // otherwise.
+    struct ConcurrentQueueDefaultTraits
+    {
+        // General-purpose size type. std::size_t is strongly recommended.
+        typedef std::size_t        size_t;
+
+        // The type used for the enqueue and dequeue indices. Must be at least as
+        // large as size_t. Should be significantly larger than the number of elements
+        // you expect to hold at once, especially if you have a high turnover rate;
+        // for example, on 32-bit x86, if you expect to have over a hundred million
+        // elements or pump several million elements through your queue in a very
+        // short space of time, using a 32-bit type *may* trigger a race condition.
+        // A 64-bit int type is recommended in that case, and in practice will
+        // prevent a race condition no matter the usage of the queue. Note that
+        // whether the queue is lock-free with a 64-int type depends on the whether
+        // std::atomic<std::uint64_t> is lock-free, which is platform-specific.
+        typedef std::size_t        index_t;
+
+        // Internally, all elements are enqueued and dequeued from multi-element
+        // blocks; this is the smallest controllable unit. If you expect few elements
+        // but many producers, a smaller block size should be favoured. For few producers
+        // and/or many elements, a larger block size is preferred. A sane default
+        // is provided. Must be a power of 2.
+        static const size_t        BLOCK_SIZE = 32;
+
+        // For explicit producers (i.e. when using a producer token), the block is
+        // checked for being empty by iterating through a list of flags, one per element.
+        // For large block sizes, this is too inefficient, and switching to an atomic
+        // counter-based approach is faster. The switch is made for block sizes strictly
+        // larger than this threshold.
+        static const size_t        EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32;
+
+        // How many full blocks can be expected for a single explicit producer? This should
+        // reflect that number's maximum for optimal performance. Must be a power of 2.
+        static const size_t        EXPLICIT_INITIAL_INDEX_SIZE = 32;
+
+        // How many full blocks can be expected for a single implicit producer? This should
+        // reflect that number's maximum for optimal performance. Must be a power of 2.
+        static const size_t        IMPLICIT_INITIAL_INDEX_SIZE = 32;
+
+        // The initial size of the hash table mapping thread IDs to implicit producers.
+        // Note that the hash is resized every time it becomes half full.
+        // Must be a power of two, and either 0 or at least 1. If 0, implicit production
+        // (using the enqueue methods without an explicit producer token) is disabled.
+        static const size_t        INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = 32;
+
+        // Controls the number of items that an explicit consumer (i.e. one with a token)
+        // must consume before it causes all consumers to rotate and move on to the next
+        // internal queue.
+        static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = 256;
+
+        // The maximum number of elements (inclusive) that can be enqueued to a sub-queue.
+        // Enqueue operations that would cause this limit to be surpassed will fail. Note
+        // that this limit is enforced at the block level (for performance reasons), i.e.
+        // it's rounded up to the nearest block size.
+        static const size_t        MAX_SUBQUEUE_SIZE = details::const_numeric_max<size_t>::value;
+
+        // The number of times to spin before sleeping when waiting on a semaphore.
+        // Recommended values are on the order of 1000-10000 unless the number of
+        // consumer threads exceeds the number of idle cores (in which case try 0-100).
+        // Only affects instances of the BlockingConcurrentQueue.
+        static const int           MAX_SEMA_SPINS = 10000;
+
+        // Whether to recycle dynamically-allocated blocks into an internal free list or
+        // not. If false, only pre-allocated blocks (controlled by the constructor
+        // arguments) will be recycled, and all others will be `free`d back to the heap.
+        // Note that blocks consumed by explicit producers are only freed on destruction
+        // of the queue (not following destruction of the token) regardless of this trait.
+        static const bool          RECYCLE_ALLOCATED_BLOCKS = false;
 
 
 #ifndef MCDBGQ_USE_RELACY
-	// Memory allocation can be customized if needed.
-	// malloc should return nullptr on failure, and handle alignment like std::malloc.
-#if defined(malloc) || defined(free)
-	// Gah, this is 2015, stop defining macros that break standard code already!
-	// Work around malloc/free being special macros:
-	static inline void* WORKAROUND_malloc(size_t size) { return malloc(size); }
-	static inline void WORKAROUND_free(void* ptr) { return free(ptr); }
-	static inline void* (malloc)(size_t size) { return WORKAROUND_malloc(size); }
-	static inline void (free)(void* ptr) { return WORKAROUND_free(ptr); }
-#else
-	static inline void* malloc(size_t size) { return std::malloc(size); }
-	static inline void free(void* ptr) { return std::free(ptr); }
-#endif
+        // Memory allocation can be customized if needed.
+        // malloc should return nullptr on failure, and handle alignment like std::malloc.
+    #if defined(malloc) || defined(free)
+        // Gah, this is 2015, stop defining macros that break standard code already!
+        // Work around malloc/free being special macros:
+        static inline void* WORKAROUND_malloc(size_t size)
+        {
+            return malloc(size);
+        }
+        static inline void WORKAROUND_free(void* ptr)
+        {
+            return free(ptr);
+        }
+        static inline void*(malloc)(size_t size)
+        {
+            return WORKAROUND_malloc(size);
+        }
+        static inline void(free)(void* ptr)
+        {
+            return WORKAROUND_free(ptr);
+        }
+    #else
+        static inline void* malloc(size_t size)
+        {
+            return std::malloc(size);
+        }
+        static inline void free(void* ptr)
+        {
+            return std::free(ptr);
+        }
+    #endif
 #else
-	// Debug versions when running under the Relacy race detector (ignore
-	// these in user code)
-	static inline void* malloc(size_t size) { return rl::rl_malloc(size, $); }
-	static inline void free(void* ptr) { return rl::rl_free(ptr, $); }
+        // Debug versions when running under the Relacy race detector (ignore
+        // these in user code)
+        static inline void* malloc(size_t size)
+        {
+            return rl::rl_malloc(size, $);
+        }
+        static inline void free(void* ptr)
+        {
+            return rl::rl_free(ptr, $);
+        }
 #endif
-};
-
-
-// When producing or consuming many elements, the most efficient way is to:
-//    1) Use one of the bulk-operation methods of the queue with a token
-//    2) Failing that, use the bulk-operation methods without a token
-//    3) Failing that, create a token and use that with the single-item methods
-//    4) Failing that, use the single-parameter methods of the queue
-// Having said that, don't create tokens willy-nilly -- ideally there should be
-// a maximum of one token per thread (of each kind).
-struct ProducerToken;
-struct ConsumerToken;
-
-template<typename T, typename Traits> class ConcurrentQueue;
-template<typename T, typename Traits> class BlockingConcurrentQueue;
-class ConcurrentQueueTests;
-
-
-namespace details
-{
-	struct ConcurrentQueueProducerTypelessBase
-	{
-		ConcurrentQueueProducerTypelessBase* next;
-		std::atomic<bool> inactive;
-		ProducerToken* token;
-
-		ConcurrentQueueProducerTypelessBase()
-			: next(nullptr), inactive(false), token(nullptr)
-		{
-		}
-	};
-
-	template<bool use32> struct _hash_32_or_64 {
-		static inline std::uint32_t hash(std::uint32_t h)
-		{
-			// MurmurHash3 finalizer -- see https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp
-			// Since the thread ID is already unique, all we really want to do is propagate that
-			// uniqueness evenly across all the bits, so that we can use a subset of the bits while
-			// reducing collisions significantly
-			h ^= h >> 16;
-			h *= 0x85ebca6b;
-			h ^= h >> 13;
-			h *= 0xc2b2ae35;
-			return h ^ (h >> 16);
-		}
-	};
-	template<> struct _hash_32_or_64<1> {
-		static inline std::uint64_t hash(std::uint64_t h)
-		{
-			h ^= h >> 33;
-			h *= 0xff51afd7ed558ccd;
-			h ^= h >> 33;
-			h *= 0xc4ceb9fe1a85ec53;
-			return h ^ (h >> 33);
-		}
-	};
-	template<std::size_t size> struct hash_32_or_64 : public _hash_32_or_64<(size > 4)> {  };
-
-	static inline size_t hash_thread_id(thread_id_t id)
-	{
-		static_assert(sizeof(thread_id_t) <= 8, "Expected a platform where thread IDs are at most 64-bit values");
-		return static_cast<size_t>(hash_32_or_64<sizeof(thread_id_converter<thread_id_t>::thread_id_hash_t)>::hash(
-			thread_id_converter<thread_id_t>::prehash(id)));
-	}
-
-	template<typename T>
-	static inline bool circular_less_than(T a, T b)
-	{
-		static_assert(std::is_integral<T>::value && !std::numeric_limits<T>::is_signed, "circular_less_than is intended to be used only with unsigned integer types");
-		return static_cast<T>(a - b) > static_cast<T>(static_cast<T>(1) << (static_cast<T>(sizeof(T) * CHAR_BIT - 1)));
-		// Note: extra parens around rhs of operator<< is MSVC bug: https://developercommunity2.visualstudio.com/t/C4554-triggers-when-both-lhs-and-rhs-is/10034931
-		//       silencing the bug requires #pragma warning(disable: 4554) around the calling code and has no effect when done here.
-	}
-
-	template<typename U>
-	static inline char* align_for(char* ptr)
-	{
-		const std::size_t alignment = std::alignment_of<U>::value;
-		return ptr + (alignment - (reinterpret_cast<std::uintptr_t>(ptr) % alignment)) % alignment;
-	}
-
-	template<typename T>
-	static inline T ceil_to_pow_2(T x)
-	{
-		static_assert(std::is_integral<T>::value && !std::numeric_limits<T>::is_signed, "ceil_to_pow_2 is intended to be used only with unsigned integer types");
-
-		// Adapted from http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
-		--x;
-		x |= x >> 1;
-		x |= x >> 2;
-		x |= x >> 4;
-		for (std::size_t i = 1; i < sizeof(T); i <<= 1) {
-			x |= x >> (i << 3);
-		}
-		++x;
-		return x;
-	}
-
-	template<typename T>
-	static inline void swap_relaxed(std::atomic<T>& left, std::atomic<T>& right)
-	{
-		T temp = std::move(left.load(std::memory_order_relaxed));
-		left.store(std::move(right.load(std::memory_order_relaxed)), std::memory_order_relaxed);
-		right.store(std::move(temp), std::memory_order_relaxed);
-	}
-
-	template<typename T>
-	static inline T const& nomove(T const& x)
-	{
-		return x;
-	}
-
-	template<bool Enable>
-	struct nomove_if
-	{
-		template<typename T>
-		static inline T const& eval(T const& x)
-		{
-			return x;
-		}
-	};
-
-	template<>
-	struct nomove_if<false>
-	{
-		template<typename U>
-		static inline auto eval(U&& x)
-			-> decltype(std::forward<U>(x))
-		{
-			return std::forward<U>(x);
-		}
-	};
-
-	template<typename It>
-	static inline auto deref_noexcept(It& it) MOODYCAMEL_NOEXCEPT -> decltype(*it)
-	{
-		return *it;
-	}
+    };
+
+
+    // When producing or consuming many elements, the most efficient way is to:
+    //    1) Use one of the bulk-operation methods of the queue with a token
+    //    2) Failing that, use the bulk-operation methods without a token
+    //    3) Failing that, create a token and use that with the single-item methods
+    //    4) Failing that, use the single-parameter methods of the queue
+    // Having said that, don't create tokens willy-nilly -- ideally there should be
+    // a maximum of one token per thread (of each kind).
+    struct ProducerToken;
+    struct ConsumerToken;
+
+    template<typename T, typename Traits>
+    class ConcurrentQueue;
+    template<typename T, typename Traits>
+    class BlockingConcurrentQueue;
+    class ConcurrentQueueTests;
+
+
+    namespace details
+    {
+        struct ConcurrentQueueProducerTypelessBase
+        {
+            ConcurrentQueueProducerTypelessBase* next;
+            std::atomic<bool>                    inactive;
+            ProducerToken*                       token;
+
+            ConcurrentQueueProducerTypelessBase()
+                : next(nullptr)
+                , inactive(false)
+                , token(nullptr)
+            {
+            }
+        };
+
+        template<bool use32>
+        struct _hash_32_or_64
+        {
+            static inline std::uint32_t hash(std::uint32_t h)
+            {
+                // MurmurHash3 finalizer -- see https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp
+                // Since the thread ID is already unique, all we really want to do is propagate that
+                // uniqueness evenly across all the bits, so that we can use a subset of the bits while
+                // reducing collisions significantly
+                h ^= h >> 16;
+                h *= 0x85ebca6b;
+                h ^= h >> 13;
+                h *= 0xc2b2ae35;
+                return h ^ (h >> 16);
+            }
+        };
+        template<>
+        struct _hash_32_or_64<1>
+        {
+            static inline std::uint64_t hash(std::uint64_t h)
+            {
+                h ^= h >> 33;
+                h *= 0xff51afd7ed558ccd;
+                h ^= h >> 33;
+                h *= 0xc4ceb9fe1a85ec53;
+                return h ^ (h >> 33);
+            }
+        };
+        template<std::size_t size>
+        struct hash_32_or_64 : public _hash_32_or_64<(size > 4)>
+        {
+        };
+
+        static inline size_t hash_thread_id(thread_id_t id)
+        {
+            static_assert(sizeof(thread_id_t) <= 8, "Expected a platform where thread IDs are at most 64-bit values");
+            return static_cast<size_t>(hash_32_or_64<sizeof(thread_id_converter<thread_id_t>::thread_id_hash_t)>::hash(
+                thread_id_converter<thread_id_t>::prehash(id)));
+        }
+
+        template<typename T>
+        static inline bool circular_less_than(T a, T b)
+        {
+            static_assert(std::is_integral<T>::value && !std::numeric_limits<T>::is_signed, "circular_less_than is intended to be used only with unsigned integer types");
+            return static_cast<T>(a - b) > static_cast<T>(static_cast<T>(1) << (static_cast<T>(sizeof(T) * CHAR_BIT - 1)));
+            // Note: extra parens around rhs of operator<< is MSVC bug: https://developercommunity2.visualstudio.com/t/C4554-triggers-when-both-lhs-and-rhs-is/10034931
+            //       silencing the bug requires #pragma warning(disable: 4554) around the calling code and has no effect when done here.
+        }
+
+        template<typename U>
+        static inline char* align_for(char* ptr)
+        {
+            const std::size_t alignment = std::alignment_of<U>::value;
+            return ptr + (alignment - (reinterpret_cast<std::uintptr_t>(ptr) % alignment)) % alignment;
+        }
+
+        template<typename T>
+        static inline T ceil_to_pow_2(T x)
+        {
+            static_assert(std::is_integral<T>::value && !std::numeric_limits<T>::is_signed, "ceil_to_pow_2 is intended to be used only with unsigned integer types");
+
+            // Adapted from http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+            --x;
+            x |= x >> 1;
+            x |= x >> 2;
+            x |= x >> 4;
+            for (std::size_t i = 1; i < sizeof(T); i <<= 1)
+            {
+                x |= x >> (i << 3);
+            }
+            ++x;
+            return x;
+        }
+
+        template<typename T>
+        static inline void swap_relaxed(std::atomic<T>& left, std::atomic<T>& right)
+        {
+            T temp = std::move(left.load(std::memory_order_relaxed));
+            left.store(std::move(right.load(std::memory_order_relaxed)), std::memory_order_relaxed);
+            right.store(std::move(temp), std::memory_order_relaxed);
+        }
+
+        template<typename T>
+        static inline T const& nomove(T const& x)
+        {
+            return x;
+        }
+
+        template<bool Enable>
+        struct nomove_if
+        {
+            template<typename T>
+            static inline T const& eval(T const& x)
+            {
+                return x;
+            }
+        };
+
+        template<>
+        struct nomove_if<false>
+        {
+            template<typename U>
+            static inline auto eval(U&& x)
+                -> decltype(std::forward<U>(x))
+            {
+                return std::forward<U>(x);
+            }
+        };
+
+        template<typename It>
+        static inline auto deref_noexcept(It& it) MOODYCAMEL_NOEXCEPT->decltype(*it)
+        {
+            return *it;
+        }
 
 #if defined(__clang__) || !defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
-	template<typename T> struct is_trivially_destructible : std::is_trivially_destructible<T> { };
+        template<typename T>
+        struct is_trivially_destructible : std::is_trivially_destructible<T>
+        {
+        };
 #else
-	template<typename T> struct is_trivially_destructible : std::has_trivial_destructor<T> { };
+        template<typename T>
+        struct is_trivially_destructible : std::has_trivial_destructor<T>
+        {
+        };
 #endif
 
 #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
-#ifdef MCDBGQ_USE_RELACY
-	typedef RelacyThreadExitListener ThreadExitListener;
-	typedef RelacyThreadExitNotifier ThreadExitNotifier;
-#else
-	class ThreadExitNotifier;
-
-	struct ThreadExitListener
-	{
-		typedef void (*callback_t)(void*);
-		callback_t callback;
-		void* userData;
-
-		ThreadExitListener* next;		// reserved for use by the ThreadExitNotifier
-		ThreadExitNotifier* chain;		// reserved for use by the ThreadExitNotifier
-	};
-
-	class ThreadExitNotifier
-	{
-	public:
-		static void subscribe(ThreadExitListener* listener)
-		{
-			auto& tlsInst = instance();
-			std::lock_guard<std::mutex> guard(mutex());
-			listener->next = tlsInst.tail;
-			listener->chain = &tlsInst;
-			tlsInst.tail = listener;
-		}
-
-		static void unsubscribe(ThreadExitListener* listener)
-		{
-			std::lock_guard<std::mutex> guard(mutex());
-			if (!listener->chain) {
-				return;  // race with ~ThreadExitNotifier
-			}
-			auto& tlsInst = *listener->chain;
-			listener->chain = nullptr;
-			ThreadExitListener** prev = &tlsInst.tail;
-			for (auto ptr = tlsInst.tail; ptr != nullptr; ptr = ptr->next) {
-				if (ptr == listener) {
-					*prev = ptr->next;
-					break;
-				}
-				prev = &ptr->next;
-			}
-		}
-
-	private:
-		ThreadExitNotifier() : tail(nullptr) { }
-		ThreadExitNotifier(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION;
-		ThreadExitNotifier& operator=(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION;
-
-		~ThreadExitNotifier()
-		{
-			// This thread is about to exit, let everyone know!
-			assert(this == &instance() && "If this assert fails, you likely have a buggy compiler! Change the preprocessor conditions such that MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is no longer defined.");
-			std::lock_guard<std::mutex> guard(mutex());
-			for (auto ptr = tail; ptr != nullptr; ptr = ptr->next) {
-				ptr->chain = nullptr;
-				ptr->callback(ptr->userData);
-			}
-		}
-
-		// Thread-local
-		static inline ThreadExitNotifier& instance()
-		{
-			static thread_local ThreadExitNotifier notifier;
-			return notifier;
-		}
-
-		static inline std::mutex& mutex()
-		{
-			// Must be static because the ThreadExitNotifier could be destroyed while unsubscribe is called
-			static std::mutex mutex;
-			return mutex;
-		}
-
-	private:
-		ThreadExitListener* tail;
-	};
-#endif
-#endif
-
-	template<typename T> struct static_is_lock_free_num { enum { value = 0 }; };
-	template<> struct static_is_lock_free_num<signed char> { enum { value = ATOMIC_CHAR_LOCK_FREE }; };
-	template<> struct static_is_lock_free_num<short> { enum { value = ATOMIC_SHORT_LOCK_FREE }; };
-	template<> struct static_is_lock_free_num<int> { enum { value = ATOMIC_INT_LOCK_FREE }; };
-	template<> struct static_is_lock_free_num<long> { enum { value = ATOMIC_LONG_LOCK_FREE }; };
-	template<> struct static_is_lock_free_num<long long> { enum { value = ATOMIC_LLONG_LOCK_FREE }; };
-	template<typename T> struct static_is_lock_free : static_is_lock_free_num<typename std::make_signed<T>::type> {  };
-	template<> struct static_is_lock_free<bool> { enum { value = ATOMIC_BOOL_LOCK_FREE }; };
-	template<typename U> struct static_is_lock_free<U*> { enum { value = ATOMIC_POINTER_LOCK_FREE }; };
-}
-
+    #ifdef MCDBGQ_USE_RELACY
+        typedef RelacyThreadExitListener ThreadExitListener;
+        typedef RelacyThreadExitNotifier ThreadExitNotifier;
+    #else
+        class ThreadExitNotifier;
+
+        struct ThreadExitListener
+        {
+            typedef void (*callback_t)(void*);
+            callback_t          callback;
+            void*               userData;
+
+            ThreadExitListener* next;   // reserved for use by the ThreadExitNotifier
+            ThreadExitNotifier* chain;  // reserved for use by the ThreadExitNotifier
+        };
+
+        class ThreadExitNotifier
+        {
+          public:
+            static void subscribe(ThreadExitListener* listener)
+            {
+                auto&                       tlsInst = instance();
+                std::lock_guard<std::mutex> guard(mutex());
+                listener->next  = tlsInst.tail;
+                listener->chain = &tlsInst;
+                tlsInst.tail    = listener;
+            }
+
+            static void unsubscribe(ThreadExitListener* listener)
+            {
+                std::lock_guard<std::mutex> guard(mutex());
+                if (!listener->chain)
+                {
+                    return;  // race with ~ThreadExitNotifier
+                }
+                auto& tlsInst             = *listener->chain;
+                listener->chain           = nullptr;
+                ThreadExitListener** prev = &tlsInst.tail;
+                for (auto ptr = tlsInst.tail; ptr != nullptr; ptr = ptr->next)
+                {
+                    if (ptr == listener)
+                    {
+                        *prev = ptr->next;
+                        break;
+                    }
+                    prev = &ptr->next;
+                }
+            }
+
+          private:
+            ThreadExitNotifier()
+                : tail(nullptr)
+            {
+            }
+            ThreadExitNotifier(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION;
+            ThreadExitNotifier& operator=(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION;
+
+            ~ThreadExitNotifier()
+            {
+                // This thread is about to exit, let everyone know!
+                assert(this == &instance() && "If this assert fails, you likely have a buggy compiler! Change the preprocessor conditions such that MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is no longer defined.");
+                std::lock_guard<std::mutex> guard(mutex());
+                for (auto ptr = tail; ptr != nullptr; ptr = ptr->next)
+                {
+                    ptr->chain = nullptr;
+                    ptr->callback(ptr->userData);
+                }
+            }
+
+            // Thread-local
+            static inline ThreadExitNotifier& instance()
+            {
+                static thread_local ThreadExitNotifier notifier;
+                return notifier;
+            }
+
+            static inline std::mutex& mutex()
+            {
+                // Must be static because the ThreadExitNotifier could be destroyed while unsubscribe is called
+                static std::mutex mutex;
+                return mutex;
+            }
+
+          private:
+            ThreadExitListener* tail;
+        };
+    #endif
+#endif
 
-struct ProducerToken
-{
-	template<typename T, typename Traits>
-	explicit ProducerToken(ConcurrentQueue<T, Traits>& queue);
-
-	template<typename T, typename Traits>
-	explicit ProducerToken(BlockingConcurrentQueue<T, Traits>& queue);
-
-	ProducerToken(ProducerToken&& other) MOODYCAMEL_NOEXCEPT
-		: producer(other.producer)
-	{
-		other.producer = nullptr;
-		if (producer != nullptr) {
-			producer->token = this;
-		}
-	}
-
-	inline ProducerToken& operator=(ProducerToken&& other) MOODYCAMEL_NOEXCEPT
-	{
-		swap(other);
-		return *this;
-	}
-
-	void swap(ProducerToken& other) MOODYCAMEL_NOEXCEPT
-	{
-		std::swap(producer, other.producer);
-		if (producer != nullptr) {
-			producer->token = this;
-		}
-		if (other.producer != nullptr) {
-			other.producer->token = &other;
-		}
-	}
-
-	// A token is always valid unless:
-	//     1) Memory allocation failed during construction
-	//     2) It was moved via the move constructor
-	//        (Note: assignment does a swap, leaving both potentially valid)
-	//     3) The associated queue was destroyed
-	// Note that if valid() returns true, that only indicates
-	// that the token is valid for use with a specific queue,
-	// but not which one; that's up to the user to track.
-	inline bool valid() const { return producer != nullptr; }
-
-	~ProducerToken()
-	{
-		if (producer != nullptr) {
-			producer->token = nullptr;
-			producer->inactive.store(true, std::memory_order_release);
-		}
-	}
-
-	// Disable copying and assignment
-	ProducerToken(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION;
-	ProducerToken& operator=(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION;
-
-private:
-	template<typename T, typename Traits> friend class ConcurrentQueue;
-	friend class ConcurrentQueueTests;
-
-protected:
-	details::ConcurrentQueueProducerTypelessBase* producer;
-};
-
-
-struct ConsumerToken
-{
-	template<typename T, typename Traits>
-	explicit ConsumerToken(ConcurrentQueue<T, Traits>& q);
-
-	template<typename T, typename Traits>
-	explicit ConsumerToken(BlockingConcurrentQueue<T, Traits>& q);
-
-	ConsumerToken(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT
-		: initialOffset(other.initialOffset), lastKnownGlobalOffset(other.lastKnownGlobalOffset), itemsConsumedFromCurrent(other.itemsConsumedFromCurrent), currentProducer(other.currentProducer), desiredProducer(other.desiredProducer)
-	{
-	}
-
-	inline ConsumerToken& operator=(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT
-	{
-		swap(other);
-		return *this;
-	}
-
-	void swap(ConsumerToken& other) MOODYCAMEL_NOEXCEPT
-	{
-		std::swap(initialOffset, other.initialOffset);
-		std::swap(lastKnownGlobalOffset, other.lastKnownGlobalOffset);
-		std::swap(itemsConsumedFromCurrent, other.itemsConsumedFromCurrent);
-		std::swap(currentProducer, other.currentProducer);
-		std::swap(desiredProducer, other.desiredProducer);
-	}
-
-	// Disable copying and assignment
-	ConsumerToken(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION;
-	ConsumerToken& operator=(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION;
-
-private:
-	template<typename T, typename Traits> friend class ConcurrentQueue;
-	friend class ConcurrentQueueTests;
-
-private: // but shared with ConcurrentQueue
-	std::uint32_t initialOffset;
-	std::uint32_t lastKnownGlobalOffset;
-	std::uint32_t itemsConsumedFromCurrent;
-	details::ConcurrentQueueProducerTypelessBase* currentProducer;
-	details::ConcurrentQueueProducerTypelessBase* desiredProducer;
-};
-
-// Need to forward-declare this swap because it's in a namespace.
-// See http://stackoverflow.com/questions/4492062/why-does-a-c-friend-class-need-a-forward-declaration-only-in-other-namespaces
-template<typename T, typename Traits>
-inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& a, typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT;
-
-
-template<typename T, typename Traits = ConcurrentQueueDefaultTraits>
-class ConcurrentQueue
-{
-public:
-	typedef ::moodycamel::ProducerToken producer_token_t;
-	typedef ::moodycamel::ConsumerToken consumer_token_t;
-
-	typedef typename Traits::index_t index_t;
-	typedef typename Traits::size_t size_t;
-
-	static const size_t BLOCK_SIZE = static_cast<size_t>(Traits::BLOCK_SIZE);
-	static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = static_cast<size_t>(Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD);
-	static const size_t EXPLICIT_INITIAL_INDEX_SIZE = static_cast<size_t>(Traits::EXPLICIT_INITIAL_INDEX_SIZE);
-	static const size_t IMPLICIT_INITIAL_INDEX_SIZE = static_cast<size_t>(Traits::IMPLICIT_INITIAL_INDEX_SIZE);
-	static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = static_cast<size_t>(Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE);
-	static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = static_cast<std::uint32_t>(Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE);
+        template<typename T>
+        struct static_is_lock_free_num
+        {
+            enum
+            {
+                value = 0
+            };
+        };
+        template<>
+        struct static_is_lock_free_num<signed char>
+        {
+            enum
+            {
+                value = ATOMIC_CHAR_LOCK_FREE
+            };
+        };
+        template<>
+        struct static_is_lock_free_num<short>
+        {
+            enum
+            {
+                value = ATOMIC_SHORT_LOCK_FREE
+            };
+        };
+        template<>
+        struct static_is_lock_free_num<int>
+        {
+            enum
+            {
+                value = ATOMIC_INT_LOCK_FREE
+            };
+        };
+        template<>
+        struct static_is_lock_free_num<long>
+        {
+            enum
+            {
+                value = ATOMIC_LONG_LOCK_FREE
+            };
+        };
+        template<>
+        struct static_is_lock_free_num<long long>
+        {
+            enum
+            {
+                value = ATOMIC_LLONG_LOCK_FREE
+            };
+        };
+        template<typename T>
+        struct static_is_lock_free : static_is_lock_free_num<typename std::make_signed<T>::type>
+        {
+        };
+        template<>
+        struct static_is_lock_free<bool>
+        {
+            enum
+            {
+                value = ATOMIC_BOOL_LOCK_FREE
+            };
+        };
+        template<typename U>
+        struct static_is_lock_free<U*>
+        {
+            enum
+            {
+                value = ATOMIC_POINTER_LOCK_FREE
+            };
+        };
+    }  // namespace details
+
+
+    struct ProducerToken
+    {
+        template<typename T, typename Traits>
+        explicit ProducerToken(ConcurrentQueue<T, Traits>& queue);
+
+        template<typename T, typename Traits>
+        explicit ProducerToken(BlockingConcurrentQueue<T, Traits>& queue);
+
+        ProducerToken(ProducerToken&& other) MOODYCAMEL_NOEXCEPT
+            : producer(other.producer)
+        {
+            other.producer = nullptr;
+            if (producer != nullptr)
+            {
+                producer->token = this;
+            }
+        }
+
+        inline ProducerToken& operator=(ProducerToken&& other) MOODYCAMEL_NOEXCEPT
+        {
+            swap(other);
+            return *this;
+        }
+
+        void swap(ProducerToken& other) MOODYCAMEL_NOEXCEPT
+        {
+            std::swap(producer, other.producer);
+            if (producer != nullptr)
+            {
+                producer->token = this;
+            }
+            if (other.producer != nullptr)
+            {
+                other.producer->token = &other;
+            }
+        }
+
+        // A token is always valid unless:
+        //     1) Memory allocation failed during construction
+        //     2) It was moved via the move constructor
+        //        (Note: assignment does a swap, leaving both potentially valid)
+        //     3) The associated queue was destroyed
+        // Note that if valid() returns true, that only indicates
+        // that the token is valid for use with a specific queue,
+        // but not which one; that's up to the user to track.
+        inline bool valid() const
+        {
+            return producer != nullptr;
+        }
+
+        ~ProducerToken()
+        {
+            if (producer != nullptr)
+            {
+                producer->token = nullptr;
+                producer->inactive.store(true, std::memory_order_release);
+            }
+        }
+
+        // Disable copying and assignment
+        ProducerToken(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+        ProducerToken& operator=(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+
+      private:
+        template<typename T, typename Traits>
+        friend class ConcurrentQueue;
+        friend class ConcurrentQueueTests;
+
+      protected:
+        details::ConcurrentQueueProducerTypelessBase* producer;
+    };
+
+
+    struct ConsumerToken
+    {
+        template<typename T, typename Traits>
+        explicit ConsumerToken(ConcurrentQueue<T, Traits>& q);
+
+        template<typename T, typename Traits>
+        explicit ConsumerToken(BlockingConcurrentQueue<T, Traits>& q);
+
+        ConsumerToken(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT
+            : initialOffset(other.initialOffset),
+              lastKnownGlobalOffset(other.lastKnownGlobalOffset),
+              itemsConsumedFromCurrent(other.itemsConsumedFromCurrent),
+              currentProducer(other.currentProducer),
+              desiredProducer(other.desiredProducer)
+        {
+        }
+
+        inline ConsumerToken& operator=(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT
+        {
+            swap(other);
+            return *this;
+        }
+
+        void swap(ConsumerToken& other) MOODYCAMEL_NOEXCEPT
+        {
+            std::swap(initialOffset, other.initialOffset);
+            std::swap(lastKnownGlobalOffset, other.lastKnownGlobalOffset);
+            std::swap(itemsConsumedFromCurrent, other.itemsConsumedFromCurrent);
+            std::swap(currentProducer, other.currentProducer);
+            std::swap(desiredProducer, other.desiredProducer);
+        }
+
+        // Disable copying and assignment
+        ConsumerToken(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+        ConsumerToken& operator=(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+
+      private:
+        template<typename T, typename Traits>
+        friend class ConcurrentQueue;
+        friend class ConcurrentQueueTests;
+
+      private:  // but shared with ConcurrentQueue
+        std::uint32_t                                 initialOffset;
+        std::uint32_t                                 lastKnownGlobalOffset;
+        std::uint32_t                                 itemsConsumedFromCurrent;
+        details::ConcurrentQueueProducerTypelessBase* currentProducer;
+        details::ConcurrentQueueProducerTypelessBase* desiredProducer;
+    };
+
+    // Need to forward-declare this swap because it's in a namespace.
+    // See http://stackoverflow.com/questions/4492062/why-does-a-c-friend-class-need-a-forward-declaration-only-in-other-namespaces
+    template<typename T, typename Traits>
+    inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& a, typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT;
+
+
+    template<typename T, typename Traits = ConcurrentQueueDefaultTraits>
+    class ConcurrentQueue
+    {
+      public:
+        typedef ::moodycamel::ProducerToken producer_token_t;
+        typedef ::moodycamel::ConsumerToken consumer_token_t;
+
+        typedef typename Traits::index_t    index_t;
+        typedef typename Traits::size_t     size_t;
+
+        static const size_t                 BLOCK_SIZE                                        = static_cast<size_t>(Traits::BLOCK_SIZE);
+        static const size_t                 EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD            = static_cast<size_t>(Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD);
+        static const size_t                 EXPLICIT_INITIAL_INDEX_SIZE                       = static_cast<size_t>(Traits::EXPLICIT_INITIAL_INDEX_SIZE);
+        static const size_t                 IMPLICIT_INITIAL_INDEX_SIZE                       = static_cast<size_t>(Traits::IMPLICIT_INITIAL_INDEX_SIZE);
+        static const size_t                 INITIAL_IMPLICIT_PRODUCER_HASH_SIZE               = static_cast<size_t>(Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE);
+        static const std::uint32_t          EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = static_cast<std::uint32_t>(Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE);
 #ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable: 4307)		// + integral constant overflow (that's what the ternary expression is for!)
-#pragma warning(disable: 4309)		// static_cast: Truncation of constant value
+    #pragma warning(push)
+    #pragma warning(disable : 4307)  // + integral constant overflow (that's what the ternary expression is for!)
+    #pragma warning(disable : 4309)  // static_cast: Truncation of constant value
 #endif
-	static const size_t MAX_SUBQUEUE_SIZE = (details::const_numeric_max<size_t>::value - static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) < BLOCK_SIZE) ? details::const_numeric_max<size_t>::value : ((static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) + (BLOCK_SIZE - 1)) / BLOCK_SIZE * BLOCK_SIZE);
+        static const size_t MAX_SUBQUEUE_SIZE = (details::const_numeric_max<size_t>::value - static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) < BLOCK_SIZE) ? details::const_numeric_max<size_t>::value : ((static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) + (BLOCK_SIZE - 1)) / BLOCK_SIZE * BLOCK_SIZE);
 #ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-
-	static_assert(!std::numeric_limits<size_t>::is_signed && std::is_integral<size_t>::value, "Traits::size_t must be an unsigned integral type");
-	static_assert(!std::numeric_limits<index_t>::is_signed && std::is_integral<index_t>::value, "Traits::index_t must be an unsigned integral type");
-	static_assert(sizeof(index_t) >= sizeof(size_t), "Traits::index_t must be at least as wide as Traits::size_t");
-	static_assert((BLOCK_SIZE > 1) && !(BLOCK_SIZE & (BLOCK_SIZE - 1)), "Traits::BLOCK_SIZE must be a power of 2 (and at least 2)");
-	static_assert((EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) && !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD & (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)), "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a power of 2 (and greater than 1)");
-	static_assert((EXPLICIT_INITIAL_INDEX_SIZE > 1) && !(EXPLICIT_INITIAL_INDEX_SIZE & (EXPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)");
-	static_assert((IMPLICIT_INITIAL_INDEX_SIZE > 1) && !(IMPLICIT_INITIAL_INDEX_SIZE & (IMPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::IMPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)");
-	static_assert((INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) || !(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE & (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - 1)), "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be a power of 2");
-	static_assert(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0 || INITIAL_IMPLICIT_PRODUCER_HASH_SIZE >= 1, "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be at least 1 (or 0 to disable implicit enqueueing)");
-
-public:
-	// Creates a queue with at least `capacity` element slots; note that the
-	// actual number of elements that can be inserted without additional memory
-	// allocation depends on the number of producers and the block size (e.g. if
-	// the block size is equal to `capacity`, only a single block will be allocated
-	// up-front, which means only a single producer will be able to enqueue elements
-	// without an extra allocation -- blocks aren't shared between producers).
-	// This method is not thread safe -- it is up to the user to ensure that the
-	// queue is fully constructed before it starts being used by other threads (this
-	// includes making the memory effects of construction visible, possibly with a
-	// memory barrier).
-	explicit ConcurrentQueue(size_t capacity = 32 * BLOCK_SIZE)
-		: producerListTail(nullptr),
-		producerCount(0),
-		initialBlockPoolIndex(0),
-		nextExplicitConsumerId(0),
-		globalExplicitConsumerOffset(0)
-	{
-		implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
-		populate_initial_implicit_producer_hash();
-		populate_initial_block_list(capacity / BLOCK_SIZE + ((capacity & (BLOCK_SIZE - 1)) == 0 ? 0 : 1));
+    #pragma warning(pop)
+#endif
+
+        static_assert(!std::numeric_limits<size_t>::is_signed && std::is_integral<size_t>::value, "Traits::size_t must be an unsigned integral type");
+        static_assert(!std::numeric_limits<index_t>::is_signed && std::is_integral<index_t>::value, "Traits::index_t must be an unsigned integral type");
+        static_assert(sizeof(index_t) >= sizeof(size_t), "Traits::index_t must be at least as wide as Traits::size_t");
+        static_assert((BLOCK_SIZE > 1) && !(BLOCK_SIZE & (BLOCK_SIZE - 1)), "Traits::BLOCK_SIZE must be a power of 2 (and at least 2)");
+        static_assert((EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) && !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD & (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)), "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a power of 2 (and greater than 1)");
+        static_assert((EXPLICIT_INITIAL_INDEX_SIZE > 1) && !(EXPLICIT_INITIAL_INDEX_SIZE & (EXPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)");
+        static_assert((IMPLICIT_INITIAL_INDEX_SIZE > 1) && !(IMPLICIT_INITIAL_INDEX_SIZE & (IMPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::IMPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)");
+        static_assert((INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) || !(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE & (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - 1)), "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be a power of 2");
+        static_assert(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0 || INITIAL_IMPLICIT_PRODUCER_HASH_SIZE >= 1, "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be at least 1 (or 0 to disable implicit enqueueing)");
+
+      public:
+        // Creates a queue with at least `capacity` element slots; note that the
+        // actual number of elements that can be inserted without additional memory
+        // allocation depends on the number of producers and the block size (e.g. if
+        // the block size is equal to `capacity`, only a single block will be allocated
+        // up-front, which means only a single producer will be able to enqueue elements
+        // without an extra allocation -- blocks aren't shared between producers).
+        // This method is not thread safe -- it is up to the user to ensure that the
+        // queue is fully constructed before it starts being used by other threads (this
+        // includes making the memory effects of construction visible, possibly with a
+        // memory barrier).
+        explicit ConcurrentQueue(size_t capacity = 32 * BLOCK_SIZE)
+            : producerListTail(nullptr)
+            , producerCount(0)
+            , initialBlockPoolIndex(0)
+            , nextExplicitConsumerId(0)
+            , globalExplicitConsumerOffset(0)
+        {
+            implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+            populate_initial_implicit_producer_hash();
+            populate_initial_block_list(capacity / BLOCK_SIZE + ((capacity & (BLOCK_SIZE - 1)) == 0 ? 0 : 1));
 
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-		// Track all the producers using a fully-resolved typed list for
-		// each kind; this makes it possible to debug them starting from
-		// the root queue object (otherwise wacky casts are needed that
-		// don't compile in the debugger's expression evaluator).
-		explicitProducers.store(nullptr, std::memory_order_relaxed);
-		implicitProducers.store(nullptr, std::memory_order_relaxed);
-#endif
-	}
-
-	// Computes the correct amount of pre-allocated blocks for you based
-	// on the minimum number of elements you want available at any given
-	// time, and the maximum concurrent number of each type of producer.
-	ConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers, size_t maxImplicitProducers)
-		: producerListTail(nullptr),
-		producerCount(0),
-		initialBlockPoolIndex(0),
-		nextExplicitConsumerId(0),
-		globalExplicitConsumerOffset(0)
-	{
-		implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
-		populate_initial_implicit_producer_hash();
-		size_t blocks = (((minCapacity + BLOCK_SIZE - 1) / BLOCK_SIZE) - 1) * (maxExplicitProducers + 1) + 2 * (maxExplicitProducers + maxImplicitProducers);
-		populate_initial_block_list(blocks);
+            // Track all the producers using a fully-resolved typed list for
+            // each kind; this makes it possible to debug them starting from
+            // the root queue object (otherwise wacky casts are needed that
+            // don't compile in the debugger's expression evaluator).
+            explicitProducers.store(nullptr, std::memory_order_relaxed);
+            implicitProducers.store(nullptr, std::memory_order_relaxed);
+#endif
+        }
+
+        // Computes the correct amount of pre-allocated blocks for you based
+        // on the minimum number of elements you want available at any given
+        // time, and the maximum concurrent number of each type of producer.
+        ConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers, size_t maxImplicitProducers)
+            : producerListTail(nullptr)
+            , producerCount(0)
+            , initialBlockPoolIndex(0)
+            , nextExplicitConsumerId(0)
+            , globalExplicitConsumerOffset(0)
+        {
+            implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+            populate_initial_implicit_producer_hash();
+            size_t blocks = (((minCapacity + BLOCK_SIZE - 1) / BLOCK_SIZE) - 1) * (maxExplicitProducers + 1) + 2 * (maxExplicitProducers + maxImplicitProducers);
+            populate_initial_block_list(blocks);
 
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-		explicitProducers.store(nullptr, std::memory_order_relaxed);
-		implicitProducers.store(nullptr, std::memory_order_relaxed);
-#endif
-	}
-
-	// Note: The queue should not be accessed concurrently while it's
-	// being deleted. It's up to the user to synchronize this.
-	// This method is not thread safe.
-	~ConcurrentQueue()
-	{
-		// Destroy producers
-		auto ptr = producerListTail.load(std::memory_order_relaxed);
-		while (ptr != nullptr) {
-			auto next = ptr->next_prod();
-			if (ptr->token != nullptr) {
-				ptr->token->producer = nullptr;
-			}
-			destroy(ptr);
-			ptr = next;
-		}
-
-		// Destroy implicit producer hash tables
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE != 0) {
-			auto hash = implicitProducerHash.load(std::memory_order_relaxed);
-			while (hash != nullptr) {
-				auto prev = hash->prev;
-				if (prev != nullptr) {		// The last hash is part of this object and was not allocated dynamically
-					for (size_t i = 0; i != hash->capacity; ++i) {
-						hash->entries[i].~ImplicitProducerKVP();
-					}
-					hash->~ImplicitProducerHash();
-					(Traits::free)(hash);
-				}
-				hash = prev;
-			}
-		}
-
-		// Destroy global free list
-		auto block = freeList.head_unsafe();
-		while (block != nullptr) {
-			auto next = block->freeListNext.load(std::memory_order_relaxed);
-			if (block->dynamicallyAllocated) {
-				destroy(block);
-			}
-			block = next;
-		}
-
-		// Destroy initial free list
-		destroy_array(initialBlockPool, initialBlockPoolSize);
-	}
-
-	// Disable copying and copy assignment
-	ConcurrentQueue(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
-	ConcurrentQueue& operator=(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
-
-	// Moving is supported, but note that it is *not* a thread-safe operation.
-	// Nobody can use the queue while it's being moved, and the memory effects
-	// of that move must be propagated to other threads before they can use it.
-	// Note: When a queue is moved, its tokens are still valid but can only be
-	// used with the destination queue (i.e. semantically they are moved along
-	// with the queue itself).
-	ConcurrentQueue(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
-		: producerListTail(other.producerListTail.load(std::memory_order_relaxed)),
-		producerCount(other.producerCount.load(std::memory_order_relaxed)),
-		initialBlockPoolIndex(other.initialBlockPoolIndex.load(std::memory_order_relaxed)),
-		initialBlockPool(other.initialBlockPool),
-		initialBlockPoolSize(other.initialBlockPoolSize),
-		freeList(std::move(other.freeList)),
-		nextExplicitConsumerId(other.nextExplicitConsumerId.load(std::memory_order_relaxed)),
-		globalExplicitConsumerOffset(other.globalExplicitConsumerOffset.load(std::memory_order_relaxed))
-	{
-		// Move the other one into this, and leave the other one as an empty queue
-		implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
-		populate_initial_implicit_producer_hash();
-		swap_implicit_producer_hashes(other);
-
-		other.producerListTail.store(nullptr, std::memory_order_relaxed);
-		other.producerCount.store(0, std::memory_order_relaxed);
-		other.nextExplicitConsumerId.store(0, std::memory_order_relaxed);
-		other.globalExplicitConsumerOffset.store(0, std::memory_order_relaxed);
+            explicitProducers.store(nullptr, std::memory_order_relaxed);
+            implicitProducers.store(nullptr, std::memory_order_relaxed);
+#endif
+        }
+
+        // Note: The queue should not be accessed concurrently while it's
+        // being deleted. It's up to the user to synchronize this.
+        // This method is not thread safe.
+        ~ConcurrentQueue()
+        {
+            // Destroy producers
+            auto ptr = producerListTail.load(std::memory_order_relaxed);
+            while (ptr != nullptr)
+            {
+                auto next = ptr->next_prod();
+                if (ptr->token != nullptr)
+                {
+                    ptr->token->producer = nullptr;
+                }
+                destroy(ptr);
+                ptr = next;
+            }
+
+            // Destroy implicit producer hash tables
+            MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE != 0)
+            {
+                auto hash = implicitProducerHash.load(std::memory_order_relaxed);
+                while (hash != nullptr)
+                {
+                    auto prev = hash->prev;
+                    if (prev != nullptr)
+                    {  // The last hash is part of this object and was not allocated dynamically
+                        for (size_t i = 0; i != hash->capacity; ++i)
+                        {
+                            hash->entries[i].~ImplicitProducerKVP();
+                        }
+                        hash->~ImplicitProducerHash();
+                        (Traits::free)(hash);
+                    }
+                    hash = prev;
+                }
+            }
+
+            // Destroy global free list
+            auto block = freeList.head_unsafe();
+            while (block != nullptr)
+            {
+                auto next = block->freeListNext.load(std::memory_order_relaxed);
+                if (block->dynamicallyAllocated)
+                {
+                    destroy(block);
+                }
+                block = next;
+            }
+
+            // Destroy initial free list
+            destroy_array(initialBlockPool, initialBlockPoolSize);
+        }
+
+        // Disable copying and copy assignment
+        ConcurrentQueue(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
+        ConcurrentQueue& operator=(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
+
+        // Moving is supported, but note that it is *not* a thread-safe operation.
+        // Nobody can use the queue while it's being moved, and the memory effects
+        // of that move must be propagated to other threads before they can use it.
+        // Note: When a queue is moved, its tokens are still valid but can only be
+        // used with the destination queue (i.e. semantically they are moved along
+        // with the queue itself).
+        ConcurrentQueue(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
+            : producerListTail(other.producerListTail.load(std::memory_order_relaxed)),
+              producerCount(other.producerCount.load(std::memory_order_relaxed)),
+              initialBlockPoolIndex(other.initialBlockPoolIndex.load(std::memory_order_relaxed)),
+              initialBlockPool(other.initialBlockPool),
+              initialBlockPoolSize(other.initialBlockPoolSize),
+              freeList(std::move(other.freeList)),
+              nextExplicitConsumerId(other.nextExplicitConsumerId.load(std::memory_order_relaxed)),
+              globalExplicitConsumerOffset(other.globalExplicitConsumerOffset.load(std::memory_order_relaxed))
+        {
+            // Move the other one into this, and leave the other one as an empty queue
+            implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+            populate_initial_implicit_producer_hash();
+            swap_implicit_producer_hashes(other);
+
+            other.producerListTail.store(nullptr, std::memory_order_relaxed);
+            other.producerCount.store(0, std::memory_order_relaxed);
+            other.nextExplicitConsumerId.store(0, std::memory_order_relaxed);
+            other.globalExplicitConsumerOffset.store(0, std::memory_order_relaxed);
 
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-		explicitProducers.store(other.explicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed);
-		other.explicitProducers.store(nullptr, std::memory_order_relaxed);
-		implicitProducers.store(other.implicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed);
-		other.implicitProducers.store(nullptr, std::memory_order_relaxed);
-#endif
-
-		other.initialBlockPoolIndex.store(0, std::memory_order_relaxed);
-		other.initialBlockPoolSize = 0;
-		other.initialBlockPool = nullptr;
-
-		reown_producers();
-	}
-
-	inline ConcurrentQueue& operator=(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
-	{
-		return swap_internal(other);
-	}
-
-	// Swaps this queue's state with the other's. Not thread-safe.
-	// Swapping two queues does not invalidate their tokens, however
-	// the tokens that were created for one queue must be used with
-	// only the swapped queue (i.e. the tokens are tied to the
-	// queue's movable state, not the object itself).
-	inline void swap(ConcurrentQueue& other) MOODYCAMEL_NOEXCEPT
-	{
-		swap_internal(other);
-	}
-
-private:
-	ConcurrentQueue& swap_internal(ConcurrentQueue& other)
-	{
-		if (this == &other) {
-			return *this;
-		}
-
-		details::swap_relaxed(producerListTail, other.producerListTail);
-		details::swap_relaxed(producerCount, other.producerCount);
-		details::swap_relaxed(initialBlockPoolIndex, other.initialBlockPoolIndex);
-		std::swap(initialBlockPool, other.initialBlockPool);
-		std::swap(initialBlockPoolSize, other.initialBlockPoolSize);
-		freeList.swap(other.freeList);
-		details::swap_relaxed(nextExplicitConsumerId, other.nextExplicitConsumerId);
-		details::swap_relaxed(globalExplicitConsumerOffset, other.globalExplicitConsumerOffset);
-
-		swap_implicit_producer_hashes(other);
-
-		reown_producers();
-		other.reown_producers();
+            explicitProducers.store(other.explicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed);
+            other.explicitProducers.store(nullptr, std::memory_order_relaxed);
+            implicitProducers.store(other.implicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed);
+            other.implicitProducers.store(nullptr, std::memory_order_relaxed);
+#endif
+
+            other.initialBlockPoolIndex.store(0, std::memory_order_relaxed);
+            other.initialBlockPoolSize = 0;
+            other.initialBlockPool     = nullptr;
+
+            reown_producers();
+        }
+
+        inline ConcurrentQueue& operator=(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
+        {
+            return swap_internal(other);
+        }
+
+        // Swaps this queue's state with the other's. Not thread-safe.
+        // Swapping two queues does not invalidate their tokens, however
+        // the tokens that were created for one queue must be used with
+        // only the swapped queue (i.e. the tokens are tied to the
+        // queue's movable state, not the object itself).
+        inline void swap(ConcurrentQueue& other) MOODYCAMEL_NOEXCEPT
+        {
+            swap_internal(other);
+        }
+
+      private:
+        ConcurrentQueue& swap_internal(ConcurrentQueue& other)
+        {
+            if (this == &other)
+            {
+                return *this;
+            }
+
+            details::swap_relaxed(producerListTail, other.producerListTail);
+            details::swap_relaxed(producerCount, other.producerCount);
+            details::swap_relaxed(initialBlockPoolIndex, other.initialBlockPoolIndex);
+            std::swap(initialBlockPool, other.initialBlockPool);
+            std::swap(initialBlockPoolSize, other.initialBlockPoolSize);
+            freeList.swap(other.freeList);
+            details::swap_relaxed(nextExplicitConsumerId, other.nextExplicitConsumerId);
+            details::swap_relaxed(globalExplicitConsumerOffset, other.globalExplicitConsumerOffset);
+
+            swap_implicit_producer_hashes(other);
+
+            reown_producers();
+            other.reown_producers();
 
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-		details::swap_relaxed(explicitProducers, other.explicitProducers);
-		details::swap_relaxed(implicitProducers, other.implicitProducers);
-#endif
-
-		return *this;
-	}
-
-public:
-	// Enqueues a single item (by copying it).
-	// Allocates memory if required. Only fails if memory allocation fails (or implicit
-	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
-	// or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
-	// Thread-safe.
-	inline bool enqueue(T const& item)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
-		else return inner_enqueue<CanAlloc>(item);
-	}
-
-	// Enqueues a single item (by moving it, if possible).
-	// Allocates memory if required. Only fails if memory allocation fails (or implicit
-	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
-	// or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
-	// Thread-safe.
-	inline bool enqueue(T&& item)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
-		else return inner_enqueue<CanAlloc>(std::move(item));
-	}
-
-	// Enqueues a single item (by copying it) using an explicit producer token.
-	// Allocates memory if required. Only fails if memory allocation fails (or
-	// Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
-	// Thread-safe.
-	inline bool enqueue(producer_token_t const& token, T const& item)
-	{
-		return inner_enqueue<CanAlloc>(token, item);
-	}
-
-	// Enqueues a single item (by moving it, if possible) using an explicit producer token.
-	// Allocates memory if required. Only fails if memory allocation fails (or
-	// Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
-	// Thread-safe.
-	inline bool enqueue(producer_token_t const& token, T&& item)
-	{
-		return inner_enqueue<CanAlloc>(token, std::move(item));
-	}
-
-	// Enqueues several items.
-	// Allocates memory if required. Only fails if memory allocation fails (or
-	// implicit production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
-	// is 0, or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
-	// Note: Use std::make_move_iterator if the elements should be moved instead of copied.
-	// Thread-safe.
-	template<typename It>
-	bool enqueue_bulk(It itemFirst, size_t count)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
-		else return inner_enqueue_bulk<CanAlloc>(itemFirst, count);
-	}
-
-	// Enqueues several items using an explicit producer token.
-	// Allocates memory if required. Only fails if memory allocation fails
-	// (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
-	// Note: Use std::make_move_iterator if the elements should be moved
-	// instead of copied.
-	// Thread-safe.
-	template<typename It>
-	bool enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
-	{
-		return inner_enqueue_bulk<CanAlloc>(token, itemFirst, count);
-	}
-
-	// Enqueues a single item (by copying it).
-	// Does not allocate memory. Fails if not enough room to enqueue (or implicit
-	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
-	// is 0).
-	// Thread-safe.
-	inline bool try_enqueue(T const& item)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
-		else return inner_enqueue<CannotAlloc>(item);
-	}
-
-	// Enqueues a single item (by moving it, if possible).
-	// Does not allocate memory (except for one-time implicit producer).
-	// Fails if not enough room to enqueue (or implicit production is
-	// disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
-	// Thread-safe.
-	inline bool try_enqueue(T&& item)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
-		else return inner_enqueue<CannotAlloc>(std::move(item));
-	}
-
-	// Enqueues a single item (by copying it) using an explicit producer token.
-	// Does not allocate memory. Fails if not enough room to enqueue.
-	// Thread-safe.
-	inline bool try_enqueue(producer_token_t const& token, T const& item)
-	{
-		return inner_enqueue<CannotAlloc>(token, item);
-	}
-
-	// Enqueues a single item (by moving it, if possible) using an explicit producer token.
-	// Does not allocate memory. Fails if not enough room to enqueue.
-	// Thread-safe.
-	inline bool try_enqueue(producer_token_t const& token, T&& item)
-	{
-		return inner_enqueue<CannotAlloc>(token, std::move(item));
-	}
-
-	// Enqueues several items.
-	// Does not allocate memory (except for one-time implicit producer).
-	// Fails if not enough room to enqueue (or implicit production is
-	// disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
-	// Note: Use std::make_move_iterator if the elements should be moved
-	// instead of copied.
-	// Thread-safe.
-	template<typename It>
-	bool try_enqueue_bulk(It itemFirst, size_t count)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
-		else return inner_enqueue_bulk<CannotAlloc>(itemFirst, count);
-	}
-
-	// Enqueues several items using an explicit producer token.
-	// Does not allocate memory. Fails if not enough room to enqueue.
-	// Note: Use std::make_move_iterator if the elements should be moved
-	// instead of copied.
-	// Thread-safe.
-	template<typename It>
-	bool try_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
-	{
-		return inner_enqueue_bulk<CannotAlloc>(token, itemFirst, count);
-	}
-
-
-
-	// Attempts to dequeue from the queue.
-	// Returns false if all producer streams appeared empty at the time they
-	// were checked (so, the queue is likely but not guaranteed to be empty).
-	// Never allocates. Thread-safe.
-	template<typename U>
-	bool try_dequeue(U& item)
-	{
-		// Instead of simply trying each producer in turn (which could cause needless contention on the first
-		// producer), we score them heuristically.
-		size_t nonEmptyCount = 0;
-		ProducerBase* best = nullptr;
-		size_t bestSize = 0;
-		for (auto ptr = producerListTail.load(std::memory_order_acquire); nonEmptyCount < 3 && ptr != nullptr; ptr = ptr->next_prod()) {
-			auto size = ptr->size_approx();
-			if (size > 0) {
-				if (size > bestSize) {
-					bestSize = size;
-					best = ptr;
-				}
-				++nonEmptyCount;
-			}
-		}
-
-		// If there was at least one non-empty queue but it appears empty at the time
-		// we try to dequeue from it, we need to make sure every queue's been tried
-		if (nonEmptyCount > 0) {
-			if ((details::likely)(best->dequeue(item))) {
-				return true;
-			}
-			for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
-				if (ptr != best && ptr->dequeue(item)) {
-					return true;
-				}
-			}
-		}
-		return false;
-	}
-
-	// Attempts to dequeue from the queue.
-	// Returns false if all producer streams appeared empty at the time they
-	// were checked (so, the queue is likely but not guaranteed to be empty).
-	// This differs from the try_dequeue(item) method in that this one does
-	// not attempt to reduce contention by interleaving the order that producer
-	// streams are dequeued from. So, using this method can reduce overall throughput
-	// under contention, but will give more predictable results in single-threaded
-	// consumer scenarios. This is mostly only useful for internal unit tests.
-	// Never allocates. Thread-safe.
-	template<typename U>
-	bool try_dequeue_non_interleaved(U& item)
-	{
-		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
-			if (ptr->dequeue(item)) {
-				return true;
-			}
-		}
-		return false;
-	}
-
-	// Attempts to dequeue from the queue using an explicit consumer token.
-	// Returns false if all producer streams appeared empty at the time they
-	// were checked (so, the queue is likely but not guaranteed to be empty).
-	// Never allocates. Thread-safe.
-	template<typename U>
-	bool try_dequeue(consumer_token_t& token, U& item)
-	{
-		// The idea is roughly as follows:
-		// Every 256 items from one producer, make everyone rotate (increase the global offset) -> this means the highest efficiency consumer dictates the rotation speed of everyone else, more or less
-		// If you see that the global offset has changed, you must reset your consumption counter and move to your designated place
-		// If there's no items where you're supposed to be, keep moving until you find a producer with some items
-		// If the global offset has not changed but you've run out of items to consume, move over from your current position until you find an producer with something in it
-
-		if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
-			if (!update_current_producer_after_rotation(token)) {
-				return false;
-			}
-		}
-
-		// If there was at least one non-empty queue but it appears empty at the time
-		// we try to dequeue from it, we need to make sure every queue's been tried
-		if (static_cast<ProducerBase*>(token.currentProducer)->dequeue(item)) {
-			if (++token.itemsConsumedFromCurrent == EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
-				globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
-			}
-			return true;
-		}
-
-		auto tail = producerListTail.load(std::memory_order_acquire);
-		auto ptr = static_cast<ProducerBase*>(token.currentProducer)->next_prod();
-		if (ptr == nullptr) {
-			ptr = tail;
-		}
-		while (ptr != static_cast<ProducerBase*>(token.currentProducer)) {
-			if (ptr->dequeue(item)) {
-				token.currentProducer = ptr;
-				token.itemsConsumedFromCurrent = 1;
-				return true;
-			}
-			ptr = ptr->next_prod();
-			if (ptr == nullptr) {
-				ptr = tail;
-			}
-		}
-		return false;
-	}
-
-	// Attempts to dequeue several elements from the queue.
-	// Returns the number of items actually dequeued.
-	// Returns 0 if all producer streams appeared empty at the time they
-	// were checked (so, the queue is likely but not guaranteed to be empty).
-	// Never allocates. Thread-safe.
-	template<typename It>
-	size_t try_dequeue_bulk(It itemFirst, size_t max)
-	{
-		size_t count = 0;
-		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
-			count += ptr->dequeue_bulk(itemFirst, max - count);
-			if (count == max) {
-				break;
-			}
-		}
-		return count;
-	}
-
-	// Attempts to dequeue several elements from the queue using an explicit consumer token.
-	// Returns the number of items actually dequeued.
-	// Returns 0 if all producer streams appeared empty at the time they
-	// were checked (so, the queue is likely but not guaranteed to be empty).
-	// Never allocates. Thread-safe.
-	template<typename It>
-	size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max)
-	{
-		if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
-			if (!update_current_producer_after_rotation(token)) {
-				return 0;
-			}
-		}
-
-		size_t count = static_cast<ProducerBase*>(token.currentProducer)->dequeue_bulk(itemFirst, max);
-		if (count == max) {
-			if ((token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(max)) >= EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
-				globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
-			}
-			return max;
-		}
-		token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(count);
-		max -= count;
-
-		auto tail = producerListTail.load(std::memory_order_acquire);
-		auto ptr = static_cast<ProducerBase*>(token.currentProducer)->next_prod();
-		if (ptr == nullptr) {
-			ptr = tail;
-		}
-		while (ptr != static_cast<ProducerBase*>(token.currentProducer)) {
-			auto dequeued = ptr->dequeue_bulk(itemFirst, max);
-			count += dequeued;
-			if (dequeued != 0) {
-				token.currentProducer = ptr;
-				token.itemsConsumedFromCurrent = static_cast<std::uint32_t>(dequeued);
-			}
-			if (dequeued == max) {
-				break;
-			}
-			max -= dequeued;
-			ptr = ptr->next_prod();
-			if (ptr == nullptr) {
-				ptr = tail;
-			}
-		}
-		return count;
-	}
-
-
-
-	// Attempts to dequeue from a specific producer's inner queue.
-	// If you happen to know which producer you want to dequeue from, this
-	// is significantly faster than using the general-case try_dequeue methods.
-	// Returns false if the producer's queue appeared empty at the time it
-	// was checked (so, the queue is likely but not guaranteed to be empty).
-	// Never allocates. Thread-safe.
-	template<typename U>
-	inline bool try_dequeue_from_producer(producer_token_t const& producer, U& item)
-	{
-		return static_cast<ExplicitProducer*>(producer.producer)->dequeue(item);
-	}
-
-	// Attempts to dequeue several elements from a specific producer's inner queue.
-	// Returns the number of items actually dequeued.
-	// If you happen to know which producer you want to dequeue from, this
-	// is significantly faster than using the general-case try_dequeue methods.
-	// Returns 0 if the producer's queue appeared empty at the time it
-	// was checked (so, the queue is likely but not guaranteed to be empty).
-	// Never allocates. Thread-safe.
-	template<typename It>
-	inline size_t try_dequeue_bulk_from_producer(producer_token_t const& producer, It itemFirst, size_t max)
-	{
-		return static_cast<ExplicitProducer*>(producer.producer)->dequeue_bulk(itemFirst, max);
-	}
-
-
-	// Returns an estimate of the total number of elements currently in the queue. This
-	// estimate is only accurate if the queue has completely stabilized before it is called
-	// (i.e. all enqueue and dequeue operations have completed and their memory effects are
-	// visible on the calling thread, and no further operations start while this method is
-	// being called).
-	// Thread-safe.
-	size_t size_approx() const
-	{
-		size_t size = 0;
-		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
-			size += ptr->size_approx();
-		}
-		return size;
-	}
-
-
-	// Returns true if the underlying atomic variables used by
-	// the queue are lock-free (they should be on most platforms).
-	// Thread-safe.
-	static constexpr bool is_lock_free()
-	{
-		return
-			details::static_is_lock_free<bool>::value == 2 &&
-			details::static_is_lock_free<size_t>::value == 2 &&
-			details::static_is_lock_free<std::uint32_t>::value == 2 &&
-			details::static_is_lock_free<index_t>::value == 2 &&
-			details::static_is_lock_free<void*>::value == 2 &&
-			details::static_is_lock_free<typename details::thread_id_converter<details::thread_id_t>::thread_id_numeric_size_t>::value == 2;
-	}
-
-
-private:
-	friend struct ProducerToken;
-	friend struct ConsumerToken;
-	struct ExplicitProducer;
-	friend struct ExplicitProducer;
-	struct ImplicitProducer;
-	friend struct ImplicitProducer;
-	friend class ConcurrentQueueTests;
-
-	enum AllocationMode { CanAlloc, CannotAlloc };
-
-
-	///////////////////////////////
-	// Queue methods
-	///////////////////////////////
-
-	template<AllocationMode canAlloc, typename U>
-	inline bool inner_enqueue(producer_token_t const& token, U&& element)
-	{
-		return static_cast<ExplicitProducer*>(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue<canAlloc>(std::forward<U>(element));
-	}
-
-	template<AllocationMode canAlloc, typename U>
-	inline bool inner_enqueue(U&& element)
-	{
-		auto producer = get_or_add_implicit_producer();
-		return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue<canAlloc>(std::forward<U>(element));
-	}
-
-	template<AllocationMode canAlloc, typename It>
-	inline bool inner_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
-	{
-		return static_cast<ExplicitProducer*>(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue_bulk<canAlloc>(itemFirst, count);
-	}
-
-	template<AllocationMode canAlloc, typename It>
-	inline bool inner_enqueue_bulk(It itemFirst, size_t count)
-	{
-		auto producer = get_or_add_implicit_producer();
-		return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue_bulk<canAlloc>(itemFirst, count);
-	}
-
-	inline bool update_current_producer_after_rotation(consumer_token_t& token)
-	{
-		// Ah, there's been a rotation, figure out where we should be!
-		auto tail = producerListTail.load(std::memory_order_acquire);
-		if (token.desiredProducer == nullptr && tail == nullptr) {
-			return false;
-		}
-		auto prodCount = producerCount.load(std::memory_order_relaxed);
-		auto globalOffset = globalExplicitConsumerOffset.load(std::memory_order_relaxed);
-		if ((details::unlikely)(token.desiredProducer == nullptr)) {
-			// Aha, first time we're dequeueing anything.
-			// Figure out our local position
-			// Note: offset is from start, not end, but we're traversing from end -- subtract from count first
-			std::uint32_t offset = prodCount - 1 - (token.initialOffset % prodCount);
-			token.desiredProducer = tail;
-			for (std::uint32_t i = 0; i != offset; ++i) {
-				token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
-				if (token.desiredProducer == nullptr) {
-					token.desiredProducer = tail;
-				}
-			}
-		}
-
-		std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset;
-		if (delta >= prodCount) {
-			delta = delta % prodCount;
-		}
-		for (std::uint32_t i = 0; i != delta; ++i) {
-			token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
-			if (token.desiredProducer == nullptr) {
-				token.desiredProducer = tail;
-			}
-		}
-
-		token.lastKnownGlobalOffset = globalOffset;
-		token.currentProducer = token.desiredProducer;
-		token.itemsConsumedFromCurrent = 0;
-		return true;
-	}
-
-
-	///////////////////////////
-	// Free list
-	///////////////////////////
-
-	template <typename N>
-	struct FreeListNode
-	{
-		FreeListNode() : freeListRefs(0), freeListNext(nullptr) { }
-
-		std::atomic<std::uint32_t> freeListRefs;
-		std::atomic<N*> freeListNext;
-	};
-
-	// A simple CAS-based lock-free free list. Not the fastest thing in the world under heavy contention, but
-	// simple and correct (assuming nodes are never freed until after the free list is destroyed), and fairly
-	// speedy under low contention.
-	template<typename N>		// N must inherit FreeListNode or have the same fields (and initialization of them)
-	struct FreeList
-	{
-		FreeList() : freeListHead(nullptr) { }
-		FreeList(FreeList&& other) : freeListHead(other.freeListHead.load(std::memory_order_relaxed)) { other.freeListHead.store(nullptr, std::memory_order_relaxed); }
-		void swap(FreeList& other) { details::swap_relaxed(freeListHead, other.freeListHead); }
-
-		FreeList(FreeList const&) MOODYCAMEL_DELETE_FUNCTION;
-		FreeList& operator=(FreeList const&) MOODYCAMEL_DELETE_FUNCTION;
-
-		inline void add(N* node)
-		{
+            details::swap_relaxed(explicitProducers, other.explicitProducers);
+            details::swap_relaxed(implicitProducers, other.implicitProducers);
+#endif
+
+            return *this;
+        }
+
+      public:
+        // Enqueues a single item (by copying it).
+        // Allocates memory if required. Only fails if memory allocation fails (or implicit
+        // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
+        // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+        // Thread-safe.
+        inline bool enqueue(T const& item)
+        {
+            MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
+            return false;
+            else return inner_enqueue<CanAlloc>(item);
+        }
+
+        // Enqueues a single item (by moving it, if possible).
+        // Allocates memory if required. Only fails if memory allocation fails (or implicit
+        // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
+        // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+        // Thread-safe.
+        inline bool enqueue(T&& item)
+        {
+            MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
+            return false;
+            else return inner_enqueue<CanAlloc>(std::move(item));
+        }
+
+        // Enqueues a single item (by copying it) using an explicit producer token.
+        // Allocates memory if required. Only fails if memory allocation fails (or
+        // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+        // Thread-safe.
+        inline bool enqueue(producer_token_t const& token, T const& item)
+        {
+            return inner_enqueue<CanAlloc>(token, item);
+        }
+
+        // Enqueues a single item (by moving it, if possible) using an explicit producer token.
+        // Allocates memory if required. Only fails if memory allocation fails (or
+        // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+        // Thread-safe.
+        inline bool enqueue(producer_token_t const& token, T&& item)
+        {
+            return inner_enqueue<CanAlloc>(token, std::move(item));
+        }
+
+        // Enqueues several items.
+        // Allocates memory if required. Only fails if memory allocation fails (or
+        // implicit production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
+        // is 0, or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+        // Note: Use std::make_move_iterator if the elements should be moved instead of copied.
+        // Thread-safe.
+        template<typename It>
+        bool enqueue_bulk(It itemFirst, size_t count)
+        {
+            MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
+            return false;
+            else return inner_enqueue_bulk<CanAlloc>(itemFirst, count);
+        }
+
+        // Enqueues several items using an explicit producer token.
+        // Allocates memory if required. Only fails if memory allocation fails
+        // (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+        // Note: Use std::make_move_iterator if the elements should be moved
+        // instead of copied.
+        // Thread-safe.
+        template<typename It>
+        bool enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+        {
+            return inner_enqueue_bulk<CanAlloc>(token, itemFirst, count);
+        }
+
+        // Enqueues a single item (by copying it).
+        // Does not allocate memory. Fails if not enough room to enqueue (or implicit
+        // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
+        // is 0).
+        // Thread-safe.
+        inline bool try_enqueue(T const& item)
+        {
+            MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
+            return false;
+            else return inner_enqueue<CannotAlloc>(item);
+        }
+
+        // Enqueues a single item (by moving it, if possible).
+        // Does not allocate memory (except for one-time implicit producer).
+        // Fails if not enough room to enqueue (or implicit production is
+        // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+        // Thread-safe.
+        inline bool try_enqueue(T&& item)
+        {
+            MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
+            return false;
+            else return inner_enqueue<CannotAlloc>(std::move(item));
+        }
+
+        // Enqueues a single item (by copying it) using an explicit producer token.
+        // Does not allocate memory. Fails if not enough room to enqueue.
+        // Thread-safe.
+        inline bool try_enqueue(producer_token_t const& token, T const& item)
+        {
+            return inner_enqueue<CannotAlloc>(token, item);
+        }
+
+        // Enqueues a single item (by moving it, if possible) using an explicit producer token.
+        // Does not allocate memory. Fails if not enough room to enqueue.
+        // Thread-safe.
+        inline bool try_enqueue(producer_token_t const& token, T&& item)
+        {
+            return inner_enqueue<CannotAlloc>(token, std::move(item));
+        }
+
+        // Enqueues several items.
+        // Does not allocate memory (except for one-time implicit producer).
+        // Fails if not enough room to enqueue (or implicit production is
+        // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+        // Note: Use std::make_move_iterator if the elements should be moved
+        // instead of copied.
+        // Thread-safe.
+        template<typename It>
+        bool try_enqueue_bulk(It itemFirst, size_t count)
+        {
+            MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
+            return false;
+            else return inner_enqueue_bulk<CannotAlloc>(itemFirst, count);
+        }
+
+        // Enqueues several items using an explicit producer token.
+        // Does not allocate memory. Fails if not enough room to enqueue.
+        // Note: Use std::make_move_iterator if the elements should be moved
+        // instead of copied.
+        // Thread-safe.
+        template<typename It>
+        bool try_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+        {
+            return inner_enqueue_bulk<CannotAlloc>(token, itemFirst, count);
+        }
+
+
+        // Attempts to dequeue from the queue.
+        // Returns false if all producer streams appeared empty at the time they
+        // were checked (so, the queue is likely but not guaranteed to be empty).
+        // Never allocates. Thread-safe.
+        template<typename U>
+        bool try_dequeue(U& item)
+        {
+            // Instead of simply trying each producer in turn (which could cause needless contention on the first
+            // producer), we score them heuristically.
+            size_t        nonEmptyCount = 0;
+            ProducerBase* best          = nullptr;
+            size_t        bestSize      = 0;
+            for (auto ptr = producerListTail.load(std::memory_order_acquire); nonEmptyCount < 3 && ptr != nullptr; ptr = ptr->next_prod())
+            {
+                auto size = ptr->size_approx();
+                if (size > 0)
+                {
+                    if (size > bestSize)
+                    {
+                        bestSize = size;
+                        best     = ptr;
+                    }
+                    ++nonEmptyCount;
+                }
+            }
+
+            // If there was at least one non-empty queue but it appears empty at the time
+            // we try to dequeue from it, we need to make sure every queue's been tried
+            if (nonEmptyCount > 0)
+            {
+                if ((details::likely)(best->dequeue(item)))
+                {
+                    return true;
+                }
+                for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod())
+                {
+                    if (ptr != best && ptr->dequeue(item))
+                    {
+                        return true;
+                    }
+                }
+            }
+            return false;
+        }
+
+        // Attempts to dequeue from the queue.
+        // Returns false if all producer streams appeared empty at the time they
+        // were checked (so, the queue is likely but not guaranteed to be empty).
+        // This differs from the try_dequeue(item) method in that this one does
+        // not attempt to reduce contention by interleaving the order that producer
+        // streams are dequeued from. So, using this method can reduce overall throughput
+        // under contention, but will give more predictable results in single-threaded
+        // consumer scenarios. This is mostly only useful for internal unit tests.
+        // Never allocates. Thread-safe.
+        template<typename U>
+        bool try_dequeue_non_interleaved(U& item)
+        {
+            for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod())
+            {
+                if (ptr->dequeue(item))
+                {
+                    return true;
+                }
+            }
+            return false;
+        }
+
+        // Attempts to dequeue from the queue using an explicit consumer token.
+        // Returns false if all producer streams appeared empty at the time they
+        // were checked (so, the queue is likely but not guaranteed to be empty).
+        // Never allocates. Thread-safe.
+        template<typename U>
+        bool try_dequeue(consumer_token_t& token, U& item)
+        {
+            // The idea is roughly as follows:
+            // Every 256 items from one producer, make everyone rotate (increase the global offset) -> this means the highest efficiency consumer dictates the rotation speed of everyone else, more or less
+            // If you see that the global offset has changed, you must reset your consumption counter and move to your designated place
+            // If there's no items where you're supposed to be, keep moving until you find a producer with some items
+            // If the global offset has not changed but you've run out of items to consume, move over from your current position until you find an producer with something in it
+
+            if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed))
+            {
+                if (!update_current_producer_after_rotation(token))
+                {
+                    return false;
+                }
+            }
+
+            // If there was at least one non-empty queue but it appears empty at the time
+            // we try to dequeue from it, we need to make sure every queue's been tried
+            if (static_cast<ProducerBase*>(token.currentProducer)->dequeue(item))
+            {
+                if (++token.itemsConsumedFromCurrent == EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE)
+                {
+                    globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
+                }
+                return true;
+            }
+
+            auto tail = producerListTail.load(std::memory_order_acquire);
+            auto ptr  = static_cast<ProducerBase*>(token.currentProducer)->next_prod();
+            if (ptr == nullptr)
+            {
+                ptr = tail;
+            }
+            while (ptr != static_cast<ProducerBase*>(token.currentProducer))
+            {
+                if (ptr->dequeue(item))
+                {
+                    token.currentProducer          = ptr;
+                    token.itemsConsumedFromCurrent = 1;
+                    return true;
+                }
+                ptr = ptr->next_prod();
+                if (ptr == nullptr)
+                {
+                    ptr = tail;
+                }
+            }
+            return false;
+        }
+
+        // Attempts to dequeue several elements from the queue.
+        // Returns the number of items actually dequeued.
+        // Returns 0 if all producer streams appeared empty at the time they
+        // were checked (so, the queue is likely but not guaranteed to be empty).
+        // Never allocates. Thread-safe.
+        template<typename It>
+        size_t try_dequeue_bulk(It itemFirst, size_t max)
+        {
+            size_t count = 0;
+            for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod())
+            {
+                count += ptr->dequeue_bulk(itemFirst, max - count);
+                if (count == max)
+                {
+                    break;
+                }
+            }
+            return count;
+        }
+
+        // Attempts to dequeue several elements from the queue using an explicit consumer token.
+        // Returns the number of items actually dequeued.
+        // Returns 0 if all producer streams appeared empty at the time they
+        // were checked (so, the queue is likely but not guaranteed to be empty).
+        // Never allocates. Thread-safe.
+        template<typename It>
+        size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max)
+        {
+            if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed))
+            {
+                if (!update_current_producer_after_rotation(token))
+                {
+                    return 0;
+                }
+            }
+
+            size_t count = static_cast<ProducerBase*>(token.currentProducer)->dequeue_bulk(itemFirst, max);
+            if (count == max)
+            {
+                if ((token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(max)) >= EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE)
+                {
+                    globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
+                }
+                return max;
+            }
+            token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(count);
+            max -= count;
+
+            auto tail = producerListTail.load(std::memory_order_acquire);
+            auto ptr  = static_cast<ProducerBase*>(token.currentProducer)->next_prod();
+            if (ptr == nullptr)
+            {
+                ptr = tail;
+            }
+            while (ptr != static_cast<ProducerBase*>(token.currentProducer))
+            {
+                auto dequeued = ptr->dequeue_bulk(itemFirst, max);
+                count += dequeued;
+                if (dequeued != 0)
+                {
+                    token.currentProducer          = ptr;
+                    token.itemsConsumedFromCurrent = static_cast<std::uint32_t>(dequeued);
+                }
+                if (dequeued == max)
+                {
+                    break;
+                }
+                max -= dequeued;
+                ptr = ptr->next_prod();
+                if (ptr == nullptr)
+                {
+                    ptr = tail;
+                }
+            }
+            return count;
+        }
+
+
+        // Attempts to dequeue from a specific producer's inner queue.
+        // If you happen to know which producer you want to dequeue from, this
+        // is significantly faster than using the general-case try_dequeue methods.
+        // Returns false if the producer's queue appeared empty at the time it
+        // was checked (so, the queue is likely but not guaranteed to be empty).
+        // Never allocates. Thread-safe.
+        template<typename U>
+        inline bool try_dequeue_from_producer(producer_token_t const& producer, U& item)
+        {
+            return static_cast<ExplicitProducer*>(producer.producer)->dequeue(item);
+        }
+
+        // Attempts to dequeue several elements from a specific producer's inner queue.
+        // Returns the number of items actually dequeued.
+        // If you happen to know which producer you want to dequeue from, this
+        // is significantly faster than using the general-case try_dequeue methods.
+        // Returns 0 if the producer's queue appeared empty at the time it
+        // was checked (so, the queue is likely but not guaranteed to be empty).
+        // Never allocates. Thread-safe.
+        template<typename It>
+        inline size_t try_dequeue_bulk_from_producer(producer_token_t const& producer, It itemFirst, size_t max)
+        {
+            return static_cast<ExplicitProducer*>(producer.producer)->dequeue_bulk(itemFirst, max);
+        }
+
+
+        // Returns an estimate of the total number of elements currently in the queue. This
+        // estimate is only accurate if the queue has completely stabilized before it is called
+        // (i.e. all enqueue and dequeue operations have completed and their memory effects are
+        // visible on the calling thread, and no further operations start while this method is
+        // being called).
+        // Thread-safe.
+        size_t size_approx() const
+        {
+            size_t size = 0;
+            for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod())
+            {
+                size += ptr->size_approx();
+            }
+            return size;
+        }
+
+
+        // Returns true if the underlying atomic variables used by
+        // the queue are lock-free (they should be on most platforms).
+        // Thread-safe.
+        static constexpr bool is_lock_free()
+        {
+            return details::static_is_lock_free<bool>::value == 2 &&
+                   details::static_is_lock_free<size_t>::value == 2 &&
+                   details::static_is_lock_free<std::uint32_t>::value == 2 &&
+                   details::static_is_lock_free<index_t>::value == 2 &&
+                   details::static_is_lock_free<void*>::value == 2 &&
+                   details::static_is_lock_free<typename details::thread_id_converter<details::thread_id_t>::thread_id_numeric_size_t>::value == 2;
+        }
+
+
+      private:
+        friend struct ProducerToken;
+        friend struct ConsumerToken;
+        struct ExplicitProducer;
+        friend struct ExplicitProducer;
+        struct ImplicitProducer;
+        friend struct ImplicitProducer;
+        friend class ConcurrentQueueTests;
+
+        enum AllocationMode
+        {
+            CanAlloc,
+            CannotAlloc
+        };
+
+
+        ///////////////////////////////
+        // Queue methods
+        ///////////////////////////////
+
+        template<AllocationMode canAlloc, typename U>
+        inline bool inner_enqueue(producer_token_t const& token, U&& element)
+        {
+            return static_cast<ExplicitProducer*>(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue<canAlloc>(std::forward<U>(element));
+        }
+
+        template<AllocationMode canAlloc, typename U>
+        inline bool inner_enqueue(U&& element)
+        {
+            auto producer = get_or_add_implicit_producer();
+            return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue<canAlloc>(std::forward<U>(element));
+        }
+
+        template<AllocationMode canAlloc, typename It>
+        inline bool inner_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+        {
+            return static_cast<ExplicitProducer*>(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue_bulk<canAlloc>(itemFirst, count);
+        }
+
+        template<AllocationMode canAlloc, typename It>
+        inline bool inner_enqueue_bulk(It itemFirst, size_t count)
+        {
+            auto producer = get_or_add_implicit_producer();
+            return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue_bulk<canAlloc>(itemFirst, count);
+        }
+
+        inline bool update_current_producer_after_rotation(consumer_token_t& token)
+        {
+            // Ah, there's been a rotation, figure out where we should be!
+            auto tail = producerListTail.load(std::memory_order_acquire);
+            if (token.desiredProducer == nullptr && tail == nullptr)
+            {
+                return false;
+            }
+            auto prodCount    = producerCount.load(std::memory_order_relaxed);
+            auto globalOffset = globalExplicitConsumerOffset.load(std::memory_order_relaxed);
+            if ((details::unlikely)(token.desiredProducer == nullptr))
+            {
+                // Aha, first time we're dequeueing anything.
+                // Figure out our local position
+                // Note: offset is from start, not end, but we're traversing from end -- subtract from count first
+                std::uint32_t offset  = prodCount - 1 - (token.initialOffset % prodCount);
+                token.desiredProducer = tail;
+                for (std::uint32_t i = 0; i != offset; ++i)
+                {
+                    token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
+                    if (token.desiredProducer == nullptr)
+                    {
+                        token.desiredProducer = tail;
+                    }
+                }
+            }
+
+            std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset;
+            if (delta >= prodCount)
+            {
+                delta = delta % prodCount;
+            }
+            for (std::uint32_t i = 0; i != delta; ++i)
+            {
+                token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
+                if (token.desiredProducer == nullptr)
+                {
+                    token.desiredProducer = tail;
+                }
+            }
+
+            token.lastKnownGlobalOffset    = globalOffset;
+            token.currentProducer          = token.desiredProducer;
+            token.itemsConsumedFromCurrent = 0;
+            return true;
+        }
+
+
+        ///////////////////////////
+        // Free list
+        ///////////////////////////
+
+        template<typename N>
+        struct FreeListNode
+        {
+            FreeListNode()
+                : freeListRefs(0)
+                , freeListNext(nullptr)
+            {
+            }
+
+            std::atomic<std::uint32_t> freeListRefs;
+            std::atomic<N*>            freeListNext;
+        };
+
+        // A simple CAS-based lock-free free list. Not the fastest thing in the world under heavy contention, but
+        // simple and correct (assuming nodes are never freed until after the free list is destroyed), and fairly
+        // speedy under low contention.
+        template<typename N>  // N must inherit FreeListNode or have the same fields (and initialization of them)
+        struct FreeList
+        {
+            FreeList()
+                : freeListHead(nullptr)
+            {
+            }
+            FreeList(FreeList&& other)
+                : freeListHead(other.freeListHead.load(std::memory_order_relaxed))
+            {
+                other.freeListHead.store(nullptr, std::memory_order_relaxed);
+            }
+            void swap(FreeList& other)
+            {
+                details::swap_relaxed(freeListHead, other.freeListHead);
+            }
+
+            FreeList(FreeList const&) MOODYCAMEL_DELETE_FUNCTION;
+            FreeList&   operator=(FreeList const&) MOODYCAMEL_DELETE_FUNCTION;
+
+            inline void add(N* node)
+            {
 #ifdef MCDBGQ_NOLOCKFREE_FREELIST
-			debug::DebugLock lock(mutex);
-#endif
-			// We know that the should-be-on-freelist bit is 0 at this point, so it's safe to
-			// set it using a fetch_add
-			if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST, std::memory_order_acq_rel) == 0) {
-				// Oh look! We were the last ones referencing this node, and we know
-				// we want to add it to the free list, so let's do it!
-		 		add_knowing_refcount_is_zero(node);
-			}
-		}
-
-		inline N* try_get()
-		{
+                debug::DebugLock lock(mutex);
+#endif
+                // We know that the should-be-on-freelist bit is 0 at this point, so it's safe to
+                // set it using a fetch_add
+                if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST, std::memory_order_acq_rel) == 0)
+                {
+                    // Oh look! We were the last ones referencing this node, and we know
+                    // we want to add it to the free list, so let's do it!
+                    add_knowing_refcount_is_zero(node);
+                }
+            }
+
+            inline N* try_get()
+            {
 #ifdef MCDBGQ_NOLOCKFREE_FREELIST
-			debug::DebugLock lock(mutex);
-#endif
-			auto head = freeListHead.load(std::memory_order_acquire);
-			while (head != nullptr) {
-				auto prevHead = head;
-				auto refs = head->freeListRefs.load(std::memory_order_relaxed);
-				if ((refs & REFS_MASK) == 0 || !head->freeListRefs.compare_exchange_strong(refs, refs + 1, std::memory_order_acquire, std::memory_order_relaxed)) {
-					head = freeListHead.load(std::memory_order_acquire);
-					continue;
-				}
-
-				// Good, reference count has been incremented (it wasn't at zero), which means we can read the
-				// next and not worry about it changing between now and the time we do the CAS
-				auto next = head->freeListNext.load(std::memory_order_relaxed);
-				if (freeListHead.compare_exchange_strong(head, next, std::memory_order_acquire, std::memory_order_relaxed)) {
-					// Yay, got the node. This means it was on the list, which means shouldBeOnFreeList must be false no
-					// matter the refcount (because nobody else knows it's been taken off yet, it can't have been put back on).
-					assert((head->freeListRefs.load(std::memory_order_relaxed) & SHOULD_BE_ON_FREELIST) == 0);
-
-					// Decrease refcount twice, once for our ref, and once for the list's ref
-					head->freeListRefs.fetch_sub(2, std::memory_order_release);
-					return head;
-				}
-
-				// OK, the head must have changed on us, but we still need to decrease the refcount we increased.
-				// Note that we don't need to release any memory effects, but we do need to ensure that the reference
-				// count decrement happens-after the CAS on the head.
-				refs = prevHead->freeListRefs.fetch_sub(1, std::memory_order_acq_rel);
-				if (refs == SHOULD_BE_ON_FREELIST + 1) {
-					add_knowing_refcount_is_zero(prevHead);
-				}
-			}
-
-			return nullptr;
-		}
-
-		// Useful for traversing the list when there's no contention (e.g. to destroy remaining nodes)
-		N* head_unsafe() const { return freeListHead.load(std::memory_order_relaxed); }
-
-	private:
-		inline void add_knowing_refcount_is_zero(N* node)
-		{
-			// Since the refcount is zero, and nobody can increase it once it's zero (except us, and we run
-			// only one copy of this method per node at a time, i.e. the single thread case), then we know
-			// we can safely change the next pointer of the node; however, once the refcount is back above
-			// zero, then other threads could increase it (happens under heavy contention, when the refcount
-			// goes to zero in between a load and a refcount increment of a node in try_get, then back up to
-			// something non-zero, then the refcount increment is done by the other thread) -- so, if the CAS
-			// to add the node to the actual list fails, decrease the refcount and leave the add operation to
-			// the next thread who puts the refcount back at zero (which could be us, hence the loop).
-			auto head = freeListHead.load(std::memory_order_relaxed);
-			while (true) {
-				node->freeListNext.store(head, std::memory_order_relaxed);
-				node->freeListRefs.store(1, std::memory_order_release);
-				if (!freeListHead.compare_exchange_strong(head, node, std::memory_order_release, std::memory_order_relaxed)) {
-					// Hmm, the add failed, but we can only try again when the refcount goes back to zero
-					if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST - 1, std::memory_order_release) == 1) {
-						continue;
-					}
-				}
-				return;
-			}
-		}
-
-	private:
-		// Implemented like a stack, but where node order doesn't matter (nodes are inserted out of order under contention)
-		std::atomic<N*> freeListHead;
-
-	static const std::uint32_t REFS_MASK = 0x7FFFFFFF;
-	static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000;
+                debug::DebugLock lock(mutex);
+#endif
+                auto head = freeListHead.load(std::memory_order_acquire);
+                while (head != nullptr)
+                {
+                    auto prevHead = head;
+                    auto refs     = head->freeListRefs.load(std::memory_order_relaxed);
+                    if ((refs & REFS_MASK) == 0 || !head->freeListRefs.compare_exchange_strong(refs, refs + 1, std::memory_order_acquire, std::memory_order_relaxed))
+                    {
+                        head = freeListHead.load(std::memory_order_acquire);
+                        continue;
+                    }
+
+                    // Good, reference count has been incremented (it wasn't at zero), which means we can read the
+                    // next and not worry about it changing between now and the time we do the CAS
+                    auto next = head->freeListNext.load(std::memory_order_relaxed);
+                    if (freeListHead.compare_exchange_strong(head, next, std::memory_order_acquire, std::memory_order_relaxed))
+                    {
+                        // Yay, got the node. This means it was on the list, which means shouldBeOnFreeList must be false no
+                        // matter the refcount (because nobody else knows it's been taken off yet, it can't have been put back on).
+                        assert((head->freeListRefs.load(std::memory_order_relaxed) & SHOULD_BE_ON_FREELIST) == 0);
+
+                        // Decrease refcount twice, once for our ref, and once for the list's ref
+                        head->freeListRefs.fetch_sub(2, std::memory_order_release);
+                        return head;
+                    }
+
+                    // OK, the head must have changed on us, but we still need to decrease the refcount we increased.
+                    // Note that we don't need to release any memory effects, but we do need to ensure that the reference
+                    // count decrement happens-after the CAS on the head.
+                    refs = prevHead->freeListRefs.fetch_sub(1, std::memory_order_acq_rel);
+                    if (refs == SHOULD_BE_ON_FREELIST + 1)
+                    {
+                        add_knowing_refcount_is_zero(prevHead);
+                    }
+                }
+
+                return nullptr;
+            }
+
+            // Useful for traversing the list when there's no contention (e.g. to destroy remaining nodes)
+            N* head_unsafe() const
+            {
+                return freeListHead.load(std::memory_order_relaxed);
+            }
+
+          private:
+            inline void add_knowing_refcount_is_zero(N* node)
+            {
+                // Since the refcount is zero, and nobody can increase it once it's zero (except us, and we run
+                // only one copy of this method per node at a time, i.e. the single thread case), then we know
+                // we can safely change the next pointer of the node; however, once the refcount is back above
+                // zero, then other threads could increase it (happens under heavy contention, when the refcount
+                // goes to zero in between a load and a refcount increment of a node in try_get, then back up to
+                // something non-zero, then the refcount increment is done by the other thread) -- so, if the CAS
+                // to add the node to the actual list fails, decrease the refcount and leave the add operation to
+                // the next thread who puts the refcount back at zero (which could be us, hence the loop).
+                auto head = freeListHead.load(std::memory_order_relaxed);
+                while (true)
+                {
+                    node->freeListNext.store(head, std::memory_order_relaxed);
+                    node->freeListRefs.store(1, std::memory_order_release);
+                    if (!freeListHead.compare_exchange_strong(head, node, std::memory_order_release, std::memory_order_relaxed))
+                    {
+                        // Hmm, the add failed, but we can only try again when the refcount goes back to zero
+                        if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST - 1, std::memory_order_release) == 1)
+                        {
+                            continue;
+                        }
+                    }
+                    return;
+                }
+            }
+
+          private:
+            // Implemented like a stack, but where node order doesn't matter (nodes are inserted out of order under contention)
+            std::atomic<N*>            freeListHead;
+
+            static const std::uint32_t REFS_MASK             = 0x7FFFFFFF;
+            static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000;
 
 #ifdef MCDBGQ_NOLOCKFREE_FREELIST
-		debug::DebugMutex mutex;
+            debug::DebugMutex mutex;
 #endif
-	};
-
-
-	///////////////////////////
-	// Block
-	///////////////////////////
-
-	enum InnerQueueContext { implicit_context = 0, explicit_context = 1 };
-
-	struct Block
-	{
-		Block()
-			: next(nullptr), elementsCompletelyDequeued(0), freeListRefs(0), freeListNext(nullptr), dynamicallyAllocated(true)
-		{
+        };
+
+
+        ///////////////////////////
+        // Block
+        ///////////////////////////
+
+        enum InnerQueueContext
+        {
+            implicit_context = 0,
+            explicit_context = 1
+        };
+
+        struct Block
+        {
+            Block()
+                : next(nullptr)
+                , elementsCompletelyDequeued(0)
+                , freeListRefs(0)
+                , freeListNext(nullptr)
+                , dynamicallyAllocated(true)
+            {
 #ifdef MCDBGQ_TRACKMEM
-			owner = nullptr;
-#endif
-		}
-
-		template<InnerQueueContext context>
-		inline bool is_empty() const
-		{
-			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
-				// Check flags
-				for (size_t i = 0; i < BLOCK_SIZE; ++i) {
-					if (!emptyFlags[i].load(std::memory_order_relaxed)) {
-						return false;
-					}
-				}
-
-				// Aha, empty; make sure we have all other memory effects that happened before the empty flags were set
-				std::atomic_thread_fence(std::memory_order_acquire);
-				return true;
-			}
-			else {
-				// Check counter
-				if (elementsCompletelyDequeued.load(std::memory_order_relaxed) == BLOCK_SIZE) {
-					std::atomic_thread_fence(std::memory_order_acquire);
-					return true;
-				}
-				assert(elementsCompletelyDequeued.load(std::memory_order_relaxed) <= BLOCK_SIZE);
-				return false;
-			}
-		}
-
-		// Returns true if the block is now empty (does not apply in explicit context)
-		template<InnerQueueContext context>
-		inline bool set_empty(MOODYCAMEL_MAYBE_UNUSED index_t i)
-		{
-			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
-				// Set flag
-				assert(!emptyFlags[BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].load(std::memory_order_relaxed));
-				emptyFlags[BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].store(true, std::memory_order_release);
-				return false;
-			}
-			else {
-				// Increment counter
-				auto prevVal = elementsCompletelyDequeued.fetch_add(1, std::memory_order_release);
-				assert(prevVal < BLOCK_SIZE);
-				return prevVal == BLOCK_SIZE - 1;
-			}
-		}
-
-		// Sets multiple contiguous item statuses to 'empty' (assumes no wrapping and count > 0).
-		// Returns true if the block is now empty (does not apply in explicit context).
-		template<InnerQueueContext context>
-		inline bool set_many_empty(MOODYCAMEL_MAYBE_UNUSED index_t i, size_t count)
-		{
-			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
-				// Set flags
-				std::atomic_thread_fence(std::memory_order_release);
-				i = BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1)) - count + 1;
-				for (size_t j = 0; j != count; ++j) {
-					assert(!emptyFlags[i + j].load(std::memory_order_relaxed));
-					emptyFlags[i + j].store(true, std::memory_order_relaxed);
-				}
-				return false;
-			}
-			else {
-				// Increment counter
-				auto prevVal = elementsCompletelyDequeued.fetch_add(count, std::memory_order_release);
-				assert(prevVal + count <= BLOCK_SIZE);
-				return prevVal + count == BLOCK_SIZE;
-			}
-		}
-
-		template<InnerQueueContext context>
-		inline void set_all_empty()
-		{
-			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
-				// Set all flags
-				for (size_t i = 0; i != BLOCK_SIZE; ++i) {
-					emptyFlags[i].store(true, std::memory_order_relaxed);
-				}
-			}
-			else {
-				// Reset counter
-				elementsCompletelyDequeued.store(BLOCK_SIZE, std::memory_order_relaxed);
-			}
-		}
-
-		template<InnerQueueContext context>
-		inline void reset_empty()
-		{
-			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
-				// Reset flags
-				for (size_t i = 0; i != BLOCK_SIZE; ++i) {
-					emptyFlags[i].store(false, std::memory_order_relaxed);
-				}
-			}
-			else {
-				// Reset counter
-				elementsCompletelyDequeued.store(0, std::memory_order_relaxed);
-			}
-		}
-
-		inline T* operator[](index_t idx) MOODYCAMEL_NOEXCEPT { return static_cast<T*>(static_cast<void*>(elements)) + static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1)); }
-		inline T const* operator[](index_t idx) const MOODYCAMEL_NOEXCEPT { return static_cast<T const*>(static_cast<void const*>(elements)) + static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1)); }
-
-	private:
-		static_assert(std::alignment_of<T>::value <= sizeof(T), "The queue does not support types with an alignment greater than their size at this time");
-		MOODYCAMEL_ALIGNED_TYPE_LIKE(char[sizeof(T) * BLOCK_SIZE], T) elements;
-	public:
-		Block* next;
-		std::atomic<size_t> elementsCompletelyDequeued;
-		std::atomic<bool> emptyFlags[BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD ? BLOCK_SIZE : 1];
-	public:
-		std::atomic<std::uint32_t> freeListRefs;
-		std::atomic<Block*> freeListNext;
-		bool dynamicallyAllocated;		// Perhaps a better name for this would be 'isNotPartOfInitialBlockPool'
+                owner = nullptr;
+#endif
+            }
+
+            template<InnerQueueContext context>
+            inline bool is_empty() const
+            {
+                MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD)
+                {
+                    // Check flags
+                    for (size_t i = 0; i < BLOCK_SIZE; ++i)
+                    {
+                        if (!emptyFlags[i].load(std::memory_order_relaxed))
+                        {
+                            return false;
+                        }
+                    }
+
+                    // Aha, empty; make sure we have all other memory effects that happened before the empty flags were set
+                    std::atomic_thread_fence(std::memory_order_acquire);
+                    return true;
+                }
+                else
+                {
+                    // Check counter
+                    if (elementsCompletelyDequeued.load(std::memory_order_relaxed) == BLOCK_SIZE)
+                    {
+                        std::atomic_thread_fence(std::memory_order_acquire);
+                        return true;
+                    }
+                    assert(elementsCompletelyDequeued.load(std::memory_order_relaxed) <= BLOCK_SIZE);
+                    return false;
+                }
+            }
+
+            // Returns true if the block is now empty (does not apply in explicit context)
+            template<InnerQueueContext context>
+            inline bool set_empty(MOODYCAMEL_MAYBE_UNUSED index_t i)
+            {
+                MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD)
+                {
+                    // Set flag
+                    assert(!emptyFlags[BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].load(std::memory_order_relaxed));
+                    emptyFlags[BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].store(true, std::memory_order_release);
+                    return false;
+                }
+                else
+                {
+                    // Increment counter
+                    auto prevVal = elementsCompletelyDequeued.fetch_add(1, std::memory_order_release);
+                    assert(prevVal < BLOCK_SIZE);
+                    return prevVal == BLOCK_SIZE - 1;
+                }
+            }
+
+            // Sets multiple contiguous item statuses to 'empty' (assumes no wrapping and count > 0).
+            // Returns true if the block is now empty (does not apply in explicit context).
+            template<InnerQueueContext context>
+            inline bool set_many_empty(MOODYCAMEL_MAYBE_UNUSED index_t i, size_t count)
+            {
+                MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD)
+                {
+                    // Set flags
+                    std::atomic_thread_fence(std::memory_order_release);
+                    i = BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1)) - count + 1;
+                    for (size_t j = 0; j != count; ++j)
+                    {
+                        assert(!emptyFlags[i + j].load(std::memory_order_relaxed));
+                        emptyFlags[i + j].store(true, std::memory_order_relaxed);
+                    }
+                    return false;
+                }
+                else
+                {
+                    // Increment counter
+                    auto prevVal = elementsCompletelyDequeued.fetch_add(count, std::memory_order_release);
+                    assert(prevVal + count <= BLOCK_SIZE);
+                    return prevVal + count == BLOCK_SIZE;
+                }
+            }
+
+            template<InnerQueueContext context>
+            inline void set_all_empty()
+            {
+                MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD)
+                {
+                    // Set all flags
+                    for (size_t i = 0; i != BLOCK_SIZE; ++i)
+                    {
+                        emptyFlags[i].store(true, std::memory_order_relaxed);
+                    }
+                }
+                else
+                {
+                    // Reset counter
+                    elementsCompletelyDequeued.store(BLOCK_SIZE, std::memory_order_relaxed);
+                }
+            }
+
+            template<InnerQueueContext context>
+            inline void reset_empty()
+            {
+                MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD)
+                {
+                    // Reset flags
+                    for (size_t i = 0; i != BLOCK_SIZE; ++i)
+                    {
+                        emptyFlags[i].store(false, std::memory_order_relaxed);
+                    }
+                }
+                else
+                {
+                    // Reset counter
+                    elementsCompletelyDequeued.store(0, std::memory_order_relaxed);
+                }
+            }
+
+            inline T* operator[](index_t idx) MOODYCAMEL_NOEXCEPT
+            {
+                return static_cast<T*>(static_cast<void*>(elements)) + static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1));
+            }
+            inline T const* operator[](index_t idx) const MOODYCAMEL_NOEXCEPT
+            {
+                return static_cast<T const*>(static_cast<void const*>(elements)) + static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1));
+            }
+
+          private:
+            static_assert(std::alignment_of<T>::value <= sizeof(T), "The queue does not support types with an alignment greater than their size at this time");
+            MOODYCAMEL_ALIGNED_TYPE_LIKE(char[sizeof(T) * BLOCK_SIZE], T)
+            elements;
+
+          public:
+            Block*              next;
+            std::atomic<size_t> elementsCompletelyDequeued;
+            std::atomic<bool>   emptyFlags[BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD ? BLOCK_SIZE : 1];
+
+          public:
+            std::atomic<std::uint32_t> freeListRefs;
+            std::atomic<Block*>        freeListNext;
+            bool                       dynamicallyAllocated;  // Perhaps a better name for this would be 'isNotPartOfInitialBlockPool'
 
 #ifdef MCDBGQ_TRACKMEM
-		void* owner;
+            void* owner;
 #endif
-	};
-	static_assert(std::alignment_of<Block>::value >= std::alignment_of<T>::value, "Internal error: Blocks must be at least as aligned as the type they are wrapping");
+        };
+        static_assert(std::alignment_of<Block>::value >= std::alignment_of<T>::value, "Internal error: Blocks must be at least as aligned as the type they are wrapping");
 
 
 #ifdef MCDBGQ_TRACKMEM
-public:
-	struct MemStats;
-private:
-#endif
-
-	///////////////////////////
-	// Producer base
-	///////////////////////////
-
-	struct ProducerBase : public details::ConcurrentQueueProducerTypelessBase
-	{
-		ProducerBase(ConcurrentQueue* parent_, bool isExplicit_) :
-			tailIndex(0),
-			headIndex(0),
-			dequeueOptimisticCount(0),
-			dequeueOvercommit(0),
-			tailBlock(nullptr),
-			isExplicit(isExplicit_),
-			parent(parent_)
-		{
-		}
-
-		virtual ~ProducerBase() { }
-
-		template<typename U>
-		inline bool dequeue(U& element)
-		{
-			if (isExplicit) {
-				return static_cast<ExplicitProducer*>(this)->dequeue(element);
-			}
-			else {
-				return static_cast<ImplicitProducer*>(this)->dequeue(element);
-			}
-		}
-
-		template<typename It>
-		inline size_t dequeue_bulk(It& itemFirst, size_t max)
-		{
-			if (isExplicit) {
-				return static_cast<ExplicitProducer*>(this)->dequeue_bulk(itemFirst, max);
-			}
-			else {
-				return static_cast<ImplicitProducer*>(this)->dequeue_bulk(itemFirst, max);
-			}
-		}
-
-		inline ProducerBase* next_prod() const { return static_cast<ProducerBase*>(next); }
-
-		inline size_t size_approx() const
-		{
-			auto tail = tailIndex.load(std::memory_order_relaxed);
-			auto head = headIndex.load(std::memory_order_relaxed);
-			return details::circular_less_than(head, tail) ? static_cast<size_t>(tail - head) : 0;
-		}
-
-		inline index_t getTail() const { return tailIndex.load(std::memory_order_relaxed); }
-	protected:
-		std::atomic<index_t> tailIndex;		// Where to enqueue to next
-		std::atomic<index_t> headIndex;		// Where to dequeue from next
-
-		std::atomic<index_t> dequeueOptimisticCount;
-		std::atomic<index_t> dequeueOvercommit;
-
-		Block* tailBlock;
-
-	public:
-		bool isExplicit;
-		ConcurrentQueue* parent;
-
-	protected:
+      public:
+        struct MemStats;
+
+      private:
+#endif
+
+        ///////////////////////////
+        // Producer base
+        ///////////////////////////
+
+        struct ProducerBase : public details::ConcurrentQueueProducerTypelessBase
+        {
+            ProducerBase(ConcurrentQueue* parent_, bool isExplicit_)
+                : tailIndex(0)
+                , headIndex(0)
+                , dequeueOptimisticCount(0)
+                , dequeueOvercommit(0)
+                , tailBlock(nullptr)
+                , isExplicit(isExplicit_)
+                , parent(parent_)
+            {
+            }
+
+            virtual ~ProducerBase() {}
+
+            template<typename U>
+            inline bool dequeue(U& element)
+            {
+                if (isExplicit)
+                {
+                    return static_cast<ExplicitProducer*>(this)->dequeue(element);
+                }
+                else
+                {
+                    return static_cast<ImplicitProducer*>(this)->dequeue(element);
+                }
+            }
+
+            template<typename It>
+            inline size_t dequeue_bulk(It& itemFirst, size_t max)
+            {
+                if (isExplicit)
+                {
+                    return static_cast<ExplicitProducer*>(this)->dequeue_bulk(itemFirst, max);
+                }
+                else
+                {
+                    return static_cast<ImplicitProducer*>(this)->dequeue_bulk(itemFirst, max);
+                }
+            }
+
+            inline ProducerBase* next_prod() const
+            {
+                return static_cast<ProducerBase*>(next);
+            }
+
+            inline size_t size_approx() const
+            {
+                auto tail = tailIndex.load(std::memory_order_relaxed);
+                auto head = headIndex.load(std::memory_order_relaxed);
+                return details::circular_less_than(head, tail) ? static_cast<size_t>(tail - head) : 0;
+            }
+
+            inline index_t getTail() const
+            {
+                return tailIndex.load(std::memory_order_relaxed);
+            }
+
+          protected:
+            std::atomic<index_t> tailIndex;  // Where to enqueue to next
+            std::atomic<index_t> headIndex;  // Where to dequeue from next
+
+            std::atomic<index_t> dequeueOptimisticCount;
+            std::atomic<index_t> dequeueOvercommit;
+
+            Block*               tailBlock;
+
+          public:
+            bool             isExplicit;
+            ConcurrentQueue* parent;
+
+          protected:
 #ifdef MCDBGQ_TRACKMEM
-		friend struct MemStats;
-#endif
-	};
-
-
-	///////////////////////////
-	// Explicit queue
-	///////////////////////////
-
-	struct ExplicitProducer : public ProducerBase
-	{
-		explicit ExplicitProducer(ConcurrentQueue* parent_) :
-			ProducerBase(parent_, true),
-			blockIndex(nullptr),
-			pr_blockIndexSlotsUsed(0),
-			pr_blockIndexSize(EXPLICIT_INITIAL_INDEX_SIZE >> 1),
-			pr_blockIndexFront(0),
-			pr_blockIndexEntries(nullptr),
-			pr_blockIndexRaw(nullptr)
-		{
-			size_t poolBasedIndexSize = details::ceil_to_pow_2(parent_->initialBlockPoolSize) >> 1;
-			if (poolBasedIndexSize > pr_blockIndexSize) {
-				pr_blockIndexSize = poolBasedIndexSize;
-			}
-
-			new_block_index(0);		// This creates an index with double the number of current entries, i.e. EXPLICIT_INITIAL_INDEX_SIZE
-		}
-
-		~ExplicitProducer()
-		{
-			// Destruct any elements not yet dequeued.
-			// Since we're in the destructor, we can assume all elements
-			// are either completely dequeued or completely not (no halfways).
-			if (this->tailBlock != nullptr) {		// Note this means there must be a block index too
-				// First find the block that's partially dequeued, if any
-				Block* halfDequeuedBlock = nullptr;
-				if ((this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1)) != 0) {
-					// The head's not on a block boundary, meaning a block somewhere is partially dequeued
-					// (or the head block is the tail block and was fully dequeued, but the head/tail are still not on a boundary)
-					size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & (pr_blockIndexSize - 1);
-					while (details::circular_less_than<index_t>(pr_blockIndexEntries[i].base + BLOCK_SIZE, this->headIndex.load(std::memory_order_relaxed))) {
-						i = (i + 1) & (pr_blockIndexSize - 1);
-					}
-					assert(details::circular_less_than<index_t>(pr_blockIndexEntries[i].base, this->headIndex.load(std::memory_order_relaxed)));
-					halfDequeuedBlock = pr_blockIndexEntries[i].block;
-				}
-
-				// Start at the head block (note the first line in the loop gives us the head from the tail on the first iteration)
-				auto block = this->tailBlock;
-				do {
-					block = block->next;
-					if (block->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
-						continue;
-					}
-
-					size_t i = 0;	// Offset into block
-					if (block == halfDequeuedBlock) {
-						i = static_cast<size_t>(this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1));
-					}
-
-					// Walk through all the items in the block; if this is the tail block, we need to stop when we reach the tail index
-					auto lastValidIndex = (this->tailIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 ? BLOCK_SIZE : static_cast<size_t>(this->tailIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1));
-					while (i != BLOCK_SIZE && (block != this->tailBlock || i != lastValidIndex)) {
-						(*block)[i++]->~T();
-					}
-				} while (block != this->tailBlock);
-			}
-
-			// Destroy all blocks that we own
-			if (this->tailBlock != nullptr) {
-				auto block = this->tailBlock;
-				do {
-					auto nextBlock = block->next;
-					this->parent->add_block_to_free_list(block);
-					block = nextBlock;
-				} while (block != this->tailBlock);
-			}
-
-			// Destroy the block indices
-			auto header = static_cast<BlockIndexHeader*>(pr_blockIndexRaw);
-			while (header != nullptr) {
-				auto prev = static_cast<BlockIndexHeader*>(header->prev);
-				header->~BlockIndexHeader();
-				(Traits::free)(header);
-				header = prev;
-			}
-		}
-
-		template<AllocationMode allocMode, typename U>
-		inline bool enqueue(U&& element)
-		{
-			index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed);
-			index_t newTailIndex = 1 + currentTailIndex;
-			if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
-				// We reached the end of a block, start a new one
-				auto startBlock = this->tailBlock;
-				auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
-				if (this->tailBlock != nullptr && this->tailBlock->next->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
-					// We can re-use the block ahead of us, it's empty!
-					this->tailBlock = this->tailBlock->next;
-					this->tailBlock->ConcurrentQueue::Block::template reset_empty<explicit_context>();
-
-					// We'll put the block on the block index (guaranteed to be room since we're conceptually removing the
-					// last block from it first -- except instead of removing then adding, we can just overwrite).
-					// Note that there must be a valid block index here, since even if allocation failed in the ctor,
-					// it would have been re-attempted when adding the first block to the queue; since there is such
-					// a block, a block index must have been successfully allocated.
-				}
-				else {
-					// Whatever head value we see here is >= the last value we saw here (relatively),
-					// and <= its current value. Since we have the most recent tail, the head must be
-					// <= to it.
-					auto head = this->headIndex.load(std::memory_order_relaxed);
-					assert(!details::circular_less_than<index_t>(currentTailIndex, head));
-					if (!details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE)
-						|| (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) {
-						// We can't enqueue in another block because there's not enough leeway -- the
-						// tail could surpass the head by the time the block fills up! (Or we'll exceed
-						// the size limit, if the second part of the condition was true.)
-						return false;
-					}
-					// We're going to need a new block; check that the block index has room
-					if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize) {
-						// Hmm, the circular block index is already full -- we'll need
-						// to allocate a new index. Note pr_blockIndexRaw can only be nullptr if
-						// the initial allocation failed in the constructor.
-
-						MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) {
-							return false;
-						}
-						else if (!new_block_index(pr_blockIndexSlotsUsed)) {
-							return false;
-						}
-					}
-
-					// Insert a new block in the circular linked list
-					auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
-					if (newBlock == nullptr) {
-						return false;
-					}
+            friend struct MemStats;
+#endif
+        };
+
+
+        ///////////////////////////
+        // Explicit queue
+        ///////////////////////////
+
+        struct ExplicitProducer : public ProducerBase
+        {
+            explicit ExplicitProducer(ConcurrentQueue* parent_)
+                : ProducerBase(parent_, true)
+                , blockIndex(nullptr)
+                , pr_blockIndexSlotsUsed(0)
+                , pr_blockIndexSize(EXPLICIT_INITIAL_INDEX_SIZE >> 1)
+                , pr_blockIndexFront(0)
+                , pr_blockIndexEntries(nullptr)
+                , pr_blockIndexRaw(nullptr)
+            {
+                size_t poolBasedIndexSize = details::ceil_to_pow_2(parent_->initialBlockPoolSize) >> 1;
+                if (poolBasedIndexSize > pr_blockIndexSize)
+                {
+                    pr_blockIndexSize = poolBasedIndexSize;
+                }
+
+                new_block_index(0);  // This creates an index with double the number of current entries, i.e. EXPLICIT_INITIAL_INDEX_SIZE
+            }
+
+            ~ExplicitProducer()
+            {
+                // Destruct any elements not yet dequeued.
+                // Since we're in the destructor, we can assume all elements
+                // are either completely dequeued or completely not (no halfways).
+                if (this->tailBlock != nullptr)
+                {  // Note this means there must be a block index too
+                    // First find the block that's partially dequeued, if any
+                    Block* halfDequeuedBlock = nullptr;
+                    if ((this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1)) != 0)
+                    {
+                        // The head's not on a block boundary, meaning a block somewhere is partially dequeued
+                        // (or the head block is the tail block and was fully dequeued, but the head/tail are still not on a boundary)
+                        size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & (pr_blockIndexSize - 1);
+                        while (details::circular_less_than<index_t>(pr_blockIndexEntries[i].base + BLOCK_SIZE, this->headIndex.load(std::memory_order_relaxed)))
+                        {
+                            i = (i + 1) & (pr_blockIndexSize - 1);
+                        }
+                        assert(details::circular_less_than<index_t>(pr_blockIndexEntries[i].base, this->headIndex.load(std::memory_order_relaxed)));
+                        halfDequeuedBlock = pr_blockIndexEntries[i].block;
+                    }
+
+                    // Start at the head block (note the first line in the loop gives us the head from the tail on the first iteration)
+                    auto block = this->tailBlock;
+                    do {
+                        block = block->next;
+                        if (block->ConcurrentQueue::Block::template is_empty<explicit_context>())
+                        {
+                            continue;
+                        }
+
+                        size_t i = 0;  // Offset into block
+                        if (block == halfDequeuedBlock)
+                        {
+                            i = static_cast<size_t>(this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1));
+                        }
+
+                        // Walk through all the items in the block; if this is the tail block, we need to stop when we reach the tail index
+                        auto lastValidIndex = (this->tailIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 ? BLOCK_SIZE : static_cast<size_t>(this->tailIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1));
+                        while (i != BLOCK_SIZE && (block != this->tailBlock || i != lastValidIndex))
+                        {
+                            (*block)[i++]->~T();
+                        }
+                    } while (block != this->tailBlock);
+                }
+
+                // Destroy all blocks that we own
+                if (this->tailBlock != nullptr)
+                {
+                    auto block = this->tailBlock;
+                    do {
+                        auto nextBlock = block->next;
+                        this->parent->add_block_to_free_list(block);
+                        block = nextBlock;
+                    } while (block != this->tailBlock);
+                }
+
+                // Destroy the block indices
+                auto header = static_cast<BlockIndexHeader*>(pr_blockIndexRaw);
+                while (header != nullptr)
+                {
+                    auto prev = static_cast<BlockIndexHeader*>(header->prev);
+                    header->~BlockIndexHeader();
+                    (Traits::free)(header);
+                    header = prev;
+                }
+            }
+
+            template<AllocationMode allocMode, typename U>
+            inline bool enqueue(U&& element)
+            {
+                index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+                index_t newTailIndex     = 1 + currentTailIndex;
+                if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0)
+                {
+                    // We reached the end of a block, start a new one
+                    auto startBlock                  = this->tailBlock;
+                    auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
+                    if (this->tailBlock != nullptr && this->tailBlock->next->ConcurrentQueue::Block::template is_empty<explicit_context>())
+                    {
+                        // We can re-use the block ahead of us, it's empty!
+                        this->tailBlock = this->tailBlock->next;
+                        this->tailBlock->ConcurrentQueue::Block::template reset_empty<explicit_context>();
+
+                        // We'll put the block on the block index (guaranteed to be room since we're conceptually removing the
+                        // last block from it first -- except instead of removing then adding, we can just overwrite).
+                        // Note that there must be a valid block index here, since even if allocation failed in the ctor,
+                        // it would have been re-attempted when adding the first block to the queue; since there is such
+                        // a block, a block index must have been successfully allocated.
+                    }
+                    else
+                    {
+                        // Whatever head value we see here is >= the last value we saw here (relatively),
+                        // and <= its current value. Since we have the most recent tail, the head must be
+                        // <= to it.
+                        auto head = this->headIndex.load(std::memory_order_relaxed);
+                        assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+                        if (!details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head)))
+                        {
+                            // We can't enqueue in another block because there's not enough leeway -- the
+                            // tail could surpass the head by the time the block fills up! (Or we'll exceed
+                            // the size limit, if the second part of the condition was true.)
+                            return false;
+                        }
+                        // We're going to need a new block; check that the block index has room
+                        if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize)
+                        {
+                            // Hmm, the circular block index is already full -- we'll need
+                            // to allocate a new index. Note pr_blockIndexRaw can only be nullptr if
+                            // the initial allocation failed in the constructor.
+
+                            MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc)
+                            {
+                                return false;
+                            }
+                            else if (!new_block_index(pr_blockIndexSlotsUsed))
+                            {
+                                return false;
+                            }
+                        }
+
+                        // Insert a new block in the circular linked list
+                        auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
+                        if (newBlock == nullptr)
+                        {
+                            return false;
+                        }
 #ifdef MCDBGQ_TRACKMEM
-					newBlock->owner = this;
-#endif
-					newBlock->ConcurrentQueue::Block::template reset_empty<explicit_context>();
-					if (this->tailBlock == nullptr) {
-						newBlock->next = newBlock;
-					}
-					else {
-						newBlock->next = this->tailBlock->next;
-						this->tailBlock->next = newBlock;
-					}
-					this->tailBlock = newBlock;
-					++pr_blockIndexSlotsUsed;
-				}
-
-				MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
-					// The constructor may throw. We want the element not to appear in the queue in
-					// that case (without corrupting the queue):
-					MOODYCAMEL_TRY {
-						new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
-					}
-					MOODYCAMEL_CATCH (...) {
-						// Revert change to the current block, but leave the new block available
-						// for next time
-						pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
-						this->tailBlock = startBlock == nullptr ? this->tailBlock : startBlock;
-						MOODYCAMEL_RETHROW;
-					}
-				}
-				else {
-					(void)startBlock;
-					(void)originalBlockIndexSlotsUsed;
-				}
-
-				// Add block to block index
-				auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
-				entry.base = currentTailIndex;
-				entry.block = this->tailBlock;
-				blockIndex.load(std::memory_order_relaxed)->front.store(pr_blockIndexFront, std::memory_order_release);
-				pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
-
-				MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
-					this->tailIndex.store(newTailIndex, std::memory_order_release);
-					return true;
-				}
-			}
-
-			// Enqueue
-			new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
-
-			this->tailIndex.store(newTailIndex, std::memory_order_release);
-			return true;
-		}
-
-		template<typename U>
-		bool dequeue(U& element)
-		{
-			auto tail = this->tailIndex.load(std::memory_order_relaxed);
-			auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
-			if (details::circular_less_than<index_t>(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) {
-				// Might be something to dequeue, let's give it a try
-
-				// Note that this if is purely for performance purposes in the common case when the queue is
-				// empty and the values are eventually consistent -- we may enter here spuriously.
-
-				// Note that whatever the values of overcommit and tail are, they are not going to change (unless we
-				// change them) and must be the same value at this point (inside the if) as when the if condition was
-				// evaluated.
-
-				// We insert an acquire fence here to synchronize-with the release upon incrementing dequeueOvercommit below.
-				// This ensures that whatever the value we got loaded into overcommit, the load of dequeueOptisticCount in
-				// the fetch_add below will result in a value at least as recent as that (and therefore at least as large).
-				// Note that I believe a compiler (signal) fence here would be sufficient due to the nature of fetch_add (all
-				// read-modify-write operations are guaranteed to work on the latest value in the modification order), but
-				// unfortunately that can't be shown to be correct using only the C++11 standard.
-				// See http://stackoverflow.com/questions/18223161/what-are-the-c11-memory-ordering-guarantees-in-this-corner-case
-				std::atomic_thread_fence(std::memory_order_acquire);
-
-				// Increment optimistic counter, then check if it went over the boundary
-				auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed);
-
-				// Note that since dequeueOvercommit must be <= dequeueOptimisticCount (because dequeueOvercommit is only ever
-				// incremented after dequeueOptimisticCount -- this is enforced in the `else` block below), and since we now
-				// have a version of dequeueOptimisticCount that is at least as recent as overcommit (due to the release upon
-				// incrementing dequeueOvercommit and the acquire above that synchronizes with it), overcommit <= myDequeueCount.
-				// However, we can't assert this since both dequeueOptimisticCount and dequeueOvercommit may (independently)
-				// overflow; in such a case, though, the logic still holds since the difference between the two is maintained.
-
-				// Note that we reload tail here in case it changed; it will be the same value as before or greater, since
-				// this load is sequenced after (happens after) the earlier load above. This is supported by read-read
-				// coherency (as defined in the standard), explained here: http://en.cppreference.com/w/cpp/atomic/memory_order
-				tail = this->tailIndex.load(std::memory_order_acquire);
-				if ((details::likely)(details::circular_less_than<index_t>(myDequeueCount - overcommit, tail))) {
-					// Guaranteed to be at least one element to dequeue!
-
-					// Get the index. Note that since there's guaranteed to be at least one element, this
-					// will never exceed tail. We need to do an acquire-release fence here since it's possible
-					// that whatever condition got us to this point was for an earlier enqueued element (that
-					// we already see the memory effects for), but that by the time we increment somebody else
-					// has incremented it, and we need to see the memory effects for *that* element, which is
-					// in such a case is necessarily visible on the thread that incremented it in the first
-					// place with the more current condition (they must have acquired a tail that is at least
-					// as recent).
-					auto index = this->headIndex.fetch_add(1, std::memory_order_acq_rel);
-
-
-					// Determine which block the element is in
-
-					auto localBlockIndex = blockIndex.load(std::memory_order_acquire);
-					auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire);
-
-					// We need to be careful here about subtracting and dividing because of index wrap-around.
-					// When an index wraps, we need to preserve the sign of the offset when dividing it by the
-					// block size (in order to get a correct signed block count offset in all cases):
-					auto headBase = localBlockIndex->entries[localBlockIndexHead].base;
-					auto blockBaseIndex = index & ~static_cast<index_t>(BLOCK_SIZE - 1);
-					auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(blockBaseIndex - headBase) / static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
-					auto block = localBlockIndex->entries[(localBlockIndexHead + offset) & (localBlockIndex->size - 1)].block;
-
-					// Dequeue
-					auto& el = *((*block)[index]);
-					if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) {
-						// Make sure the element is still fully dequeued and destroyed even if the assignment
-						// throws
-						struct Guard {
-							Block* block;
-							index_t index;
-
-							~Guard()
-							{
-								(*block)[index]->~T();
-								block->ConcurrentQueue::Block::template set_empty<explicit_context>(index);
-							}
-						} guard = { block, index };
-
-						element = std::move(el); // NOLINT
-					}
-					else {
-						element = std::move(el); // NOLINT
-						el.~T(); // NOLINT
-						block->ConcurrentQueue::Block::template set_empty<explicit_context>(index);
-					}
-
-					return true;
-				}
-				else {
-					// Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent
-					this->dequeueOvercommit.fetch_add(1, std::memory_order_release);		// Release so that the fetch_add on dequeueOptimisticCount is guaranteed to happen before this write
-				}
-			}
-
-			return false;
-		}
-
-		template<AllocationMode allocMode, typename It>
-		bool MOODYCAMEL_NO_TSAN enqueue_bulk(It itemFirst, size_t count)
-		{
-			// First, we need to make sure we have enough room to enqueue all of the elements;
-			// this means pre-allocating blocks and putting them in the block index (but only if
-			// all the allocations succeeded).
-			index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed);
-			auto startBlock = this->tailBlock;
-			auto originalBlockIndexFront = pr_blockIndexFront;
-			auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
-
-			Block* firstAllocatedBlock = nullptr;
-
-			// Figure out how many blocks we'll need to allocate, and do so
-			size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
-			index_t currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
-			if (blockBaseDiff > 0) {
-				// Allocate as many blocks as possible from ahead
-				while (blockBaseDiff > 0 && this->tailBlock != nullptr && this->tailBlock->next != firstAllocatedBlock && this->tailBlock->next->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
-					blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
-					currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
-
-					this->tailBlock = this->tailBlock->next;
-					firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock;
-
-					auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
-					entry.base = currentTailIndex;
-					entry.block = this->tailBlock;
-					pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
-				}
-
-				// Now allocate as many blocks as necessary from the block pool
-				while (blockBaseDiff > 0) {
-					blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
-					currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
-
-					auto head = this->headIndex.load(std::memory_order_relaxed);
-					assert(!details::circular_less_than<index_t>(currentTailIndex, head));
-					bool full = !details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head));
-					if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize || full) {
-						MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) {
-							// Failed to allocate, undo changes (but keep injected blocks)
-							pr_blockIndexFront = originalBlockIndexFront;
-							pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
-							this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
-							return false;
-						}
-						else if (full || !new_block_index(originalBlockIndexSlotsUsed)) {
-							// Failed to allocate, undo changes (but keep injected blocks)
-							pr_blockIndexFront = originalBlockIndexFront;
-							pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
-							this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
-							return false;
-						}
-
-						// pr_blockIndexFront is updated inside new_block_index, so we need to
-						// update our fallback value too (since we keep the new index even if we
-						// later fail)
-						originalBlockIndexFront = originalBlockIndexSlotsUsed;
-					}
-
-					// Insert a new block in the circular linked list
-					auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
-					if (newBlock == nullptr) {
-						pr_blockIndexFront = originalBlockIndexFront;
-						pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
-						this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
-						return false;
-					}
+                        newBlock->owner = this;
+#endif
+                        newBlock->ConcurrentQueue::Block::template reset_empty<explicit_context>();
+                        if (this->tailBlock == nullptr)
+                        {
+                            newBlock->next = newBlock;
+                        }
+                        else
+                        {
+                            newBlock->next        = this->tailBlock->next;
+                            this->tailBlock->next = newBlock;
+                        }
+                        this->tailBlock = newBlock;
+                        ++pr_blockIndexSlotsUsed;
+                    }
+
+                    MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element))))
+                    {
+                        // The constructor may throw. We want the element not to appear in the queue in
+                        // that case (without corrupting the queue):
+                        MOODYCAMEL_TRY
+                        {
+                            new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
+                        }
+                        MOODYCAMEL_CATCH(...)
+                        {
+                            // Revert change to the current block, but leave the new block available
+                            // for next time
+                            pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+                            this->tailBlock        = startBlock == nullptr ? this->tailBlock : startBlock;
+                            MOODYCAMEL_RETHROW;
+                        }
+                    }
+                    else
+                    {
+                        (void)startBlock;
+                        (void)originalBlockIndexSlotsUsed;
+                    }
+
+                    // Add block to block index
+                    auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
+                    entry.base  = currentTailIndex;
+                    entry.block = this->tailBlock;
+                    blockIndex.load(std::memory_order_relaxed)->front.store(pr_blockIndexFront, std::memory_order_release);
+                    pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+
+                    MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element))))
+                    {
+                        this->tailIndex.store(newTailIndex, std::memory_order_release);
+                        return true;
+                    }
+                }
+
+                // Enqueue
+                new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
+
+                this->tailIndex.store(newTailIndex, std::memory_order_release);
+                return true;
+            }
+
+            template<typename U>
+            bool dequeue(U& element)
+            {
+                auto tail       = this->tailIndex.load(std::memory_order_relaxed);
+                auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+                if (details::circular_less_than<index_t>(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail))
+                {
+                    // Might be something to dequeue, let's give it a try
+
+                    // Note that this if is purely for performance purposes in the common case when the queue is
+                    // empty and the values are eventually consistent -- we may enter here spuriously.
+
+                    // Note that whatever the values of overcommit and tail are, they are not going to change (unless we
+                    // change them) and must be the same value at this point (inside the if) as when the if condition was
+                    // evaluated.
+
+                    // We insert an acquire fence here to synchronize-with the release upon incrementing dequeueOvercommit below.
+                    // This ensures that whatever the value we got loaded into overcommit, the load of dequeueOptisticCount in
+                    // the fetch_add below will result in a value at least as recent as that (and therefore at least as large).
+                    // Note that I believe a compiler (signal) fence here would be sufficient due to the nature of fetch_add (all
+                    // read-modify-write operations are guaranteed to work on the latest value in the modification order), but
+                    // unfortunately that can't be shown to be correct using only the C++11 standard.
+                    // See http://stackoverflow.com/questions/18223161/what-are-the-c11-memory-ordering-guarantees-in-this-corner-case
+                    std::atomic_thread_fence(std::memory_order_acquire);
+
+                    // Increment optimistic counter, then check if it went over the boundary
+                    auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed);
+
+                    // Note that since dequeueOvercommit must be <= dequeueOptimisticCount (because dequeueOvercommit is only ever
+                    // incremented after dequeueOptimisticCount -- this is enforced in the `else` block below), and since we now
+                    // have a version of dequeueOptimisticCount that is at least as recent as overcommit (due to the release upon
+                    // incrementing dequeueOvercommit and the acquire above that synchronizes with it), overcommit <= myDequeueCount.
+                    // However, we can't assert this since both dequeueOptimisticCount and dequeueOvercommit may (independently)
+                    // overflow; in such a case, though, the logic still holds since the difference between the two is maintained.
+
+                    // Note that we reload tail here in case it changed; it will be the same value as before or greater, since
+                    // this load is sequenced after (happens after) the earlier load above. This is supported by read-read
+                    // coherency (as defined in the standard), explained here: http://en.cppreference.com/w/cpp/atomic/memory_order
+                    tail = this->tailIndex.load(std::memory_order_acquire);
+                    if ((details::likely)(details::circular_less_than<index_t>(myDequeueCount - overcommit, tail)))
+                    {
+                        // Guaranteed to be at least one element to dequeue!
+
+                        // Get the index. Note that since there's guaranteed to be at least one element, this
+                        // will never exceed tail. We need to do an acquire-release fence here since it's possible
+                        // that whatever condition got us to this point was for an earlier enqueued element (that
+                        // we already see the memory effects for), but that by the time we increment somebody else
+                        // has incremented it, and we need to see the memory effects for *that* element, which is
+                        // in such a case is necessarily visible on the thread that incremented it in the first
+                        // place with the more current condition (they must have acquired a tail that is at least
+                        // as recent).
+                        auto  index = this->headIndex.fetch_add(1, std::memory_order_acq_rel);
+
+
+                        // Determine which block the element is in
+
+                        auto  localBlockIndex     = blockIndex.load(std::memory_order_acquire);
+                        auto  localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire);
+
+                        // We need to be careful here about subtracting and dividing because of index wrap-around.
+                        // When an index wraps, we need to preserve the sign of the offset when dividing it by the
+                        // block size (in order to get a correct signed block count offset in all cases):
+                        auto  headBase       = localBlockIndex->entries[localBlockIndexHead].base;
+                        auto  blockBaseIndex = index & ~static_cast<index_t>(BLOCK_SIZE - 1);
+                        auto  offset         = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(blockBaseIndex - headBase) / static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
+                        auto  block          = localBlockIndex->entries[(localBlockIndexHead + offset) & (localBlockIndex->size - 1)].block;
+
+                        // Dequeue
+                        auto& el = *((*block)[index]);
+                        if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el)))
+                        {
+                            // Make sure the element is still fully dequeued and destroyed even if the assignment
+                            // throws
+                            struct Guard
+                            {
+                                Block*  block;
+                                index_t index;
+
+                                ~Guard()
+                                {
+                                    (*block)[index]->~T();
+                                    block->ConcurrentQueue::Block::template set_empty<explicit_context>(index);
+                                }
+                            } guard = {block, index};
+
+                            element = std::move(el);  // NOLINT
+                        }
+                        else
+                        {
+                            element = std::move(el);  // NOLINT
+                            el.~T();                  // NOLINT
+                            block->ConcurrentQueue::Block::template set_empty<explicit_context>(index);
+                        }
+
+                        return true;
+                    }
+                    else
+                    {
+                        // Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent
+                        this->dequeueOvercommit.fetch_add(1, std::memory_order_release);  // Release so that the fetch_add on dequeueOptimisticCount is guaranteed to happen before this write
+                    }
+                }
+
+                return false;
+            }
+
+            template<AllocationMode allocMode, typename It>
+            bool MOODYCAMEL_NO_TSAN enqueue_bulk(It itemFirst, size_t count)
+            {
+                // First, we need to make sure we have enough room to enqueue all of the elements;
+                // this means pre-allocating blocks and putting them in the block index (but only if
+                // all the allocations succeeded).
+                index_t startTailIndex              = this->tailIndex.load(std::memory_order_relaxed);
+                auto    startBlock                  = this->tailBlock;
+                auto    originalBlockIndexFront     = pr_blockIndexFront;
+                auto    originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
+
+                Block*  firstAllocatedBlock = nullptr;
+
+                // Figure out how many blocks we'll need to allocate, and do so
+                size_t  blockBaseDiff    = ((startTailIndex + count - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
+                index_t currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+                if (blockBaseDiff > 0)
+                {
+                    // Allocate as many blocks as possible from ahead
+                    while (blockBaseDiff > 0 && this->tailBlock != nullptr && this->tailBlock->next != firstAllocatedBlock && this->tailBlock->next->ConcurrentQueue::Block::template is_empty<explicit_context>())
+                    {
+                        blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+                        currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+
+                        this->tailBlock     = this->tailBlock->next;
+                        firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock;
+
+                        auto& entry        = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
+                        entry.base         = currentTailIndex;
+                        entry.block        = this->tailBlock;
+                        pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+                    }
+
+                    // Now allocate as many blocks as necessary from the block pool
+                    while (blockBaseDiff > 0)
+                    {
+                        blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+                        currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+
+                        auto head = this->headIndex.load(std::memory_order_relaxed);
+                        assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+                        bool full = !details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head));
+                        if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize || full)
+                        {
+                            MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc)
+                            {
+                                // Failed to allocate, undo changes (but keep injected blocks)
+                                pr_blockIndexFront     = originalBlockIndexFront;
+                                pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+                                this->tailBlock        = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+                                return false;
+                            }
+                            else if (full || !new_block_index(originalBlockIndexSlotsUsed))
+                            {
+                                // Failed to allocate, undo changes (but keep injected blocks)
+                                pr_blockIndexFront     = originalBlockIndexFront;
+                                pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+                                this->tailBlock        = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+                                return false;
+                            }
+
+                            // pr_blockIndexFront is updated inside new_block_index, so we need to
+                            // update our fallback value too (since we keep the new index even if we
+                            // later fail)
+                            originalBlockIndexFront = originalBlockIndexSlotsUsed;
+                        }
+
+                        // Insert a new block in the circular linked list
+                        auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
+                        if (newBlock == nullptr)
+                        {
+                            pr_blockIndexFront     = originalBlockIndexFront;
+                            pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+                            this->tailBlock        = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+                            return false;
+                        }
 
 #ifdef MCDBGQ_TRACKMEM
-					newBlock->owner = this;
-#endif
-					newBlock->ConcurrentQueue::Block::template set_all_empty<explicit_context>();
-					if (this->tailBlock == nullptr) {
-						newBlock->next = newBlock;
-					}
-					else {
-						newBlock->next = this->tailBlock->next;
-						this->tailBlock->next = newBlock;
-					}
-					this->tailBlock = newBlock;
-					firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock;
-
-					++pr_blockIndexSlotsUsed;
-
-					auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
-					entry.base = currentTailIndex;
-					entry.block = this->tailBlock;
-					pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
-				}
-
-				// Excellent, all allocations succeeded. Reset each block's emptiness before we fill them up, and
-				// publish the new block index front
-				auto block = firstAllocatedBlock;
-				while (true) {
-					block->ConcurrentQueue::Block::template reset_empty<explicit_context>();
-					if (block == this->tailBlock) {
-						break;
-					}
-					block = block->next;
-				}
-
-				MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
-					blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release);
-				}
-			}
-
-			// Enqueue, one block at a time
-			index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
-			currentTailIndex = startTailIndex;
-			auto endBlock = this->tailBlock;
-			this->tailBlock = startBlock;
-			assert((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0);
-			if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) {
-				this->tailBlock = firstAllocatedBlock;
-			}
-			while (true) {
-				index_t stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-				if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) {
-					stopIndex = newTailIndex;
-				}
-				MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
-					while (currentTailIndex != stopIndex) {
-						new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
-					}
-				}
-				else {
-					MOODYCAMEL_TRY {
-						while (currentTailIndex != stopIndex) {
-							// Must use copy constructor even if move constructor is available
-							// because we may have to revert if there's an exception.
-							// Sorry about the horrible templated next line, but it was the only way
-							// to disable moving *at compile time*, which is important because a type
-							// may only define a (noexcept) move constructor, and so calls to the
-							// cctor will not compile, even if they are in an if branch that will never
-							// be executed
-							new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if<!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst));
-							++currentTailIndex;
-							++itemFirst;
-						}
-					}
-					MOODYCAMEL_CATCH (...) {
-						// Oh dear, an exception's been thrown -- destroy the elements that
-						// were enqueued so far and revert the entire bulk operation (we'll keep
-						// any allocated blocks in our linked list for later, though).
-						auto constructedStopIndex = currentTailIndex;
-						auto lastBlockEnqueued = this->tailBlock;
-
-						pr_blockIndexFront = originalBlockIndexFront;
-						pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
-						this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
-
-						if (!details::is_trivially_destructible<T>::value) {
-							auto block = startBlock;
-							if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
-								block = firstAllocatedBlock;
-							}
-							currentTailIndex = startTailIndex;
-							while (true) {
-								stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-								if (details::circular_less_than<index_t>(constructedStopIndex, stopIndex)) {
-									stopIndex = constructedStopIndex;
-								}
-								while (currentTailIndex != stopIndex) {
-									(*block)[currentTailIndex++]->~T();
-								}
-								if (block == lastBlockEnqueued) {
-									break;
-								}
-								block = block->next;
-							}
-						}
-						MOODYCAMEL_RETHROW;
-					}
-				}
-
-				if (this->tailBlock == endBlock) {
-					assert(currentTailIndex == newTailIndex);
-					break;
-				}
-				this->tailBlock = this->tailBlock->next;
-			}
-
-			MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
-				if (firstAllocatedBlock != nullptr)
-					blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release);
-			}
-
-			this->tailIndex.store(newTailIndex, std::memory_order_release);
-			return true;
-		}
-
-		template<typename It>
-		size_t dequeue_bulk(It& itemFirst, size_t max)
-		{
-			auto tail = this->tailIndex.load(std::memory_order_relaxed);
-			auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
-			auto desiredCount = static_cast<size_t>(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit));
-			if (details::circular_less_than<size_t>(0, desiredCount)) {
-				desiredCount = desiredCount < max ? desiredCount : max;
-				std::atomic_thread_fence(std::memory_order_acquire);
-
-				auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed);
-
-				tail = this->tailIndex.load(std::memory_order_acquire);
-				auto actualCount = static_cast<size_t>(tail - (myDequeueCount - overcommit));
-				if (details::circular_less_than<size_t>(0, actualCount)) {
-					actualCount = desiredCount < actualCount ? desiredCount : actualCount;
-					if (actualCount < desiredCount) {
-						this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release);
-					}
-
-					// Get the first index. Note that since there's guaranteed to be at least actualCount elements, this
-					// will never exceed tail.
-					auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
-
-					// Determine which block the first element is in
-					auto localBlockIndex = blockIndex.load(std::memory_order_acquire);
-					auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire);
-
-					auto headBase = localBlockIndex->entries[localBlockIndexHead].base;
-					auto firstBlockBaseIndex = firstIndex & ~static_cast<index_t>(BLOCK_SIZE - 1);
-					auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(firstBlockBaseIndex - headBase) / static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
-					auto indexIndex = (localBlockIndexHead + offset) & (localBlockIndex->size - 1);
-
-					// Iterate the blocks and dequeue
-					auto index = firstIndex;
-					do {
-						auto firstIndexInBlock = index;
-						index_t endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-						endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
-						auto block = localBlockIndex->entries[indexIndex].block;
-						if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) {
-							while (index != endIndex) {
-								auto& el = *((*block)[index]);
-								*itemFirst++ = std::move(el);
-								el.~T();
-								++index;
-							}
-						}
-						else {
-							MOODYCAMEL_TRY {
-								while (index != endIndex) {
-									auto& el = *((*block)[index]);
-									*itemFirst = std::move(el);
-									++itemFirst;
-									el.~T();
-									++index;
-								}
-							}
-							MOODYCAMEL_CATCH (...) {
-								// It's too late to revert the dequeue, but we can make sure that all
-								// the dequeued objects are properly destroyed and the block index
-								// (and empty count) are properly updated before we propagate the exception
-								do {
-									block = localBlockIndex->entries[indexIndex].block;
-									while (index != endIndex) {
-										(*block)[index++]->~T();
-									}
-									block->ConcurrentQueue::Block::template set_many_empty<explicit_context>(firstIndexInBlock, static_cast<size_t>(endIndex - firstIndexInBlock));
-									indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
-
-									firstIndexInBlock = index;
-									endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-									endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
-								} while (index != firstIndex + actualCount);
-
-								MOODYCAMEL_RETHROW;
-							}
-						}
-						block->ConcurrentQueue::Block::template set_many_empty<explicit_context>(firstIndexInBlock, static_cast<size_t>(endIndex - firstIndexInBlock));
-						indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
-					} while (index != firstIndex + actualCount);
-
-					return actualCount;
-				}
-				else {
-					// Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent
-					this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release);
-				}
-			}
-
-			return 0;
-		}
-
-	private:
-		struct BlockIndexEntry
-		{
-			index_t base;
-			Block* block;
-		};
-
-		struct BlockIndexHeader
-		{
-			size_t size;
-			std::atomic<size_t> front;		// Current slot (not next, like pr_blockIndexFront)
-			BlockIndexEntry* entries;
-			void* prev;
-		};
-
-
-		bool new_block_index(size_t numberOfFilledSlotsToExpose)
-		{
-			auto prevBlockSizeMask = pr_blockIndexSize - 1;
-
-			// Create the new block
-			pr_blockIndexSize <<= 1;
-			auto newRawPtr = static_cast<char*>((Traits::malloc)(sizeof(BlockIndexHeader) + std::alignment_of<BlockIndexEntry>::value - 1 + sizeof(BlockIndexEntry) * pr_blockIndexSize));
-			if (newRawPtr == nullptr) {
-				pr_blockIndexSize >>= 1;		// Reset to allow graceful retry
-				return false;
-			}
-
-			auto newBlockIndexEntries = reinterpret_cast<BlockIndexEntry*>(details::align_for<BlockIndexEntry>(newRawPtr + sizeof(BlockIndexHeader)));
-
-			// Copy in all the old indices, if any
-			size_t j = 0;
-			if (pr_blockIndexSlotsUsed != 0) {
-				auto i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & prevBlockSizeMask;
-				do {
-					newBlockIndexEntries[j++] = pr_blockIndexEntries[i];
-					i = (i + 1) & prevBlockSizeMask;
-				} while (i != pr_blockIndexFront);
-			}
-
-			// Update everything
-			auto header = new (newRawPtr) BlockIndexHeader;
-			header->size = pr_blockIndexSize;
-			header->front.store(numberOfFilledSlotsToExpose - 1, std::memory_order_relaxed);
-			header->entries = newBlockIndexEntries;
-			header->prev = pr_blockIndexRaw;		// we link the new block to the old one so we can free it later
-
-			pr_blockIndexFront = j;
-			pr_blockIndexEntries = newBlockIndexEntries;
-			pr_blockIndexRaw = newRawPtr;
-			blockIndex.store(header, std::memory_order_release);
-
-			return true;
-		}
-
-	private:
-		std::atomic<BlockIndexHeader*> blockIndex;
-
-		// To be used by producer only -- consumer must use the ones in referenced by blockIndex
-		size_t pr_blockIndexSlotsUsed;
-		size_t pr_blockIndexSize;
-		size_t pr_blockIndexFront;		// Next slot (not current)
-		BlockIndexEntry* pr_blockIndexEntries;
-		void* pr_blockIndexRaw;
+                        newBlock->owner = this;
+#endif
+                        newBlock->ConcurrentQueue::Block::template set_all_empty<explicit_context>();
+                        if (this->tailBlock == nullptr)
+                        {
+                            newBlock->next = newBlock;
+                        }
+                        else
+                        {
+                            newBlock->next        = this->tailBlock->next;
+                            this->tailBlock->next = newBlock;
+                        }
+                        this->tailBlock     = newBlock;
+                        firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock;
+
+                        ++pr_blockIndexSlotsUsed;
+
+                        auto& entry        = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
+                        entry.base         = currentTailIndex;
+                        entry.block        = this->tailBlock;
+                        pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+                    }
+
+                    // Excellent, all allocations succeeded. Reset each block's emptiness before we fill them up, and
+                    // publish the new block index front
+                    auto block = firstAllocatedBlock;
+                    while (true)
+                    {
+                        block->ConcurrentQueue::Block::template reset_empty<explicit_context>();
+                        if (block == this->tailBlock)
+                        {
+                            break;
+                        }
+                        block = block->next;
+                    }
+
+                    MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst))))
+                    {
+                        blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release);
+                    }
+                }
+
+                // Enqueue, one block at a time
+                index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
+                currentTailIndex     = startTailIndex;
+                auto endBlock        = this->tailBlock;
+                this->tailBlock      = startBlock;
+                assert((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0);
+                if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr)
+                {
+                    this->tailBlock = firstAllocatedBlock;
+                }
+                while (true)
+                {
+                    index_t stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+                    if (details::circular_less_than<index_t>(newTailIndex, stopIndex))
+                    {
+                        stopIndex = newTailIndex;
+                    }
+                    MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst))))
+                    {
+                        while (currentTailIndex != stopIndex)
+                        {
+                            new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
+                        }
+                    }
+                    else
+                    {
+                        MOODYCAMEL_TRY
+                        {
+                            while (currentTailIndex != stopIndex)
+                            {
+                                // Must use copy constructor even if move constructor is available
+                                // because we may have to revert if there's an exception.
+                                // Sorry about the horrible templated next line, but it was the only way
+                                // to disable moving *at compile time*, which is important because a type
+                                // may only define a (noexcept) move constructor, and so calls to the
+                                // cctor will not compile, even if they are in an if branch that will never
+                                // be executed
+                                new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if<!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst));
+                                ++currentTailIndex;
+                                ++itemFirst;
+                            }
+                        }
+                        MOODYCAMEL_CATCH(...)
+                        {
+                            // Oh dear, an exception's been thrown -- destroy the elements that
+                            // were enqueued so far and revert the entire bulk operation (we'll keep
+                            // any allocated blocks in our linked list for later, though).
+                            auto constructedStopIndex = currentTailIndex;
+                            auto lastBlockEnqueued    = this->tailBlock;
+
+                            pr_blockIndexFront     = originalBlockIndexFront;
+                            pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+                            this->tailBlock        = startBlock == nullptr ? firstAllocatedBlock : startBlock;
+
+                            if (!details::is_trivially_destructible<T>::value)
+                            {
+                                auto block = startBlock;
+                                if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0)
+                                {
+                                    block = firstAllocatedBlock;
+                                }
+                                currentTailIndex = startTailIndex;
+                                while (true)
+                                {
+                                    stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+                                    if (details::circular_less_than<index_t>(constructedStopIndex, stopIndex))
+                                    {
+                                        stopIndex = constructedStopIndex;
+                                    }
+                                    while (currentTailIndex != stopIndex)
+                                    {
+                                        (*block)[currentTailIndex++]->~T();
+                                    }
+                                    if (block == lastBlockEnqueued)
+                                    {
+                                        break;
+                                    }
+                                    block = block->next;
+                                }
+                            }
+                            MOODYCAMEL_RETHROW;
+                        }
+                    }
+
+                    if (this->tailBlock == endBlock)
+                    {
+                        assert(currentTailIndex == newTailIndex);
+                        break;
+                    }
+                    this->tailBlock = this->tailBlock->next;
+                }
+
+                MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst))))
+                {
+                    if (firstAllocatedBlock != nullptr)
+                        blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release);
+                }
+
+                this->tailIndex.store(newTailIndex, std::memory_order_release);
+                return true;
+            }
+
+            template<typename It>
+            size_t dequeue_bulk(It& itemFirst, size_t max)
+            {
+                auto tail         = this->tailIndex.load(std::memory_order_relaxed);
+                auto overcommit   = this->dequeueOvercommit.load(std::memory_order_relaxed);
+                auto desiredCount = static_cast<size_t>(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit));
+                if (details::circular_less_than<size_t>(0, desiredCount))
+                {
+                    desiredCount = desiredCount < max ? desiredCount : max;
+                    std::atomic_thread_fence(std::memory_order_acquire);
+
+                    auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed);
+
+                    tail             = this->tailIndex.load(std::memory_order_acquire);
+                    auto actualCount = static_cast<size_t>(tail - (myDequeueCount - overcommit));
+                    if (details::circular_less_than<size_t>(0, actualCount))
+                    {
+                        actualCount = desiredCount < actualCount ? desiredCount : actualCount;
+                        if (actualCount < desiredCount)
+                        {
+                            this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release);
+                        }
+
+                        // Get the first index. Note that since there's guaranteed to be at least actualCount elements, this
+                        // will never exceed tail.
+                        auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
+
+                        // Determine which block the first element is in
+                        auto localBlockIndex     = blockIndex.load(std::memory_order_acquire);
+                        auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire);
+
+                        auto headBase            = localBlockIndex->entries[localBlockIndexHead].base;
+                        auto firstBlockBaseIndex = firstIndex & ~static_cast<index_t>(BLOCK_SIZE - 1);
+                        auto offset              = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(firstBlockBaseIndex - headBase) / static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
+                        auto indexIndex          = (localBlockIndexHead + offset) & (localBlockIndex->size - 1);
+
+                        // Iterate the blocks and dequeue
+                        auto index = firstIndex;
+                        do {
+                            auto    firstIndexInBlock = index;
+                            index_t endIndex          = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+                            endIndex                  = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
+                            auto block                = localBlockIndex->entries[indexIndex].block;
+                            if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index]))))
+                            {
+                                while (index != endIndex)
+                                {
+                                    auto& el     = *((*block)[index]);
+                                    *itemFirst++ = std::move(el);
+                                    el.~T();
+                                    ++index;
+                                }
+                            }
+                            else
+                            {
+                                MOODYCAMEL_TRY
+                                {
+                                    while (index != endIndex)
+                                    {
+                                        auto& el   = *((*block)[index]);
+                                        *itemFirst = std::move(el);
+                                        ++itemFirst;
+                                        el.~T();
+                                        ++index;
+                                    }
+                                }
+                                MOODYCAMEL_CATCH(...)
+                                {
+                                    // It's too late to revert the dequeue, but we can make sure that all
+                                    // the dequeued objects are properly destroyed and the block index
+                                    // (and empty count) are properly updated before we propagate the exception
+                                    do {
+                                        block = localBlockIndex->entries[indexIndex].block;
+                                        while (index != endIndex)
+                                        {
+                                            (*block)[index++]->~T();
+                                        }
+                                        block->ConcurrentQueue::Block::template set_many_empty<explicit_context>(firstIndexInBlock, static_cast<size_t>(endIndex - firstIndexInBlock));
+                                        indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
+
+                                        firstIndexInBlock = index;
+                                        endIndex          = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+                                        endIndex          = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
+                                    } while (index != firstIndex + actualCount);
+
+                                    MOODYCAMEL_RETHROW;
+                                }
+                            }
+                            block->ConcurrentQueue::Block::template set_many_empty<explicit_context>(firstIndexInBlock, static_cast<size_t>(endIndex - firstIndexInBlock));
+                            indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
+                        } while (index != firstIndex + actualCount);
+
+                        return actualCount;
+                    }
+                    else
+                    {
+                        // Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent
+                        this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release);
+                    }
+                }
+
+                return 0;
+            }
+
+          private:
+            struct BlockIndexEntry
+            {
+                index_t base;
+                Block*  block;
+            };
+
+            struct BlockIndexHeader
+            {
+                size_t              size;
+                std::atomic<size_t> front;  // Current slot (not next, like pr_blockIndexFront)
+                BlockIndexEntry*    entries;
+                void*               prev;
+            };
+
+
+            bool new_block_index(size_t numberOfFilledSlotsToExpose)
+            {
+                auto prevBlockSizeMask = pr_blockIndexSize - 1;
+
+                // Create the new block
+                pr_blockIndexSize <<= 1;
+                auto newRawPtr = static_cast<char*>((Traits::malloc)(sizeof(BlockIndexHeader) + std::alignment_of<BlockIndexEntry>::value - 1 + sizeof(BlockIndexEntry) * pr_blockIndexSize));
+                if (newRawPtr == nullptr)
+                {
+                    pr_blockIndexSize >>= 1;  // Reset to allow graceful retry
+                    return false;
+                }
+
+                auto   newBlockIndexEntries = reinterpret_cast<BlockIndexEntry*>(details::align_for<BlockIndexEntry>(newRawPtr + sizeof(BlockIndexHeader)));
+
+                // Copy in all the old indices, if any
+                size_t j = 0;
+                if (pr_blockIndexSlotsUsed != 0)
+                {
+                    auto i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & prevBlockSizeMask;
+                    do {
+                        newBlockIndexEntries[j++] = pr_blockIndexEntries[i];
+                        i                         = (i + 1) & prevBlockSizeMask;
+                    } while (i != pr_blockIndexFront);
+                }
+
+                // Update everything
+                auto header  = new (newRawPtr) BlockIndexHeader;
+                header->size = pr_blockIndexSize;
+                header->front.store(numberOfFilledSlotsToExpose - 1, std::memory_order_relaxed);
+                header->entries = newBlockIndexEntries;
+                header->prev    = pr_blockIndexRaw;  // we link the new block to the old one so we can free it later
+
+                pr_blockIndexFront   = j;
+                pr_blockIndexEntries = newBlockIndexEntries;
+                pr_blockIndexRaw     = newRawPtr;
+                blockIndex.store(header, std::memory_order_release);
+
+                return true;
+            }
+
+          private:
+            std::atomic<BlockIndexHeader*> blockIndex;
+
+            // To be used by producer only -- consumer must use the ones in referenced by blockIndex
+            size_t                         pr_blockIndexSlotsUsed;
+            size_t                         pr_blockIndexSize;
+            size_t                         pr_blockIndexFront;  // Next slot (not current)
+            BlockIndexEntry*               pr_blockIndexEntries;
+            void*                          pr_blockIndexRaw;
 
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-	public:
-		ExplicitProducer* nextExplicitProducer;
-	private:
+          public:
+            ExplicitProducer* nextExplicitProducer;
+
+          private:
 #endif
 
 #ifdef MCDBGQ_TRACKMEM
-		friend struct MemStats;
+            friend struct MemStats;
 #endif
-	};
-
-
-	//////////////////////////////////
-	// Implicit queue
-	//////////////////////////////////
-
-	struct ImplicitProducer : public ProducerBase
-	{
-		ImplicitProducer(ConcurrentQueue* parent_) :
-			ProducerBase(parent_, false),
-			nextBlockIndexCapacity(IMPLICIT_INITIAL_INDEX_SIZE),
-			blockIndex(nullptr)
-		{
-			new_block_index();
-		}
-
-		~ImplicitProducer()
-		{
-			// Note that since we're in the destructor we can assume that all enqueue/dequeue operations
-			// completed already; this means that all undequeued elements are placed contiguously across
-			// contiguous blocks, and that only the first and last remaining blocks can be only partially
-			// empty (all other remaining blocks must be completely full).
+        };
+
+
+        //////////////////////////////////
+        // Implicit queue
+        //////////////////////////////////
+
+        struct ImplicitProducer : public ProducerBase
+        {
+            ImplicitProducer(ConcurrentQueue* parent_)
+                : ProducerBase(parent_, false)
+                , nextBlockIndexCapacity(IMPLICIT_INITIAL_INDEX_SIZE)
+                , blockIndex(nullptr)
+            {
+                new_block_index();
+            }
+
+            ~ImplicitProducer()
+            {
+                // Note that since we're in the destructor we can assume that all enqueue/dequeue operations
+                // completed already; this means that all undequeued elements are placed contiguously across
+                // contiguous blocks, and that only the first and last remaining blocks can be only partially
+                // empty (all other remaining blocks must be completely full).
 
 #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
-			// Unregister ourselves for thread termination notification
-			if (!this->inactive.load(std::memory_order_relaxed)) {
-				details::ThreadExitNotifier::unsubscribe(&threadExitListener);
-			}
-#endif
-
-			// Destroy all remaining elements!
-			auto tail = this->tailIndex.load(std::memory_order_relaxed);
-			auto index = this->headIndex.load(std::memory_order_relaxed);
-			Block* block = nullptr;
-			assert(index == tail || details::circular_less_than(index, tail));
-			bool forceFreeLastBlock = index != tail;		// If we enter the loop, then the last (tail) block will not be freed
-			while (index != tail) {
-				if ((index & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 || block == nullptr) {
-					if (block != nullptr) {
-						// Free the old block
-						this->parent->add_block_to_free_list(block);
-					}
-
-					block = get_block_index_entry_for_index(index)->value.load(std::memory_order_relaxed);
-				}
-
-				((*block)[index])->~T();
-				++index;
-			}
-			// Even if the queue is empty, there's still one block that's not on the free list
-			// (unless the head index reached the end of it, in which case the tail will be poised
-			// to create a new block).
-			if (this->tailBlock != nullptr && (forceFreeLastBlock || (tail & static_cast<index_t>(BLOCK_SIZE - 1)) != 0)) {
-				this->parent->add_block_to_free_list(this->tailBlock);
-			}
-
-			// Destroy block index
-			auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
-			if (localBlockIndex != nullptr) {
-				for (size_t i = 0; i != localBlockIndex->capacity; ++i) {
-					localBlockIndex->index[i]->~BlockIndexEntry();
-				}
-				do {
-					auto prev = localBlockIndex->prev;
-					localBlockIndex->~BlockIndexHeader();
-					(Traits::free)(localBlockIndex);
-					localBlockIndex = prev;
-				} while (localBlockIndex != nullptr);
-			}
-		}
-
-		template<AllocationMode allocMode, typename U>
-		inline bool enqueue(U&& element)
-		{
-			index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed);
-			index_t newTailIndex = 1 + currentTailIndex;
-			if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
-				// We reached the end of a block, start a new one
-				auto head = this->headIndex.load(std::memory_order_relaxed);
-				assert(!details::circular_less_than<index_t>(currentTailIndex, head));
-				if (!details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) {
-					return false;
-				}
+                // Unregister ourselves for thread termination notification
+                if (!this->inactive.load(std::memory_order_relaxed))
+                {
+                    details::ThreadExitNotifier::unsubscribe(&threadExitListener);
+                }
+#endif
+
+                // Destroy all remaining elements!
+                auto   tail  = this->tailIndex.load(std::memory_order_relaxed);
+                auto   index = this->headIndex.load(std::memory_order_relaxed);
+                Block* block = nullptr;
+                assert(index == tail || details::circular_less_than(index, tail));
+                bool forceFreeLastBlock = index != tail;  // If we enter the loop, then the last (tail) block will not be freed
+                while (index != tail)
+                {
+                    if ((index & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 || block == nullptr)
+                    {
+                        if (block != nullptr)
+                        {
+                            // Free the old block
+                            this->parent->add_block_to_free_list(block);
+                        }
+
+                        block = get_block_index_entry_for_index(index)->value.load(std::memory_order_relaxed);
+                    }
+
+                    ((*block)[index])->~T();
+                    ++index;
+                }
+                // Even if the queue is empty, there's still one block that's not on the free list
+                // (unless the head index reached the end of it, in which case the tail will be poised
+                // to create a new block).
+                if (this->tailBlock != nullptr && (forceFreeLastBlock || (tail & static_cast<index_t>(BLOCK_SIZE - 1)) != 0))
+                {
+                    this->parent->add_block_to_free_list(this->tailBlock);
+                }
+
+                // Destroy block index
+                auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+                if (localBlockIndex != nullptr)
+                {
+                    for (size_t i = 0; i != localBlockIndex->capacity; ++i)
+                    {
+                        localBlockIndex->index[i]->~BlockIndexEntry();
+                    }
+                    do {
+                        auto prev = localBlockIndex->prev;
+                        localBlockIndex->~BlockIndexHeader();
+                        (Traits::free)(localBlockIndex);
+                        localBlockIndex = prev;
+                    } while (localBlockIndex != nullptr);
+                }
+            }
+
+            template<AllocationMode allocMode, typename U>
+            inline bool enqueue(U&& element)
+            {
+                index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+                index_t newTailIndex     = 1 + currentTailIndex;
+                if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0)
+                {
+                    // We reached the end of a block, start a new one
+                    auto head = this->headIndex.load(std::memory_order_relaxed);
+                    assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+                    if (!details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head)))
+                    {
+                        return false;
+                    }
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-				debug::DebugLock lock(mutex);
-#endif
-				// Find out where we'll be inserting this block in the block index
-				BlockIndexEntry* idxEntry;
-				if (!insert_block_index_entry<allocMode>(idxEntry, currentTailIndex)) {
-					return false;
-				}
-
-				// Get ahold of a new block
-				auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
-				if (newBlock == nullptr) {
-					rewind_block_index_tail();
-					idxEntry->value.store(nullptr, std::memory_order_relaxed);
-					return false;
-				}
+                    debug::DebugLock lock(mutex);
+#endif
+                    // Find out where we'll be inserting this block in the block index
+                    BlockIndexEntry* idxEntry;
+                    if (!insert_block_index_entry<allocMode>(idxEntry, currentTailIndex))
+                    {
+                        return false;
+                    }
+
+                    // Get ahold of a new block
+                    auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
+                    if (newBlock == nullptr)
+                    {
+                        rewind_block_index_tail();
+                        idxEntry->value.store(nullptr, std::memory_order_relaxed);
+                        return false;
+                    }
 #ifdef MCDBGQ_TRACKMEM
-				newBlock->owner = this;
-#endif
-				newBlock->ConcurrentQueue::Block::template reset_empty<implicit_context>();
-
-				MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
-					// May throw, try to insert now before we publish the fact that we have this new block
-					MOODYCAMEL_TRY {
-						new ((*newBlock)[currentTailIndex]) T(std::forward<U>(element));
-					}
-					MOODYCAMEL_CATCH (...) {
-						rewind_block_index_tail();
-						idxEntry->value.store(nullptr, std::memory_order_relaxed);
-						this->parent->add_block_to_free_list(newBlock);
-						MOODYCAMEL_RETHROW;
-					}
-				}
-
-				// Insert the new block into the index
-				idxEntry->value.store(newBlock, std::memory_order_relaxed);
-
-				this->tailBlock = newBlock;
-
-				MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
-					this->tailIndex.store(newTailIndex, std::memory_order_release);
-					return true;
-				}
-			}
-
-			// Enqueue
-			new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
-
-			this->tailIndex.store(newTailIndex, std::memory_order_release);
-			return true;
-		}
-
-		template<typename U>
-		bool dequeue(U& element)
-		{
-			// See ExplicitProducer::dequeue for rationale and explanation
-			index_t tail = this->tailIndex.load(std::memory_order_relaxed);
-			index_t overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
-			if (details::circular_less_than<index_t>(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) {
-				std::atomic_thread_fence(std::memory_order_acquire);
-
-				index_t myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed);
-				tail = this->tailIndex.load(std::memory_order_acquire);
-				if ((details::likely)(details::circular_less_than<index_t>(myDequeueCount - overcommit, tail))) {
-					index_t index = this->headIndex.fetch_add(1, std::memory_order_acq_rel);
-
-					// Determine which block the element is in
-					auto entry = get_block_index_entry_for_index(index);
-
-					// Dequeue
-					auto block = entry->value.load(std::memory_order_relaxed);
-					auto& el = *((*block)[index]);
-
-					if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) {
+                    newBlock->owner = this;
+#endif
+                    newBlock->ConcurrentQueue::Block::template reset_empty<implicit_context>();
+
+                    MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element))))
+                    {
+                        // May throw, try to insert now before we publish the fact that we have this new block
+                        MOODYCAMEL_TRY
+                        {
+                            new ((*newBlock)[currentTailIndex]) T(std::forward<U>(element));
+                        }
+                        MOODYCAMEL_CATCH(...)
+                        {
+                            rewind_block_index_tail();
+                            idxEntry->value.store(nullptr, std::memory_order_relaxed);
+                            this->parent->add_block_to_free_list(newBlock);
+                            MOODYCAMEL_RETHROW;
+                        }
+                    }
+
+                    // Insert the new block into the index
+                    idxEntry->value.store(newBlock, std::memory_order_relaxed);
+
+                    this->tailBlock = newBlock;
+
+                    MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element))))
+                    {
+                        this->tailIndex.store(newTailIndex, std::memory_order_release);
+                        return true;
+                    }
+                }
+
+                // Enqueue
+                new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
+
+                this->tailIndex.store(newTailIndex, std::memory_order_release);
+                return true;
+            }
+
+            template<typename U>
+            bool dequeue(U& element)
+            {
+                // See ExplicitProducer::dequeue for rationale and explanation
+                index_t tail       = this->tailIndex.load(std::memory_order_relaxed);
+                index_t overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+                if (details::circular_less_than<index_t>(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail))
+                {
+                    std::atomic_thread_fence(std::memory_order_acquire);
+
+                    index_t myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed);
+                    tail                   = this->tailIndex.load(std::memory_order_acquire);
+                    if ((details::likely)(details::circular_less_than<index_t>(myDequeueCount - overcommit, tail)))
+                    {
+                        index_t index = this->headIndex.fetch_add(1, std::memory_order_acq_rel);
+
+                        // Determine which block the element is in
+                        auto    entry = get_block_index_entry_for_index(index);
+
+                        // Dequeue
+                        auto    block = entry->value.load(std::memory_order_relaxed);
+                        auto&   el    = *((*block)[index]);
+
+                        if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el)))
+                        {
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-						// Note: Acquiring the mutex with every dequeue instead of only when a block
-						// is released is very sub-optimal, but it is, after all, purely debug code.
-						debug::DebugLock lock(producer->mutex);
-#endif
-						struct Guard {
-							Block* block;
-							index_t index;
-							BlockIndexEntry* entry;
-							ConcurrentQueue* parent;
-
-							~Guard()
-							{
-								(*block)[index]->~T();
-								if (block->ConcurrentQueue::Block::template set_empty<implicit_context>(index)) {
-									entry->value.store(nullptr, std::memory_order_relaxed);
-									parent->add_block_to_free_list(block);
-								}
-							}
-						} guard = { block, index, entry, this->parent };
-
-						element = std::move(el); // NOLINT
-					}
-					else {
-						element = std::move(el); // NOLINT
-						el.~T(); // NOLINT
-
-						if (block->ConcurrentQueue::Block::template set_empty<implicit_context>(index)) {
-							{
+                            // Note: Acquiring the mutex with every dequeue instead of only when a block
+                            // is released is very sub-optimal, but it is, after all, purely debug code.
+                            debug::DebugLock lock(producer->mutex);
+#endif
+                            struct Guard
+                            {
+                                Block*           block;
+                                index_t          index;
+                                BlockIndexEntry* entry;
+                                ConcurrentQueue* parent;
+
+                                ~Guard()
+                                {
+                                    (*block)[index]->~T();
+                                    if (block->ConcurrentQueue::Block::template set_empty<implicit_context>(index))
+                                    {
+                                        entry->value.store(nullptr, std::memory_order_relaxed);
+                                        parent->add_block_to_free_list(block);
+                                    }
+                                }
+                            } guard = {block, index, entry, this->parent};
+
+                            element = std::move(el);  // NOLINT
+                        }
+                        else
+                        {
+                            element = std::move(el);  // NOLINT
+                            el.~T();                  // NOLINT
+
+                            if (block->ConcurrentQueue::Block::template set_empty<implicit_context>(index))
+                            {
+                                {
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-								debug::DebugLock lock(mutex);
+                                    debug::DebugLock lock(mutex);
 #endif
-								// Add the block back into the global free pool (and remove from block index)
-								entry->value.store(nullptr, std::memory_order_relaxed);
-							}
-							this->parent->add_block_to_free_list(block);		// releases the above store
-						}
-					}
-
-					return true;
-				}
-				else {
-					this->dequeueOvercommit.fetch_add(1, std::memory_order_release);
-				}
-			}
-
-			return false;
-		}
+                                    // Add the block back into the global free pool (and remove from block index)
+                                    entry->value.store(nullptr, std::memory_order_relaxed);
+                                }
+                                this->parent->add_block_to_free_list(block);  // releases the above store
+                            }
+                        }
+
+                        return true;
+                    }
+                    else
+                    {
+                        this->dequeueOvercommit.fetch_add(1, std::memory_order_release);
+                    }
+                }
+
+                return false;
+            }
 
 #ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable: 4706)  // assignment within conditional expression
-#endif
-		template<AllocationMode allocMode, typename It>
-		bool enqueue_bulk(It itemFirst, size_t count)
-		{
-			// First, we need to make sure we have enough room to enqueue all of the elements;
-			// this means pre-allocating blocks and putting them in the block index (but only if
-			// all the allocations succeeded).
-
-			// Note that the tailBlock we start off with may not be owned by us any more;
-			// this happens if it was filled up exactly to the top (setting tailIndex to
-			// the first index of the next block which is not yet allocated), then dequeued
-			// completely (putting it on the free list) before we enqueue again.
-
-			index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed);
-			auto startBlock = this->tailBlock;
-			Block* firstAllocatedBlock = nullptr;
-			auto endBlock = this->tailBlock;
-
-			// Figure out how many blocks we'll need to allocate, and do so
-			size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
-			index_t currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
-			if (blockBaseDiff > 0) {
+    #pragma warning(push)
+    #pragma warning(disable : 4706)  // assignment within conditional expression
+#endif
+            template<AllocationMode allocMode, typename It>
+            bool enqueue_bulk(It itemFirst, size_t count)
+            {
+                // First, we need to make sure we have enough room to enqueue all of the elements;
+                // this means pre-allocating blocks and putting them in the block index (but only if
+                // all the allocations succeeded).
+
+                // Note that the tailBlock we start off with may not be owned by us any more;
+                // this happens if it was filled up exactly to the top (setting tailIndex to
+                // the first index of the next block which is not yet allocated), then dequeued
+                // completely (putting it on the free list) before we enqueue again.
+
+                index_t startTailIndex      = this->tailIndex.load(std::memory_order_relaxed);
+                auto    startBlock          = this->tailBlock;
+                Block*  firstAllocatedBlock = nullptr;
+                auto    endBlock            = this->tailBlock;
+
+                // Figure out how many blocks we'll need to allocate, and do so
+                size_t  blockBaseDiff    = ((startTailIndex + count - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
+                index_t currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+                if (blockBaseDiff > 0)
+                {
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-				debug::DebugLock lock(mutex);
-#endif
-				do {
-					blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
-					currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
-
-					// Find out where we'll be inserting this block in the block index
-					BlockIndexEntry* idxEntry = nullptr;  // initialization here unnecessary but compiler can't always tell
-					Block* newBlock;
-					bool indexInserted = false;
-					auto head = this->headIndex.load(std::memory_order_relaxed);
-					assert(!details::circular_less_than<index_t>(currentTailIndex, head));
-					bool full = !details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head));
-
-					if (full || !(indexInserted = insert_block_index_entry<allocMode>(idxEntry, currentTailIndex)) || (newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>()) == nullptr) {
-						// Index allocation or block allocation failed; revert any other allocations
-						// and index insertions done so far for this operation
-						if (indexInserted) {
-							rewind_block_index_tail();
-							idxEntry->value.store(nullptr, std::memory_order_relaxed);
-						}
-						currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
-						for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) {
-							currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
-							idxEntry = get_block_index_entry_for_index(currentTailIndex);
-							idxEntry->value.store(nullptr, std::memory_order_relaxed);
-							rewind_block_index_tail();
-						}
-						this->parent->add_blocks_to_free_list(firstAllocatedBlock);
-						this->tailBlock = startBlock;
-
-						return false;
-					}
+                    debug::DebugLock lock(mutex);
+#endif
+                    do {
+                        blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+                        currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+
+                        // Find out where we'll be inserting this block in the block index
+                        BlockIndexEntry* idxEntry = nullptr;  // initialization here unnecessary but compiler can't always tell
+                        Block*           newBlock;
+                        bool             indexInserted = false;
+                        auto             head          = this->headIndex.load(std::memory_order_relaxed);
+                        assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+                        bool full = !details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head));
+
+                        if (full || !(indexInserted = insert_block_index_entry<allocMode>(idxEntry, currentTailIndex)) || (newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>()) == nullptr)
+                        {
+                            // Index allocation or block allocation failed; revert any other allocations
+                            // and index insertions done so far for this operation
+                            if (indexInserted)
+                            {
+                                rewind_block_index_tail();
+                                idxEntry->value.store(nullptr, std::memory_order_relaxed);
+                            }
+                            currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+                            for (auto block = firstAllocatedBlock; block != nullptr; block = block->next)
+                            {
+                                currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+                                idxEntry = get_block_index_entry_for_index(currentTailIndex);
+                                idxEntry->value.store(nullptr, std::memory_order_relaxed);
+                                rewind_block_index_tail();
+                            }
+                            this->parent->add_blocks_to_free_list(firstAllocatedBlock);
+                            this->tailBlock = startBlock;
+
+                            return false;
+                        }
 
 #ifdef MCDBGQ_TRACKMEM
-					newBlock->owner = this;
-#endif
-					newBlock->ConcurrentQueue::Block::template reset_empty<implicit_context>();
-					newBlock->next = nullptr;
-
-					// Insert the new block into the index
-					idxEntry->value.store(newBlock, std::memory_order_relaxed);
-
-					// Store the chain of blocks so that we can undo if later allocations fail,
-					// and so that we can find the blocks when we do the actual enqueueing
-					if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr) {
-						assert(this->tailBlock != nullptr);
-						this->tailBlock->next = newBlock;
-					}
-					this->tailBlock = newBlock;
-					endBlock = newBlock;
-					firstAllocatedBlock = firstAllocatedBlock == nullptr ? newBlock : firstAllocatedBlock;
-				} while (blockBaseDiff > 0);
-			}
-
-			// Enqueue, one block at a time
-			index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
-			currentTailIndex = startTailIndex;
-			this->tailBlock = startBlock;
-			assert((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0);
-			if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) {
-				this->tailBlock = firstAllocatedBlock;
-			}
-			while (true) {
-				index_t stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-				if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) {
-					stopIndex = newTailIndex;
-				}
-				MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
-					while (currentTailIndex != stopIndex) {
-						new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
-					}
-				}
-				else {
-					MOODYCAMEL_TRY {
-						while (currentTailIndex != stopIndex) {
-							new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if<!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst));
-							++currentTailIndex;
-							++itemFirst;
-						}
-					}
-					MOODYCAMEL_CATCH (...) {
-						auto constructedStopIndex = currentTailIndex;
-						auto lastBlockEnqueued = this->tailBlock;
-
-						if (!details::is_trivially_destructible<T>::value) {
-							auto block = startBlock;
-							if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
-								block = firstAllocatedBlock;
-							}
-							currentTailIndex = startTailIndex;
-							while (true) {
-								stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-								if (details::circular_less_than<index_t>(constructedStopIndex, stopIndex)) {
-									stopIndex = constructedStopIndex;
-								}
-								while (currentTailIndex != stopIndex) {
-									(*block)[currentTailIndex++]->~T();
-								}
-								if (block == lastBlockEnqueued) {
-									break;
-								}
-								block = block->next;
-							}
-						}
-
-						currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
-						for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) {
-							currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
-							auto idxEntry = get_block_index_entry_for_index(currentTailIndex);
-							idxEntry->value.store(nullptr, std::memory_order_relaxed);
-							rewind_block_index_tail();
-						}
-						this->parent->add_blocks_to_free_list(firstAllocatedBlock);
-						this->tailBlock = startBlock;
-						MOODYCAMEL_RETHROW;
-					}
-				}
-
-				if (this->tailBlock == endBlock) {
-					assert(currentTailIndex == newTailIndex);
-					break;
-				}
-				this->tailBlock = this->tailBlock->next;
-			}
-			this->tailIndex.store(newTailIndex, std::memory_order_release);
-			return true;
-		}
+                        newBlock->owner = this;
+#endif
+                        newBlock->ConcurrentQueue::Block::template reset_empty<implicit_context>();
+                        newBlock->next = nullptr;
+
+                        // Insert the new block into the index
+                        idxEntry->value.store(newBlock, std::memory_order_relaxed);
+
+                        // Store the chain of blocks so that we can undo if later allocations fail,
+                        // and so that we can find the blocks when we do the actual enqueueing
+                        if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr)
+                        {
+                            assert(this->tailBlock != nullptr);
+                            this->tailBlock->next = newBlock;
+                        }
+                        this->tailBlock     = newBlock;
+                        endBlock            = newBlock;
+                        firstAllocatedBlock = firstAllocatedBlock == nullptr ? newBlock : firstAllocatedBlock;
+                    } while (blockBaseDiff > 0);
+                }
+
+                // Enqueue, one block at a time
+                index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
+                currentTailIndex     = startTailIndex;
+                this->tailBlock      = startBlock;
+                assert((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0);
+                if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr)
+                {
+                    this->tailBlock = firstAllocatedBlock;
+                }
+                while (true)
+                {
+                    index_t stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+                    if (details::circular_less_than<index_t>(newTailIndex, stopIndex))
+                    {
+                        stopIndex = newTailIndex;
+                    }
+                    MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst))))
+                    {
+                        while (currentTailIndex != stopIndex)
+                        {
+                            new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
+                        }
+                    }
+                    else
+                    {
+                        MOODYCAMEL_TRY
+                        {
+                            while (currentTailIndex != stopIndex)
+                            {
+                                new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if<!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst));
+                                ++currentTailIndex;
+                                ++itemFirst;
+                            }
+                        }
+                        MOODYCAMEL_CATCH(...)
+                        {
+                            auto constructedStopIndex = currentTailIndex;
+                            auto lastBlockEnqueued    = this->tailBlock;
+
+                            if (!details::is_trivially_destructible<T>::value)
+                            {
+                                auto block = startBlock;
+                                if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0)
+                                {
+                                    block = firstAllocatedBlock;
+                                }
+                                currentTailIndex = startTailIndex;
+                                while (true)
+                                {
+                                    stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+                                    if (details::circular_less_than<index_t>(constructedStopIndex, stopIndex))
+                                    {
+                                        stopIndex = constructedStopIndex;
+                                    }
+                                    while (currentTailIndex != stopIndex)
+                                    {
+                                        (*block)[currentTailIndex++]->~T();
+                                    }
+                                    if (block == lastBlockEnqueued)
+                                    {
+                                        break;
+                                    }
+                                    block = block->next;
+                                }
+                            }
+
+                            currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+                            for (auto block = firstAllocatedBlock; block != nullptr; block = block->next)
+                            {
+                                currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+                                auto idxEntry = get_block_index_entry_for_index(currentTailIndex);
+                                idxEntry->value.store(nullptr, std::memory_order_relaxed);
+                                rewind_block_index_tail();
+                            }
+                            this->parent->add_blocks_to_free_list(firstAllocatedBlock);
+                            this->tailBlock = startBlock;
+                            MOODYCAMEL_RETHROW;
+                        }
+                    }
+
+                    if (this->tailBlock == endBlock)
+                    {
+                        assert(currentTailIndex == newTailIndex);
+                        break;
+                    }
+                    this->tailBlock = this->tailBlock->next;
+                }
+                this->tailIndex.store(newTailIndex, std::memory_order_release);
+                return true;
+            }
 #ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-
-		template<typename It>
-		size_t dequeue_bulk(It& itemFirst, size_t max)
-		{
-			auto tail = this->tailIndex.load(std::memory_order_relaxed);
-			auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
-			auto desiredCount = static_cast<size_t>(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit));
-			if (details::circular_less_than<size_t>(0, desiredCount)) {
-				desiredCount = desiredCount < max ? desiredCount : max;
-				std::atomic_thread_fence(std::memory_order_acquire);
-
-				auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed);
-
-				tail = this->tailIndex.load(std::memory_order_acquire);
-				auto actualCount = static_cast<size_t>(tail - (myDequeueCount - overcommit));
-				if (details::circular_less_than<size_t>(0, actualCount)) {
-					actualCount = desiredCount < actualCount ? desiredCount : actualCount;
-					if (actualCount < desiredCount) {
-						this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release);
-					}
-
-					// Get the first index. Note that since there's guaranteed to be at least actualCount elements, this
-					// will never exceed tail.
-					auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
-
-					// Iterate the blocks and dequeue
-					auto index = firstIndex;
-					BlockIndexHeader* localBlockIndex;
-					auto indexIndex = get_block_index_index_for_index(index, localBlockIndex);
-					do {
-						auto blockStartIndex = index;
-						index_t endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-						endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
-
-						auto entry = localBlockIndex->index[indexIndex];
-						auto block = entry->value.load(std::memory_order_relaxed);
-						if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) {
-							while (index != endIndex) {
-								auto& el = *((*block)[index]);
-								*itemFirst++ = std::move(el);
-								el.~T();
-								++index;
-							}
-						}
-						else {
-							MOODYCAMEL_TRY {
-								while (index != endIndex) {
-									auto& el = *((*block)[index]);
-									*itemFirst = std::move(el);
-									++itemFirst;
-									el.~T();
-									++index;
-								}
-							}
-							MOODYCAMEL_CATCH (...) {
-								do {
-									entry = localBlockIndex->index[indexIndex];
-									block = entry->value.load(std::memory_order_relaxed);
-									while (index != endIndex) {
-										(*block)[index++]->~T();
-									}
-
-									if (block->ConcurrentQueue::Block::template set_many_empty<implicit_context>(blockStartIndex, static_cast<size_t>(endIndex - blockStartIndex))) {
+    #pragma warning(pop)
+#endif
+
+            template<typename It>
+            size_t dequeue_bulk(It& itemFirst, size_t max)
+            {
+                auto tail         = this->tailIndex.load(std::memory_order_relaxed);
+                auto overcommit   = this->dequeueOvercommit.load(std::memory_order_relaxed);
+                auto desiredCount = static_cast<size_t>(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit));
+                if (details::circular_less_than<size_t>(0, desiredCount))
+                {
+                    desiredCount = desiredCount < max ? desiredCount : max;
+                    std::atomic_thread_fence(std::memory_order_acquire);
+
+                    auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed);
+
+                    tail             = this->tailIndex.load(std::memory_order_acquire);
+                    auto actualCount = static_cast<size_t>(tail - (myDequeueCount - overcommit));
+                    if (details::circular_less_than<size_t>(0, actualCount))
+                    {
+                        actualCount = desiredCount < actualCount ? desiredCount : actualCount;
+                        if (actualCount < desiredCount)
+                        {
+                            this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release);
+                        }
+
+                        // Get the first index. Note that since there's guaranteed to be at least actualCount elements, this
+                        // will never exceed tail.
+                        auto              firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
+
+                        // Iterate the blocks and dequeue
+                        auto              index = firstIndex;
+                        BlockIndexHeader* localBlockIndex;
+                        auto              indexIndex = get_block_index_index_for_index(index, localBlockIndex);
+                        do {
+                            auto    blockStartIndex = index;
+                            index_t endIndex        = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+                            endIndex                = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
+
+                            auto entry = localBlockIndex->index[indexIndex];
+                            auto block = entry->value.load(std::memory_order_relaxed);
+                            if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index]))))
+                            {
+                                while (index != endIndex)
+                                {
+                                    auto& el     = *((*block)[index]);
+                                    *itemFirst++ = std::move(el);
+                                    el.~T();
+                                    ++index;
+                                }
+                            }
+                            else
+                            {
+                                MOODYCAMEL_TRY
+                                {
+                                    while (index != endIndex)
+                                    {
+                                        auto& el   = *((*block)[index]);
+                                        *itemFirst = std::move(el);
+                                        ++itemFirst;
+                                        el.~T();
+                                        ++index;
+                                    }
+                                }
+                                MOODYCAMEL_CATCH(...)
+                                {
+                                    do {
+                                        entry = localBlockIndex->index[indexIndex];
+                                        block = entry->value.load(std::memory_order_relaxed);
+                                        while (index != endIndex)
+                                        {
+                                            (*block)[index++]->~T();
+                                        }
+
+                                        if (block->ConcurrentQueue::Block::template set_many_empty<implicit_context>(blockStartIndex, static_cast<size_t>(endIndex - blockStartIndex)))
+                                        {
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-										debug::DebugLock lock(mutex);
-#endif
-										entry->value.store(nullptr, std::memory_order_relaxed);
-										this->parent->add_block_to_free_list(block);
-									}
-									indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1);
-
-									blockStartIndex = index;
-									endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-									endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
-								} while (index != firstIndex + actualCount);
-
-								MOODYCAMEL_RETHROW;
-							}
-						}
-						if (block->ConcurrentQueue::Block::template set_many_empty<implicit_context>(blockStartIndex, static_cast<size_t>(endIndex - blockStartIndex))) {
-							{
+                                            debug::DebugLock lock(mutex);
+#endif
+                                            entry->value.store(nullptr, std::memory_order_relaxed);
+                                            this->parent->add_block_to_free_list(block);
+                                        }
+                                        indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1);
+
+                                        blockStartIndex = index;
+                                        endIndex        = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
+                                        endIndex        = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
+                                    } while (index != firstIndex + actualCount);
+
+                                    MOODYCAMEL_RETHROW;
+                                }
+                            }
+                            if (block->ConcurrentQueue::Block::template set_many_empty<implicit_context>(blockStartIndex, static_cast<size_t>(endIndex - blockStartIndex)))
+                            {
+                                {
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-								debug::DebugLock lock(mutex);
-#endif
-								// Note that the set_many_empty above did a release, meaning that anybody who acquires the block
-								// we're about to free can use it safely since our writes (and reads!) will have happened-before then.
-								entry->value.store(nullptr, std::memory_order_relaxed);
-							}
-							this->parent->add_block_to_free_list(block);		// releases the above store
-						}
-						indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1);
-					} while (index != firstIndex + actualCount);
-
-					return actualCount;
-				}
-				else {
-					this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release);
-				}
-			}
-
-			return 0;
-		}
-
-	private:
-		// The block size must be > 1, so any number with the low bit set is an invalid block base index
-		static const index_t INVALID_BLOCK_BASE = 1;
-
-		struct BlockIndexEntry
-		{
-			std::atomic<index_t> key;
-			std::atomic<Block*> value;
-		};
-
-		struct BlockIndexHeader
-		{
-			size_t capacity;
-			std::atomic<size_t> tail;
-			BlockIndexEntry* entries;
-			BlockIndexEntry** index;
-			BlockIndexHeader* prev;
-		};
-
-		template<AllocationMode allocMode>
-		inline bool insert_block_index_entry(BlockIndexEntry*& idxEntry, index_t blockStartIndex)
-		{
-			auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);		// We're the only writer thread, relaxed is OK
-			if (localBlockIndex == nullptr) {
-				return false;  // this can happen if new_block_index failed in the constructor
-			}
-			size_t newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1);
-			idxEntry = localBlockIndex->index[newTail];
-			if (idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE ||
-				idxEntry->value.load(std::memory_order_relaxed) == nullptr) {
-
-				idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
-				localBlockIndex->tail.store(newTail, std::memory_order_release);
-				return true;
-			}
-
-			// No room in the old block index, try to allocate another one!
-			MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) {
-				return false;
-			}
-			else if (!new_block_index()) {
-				return false;
-			}
-			else {
-				localBlockIndex = blockIndex.load(std::memory_order_relaxed);
-				newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1);
-				idxEntry = localBlockIndex->index[newTail];
-				assert(idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE);
-				idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
-				localBlockIndex->tail.store(newTail, std::memory_order_release);
-				return true;
-			}
-		}
-
-		inline void rewind_block_index_tail()
-		{
-			auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
-			localBlockIndex->tail.store((localBlockIndex->tail.load(std::memory_order_relaxed) - 1) & (localBlockIndex->capacity - 1), std::memory_order_relaxed);
-		}
-
-		inline BlockIndexEntry* get_block_index_entry_for_index(index_t index) const
-		{
-			BlockIndexHeader* localBlockIndex;
-			auto idx = get_block_index_index_for_index(index, localBlockIndex);
-			return localBlockIndex->index[idx];
-		}
-
-		inline size_t get_block_index_index_for_index(index_t index, BlockIndexHeader*& localBlockIndex) const
-		{
+                                    debug::DebugLock lock(mutex);
+#endif
+                                    // Note that the set_many_empty above did a release, meaning that anybody who acquires the block
+                                    // we're about to free can use it safely since our writes (and reads!) will have happened-before then.
+                                    entry->value.store(nullptr, std::memory_order_relaxed);
+                                }
+                                this->parent->add_block_to_free_list(block);  // releases the above store
+                            }
+                            indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1);
+                        } while (index != firstIndex + actualCount);
+
+                        return actualCount;
+                    }
+                    else
+                    {
+                        this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release);
+                    }
+                }
+
+                return 0;
+            }
+
+          private:
+            // The block size must be > 1, so any number with the low bit set is an invalid block base index
+            static const index_t INVALID_BLOCK_BASE = 1;
+
+            struct BlockIndexEntry
+            {
+                std::atomic<index_t> key;
+                std::atomic<Block*>  value;
+            };
+
+            struct BlockIndexHeader
+            {
+                size_t              capacity;
+                std::atomic<size_t> tail;
+                BlockIndexEntry*    entries;
+                BlockIndexEntry**   index;
+                BlockIndexHeader*   prev;
+            };
+
+            template<AllocationMode allocMode>
+            inline bool insert_block_index_entry(BlockIndexEntry*& idxEntry, index_t blockStartIndex)
+            {
+                auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);  // We're the only writer thread, relaxed is OK
+                if (localBlockIndex == nullptr)
+                {
+                    return false;  // this can happen if new_block_index failed in the constructor
+                }
+                size_t newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1);
+                idxEntry       = localBlockIndex->index[newTail];
+                if (idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE ||
+                    idxEntry->value.load(std::memory_order_relaxed) == nullptr)
+                {
+                    idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
+                    localBlockIndex->tail.store(newTail, std::memory_order_release);
+                    return true;
+                }
+
+                // No room in the old block index, try to allocate another one!
+                MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc)
+                {
+                    return false;
+                }
+                else if (!new_block_index())
+                {
+                    return false;
+                }
+                else
+                {
+                    localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+                    newTail         = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1);
+                    idxEntry        = localBlockIndex->index[newTail];
+                    assert(idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE);
+                    idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
+                    localBlockIndex->tail.store(newTail, std::memory_order_release);
+                    return true;
+                }
+            }
+
+            inline void rewind_block_index_tail()
+            {
+                auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+                localBlockIndex->tail.store((localBlockIndex->tail.load(std::memory_order_relaxed) - 1) & (localBlockIndex->capacity - 1), std::memory_order_relaxed);
+            }
+
+            inline BlockIndexEntry* get_block_index_entry_for_index(index_t index) const
+            {
+                BlockIndexHeader* localBlockIndex;
+                auto              idx = get_block_index_index_for_index(index, localBlockIndex);
+                return localBlockIndex->index[idx];
+            }
+
+            inline size_t get_block_index_index_for_index(index_t index, BlockIndexHeader*& localBlockIndex) const
+            {
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-			debug::DebugLock lock(mutex);
-#endif
-			index &= ~static_cast<index_t>(BLOCK_SIZE - 1);
-			localBlockIndex = blockIndex.load(std::memory_order_acquire);
-			auto tail = localBlockIndex->tail.load(std::memory_order_acquire);
-			auto tailBase = localBlockIndex->index[tail]->key.load(std::memory_order_relaxed);
-			assert(tailBase != INVALID_BLOCK_BASE);
-			// Note: Must use division instead of shift because the index may wrap around, causing a negative
-			// offset, whose negativity we want to preserve
-			auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(index - tailBase) / static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
-			size_t idx = (tail + offset) & (localBlockIndex->capacity - 1);
-			assert(localBlockIndex->index[idx]->key.load(std::memory_order_relaxed) == index && localBlockIndex->index[idx]->value.load(std::memory_order_relaxed) != nullptr);
-			return idx;
-		}
-
-		bool new_block_index()
-		{
-			auto prev = blockIndex.load(std::memory_order_relaxed);
-			size_t prevCapacity = prev == nullptr ? 0 : prev->capacity;
-			auto entryCount = prev == nullptr ? nextBlockIndexCapacity : prevCapacity;
-			auto raw = static_cast<char*>((Traits::malloc)(
-				sizeof(BlockIndexHeader) +
-				std::alignment_of<BlockIndexEntry>::value - 1 + sizeof(BlockIndexEntry) * entryCount +
-				std::alignment_of<BlockIndexEntry*>::value - 1 + sizeof(BlockIndexEntry*) * nextBlockIndexCapacity));
-			if (raw == nullptr) {
-				return false;
-			}
-
-			auto header = new (raw) BlockIndexHeader;
-			auto entries = reinterpret_cast<BlockIndexEntry*>(details::align_for<BlockIndexEntry>(raw + sizeof(BlockIndexHeader)));
-			auto index = reinterpret_cast<BlockIndexEntry**>(details::align_for<BlockIndexEntry*>(reinterpret_cast<char*>(entries) + sizeof(BlockIndexEntry) * entryCount));
-			if (prev != nullptr) {
-				auto prevTail = prev->tail.load(std::memory_order_relaxed);
-				auto prevPos = prevTail;
-				size_t i = 0;
-				do {
-					prevPos = (prevPos + 1) & (prev->capacity - 1);
-					index[i++] = prev->index[prevPos];
-				} while (prevPos != prevTail);
-				assert(i == prevCapacity);
-			}
-			for (size_t i = 0; i != entryCount; ++i) {
-				new (entries + i) BlockIndexEntry;
-				entries[i].key.store(INVALID_BLOCK_BASE, std::memory_order_relaxed);
-				index[prevCapacity + i] = entries + i;
-			}
-			header->prev = prev;
-			header->entries = entries;
-			header->index = index;
-			header->capacity = nextBlockIndexCapacity;
-			header->tail.store((prevCapacity - 1) & (nextBlockIndexCapacity - 1), std::memory_order_relaxed);
-
-			blockIndex.store(header, std::memory_order_release);
-
-			nextBlockIndexCapacity <<= 1;
-
-			return true;
-		}
-
-	private:
-		size_t nextBlockIndexCapacity;
-		std::atomic<BlockIndexHeader*> blockIndex;
+                debug::DebugLock lock(mutex);
+#endif
+                index &= ~static_cast<index_t>(BLOCK_SIZE - 1);
+                localBlockIndex = blockIndex.load(std::memory_order_acquire);
+                auto tail       = localBlockIndex->tail.load(std::memory_order_acquire);
+                auto tailBase   = localBlockIndex->index[tail]->key.load(std::memory_order_relaxed);
+                assert(tailBase != INVALID_BLOCK_BASE);
+                // Note: Must use division instead of shift because the index may wrap around, causing a negative
+                // offset, whose negativity we want to preserve
+                auto   offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(index - tailBase) / static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
+                size_t idx    = (tail + offset) & (localBlockIndex->capacity - 1);
+                assert(localBlockIndex->index[idx]->key.load(std::memory_order_relaxed) == index && localBlockIndex->index[idx]->value.load(std::memory_order_relaxed) != nullptr);
+                return idx;
+            }
+
+            bool new_block_index()
+            {
+                auto   prev         = blockIndex.load(std::memory_order_relaxed);
+                size_t prevCapacity = prev == nullptr ? 0 : prev->capacity;
+                auto   entryCount   = prev == nullptr ? nextBlockIndexCapacity : prevCapacity;
+                auto   raw          = static_cast<char*>((Traits::malloc)(
+                    sizeof(BlockIndexHeader) +
+                    std::alignment_of<BlockIndexEntry>::value - 1 + sizeof(BlockIndexEntry) * entryCount +
+                    std::alignment_of<BlockIndexEntry*>::value - 1 + sizeof(BlockIndexEntry*) * nextBlockIndexCapacity));
+                if (raw == nullptr)
+                {
+                    return false;
+                }
+
+                auto header  = new (raw) BlockIndexHeader;
+                auto entries = reinterpret_cast<BlockIndexEntry*>(details::align_for<BlockIndexEntry>(raw + sizeof(BlockIndexHeader)));
+                auto index   = reinterpret_cast<BlockIndexEntry**>(details::align_for<BlockIndexEntry*>(reinterpret_cast<char*>(entries) + sizeof(BlockIndexEntry) * entryCount));
+                if (prev != nullptr)
+                {
+                    auto   prevTail = prev->tail.load(std::memory_order_relaxed);
+                    auto   prevPos  = prevTail;
+                    size_t i        = 0;
+                    do {
+                        prevPos    = (prevPos + 1) & (prev->capacity - 1);
+                        index[i++] = prev->index[prevPos];
+                    } while (prevPos != prevTail);
+                    assert(i == prevCapacity);
+                }
+                for (size_t i = 0; i != entryCount; ++i)
+                {
+                    new (entries + i) BlockIndexEntry;
+                    entries[i].key.store(INVALID_BLOCK_BASE, std::memory_order_relaxed);
+                    index[prevCapacity + i] = entries + i;
+                }
+                header->prev     = prev;
+                header->entries  = entries;
+                header->index    = index;
+                header->capacity = nextBlockIndexCapacity;
+                header->tail.store((prevCapacity - 1) & (nextBlockIndexCapacity - 1), std::memory_order_relaxed);
+
+                blockIndex.store(header, std::memory_order_release);
+
+                nextBlockIndexCapacity <<= 1;
+
+                return true;
+            }
+
+          private:
+            size_t                         nextBlockIndexCapacity;
+            std::atomic<BlockIndexHeader*> blockIndex;
 
 #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
-	public:
-		details::ThreadExitListener threadExitListener;
-	private:
+          public:
+            details::ThreadExitListener threadExitListener;
+
+          private:
 #endif
 
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-	public:
-		ImplicitProducer* nextImplicitProducer;
-	private:
+          public:
+            ImplicitProducer* nextImplicitProducer;
+
+          private:
 #endif
 
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-		mutable debug::DebugMutex mutex;
+            mutable debug::DebugMutex mutex;
 #endif
 #ifdef MCDBGQ_TRACKMEM
-		friend struct MemStats;
+            friend struct MemStats;
 #endif
-	};
-
-
-	//////////////////////////////////
-	// Block pool manipulation
-	//////////////////////////////////
-
-	void populate_initial_block_list(size_t blockCount)
-	{
-		initialBlockPoolSize = blockCount;
-		if (initialBlockPoolSize == 0) {
-			initialBlockPool = nullptr;
-			return;
-		}
-
-		initialBlockPool = create_array<Block>(blockCount);
-		if (initialBlockPool == nullptr) {
-			initialBlockPoolSize = 0;
-		}
-		for (size_t i = 0; i < initialBlockPoolSize; ++i) {
-			initialBlockPool[i].dynamicallyAllocated = false;
-		}
-	}
-
-	inline Block* try_get_block_from_initial_pool()
-	{
-		if (initialBlockPoolIndex.load(std::memory_order_relaxed) >= initialBlockPoolSize) {
-			return nullptr;
-		}
-
-		auto index = initialBlockPoolIndex.fetch_add(1, std::memory_order_relaxed);
+        };
+
+
+        //////////////////////////////////
+        // Block pool manipulation
+        //////////////////////////////////
+
+        void populate_initial_block_list(size_t blockCount)
+        {
+            initialBlockPoolSize = blockCount;
+            if (initialBlockPoolSize == 0)
+            {
+                initialBlockPool = nullptr;
+                return;
+            }
+
+            initialBlockPool = create_array<Block>(blockCount);
+            if (initialBlockPool == nullptr)
+            {
+                initialBlockPoolSize = 0;
+            }
+            for (size_t i = 0; i < initialBlockPoolSize; ++i)
+            {
+                initialBlockPool[i].dynamicallyAllocated = false;
+            }
+        }
+
+        inline Block* try_get_block_from_initial_pool()
+        {
+            if (initialBlockPoolIndex.load(std::memory_order_relaxed) >= initialBlockPoolSize)
+            {
+                return nullptr;
+            }
+
+            auto index = initialBlockPoolIndex.fetch_add(1, std::memory_order_relaxed);
+
+            return index < initialBlockPoolSize ? (initialBlockPool + index) : nullptr;
+        }
+
+        inline void add_block_to_free_list(Block* block)
+        {
+#ifdef MCDBGQ_TRACKMEM
+            block->owner = nullptr;
+#endif
+            if (!Traits::RECYCLE_ALLOCATED_BLOCKS && block->dynamicallyAllocated)
+            {
+                destroy(block);
+            }
+            else
+            {
+                freeList.add(block);
+            }
+        }
+
+        inline void add_blocks_to_free_list(Block* block)
+        {
+            while (block != nullptr)
+            {
+                auto next = block->next;
+                add_block_to_free_list(block);
+                block = next;
+            }
+        }
+
+        inline Block* try_get_block_from_free_list()
+        {
+            return freeList.try_get();
+        }
+
+        // Gets a free block from one of the memory pools, or allocates a new one (if applicable)
+        template<AllocationMode canAlloc>
+        Block* requisition_block()
+        {
+            auto block = try_get_block_from_initial_pool();
+            if (block != nullptr)
+            {
+                return block;
+            }
+
+            block = try_get_block_from_free_list();
+            if (block != nullptr)
+            {
+                return block;
+            }
+
+            MOODYCAMEL_CONSTEXPR_IF(canAlloc == CanAlloc)
+            {
+                return create<Block>();
+            }
+            else
+            {
+                return nullptr;
+            }
+        }
 
-		return index < initialBlockPoolSize ? (initialBlockPool + index) : nullptr;
-	}
 
-	inline void add_block_to_free_list(Block* block)
-	{
 #ifdef MCDBGQ_TRACKMEM
-		block->owner = nullptr;
-#endif
-		if (!Traits::RECYCLE_ALLOCATED_BLOCKS && block->dynamicallyAllocated) {
-			destroy(block);
-		}
-		else {
-			freeList.add(block);
-		}
-	}
-
-	inline void add_blocks_to_free_list(Block* block)
-	{
-		while (block != nullptr) {
-			auto next = block->next;
-			add_block_to_free_list(block);
-			block = next;
-		}
-	}
-
-	inline Block* try_get_block_from_free_list()
-	{
-		return freeList.try_get();
-	}
-
-	// Gets a free block from one of the memory pools, or allocates a new one (if applicable)
-	template<AllocationMode canAlloc>
-	Block* requisition_block()
-	{
-		auto block = try_get_block_from_initial_pool();
-		if (block != nullptr) {
-			return block;
-		}
-
-		block = try_get_block_from_free_list();
-		if (block != nullptr) {
-			return block;
-		}
-
-		MOODYCAMEL_CONSTEXPR_IF (canAlloc == CanAlloc) {
-			return create<Block>();
-		}
-		else {
-			return nullptr;
-		}
-	}
+      public:
+        struct MemStats
+        {
+            size_t allocatedBlocks;
+            size_t usedBlocks;
+            size_t freeBlocks;
+            size_t ownedBlocksExplicit;
+            size_t ownedBlocksImplicit;
+            size_t implicitProducers;
+            size_t explicitProducers;
+            size_t elementsEnqueued;
+            size_t blockClassBytes;
+            size_t queueClassBytes;
+            size_t implicitBlockIndexBytes;
+            size_t explicitBlockIndexBytes;
+
+            friend class ConcurrentQueue;
+
+          private:
+            static MemStats getFor(ConcurrentQueue* q)
+            {
+                MemStats stats = {0};
+
+                stats.elementsEnqueued = q->size_approx();
+
+                auto block = q->freeList.head_unsafe();
+                while (block != nullptr)
+                {
+                    ++stats.allocatedBlocks;
+                    ++stats.freeBlocks;
+                    block = block->freeListNext.load(std::memory_order_relaxed);
+                }
+
+                for (auto ptr = q->producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod())
+                {
+                    bool implicit = dynamic_cast<ImplicitProducer*>(ptr) != nullptr;
+                    stats.implicitProducers += implicit ? 1 : 0;
+                    stats.explicitProducers += implicit ? 0 : 1;
+
+                    if (implicit)
+                    {
+                        auto prod = static_cast<ImplicitProducer*>(ptr);
+                        stats.queueClassBytes += sizeof(ImplicitProducer);
+                        auto head = prod->headIndex.load(std::memory_order_relaxed);
+                        auto tail = prod->tailIndex.load(std::memory_order_relaxed);
+                        auto hash = prod->blockIndex.load(std::memory_order_relaxed);
+                        if (hash != nullptr)
+                        {
+                            for (size_t i = 0; i != hash->capacity; ++i)
+                            {
+                                if (hash->index[i]->key.load(std::memory_order_relaxed) != ImplicitProducer::INVALID_BLOCK_BASE && hash->index[i]->value.load(std::memory_order_relaxed) != nullptr)
+                                {
+                                    ++stats.allocatedBlocks;
+                                    ++stats.ownedBlocksImplicit;
+                                }
+                            }
+                            stats.implicitBlockIndexBytes += hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry);
+                            for (; hash != nullptr; hash = hash->prev)
+                            {
+                                stats.implicitBlockIndexBytes += sizeof(typename ImplicitProducer::BlockIndexHeader) + hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry*);
+                            }
+                        }
+                        for (; details::circular_less_than<index_t>(head, tail); head += BLOCK_SIZE)
+                        {
+                            // auto block = prod->get_block_index_entry_for_index(head);
+                            ++stats.usedBlocks;
+                        }
+                    }
+                    else
+                    {
+                        auto prod = static_cast<ExplicitProducer*>(ptr);
+                        stats.queueClassBytes += sizeof(ExplicitProducer);
+                        auto tailBlock   = prod->tailBlock;
+                        bool wasNonEmpty = false;
+                        if (tailBlock != nullptr)
+                        {
+                            auto block = tailBlock;
+                            do {
+                                ++stats.allocatedBlocks;
+                                if (!block->ConcurrentQueue::Block::template is_empty<explicit_context>() || wasNonEmpty)
+                                {
+                                    ++stats.usedBlocks;
+                                    wasNonEmpty = wasNonEmpty || block != tailBlock;
+                                }
+                                ++stats.ownedBlocksExplicit;
+                                block = block->next;
+                            } while (block != tailBlock);
+                        }
+                        auto index = prod->blockIndex.load(std::memory_order_relaxed);
+                        while (index != nullptr)
+                        {
+                            stats.explicitBlockIndexBytes += sizeof(typename ExplicitProducer::BlockIndexHeader) + index->size * sizeof(typename ExplicitProducer::BlockIndexEntry);
+                            index = static_cast<typename ExplicitProducer::BlockIndexHeader*>(index->prev);
+                        }
+                    }
+                }
+
+                auto freeOnInitialPool = q->initialBlockPoolIndex.load(std::memory_order_relaxed) >= q->initialBlockPoolSize ? 0 : q->initialBlockPoolSize - q->initialBlockPoolIndex.load(std::memory_order_relaxed);
+                stats.allocatedBlocks += freeOnInitialPool;
+                stats.freeBlocks += freeOnInitialPool;
+
+                stats.blockClassBytes = sizeof(Block) * stats.allocatedBlocks;
+                stats.queueClassBytes += sizeof(ConcurrentQueue);
+
+                return stats;
+            }
+        };
+
+        // For debugging only. Not thread-safe.
+        MemStats getMemStats()
+        {
+            return MemStats::getFor(this);
+        }
+
+      private:
+        friend struct MemStats;
+#endif
 
 
-#ifdef MCDBGQ_TRACKMEM
-	public:
-		struct MemStats {
-			size_t allocatedBlocks;
-			size_t usedBlocks;
-			size_t freeBlocks;
-			size_t ownedBlocksExplicit;
-			size_t ownedBlocksImplicit;
-			size_t implicitProducers;
-			size_t explicitProducers;
-			size_t elementsEnqueued;
-			size_t blockClassBytes;
-			size_t queueClassBytes;
-			size_t implicitBlockIndexBytes;
-			size_t explicitBlockIndexBytes;
-
-			friend class ConcurrentQueue;
-
-		private:
-			static MemStats getFor(ConcurrentQueue* q)
-			{
-				MemStats stats = { 0 };
-
-				stats.elementsEnqueued = q->size_approx();
-
-				auto block = q->freeList.head_unsafe();
-				while (block != nullptr) {
-					++stats.allocatedBlocks;
-					++stats.freeBlocks;
-					block = block->freeListNext.load(std::memory_order_relaxed);
-				}
-
-				for (auto ptr = q->producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
-					bool implicit = dynamic_cast<ImplicitProducer*>(ptr) != nullptr;
-					stats.implicitProducers += implicit ? 1 : 0;
-					stats.explicitProducers += implicit ? 0 : 1;
-
-					if (implicit) {
-						auto prod = static_cast<ImplicitProducer*>(ptr);
-						stats.queueClassBytes += sizeof(ImplicitProducer);
-						auto head = prod->headIndex.load(std::memory_order_relaxed);
-						auto tail = prod->tailIndex.load(std::memory_order_relaxed);
-						auto hash = prod->blockIndex.load(std::memory_order_relaxed);
-						if (hash != nullptr) {
-							for (size_t i = 0; i != hash->capacity; ++i) {
-								if (hash->index[i]->key.load(std::memory_order_relaxed) != ImplicitProducer::INVALID_BLOCK_BASE && hash->index[i]->value.load(std::memory_order_relaxed) != nullptr) {
-									++stats.allocatedBlocks;
-									++stats.ownedBlocksImplicit;
-								}
-							}
-							stats.implicitBlockIndexBytes += hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry);
-							for (; hash != nullptr; hash = hash->prev) {
-								stats.implicitBlockIndexBytes += sizeof(typename ImplicitProducer::BlockIndexHeader) + hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry*);
-							}
-						}
-						for (; details::circular_less_than<index_t>(head, tail); head += BLOCK_SIZE) {
-							//auto block = prod->get_block_index_entry_for_index(head);
-							++stats.usedBlocks;
-						}
-					}
-					else {
-						auto prod = static_cast<ExplicitProducer*>(ptr);
-						stats.queueClassBytes += sizeof(ExplicitProducer);
-						auto tailBlock = prod->tailBlock;
-						bool wasNonEmpty = false;
-						if (tailBlock != nullptr) {
-							auto block = tailBlock;
-							do {
-								++stats.allocatedBlocks;
-								if (!block->ConcurrentQueue::Block::template is_empty<explicit_context>() || wasNonEmpty) {
-									++stats.usedBlocks;
-									wasNonEmpty = wasNonEmpty || block != tailBlock;
-								}
-								++stats.ownedBlocksExplicit;
-								block = block->next;
-							} while (block != tailBlock);
-						}
-						auto index = prod->blockIndex.load(std::memory_order_relaxed);
-						while (index != nullptr) {
-							stats.explicitBlockIndexBytes += sizeof(typename ExplicitProducer::BlockIndexHeader) + index->size * sizeof(typename ExplicitProducer::BlockIndexEntry);
-							index = static_cast<typename ExplicitProducer::BlockIndexHeader*>(index->prev);
-						}
-					}
-				}
-
-				auto freeOnInitialPool = q->initialBlockPoolIndex.load(std::memory_order_relaxed) >= q->initialBlockPoolSize ? 0 : q->initialBlockPoolSize - q->initialBlockPoolIndex.load(std::memory_order_relaxed);
-				stats.allocatedBlocks += freeOnInitialPool;
-				stats.freeBlocks += freeOnInitialPool;
-
-				stats.blockClassBytes = sizeof(Block) * stats.allocatedBlocks;
-				stats.queueClassBytes += sizeof(ConcurrentQueue);
-
-				return stats;
-			}
-		};
-
-		// For debugging only. Not thread-safe.
-		MemStats getMemStats()
-		{
-			return MemStats::getFor(this);
-		}
-	private:
-		friend struct MemStats;
-#endif
-
-
-	//////////////////////////////////
-	// Producer list manipulation
-	//////////////////////////////////
-
-	ProducerBase* recycle_or_create_producer(bool isExplicit)
-	{
+        //////////////////////////////////
+        // Producer list manipulation
+        //////////////////////////////////
+
+        ProducerBase* recycle_or_create_producer(bool isExplicit)
+        {
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
-		debug::DebugLock lock(implicitProdMutex);
-#endif
-		// Try to re-use one first
-		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
-			if (ptr->inactive.load(std::memory_order_relaxed) && ptr->isExplicit == isExplicit) {
-				bool expected = true;
-				if (ptr->inactive.compare_exchange_strong(expected, /* desired */ false, std::memory_order_acquire, std::memory_order_relaxed)) {
-					// We caught one! It's been marked as activated, the caller can have it
-					return ptr;
-				}
-			}
-		}
-
-		return add_producer(isExplicit ? static_cast<ProducerBase*>(create<ExplicitProducer>(this)) : create<ImplicitProducer>(this));
-	}
-
-	ProducerBase* add_producer(ProducerBase* producer)
-	{
-		// Handle failed memory allocation
-		if (producer == nullptr) {
-			return nullptr;
-		}
-
-		producerCount.fetch_add(1, std::memory_order_relaxed);
-
-		// Add it to the lock-free list
-		auto prevTail = producerListTail.load(std::memory_order_relaxed);
-		do {
-			producer->next = prevTail;
-		} while (!producerListTail.compare_exchange_weak(prevTail, producer, std::memory_order_release, std::memory_order_relaxed));
+            debug::DebugLock lock(implicitProdMutex);
+#endif
+            // Try to re-use one first
+            for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod())
+            {
+                if (ptr->inactive.load(std::memory_order_relaxed) && ptr->isExplicit == isExplicit)
+                {
+                    bool expected = true;
+                    if (ptr->inactive.compare_exchange_strong(expected, /* desired */ false, std::memory_order_acquire, std::memory_order_relaxed))
+                    {
+                        // We caught one! It's been marked as activated, the caller can have it
+                        return ptr;
+                    }
+                }
+            }
+
+            return add_producer(isExplicit ? static_cast<ProducerBase*>(create<ExplicitProducer>(this)) : create<ImplicitProducer>(this));
+        }
+
+        ProducerBase* add_producer(ProducerBase* producer)
+        {
+            // Handle failed memory allocation
+            if (producer == nullptr)
+            {
+                return nullptr;
+            }
+
+            producerCount.fetch_add(1, std::memory_order_relaxed);
+
+            // Add it to the lock-free list
+            auto prevTail = producerListTail.load(std::memory_order_relaxed);
+            do {
+                producer->next = prevTail;
+            } while (!producerListTail.compare_exchange_weak(prevTail, producer, std::memory_order_release, std::memory_order_relaxed));
 
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-		if (producer->isExplicit) {
-			auto prevTailExplicit = explicitProducers.load(std::memory_order_relaxed);
-			do {
-				static_cast<ExplicitProducer*>(producer)->nextExplicitProducer = prevTailExplicit;
-			} while (!explicitProducers.compare_exchange_weak(prevTailExplicit, static_cast<ExplicitProducer*>(producer), std::memory_order_release, std::memory_order_relaxed));
-		}
-		else {
-			auto prevTailImplicit = implicitProducers.load(std::memory_order_relaxed);
-			do {
-				static_cast<ImplicitProducer*>(producer)->nextImplicitProducer = prevTailImplicit;
-			} while (!implicitProducers.compare_exchange_weak(prevTailImplicit, static_cast<ImplicitProducer*>(producer), std::memory_order_release, std::memory_order_relaxed));
-		}
-#endif
-
-		return producer;
-	}
-
-	void reown_producers()
-	{
-		// After another instance is moved-into/swapped-with this one, all the
-		// producers we stole still think their parents are the other queue.
-		// So fix them up!
-		for (auto ptr = producerListTail.load(std::memory_order_relaxed); ptr != nullptr; ptr = ptr->next_prod()) {
-			ptr->parent = this;
-		}
-	}
-
-
-	//////////////////////////////////
-	// Implicit producer hash
-	//////////////////////////////////
-
-	struct ImplicitProducerKVP
-	{
-		std::atomic<details::thread_id_t> key;
-		ImplicitProducer* value;		// No need for atomicity since it's only read by the thread that sets it in the first place
-
-		ImplicitProducerKVP() : value(nullptr) { }
-
-		ImplicitProducerKVP(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT
-		{
-			key.store(other.key.load(std::memory_order_relaxed), std::memory_order_relaxed);
-			value = other.value;
-		}
-
-		inline ImplicitProducerKVP& operator=(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT
-		{
-			swap(other);
-			return *this;
-		}
-
-		inline void swap(ImplicitProducerKVP& other) MOODYCAMEL_NOEXCEPT
-		{
-			if (this != &other) {
-				details::swap_relaxed(key, other.key);
-				std::swap(value, other.value);
-			}
-		}
-	};
-
-	template<typename XT, typename XTraits>
-	friend void moodycamel::swap(typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP&, typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP&) MOODYCAMEL_NOEXCEPT;
-
-	struct ImplicitProducerHash
-	{
-		size_t capacity;
-		ImplicitProducerKVP* entries;
-		ImplicitProducerHash* prev;
-	};
-
-	inline void populate_initial_implicit_producer_hash()
-	{
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) {
-			return;
-		}
-		else {
-			implicitProducerHashCount.store(0, std::memory_order_relaxed);
-			auto hash = &initialImplicitProducerHash;
-			hash->capacity = INITIAL_IMPLICIT_PRODUCER_HASH_SIZE;
-			hash->entries = &initialImplicitProducerHashEntries[0];
-			for (size_t i = 0; i != INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; ++i) {
-				initialImplicitProducerHashEntries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed);
-			}
-			hash->prev = nullptr;
-			implicitProducerHash.store(hash, std::memory_order_relaxed);
-		}
-	}
-
-	void swap_implicit_producer_hashes(ConcurrentQueue& other)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) {
-			return;
-		}
-		else {
-			// Swap (assumes our implicit producer hash is initialized)
-			initialImplicitProducerHashEntries.swap(other.initialImplicitProducerHashEntries);
-			initialImplicitProducerHash.entries = &initialImplicitProducerHashEntries[0];
-			other.initialImplicitProducerHash.entries = &other.initialImplicitProducerHashEntries[0];
-
-			details::swap_relaxed(implicitProducerHashCount, other.implicitProducerHashCount);
-
-			details::swap_relaxed(implicitProducerHash, other.implicitProducerHash);
-			if (implicitProducerHash.load(std::memory_order_relaxed) == &other.initialImplicitProducerHash) {
-				implicitProducerHash.store(&initialImplicitProducerHash, std::memory_order_relaxed);
-			}
-			else {
-				ImplicitProducerHash* hash;
-				for (hash = implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &other.initialImplicitProducerHash; hash = hash->prev) {
-					continue;
-				}
-				hash->prev = &initialImplicitProducerHash;
-			}
-			if (other.implicitProducerHash.load(std::memory_order_relaxed) == &initialImplicitProducerHash) {
-				other.implicitProducerHash.store(&other.initialImplicitProducerHash, std::memory_order_relaxed);
-			}
-			else {
-				ImplicitProducerHash* hash;
-				for (hash = other.implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &initialImplicitProducerHash; hash = hash->prev) {
-					continue;
-				}
-				hash->prev = &other.initialImplicitProducerHash;
-			}
-		}
-	}
-
-	// Only fails (returns nullptr) if memory allocation fails
-	ImplicitProducer* get_or_add_implicit_producer()
-	{
-		// Note that since the data is essentially thread-local (key is thread ID),
-		// there's a reduced need for fences (memory ordering is already consistent
-		// for any individual thread), except for the current table itself.
-
-		// Start by looking for the thread ID in the current and all previous hash tables.
-		// If it's not found, it must not be in there yet, since this same thread would
-		// have added it previously to one of the tables that we traversed.
-
-		// Code and algorithm adapted from http://preshing.com/20130605/the-worlds-simplest-lock-free-hash-table
+            if (producer->isExplicit)
+            {
+                auto prevTailExplicit = explicitProducers.load(std::memory_order_relaxed);
+                do {
+                    static_cast<ExplicitProducer*>(producer)->nextExplicitProducer = prevTailExplicit;
+                } while (!explicitProducers.compare_exchange_weak(prevTailExplicit, static_cast<ExplicitProducer*>(producer), std::memory_order_release, std::memory_order_relaxed));
+            }
+            else
+            {
+                auto prevTailImplicit = implicitProducers.load(std::memory_order_relaxed);
+                do {
+                    static_cast<ImplicitProducer*>(producer)->nextImplicitProducer = prevTailImplicit;
+                } while (!implicitProducers.compare_exchange_weak(prevTailImplicit, static_cast<ImplicitProducer*>(producer), std::memory_order_release, std::memory_order_relaxed));
+            }
+#endif
+
+            return producer;
+        }
+
+        void reown_producers()
+        {
+            // After another instance is moved-into/swapped-with this one, all the
+            // producers we stole still think their parents are the other queue.
+            // So fix them up!
+            for (auto ptr = producerListTail.load(std::memory_order_relaxed); ptr != nullptr; ptr = ptr->next_prod())
+            {
+                ptr->parent = this;
+            }
+        }
+
+
+        //////////////////////////////////
+        // Implicit producer hash
+        //////////////////////////////////
+
+        struct ImplicitProducerKVP
+        {
+            std::atomic<details::thread_id_t> key;
+            ImplicitProducer*                 value;  // No need for atomicity since it's only read by the thread that sets it in the first place
+
+            ImplicitProducerKVP()
+                : value(nullptr)
+            {
+            }
+
+            ImplicitProducerKVP(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT
+            {
+                key.store(other.key.load(std::memory_order_relaxed), std::memory_order_relaxed);
+                value = other.value;
+            }
+
+            inline ImplicitProducerKVP& operator=(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT
+            {
+                swap(other);
+                return *this;
+            }
+
+            inline void swap(ImplicitProducerKVP& other) MOODYCAMEL_NOEXCEPT
+            {
+                if (this != &other)
+                {
+                    details::swap_relaxed(key, other.key);
+                    std::swap(value, other.value);
+                }
+            }
+        };
+
+        template<typename XT, typename XTraits>
+        friend void moodycamel::swap(typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP&, typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP&) MOODYCAMEL_NOEXCEPT;
+
+        struct ImplicitProducerHash
+        {
+            size_t                capacity;
+            ImplicitProducerKVP*  entries;
+            ImplicitProducerHash* prev;
+        };
+
+        inline void populate_initial_implicit_producer_hash()
+        {
+            MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
+            {
+                return;
+            }
+            else
+            {
+                implicitProducerHashCount.store(0, std::memory_order_relaxed);
+                auto hash      = &initialImplicitProducerHash;
+                hash->capacity = INITIAL_IMPLICIT_PRODUCER_HASH_SIZE;
+                hash->entries  = &initialImplicitProducerHashEntries[0];
+                for (size_t i = 0; i != INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; ++i)
+                {
+                    initialImplicitProducerHashEntries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed);
+                }
+                hash->prev = nullptr;
+                implicitProducerHash.store(hash, std::memory_order_relaxed);
+            }
+        }
+
+        void swap_implicit_producer_hashes(ConcurrentQueue& other)
+        {
+            MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
+            {
+                return;
+            }
+            else
+            {
+                // Swap (assumes our implicit producer hash is initialized)
+                initialImplicitProducerHashEntries.swap(other.initialImplicitProducerHashEntries);
+                initialImplicitProducerHash.entries       = &initialImplicitProducerHashEntries[0];
+                other.initialImplicitProducerHash.entries = &other.initialImplicitProducerHashEntries[0];
+
+                details::swap_relaxed(implicitProducerHashCount, other.implicitProducerHashCount);
+
+                details::swap_relaxed(implicitProducerHash, other.implicitProducerHash);
+                if (implicitProducerHash.load(std::memory_order_relaxed) == &other.initialImplicitProducerHash)
+                {
+                    implicitProducerHash.store(&initialImplicitProducerHash, std::memory_order_relaxed);
+                }
+                else
+                {
+                    ImplicitProducerHash* hash;
+                    for (hash = implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &other.initialImplicitProducerHash; hash = hash->prev)
+                    {
+                        continue;
+                    }
+                    hash->prev = &initialImplicitProducerHash;
+                }
+                if (other.implicitProducerHash.load(std::memory_order_relaxed) == &initialImplicitProducerHash)
+                {
+                    other.implicitProducerHash.store(&other.initialImplicitProducerHash, std::memory_order_relaxed);
+                }
+                else
+                {
+                    ImplicitProducerHash* hash;
+                    for (hash = other.implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &initialImplicitProducerHash; hash = hash->prev)
+                    {
+                        continue;
+                    }
+                    hash->prev = &other.initialImplicitProducerHash;
+                }
+            }
+        }
+
+        // Only fails (returns nullptr) if memory allocation fails
+        ImplicitProducer* get_or_add_implicit_producer()
+        {
+            // Note that since the data is essentially thread-local (key is thread ID),
+            // there's a reduced need for fences (memory ordering is already consistent
+            // for any individual thread), except for the current table itself.
+
+            // Start by looking for the thread ID in the current and all previous hash tables.
+            // If it's not found, it must not be in there yet, since this same thread would
+            // have added it previously to one of the tables that we traversed.
+
+            // Code and algorithm adapted from http://preshing.com/20130605/the-worlds-simplest-lock-free-hash-table
 
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
-		debug::DebugLock lock(implicitProdMutex);
-#endif
-
-		auto id = details::thread_id();
-		auto hashedId = details::hash_thread_id(id);
-
-		auto mainHash = implicitProducerHash.load(std::memory_order_acquire);
-		assert(mainHash != nullptr);  // silence clang-tidy and MSVC warnings (hash cannot be null)
-		for (auto hash = mainHash; hash != nullptr; hash = hash->prev) {
-			// Look for the id in this hash
-			auto index = hashedId;
-			while (true) {		// Not an infinite loop because at least one slot is free in the hash table
-				index &= hash->capacity - 1u;
-
-				auto probedKey = hash->entries[index].key.load(std::memory_order_relaxed);
-				if (probedKey == id) {
-					// Found it! If we had to search several hashes deep, though, we should lazily add it
-					// to the current main hash table to avoid the extended search next time.
-					// Note there's guaranteed to be room in the current hash table since every subsequent
-					// table implicitly reserves space for all previous tables (there's only one
-					// implicitProducerHashCount).
-					auto value = hash->entries[index].value;
-					if (hash != mainHash) {
-						index = hashedId;
-						while (true) {
-							index &= mainHash->capacity - 1u;
-							auto empty = details::invalid_thread_id;
+            debug::DebugLock lock(implicitProdMutex);
+#endif
+
+            auto id       = details::thread_id();
+            auto hashedId = details::hash_thread_id(id);
+
+            auto mainHash = implicitProducerHash.load(std::memory_order_acquire);
+            assert(mainHash != nullptr);  // silence clang-tidy and MSVC warnings (hash cannot be null)
+            for (auto hash = mainHash; hash != nullptr; hash = hash->prev)
+            {
+                // Look for the id in this hash
+                auto index = hashedId;
+                while (true)
+                {  // Not an infinite loop because at least one slot is free in the hash table
+                    index &= hash->capacity - 1u;
+
+                    auto probedKey = hash->entries[index].key.load(std::memory_order_relaxed);
+                    if (probedKey == id)
+                    {
+                        // Found it! If we had to search several hashes deep, though, we should lazily add it
+                        // to the current main hash table to avoid the extended search next time.
+                        // Note there's guaranteed to be room in the current hash table since every subsequent
+                        // table implicitly reserves space for all previous tables (there's only one
+                        // implicitProducerHashCount).
+                        auto value = hash->entries[index].value;
+                        if (hash != mainHash)
+                        {
+                            index = hashedId;
+                            while (true)
+                            {
+                                index &= mainHash->capacity - 1u;
+                                auto empty = details::invalid_thread_id;
 #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
-							auto reusable = details::invalid_thread_id2;
-							if (mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_seq_cst, std::memory_order_relaxed) ||
-								mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
+                                auto reusable = details::invalid_thread_id2;
+                                if (mainHash->entries[index].key.compare_exchange_strong(empty, id, std::memory_order_seq_cst, std::memory_order_relaxed) ||
+                                    mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_seq_cst, std::memory_order_relaxed))
+                                {
 #else
-							if (mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
-#endif
-								mainHash->entries[index].value = value;
-								break;
-							}
-							++index;
-						}
-					}
-
-					return value;
-				}
-				if (probedKey == details::invalid_thread_id) {
-					break;		// Not in this hash table
-				}
-				++index;
-			}
-		}
-
-		// Insert!
-		auto newCount = 1 + implicitProducerHashCount.fetch_add(1, std::memory_order_relaxed);
-		while (true) {
-			// NOLINTNEXTLINE(clang-analyzer-core.NullDereference)
-			if (newCount >= (mainHash->capacity >> 1) && !implicitProducerHashResizeInProgress.test_and_set(std::memory_order_acquire)) {
-				// We've acquired the resize lock, try to allocate a bigger hash table.
-				// Note the acquire fence synchronizes with the release fence at the end of this block, and hence when
-				// we reload implicitProducerHash it must be the most recent version (it only gets changed within this
-				// locked block).
-				mainHash = implicitProducerHash.load(std::memory_order_acquire);
-				if (newCount >= (mainHash->capacity >> 1)) {
-					size_t newCapacity = mainHash->capacity << 1;
-					while (newCount >= (newCapacity >> 1)) {
-						newCapacity <<= 1;
-					}
-					auto raw = static_cast<char*>((Traits::malloc)(sizeof(ImplicitProducerHash) + std::alignment_of<ImplicitProducerKVP>::value - 1 + sizeof(ImplicitProducerKVP) * newCapacity));
-					if (raw == nullptr) {
-						// Allocation failed
-						implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
-						implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
-						return nullptr;
-					}
-
-					auto newHash = new (raw) ImplicitProducerHash;
-					newHash->capacity = static_cast<size_t>(newCapacity);
-					newHash->entries = reinterpret_cast<ImplicitProducerKVP*>(details::align_for<ImplicitProducerKVP>(raw + sizeof(ImplicitProducerHash)));
-					for (size_t i = 0; i != newCapacity; ++i) {
-						new (newHash->entries + i) ImplicitProducerKVP;
-						newHash->entries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed);
-					}
-					newHash->prev = mainHash;
-					implicitProducerHash.store(newHash, std::memory_order_release);
-					implicitProducerHashResizeInProgress.clear(std::memory_order_release);
-					mainHash = newHash;
-				}
-				else {
-					implicitProducerHashResizeInProgress.clear(std::memory_order_release);
-				}
-			}
-
-			// If it's < three-quarters full, add to the old one anyway so that we don't have to wait for the next table
-			// to finish being allocated by another thread (and if we just finished allocating above, the condition will
-			// always be true)
-			if (newCount < (mainHash->capacity >> 1) + (mainHash->capacity >> 2)) {
-				auto producer = static_cast<ImplicitProducer*>(recycle_or_create_producer(false));
-				if (producer == nullptr) {
-					implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
-					return nullptr;
-				}
+                                if (mainHash->entries[index].key.compare_exchange_strong(empty, id, std::memory_order_seq_cst, std::memory_order_relaxed))
+                                {
+#endif
+                                    mainHash->entries[index].value = value;
+                                    break;
+                                }
+                                ++index;
+                            }
+                        }
+
+                        return value;
+                    }
+                    if (probedKey == details::invalid_thread_id)
+                    {
+                        break;  // Not in this hash table
+                    }
+                    ++index;
+                }
+            }
+
+            // Insert!
+            auto newCount = 1 + implicitProducerHashCount.fetch_add(1, std::memory_order_relaxed);
+            while (true)
+            {
+                // NOLINTNEXTLINE(clang-analyzer-core.NullDereference)
+                if (newCount >= (mainHash->capacity >> 1) && !implicitProducerHashResizeInProgress.test_and_set(std::memory_order_acquire))
+                {
+                    // We've acquired the resize lock, try to allocate a bigger hash table.
+                    // Note the acquire fence synchronizes with the release fence at the end of this block, and hence when
+                    // we reload implicitProducerHash it must be the most recent version (it only gets changed within this
+                    // locked block).
+                    mainHash = implicitProducerHash.load(std::memory_order_acquire);
+                    if (newCount >= (mainHash->capacity >> 1))
+                    {
+                        size_t newCapacity = mainHash->capacity << 1;
+                        while (newCount >= (newCapacity >> 1))
+                        {
+                            newCapacity <<= 1;
+                        }
+                        auto raw = static_cast<char*>((Traits::malloc)(sizeof(ImplicitProducerHash) + std::alignment_of<ImplicitProducerKVP>::value - 1 + sizeof(ImplicitProducerKVP) * newCapacity));
+                        if (raw == nullptr)
+                        {
+                            // Allocation failed
+                            implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
+                            implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+                            return nullptr;
+                        }
+
+                        auto newHash      = new (raw) ImplicitProducerHash;
+                        newHash->capacity = static_cast<size_t>(newCapacity);
+                        newHash->entries  = reinterpret_cast<ImplicitProducerKVP*>(details::align_for<ImplicitProducerKVP>(raw + sizeof(ImplicitProducerHash)));
+                        for (size_t i = 0; i != newCapacity; ++i)
+                        {
+                            new (newHash->entries + i) ImplicitProducerKVP;
+                            newHash->entries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed);
+                        }
+                        newHash->prev = mainHash;
+                        implicitProducerHash.store(newHash, std::memory_order_release);
+                        implicitProducerHashResizeInProgress.clear(std::memory_order_release);
+                        mainHash = newHash;
+                    }
+                    else
+                    {
+                        implicitProducerHashResizeInProgress.clear(std::memory_order_release);
+                    }
+                }
+
+                // If it's < three-quarters full, add to the old one anyway so that we don't have to wait for the next table
+                // to finish being allocated by another thread (and if we just finished allocating above, the condition will
+                // always be true)
+                if (newCount < (mainHash->capacity >> 1) + (mainHash->capacity >> 2))
+                {
+                    auto producer = static_cast<ImplicitProducer*>(recycle_or_create_producer(false));
+                    if (producer == nullptr)
+                    {
+                        implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
+                        return nullptr;
+                    }
 
 #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
-				producer->threadExitListener.callback = &ConcurrentQueue::implicit_producer_thread_exited_callback;
-				producer->threadExitListener.userData = producer;
-				details::ThreadExitNotifier::subscribe(&producer->threadExitListener);
+                    producer->threadExitListener.callback = &ConcurrentQueue::implicit_producer_thread_exited_callback;
+                    producer->threadExitListener.userData = producer;
+                    details::ThreadExitNotifier::subscribe(&producer->threadExitListener);
 #endif
 
-				auto index = hashedId;
-				while (true) {
-					index &= mainHash->capacity - 1u;
-					auto empty = details::invalid_thread_id;
+                    auto index = hashedId;
+                    while (true)
+                    {
+                        index &= mainHash->capacity - 1u;
+                        auto empty = details::invalid_thread_id;
 #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
-					auto reusable = details::invalid_thread_id2;
-					if (mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
-						implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);  // already counted as a used slot
-						mainHash->entries[index].value = producer;
-						break;
-					}
-#endif
-					if (mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
-						mainHash->entries[index].value = producer;
-						break;
-					}
-					++index;
-				}
-				return producer;
-			}
-
-			// Hmm, the old hash is quite full and somebody else is busy allocating a new one.
-			// We need to wait for the allocating thread to finish (if it succeeds, we add, if not,
-			// we try to allocate ourselves).
-			mainHash = implicitProducerHash.load(std::memory_order_acquire);
-		}
-	}
+                        auto reusable = details::invalid_thread_id2;
+                        if (mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_seq_cst, std::memory_order_relaxed))
+                        {
+                            implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);  // already counted as a used slot
+                            mainHash->entries[index].value = producer;
+                            break;
+                        }
+#endif
+                        if (mainHash->entries[index].key.compare_exchange_strong(empty, id, std::memory_order_seq_cst, std::memory_order_relaxed))
+                        {
+                            mainHash->entries[index].value = producer;
+                            break;
+                        }
+                        ++index;
+                    }
+                    return producer;
+                }
+
+                // Hmm, the old hash is quite full and somebody else is busy allocating a new one.
+                // We need to wait for the allocating thread to finish (if it succeeds, we add, if not,
+                // we try to allocate ourselves).
+                mainHash = implicitProducerHash.load(std::memory_order_acquire);
+            }
+        }
 
 #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
-	void implicit_producer_thread_exited(ImplicitProducer* producer)
-	{
-		// Remove from hash
-#ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
-		debug::DebugLock lock(implicitProdMutex);
-#endif
-		auto hash = implicitProducerHash.load(std::memory_order_acquire);
-		assert(hash != nullptr);		// The thread exit listener is only registered if we were added to a hash in the first place
-		auto id = details::thread_id();
-		auto hashedId = details::hash_thread_id(id);
-		details::thread_id_t probedKey;
-
-		// We need to traverse all the hashes just in case other threads aren't on the current one yet and are
-		// trying to add an entry thinking there's a free slot (because they reused a producer)
-		for (; hash != nullptr; hash = hash->prev) {
-			auto index = hashedId;
-			do {
-				index &= hash->capacity - 1u;
-				probedKey = id;
-				if (hash->entries[index].key.compare_exchange_strong(probedKey, details::invalid_thread_id2, std::memory_order_seq_cst, std::memory_order_relaxed)) {
-					break;
-				}
-				++index;
-			} while (probedKey != details::invalid_thread_id);		// Can happen if the hash has changed but we weren't put back in it yet, or if we weren't added to this hash in the first place
-		}
-
-		// Mark the queue as being recyclable
-		producer->inactive.store(true, std::memory_order_release);
-	}
-
-	static void implicit_producer_thread_exited_callback(void* userData)
-	{
-		auto producer = static_cast<ImplicitProducer*>(userData);
-		auto queue = producer->parent;
-		queue->implicit_producer_thread_exited(producer);
-	}
-#endif
-
-	//////////////////////////////////
-	// Utility functions
-	//////////////////////////////////
-
-	template<typename TAlign>
-	static inline void* aligned_malloc(size_t size)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (std::alignment_of<TAlign>::value <= std::alignment_of<details::max_align_t>::value)
-			return (Traits::malloc)(size);
-		else {
-			size_t alignment = std::alignment_of<TAlign>::value;
-			void* raw = (Traits::malloc)(size + alignment - 1 + sizeof(void*));
-			if (!raw)
-				return nullptr;
-			char* ptr = details::align_for<TAlign>(reinterpret_cast<char*>(raw) + sizeof(void*));
-			*(reinterpret_cast<void**>(ptr) - 1) = raw;
-			return ptr;
-		}
-	}
-
-	template<typename TAlign>
-	static inline void aligned_free(void* ptr)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (std::alignment_of<TAlign>::value <= std::alignment_of<details::max_align_t>::value)
-			return (Traits::free)(ptr);
-		else
-			(Traits::free)(ptr ? *(reinterpret_cast<void**>(ptr) - 1) : nullptr);
-	}
-
-	template<typename U>
-	static inline U* create_array(size_t count)
-	{
-		assert(count > 0);
-		U* p = static_cast<U*>(aligned_malloc<U>(sizeof(U) * count));
-		if (p == nullptr)
-			return nullptr;
-
-		for (size_t i = 0; i != count; ++i)
-			new (p + i) U();
-		return p;
-	}
-
-	template<typename U>
-	static inline void destroy_array(U* p, size_t count)
-	{
-		if (p != nullptr) {
-			assert(count > 0);
-			for (size_t i = count; i != 0; )
-				(p + --i)->~U();
-		}
-		aligned_free<U>(p);
-	}
-
-	template<typename U>
-	static inline U* create()
-	{
-		void* p = aligned_malloc<U>(sizeof(U));
-		return p != nullptr ? new (p) U : nullptr;
-	}
-
-	template<typename U, typename A1>
-	static inline U* create(A1&& a1)
-	{
-		void* p = aligned_malloc<U>(sizeof(U));
-		return p != nullptr ? new (p) U(std::forward<A1>(a1)) : nullptr;
-	}
-
-	template<typename U>
-	static inline void destroy(U* p)
-	{
-		if (p != nullptr)
-			p->~U();
-		aligned_free<U>(p);
-	}
-
-private:
-	std::atomic<ProducerBase*> producerListTail;
-	std::atomic<std::uint32_t> producerCount;
-
-	std::atomic<size_t> initialBlockPoolIndex;
-	Block* initialBlockPool;
-	size_t initialBlockPoolSize;
+        void implicit_producer_thread_exited(ImplicitProducer* producer)
+        {
+            // Remove from hash
+    #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
+            debug::DebugLock lock(implicitProdMutex);
+    #endif
+            auto hash = implicitProducerHash.load(std::memory_order_acquire);
+            assert(hash != nullptr);  // The thread exit listener is only registered if we were added to a hash in the first place
+            auto                 id       = details::thread_id();
+            auto                 hashedId = details::hash_thread_id(id);
+            details::thread_id_t probedKey;
+
+            // We need to traverse all the hashes just in case other threads aren't on the current one yet and are
+            // trying to add an entry thinking there's a free slot (because they reused a producer)
+            for (; hash != nullptr; hash = hash->prev)
+            {
+                auto index = hashedId;
+                do {
+                    index &= hash->capacity - 1u;
+                    probedKey = id;
+                    if (hash->entries[index].key.compare_exchange_strong(probedKey, details::invalid_thread_id2, std::memory_order_seq_cst, std::memory_order_relaxed))
+                    {
+                        break;
+                    }
+                    ++index;
+                } while (probedKey != details::invalid_thread_id);  // Can happen if the hash has changed but we weren't put back in it yet, or if we weren't added to this hash in the first place
+            }
+
+            // Mark the queue as being recyclable
+            producer->inactive.store(true, std::memory_order_release);
+        }
+
+        static void implicit_producer_thread_exited_callback(void* userData)
+        {
+            auto producer = static_cast<ImplicitProducer*>(userData);
+            auto queue    = producer->parent;
+            queue->implicit_producer_thread_exited(producer);
+        }
+#endif
+
+        //////////////////////////////////
+        // Utility functions
+        //////////////////////////////////
+
+        template<typename TAlign>
+        static inline void* aligned_malloc(size_t size)
+        {
+            MOODYCAMEL_CONSTEXPR_IF(std::alignment_of<TAlign>::value <= std::alignment_of<details::max_align_t>::value)
+            return (Traits::malloc)(size);
+            else
+            {
+                size_t alignment = std::alignment_of<TAlign>::value;
+                void*  raw       = (Traits::malloc)(size + alignment - 1 + sizeof(void*));
+                if (!raw)
+                    return nullptr;
+                char* ptr                            = details::align_for<TAlign>(reinterpret_cast<char*>(raw) + sizeof(void*));
+                *(reinterpret_cast<void**>(ptr) - 1) = raw;
+                return ptr;
+            }
+        }
+
+        template<typename TAlign>
+        static inline void aligned_free(void* ptr)
+        {
+            MOODYCAMEL_CONSTEXPR_IF(std::alignment_of<TAlign>::value <= std::alignment_of<details::max_align_t>::value)
+            return (Traits::free)(ptr);
+            else(Traits::free)(ptr ? *(reinterpret_cast<void**>(ptr) - 1) : nullptr);
+        }
+
+        template<typename U>
+        static inline U* create_array(size_t count)
+        {
+            assert(count > 0);
+            U* p = static_cast<U*>(aligned_malloc<U>(sizeof(U) * count));
+            if (p == nullptr)
+                return nullptr;
+
+            for (size_t i = 0; i != count; ++i)
+                new (p + i) U();
+            return p;
+        }
+
+        template<typename U>
+        static inline void destroy_array(U* p, size_t count)
+        {
+            if (p != nullptr)
+            {
+                assert(count > 0);
+                for (size_t i = count; i != 0;)
+                    (p + --i)->~U();
+            }
+            aligned_free<U>(p);
+        }
+
+        template<typename U>
+        static inline U* create()
+        {
+            void* p = aligned_malloc<U>(sizeof(U));
+            return p != nullptr ? new (p) U : nullptr;
+        }
+
+        template<typename U, typename A1>
+        static inline U* create(A1&& a1)
+        {
+            void* p = aligned_malloc<U>(sizeof(U));
+            return p != nullptr ? new (p) U(std::forward<A1>(a1)) : nullptr;
+        }
+
+        template<typename U>
+        static inline void destroy(U* p)
+        {
+            if (p != nullptr)
+                p->~U();
+            aligned_free<U>(p);
+        }
+
+      private:
+        std::atomic<ProducerBase*> producerListTail;
+        std::atomic<std::uint32_t> producerCount;
+
+        std::atomic<size_t>        initialBlockPoolIndex;
+        Block*                     initialBlockPool;
+        size_t                     initialBlockPoolSize;
 
 #ifndef MCDBGQ_USEDEBUGFREELIST
-	FreeList<Block> freeList;
+        FreeList<Block> freeList;
 #else
-	debug::DebugFreeList<Block> freeList;
+        debug::DebugFreeList<Block> freeList;
 #endif
 
-	std::atomic<ImplicitProducerHash*> implicitProducerHash;
-	std::atomic<size_t> implicitProducerHashCount;		// Number of slots logically used
-	ImplicitProducerHash initialImplicitProducerHash;
-	std::array<ImplicitProducerKVP, INITIAL_IMPLICIT_PRODUCER_HASH_SIZE> initialImplicitProducerHashEntries;
-	std::atomic_flag implicitProducerHashResizeInProgress;
+        std::atomic<ImplicitProducerHash*>                                   implicitProducerHash;
+        std::atomic<size_t>                                                  implicitProducerHashCount;  // Number of slots logically used
+        ImplicitProducerHash                                                 initialImplicitProducerHash;
+        std::array<ImplicitProducerKVP, INITIAL_IMPLICIT_PRODUCER_HASH_SIZE> initialImplicitProducerHashEntries;
+        std::atomic_flag                                                     implicitProducerHashResizeInProgress;
 
-	std::atomic<std::uint32_t> nextExplicitConsumerId;
-	std::atomic<std::uint32_t> globalExplicitConsumerOffset;
+        std::atomic<std::uint32_t>                                           nextExplicitConsumerId;
+        std::atomic<std::uint32_t>                                           globalExplicitConsumerOffset;
 
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
-	debug::DebugMutex implicitProdMutex;
+        debug::DebugMutex implicitProdMutex;
 #endif
 
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-	std::atomic<ExplicitProducer*> explicitProducers;
-	std::atomic<ImplicitProducer*> implicitProducers;
+        std::atomic<ExplicitProducer*> explicitProducers;
+        std::atomic<ImplicitProducer*> implicitProducers;
 #endif
-};
-
-
-template<typename T, typename Traits>
-ProducerToken::ProducerToken(ConcurrentQueue<T, Traits>& queue)
-	: producer(queue.recycle_or_create_producer(true))
-{
-	if (producer != nullptr) {
-		producer->token = this;
-	}
-}
-
-template<typename T, typename Traits>
-ProducerToken::ProducerToken(BlockingConcurrentQueue<T, Traits>& queue)
-	: producer(reinterpret_cast<ConcurrentQueue<T, Traits>*>(&queue)->recycle_or_create_producer(true))
-{
-	if (producer != nullptr) {
-		producer->token = this;
-	}
-}
-
-template<typename T, typename Traits>
-ConsumerToken::ConsumerToken(ConcurrentQueue<T, Traits>& queue)
-	: itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr)
-{
-	initialOffset = queue.nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
-	lastKnownGlobalOffset = static_cast<std::uint32_t>(-1);
-}
-
-template<typename T, typename Traits>
-ConsumerToken::ConsumerToken(BlockingConcurrentQueue<T, Traits>& queue)
-	: itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr)
-{
-	initialOffset = reinterpret_cast<ConcurrentQueue<T, Traits>*>(&queue)->nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
-	lastKnownGlobalOffset = static_cast<std::uint32_t>(-1);
-}
-
-template<typename T, typename Traits>
-inline void swap(ConcurrentQueue<T, Traits>& a, ConcurrentQueue<T, Traits>& b) MOODYCAMEL_NOEXCEPT
-{
-	a.swap(b);
-}
-
-inline void swap(ProducerToken& a, ProducerToken& b) MOODYCAMEL_NOEXCEPT
-{
-	a.swap(b);
-}
-
-inline void swap(ConsumerToken& a, ConsumerToken& b) MOODYCAMEL_NOEXCEPT
-{
-	a.swap(b);
-}
-
-template<typename T, typename Traits>
-inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& a, typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT
-{
-	a.swap(b);
-}
-
-}
+    };
+
+
+    template<typename T, typename Traits>
+    ProducerToken::ProducerToken(ConcurrentQueue<T, Traits>& queue)
+        : producer(queue.recycle_or_create_producer(true))
+    {
+        if (producer != nullptr)
+        {
+            producer->token = this;
+        }
+    }
+
+    template<typename T, typename Traits>
+    ProducerToken::ProducerToken(BlockingConcurrentQueue<T, Traits>& queue)
+        : producer(reinterpret_cast<ConcurrentQueue<T, Traits>*>(&queue)->recycle_or_create_producer(true))
+    {
+        if (producer != nullptr)
+        {
+            producer->token = this;
+        }
+    }
+
+    template<typename T, typename Traits>
+    ConsumerToken::ConsumerToken(ConcurrentQueue<T, Traits>& queue)
+        : itemsConsumedFromCurrent(0)
+        , currentProducer(nullptr)
+        , desiredProducer(nullptr)
+    {
+        initialOffset         = queue.nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
+        lastKnownGlobalOffset = static_cast<std::uint32_t>(-1);
+    }
+
+    template<typename T, typename Traits>
+    ConsumerToken::ConsumerToken(BlockingConcurrentQueue<T, Traits>& queue)
+        : itemsConsumedFromCurrent(0)
+        , currentProducer(nullptr)
+        , desiredProducer(nullptr)
+    {
+        initialOffset         = reinterpret_cast<ConcurrentQueue<T, Traits>*>(&queue)->nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
+        lastKnownGlobalOffset = static_cast<std::uint32_t>(-1);
+    }
+
+    template<typename T, typename Traits>
+    inline void swap(ConcurrentQueue<T, Traits>& a, ConcurrentQueue<T, Traits>& b) MOODYCAMEL_NOEXCEPT
+    {
+        a.swap(b);
+    }
+
+    inline void swap(ProducerToken& a, ProducerToken& b) MOODYCAMEL_NOEXCEPT
+    {
+        a.swap(b);
+    }
+
+    inline void swap(ConsumerToken& a, ConsumerToken& b) MOODYCAMEL_NOEXCEPT
+    {
+        a.swap(b);
+    }
+
+    template<typename T, typename Traits>
+    inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& a, typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT
+    {
+        a.swap(b);
+    }
+
+}  // namespace moodycamel
 
 #if defined(_MSC_VER) && (!defined(_HAS_CXX17) || !_HAS_CXX17)
-#pragma warning(pop)
+    #pragma warning(pop)
 #endif
 
 #if defined(__GNUC__) && !defined(__INTEL_COMPILER)
-#pragma GCC diagnostic pop
+    #pragma GCC diagnostic pop
 #endif
diff --git a/third_party/dlpack/dlpack.h b/third_party/dlpack/dlpack.h
index 9835c7b697..93f8044c7c 100644
--- a/third_party/dlpack/dlpack.h
+++ b/third_party/dlpack/dlpack.h
@@ -12,9 +12,9 @@
  * \brief Compatibility with C++
  */
 #ifdef __cplusplus
-#define DLPACK_EXTERN_C extern "C"
+    #define DLPACK_EXTERN_C extern "C"
 #else
-#define DLPACK_EXTERN_C
+    #define DLPACK_EXTERN_C
 #endif
 
 /*! \brief The current version of dlpack */
@@ -25,206 +25,214 @@
 
 /*! \brief DLPACK_DLL prefix for windows */
 #ifdef _WIN32
-#ifdef DLPACK_EXPORTS
-#define DLPACK_DLL __declspec(dllexport)
+    #ifdef DLPACK_EXPORTS
+        #define DLPACK_DLL __declspec(dllexport)
+    #else
+        #define DLPACK_DLL __declspec(dllimport)
+    #endif
 #else
-#define DLPACK_DLL __declspec(dllimport)
-#endif
-#else
-#define DLPACK_DLL
+    #define DLPACK_DLL
 #endif
 
 #include <stddef.h>
 #include <stdint.h>
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 /*!
  * \brief The device type in DLDevice.
  */
 #ifdef __cplusplus
-typedef enum : int32_t {
+    typedef enum : int32_t
+    {
 #else
-typedef enum {
+typedef enum
+{
 #endif
-  /*! \brief CPU device */
-  kDLCPU = 1,
-  /*! \brief CUDA GPU device */
-  kDLCUDA = 2,
-  /*!
-   * \brief Pinned CUDA CPU memory by cudaMallocHost
-   */
-  kDLCUDAHost = 3,
-  /*! \brief OpenCL devices. */
-  kDLOpenCL = 4,
-  /*! \brief Vulkan buffer for next generation graphics. */
-  kDLVulkan = 7,
-  /*! \brief Metal for Apple GPU. */
-  kDLMetal = 8,
-  /*! \brief Verilog simulator buffer */
-  kDLVPI = 9,
-  /*! \brief ROCm GPUs for AMD GPUs */
-  kDLROCM = 10,
-  /*!
-   * \brief Pinned ROCm CPU memory allocated by hipMallocHost
-   */
-  kDLROCMHost = 11,
-  /*!
-   * \brief Reserved extension device type,
-   * used for quickly test extension device
-   * The semantics can differ depending on the implementation.
-   */
-  kDLExtDev = 12,
-  /*!
-   * \brief CUDA managed/unified memory allocated by cudaMallocManaged
-   */
-  kDLCUDAManaged = 13,
-  /*!
-   * \brief Unified shared memory allocated on a oneAPI non-partititioned
-   * device. Call to oneAPI runtime is required to determine the device
-   * type, the USM allocation type and the sycl context it is bound to.
-   *
-   */
-  kDLOneAPI = 14,
-  /*! \brief GPU support for next generation WebGPU standard. */
-  kDLWebGPU = 15,
-  /*! \brief Qualcomm Hexagon DSP */
-  kDLHexagon = 16,
-} DLDeviceType;
+        /*! \brief CPU device */
+        kDLCPU         = 1,
+        /*! \brief CUDA GPU device */
+        kDLCUDA        = 2,
+        /*!
+         * \brief Pinned CUDA CPU memory by cudaMallocHost
+         */
+        kDLCUDAHost    = 3,
+        /*! \brief OpenCL devices. */
+        kDLOpenCL      = 4,
+        /*! \brief Vulkan buffer for next generation graphics. */
+        kDLVulkan      = 7,
+        /*! \brief Metal for Apple GPU. */
+        kDLMetal       = 8,
+        /*! \brief Verilog simulator buffer */
+        kDLVPI         = 9,
+        /*! \brief ROCm GPUs for AMD GPUs */
+        kDLROCM        = 10,
+        /*!
+         * \brief Pinned ROCm CPU memory allocated by hipMallocHost
+         */
+        kDLROCMHost    = 11,
+        /*!
+         * \brief Reserved extension device type,
+         * used for quickly test extension device
+         * The semantics can differ depending on the implementation.
+         */
+        kDLExtDev      = 12,
+        /*!
+         * \brief CUDA managed/unified memory allocated by cudaMallocManaged
+         */
+        kDLCUDAManaged = 13,
+        /*!
+         * \brief Unified shared memory allocated on a oneAPI non-partititioned
+         * device. Call to oneAPI runtime is required to determine the device
+         * type, the USM allocation type and the sycl context it is bound to.
+         *
+         */
+        kDLOneAPI      = 14,
+        /*! \brief GPU support for next generation WebGPU standard. */
+        kDLWebGPU      = 15,
+        /*! \brief Qualcomm Hexagon DSP */
+        kDLHexagon     = 16,
+    } DLDeviceType;
 
-/*!
- * \brief A Device for Tensor and operator.
- */
-typedef struct {
-  /*! \brief The device type used in the device. */
-  DLDeviceType device_type;
-  /*!
-   * \brief The device index.
-   * For vanilla CPU memory, pinned memory, or managed memory, this is set to 0.
-   */
-  int32_t device_id;
-} DLDevice;
+    /*!
+     * \brief A Device for Tensor and operator.
+     */
+    typedef struct
+    {
+        /*! \brief The device type used in the device. */
+        DLDeviceType device_type;
+        /*!
+         * \brief The device index.
+         * For vanilla CPU memory, pinned memory, or managed memory, this is set to 0.
+         */
+        int32_t      device_id;
+    } DLDevice;
 
-/*!
- * \brief The type code options DLDataType.
- */
-typedef enum {
-  /*! \brief signed integer */
-  kDLInt = 0U,
-  /*! \brief unsigned integer */
-  kDLUInt = 1U,
-  /*! \brief IEEE floating point */
-  kDLFloat = 2U,
-  /*!
-   * \brief Opaque handle type, reserved for testing purposes.
-   * Frameworks need to agree on the handle data type for the exchange to be well-defined.
-   */
-  kDLOpaqueHandle = 3U,
-  /*! \brief bfloat16 */
-  kDLBfloat = 4U,
-  /*!
-   * \brief complex number
-   * (C/C++/Python layout: compact struct per complex number)
-   */
-  kDLComplex = 5U,
-} DLDataTypeCode;
+    /*!
+     * \brief The type code options DLDataType.
+     */
+    typedef enum
+    {
+        /*! \brief signed integer */
+        kDLInt          = 0U,
+        /*! \brief unsigned integer */
+        kDLUInt         = 1U,
+        /*! \brief IEEE floating point */
+        kDLFloat        = 2U,
+        /*!
+         * \brief Opaque handle type, reserved for testing purposes.
+         * Frameworks need to agree on the handle data type for the exchange to be well-defined.
+         */
+        kDLOpaqueHandle = 3U,
+        /*! \brief bfloat16 */
+        kDLBfloat       = 4U,
+        /*!
+         * \brief complex number
+         * (C/C++/Python layout: compact struct per complex number)
+         */
+        kDLComplex      = 5U,
+    } DLDataTypeCode;
 
-/*!
- * \brief The data type the tensor can hold. The data type is assumed to follow the
- * native endian-ness. An explicit error message should be raised when attempting to
- * export an array with non-native endianness
- *
- *  Examples
- *   - float: type_code = 2, bits = 32, lanes=1
- *   - float4(vectorized 4 float): type_code = 2, bits = 32, lanes=4
- *   - int8: type_code = 0, bits = 8, lanes=1
- *   - std::complex<float>: type_code = 5, bits = 64, lanes = 1
- */
-typedef struct {
-  /*!
-   * \brief Type code of base types.
-   * We keep it uint8_t instead of DLDataTypeCode for minimal memory
-   * footprint, but the value should be one of DLDataTypeCode enum values.
-   * */
-  uint8_t code;
-  /*!
-   * \brief Number of bits, common choices are 8, 16, 32.
-   */
-  uint8_t bits;
-  /*! \brief Number of lanes in the type, used for vector types. */
-  uint16_t lanes;
-} DLDataType;
+    /*!
+     * \brief The data type the tensor can hold. The data type is assumed to follow the
+     * native endian-ness. An explicit error message should be raised when attempting to
+     * export an array with non-native endianness
+     *
+     *  Examples
+     *   - float: type_code = 2, bits = 32, lanes=1
+     *   - float4(vectorized 4 float): type_code = 2, bits = 32, lanes=4
+     *   - int8: type_code = 0, bits = 8, lanes=1
+     *   - std::complex<float>: type_code = 5, bits = 64, lanes = 1
+     */
+    typedef struct
+    {
+        /*!
+         * \brief Type code of base types.
+         * We keep it uint8_t instead of DLDataTypeCode for minimal memory
+         * footprint, but the value should be one of DLDataTypeCode enum values.
+         * */
+        uint8_t  code;
+        /*!
+         * \brief Number of bits, common choices are 8, 16, 32.
+         */
+        uint8_t  bits;
+        /*! \brief Number of lanes in the type, used for vector types. */
+        uint16_t lanes;
+    } DLDataType;
 
-/*!
- * \brief Plain C Tensor object, does not manage memory.
- */
-typedef struct {
-  /*!
-   * \brief The data pointer points to the allocated data. This will be CUDA
-   * device pointer or cl_mem handle in OpenCL. It may be opaque on some device
-   * types. This pointer is always aligned to 256 bytes as in CUDA. The
-   * `byte_offset` field should be used to point to the beginning of the data.
-   *
-   * Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow,
-   * TVM, perhaps others) do not adhere to this 256 byte aligment requirement
-   * on CPU/CUDA/ROCm, and always use `byte_offset=0`.  This must be fixed
-   * (after which this note will be updated); at the moment it is recommended
-   * to not rely on the data pointer being correctly aligned.
-   *
-   * For given DLTensor, the size of memory required to store the contents of
-   * data is calculated as follows:
-   *
-   * \code{.c}
-   * static inline size_t GetDataSize(const DLTensor* t) {
-   *   size_t size = 1;
-   *   for (tvm_index_t i = 0; i < t->ndim; ++i) {
-   *     size *= t->shape[i];
-   *   }
-   *   size *= (t->dtype.bits * t->dtype.lanes + 7) / 8;
-   *   return size;
-   * }
-   * \endcode
-   */
-  void* data;
-  /*! \brief The device of the tensor */
-  DLDevice device;
-  /*! \brief Number of dimensions */
-  int32_t ndim;
-  /*! \brief The data type of the pointer*/
-  DLDataType dtype;
-  /*! \brief The shape of the tensor */
-  int64_t* shape;
-  /*!
-   * \brief strides of the tensor (in number of elements, not bytes)
-   *  can be NULL, indicating tensor is compact and row-majored.
-   */
-  int64_t* strides;
-  /*! \brief The offset in bytes to the beginning pointer to data */
-  uint64_t byte_offset;
-} DLTensor;
+    /*!
+     * \brief Plain C Tensor object, does not manage memory.
+     */
+    typedef struct
+    {
+        /*!
+         * \brief The data pointer points to the allocated data. This will be CUDA
+         * device pointer or cl_mem handle in OpenCL. It may be opaque on some device
+         * types. This pointer is always aligned to 256 bytes as in CUDA. The
+         * `byte_offset` field should be used to point to the beginning of the data.
+         *
+         * Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow,
+         * TVM, perhaps others) do not adhere to this 256 byte aligment requirement
+         * on CPU/CUDA/ROCm, and always use `byte_offset=0`.  This must be fixed
+         * (after which this note will be updated); at the moment it is recommended
+         * to not rely on the data pointer being correctly aligned.
+         *
+         * For given DLTensor, the size of memory required to store the contents of
+         * data is calculated as follows:
+         *
+         * \code{.c}
+         * static inline size_t GetDataSize(const DLTensor* t) {
+         *   size_t size = 1;
+         *   for (tvm_index_t i = 0; i < t->ndim; ++i) {
+         *     size *= t->shape[i];
+         *   }
+         *   size *= (t->dtype.bits * t->dtype.lanes + 7) / 8;
+         *   return size;
+         * }
+         * \endcode
+         */
+        void*      data;
+        /*! \brief The device of the tensor */
+        DLDevice   device;
+        /*! \brief Number of dimensions */
+        int32_t    ndim;
+        /*! \brief The data type of the pointer*/
+        DLDataType dtype;
+        /*! \brief The shape of the tensor */
+        int64_t*   shape;
+        /*!
+         * \brief strides of the tensor (in number of elements, not bytes)
+         *  can be NULL, indicating tensor is compact and row-majored.
+         */
+        int64_t*   strides;
+        /*! \brief The offset in bytes to the beginning pointer to data */
+        uint64_t   byte_offset;
+    } DLTensor;
 
-/*!
- * \brief C Tensor object, manage memory of DLTensor. This data structure is
- *  intended to facilitate the borrowing of DLTensor by another framework. It is
- *  not meant to transfer the tensor. When the borrowing framework doesn't need
- *  the tensor, it should call the deleter to notify the host that the resource
- *  is no longer needed.
- */
-typedef struct DLManagedTensor {
-  /*! \brief DLTensor which is being memory managed */
-  DLTensor dl_tensor;
-  /*! \brief the context of the original host framework of DLManagedTensor in
-   *   which DLManagedTensor is used in the framework. It can also be NULL.
-   */
-  void* manager_ctx;
-  /*! \brief Destructor signature void (*)(void*) - this should be called
-   *   to destruct manager_ctx which holds the DLManagedTensor. It can be NULL
-   *   if there is no way for the caller to provide a reasonable destructor.
-   *   The destructors deletes the argument self as well.
-   */
-  void (*deleter)(struct DLManagedTensor* self);
-} DLManagedTensor;
+    /*!
+     * \brief C Tensor object, manage memory of DLTensor. This data structure is
+     *  intended to facilitate the borrowing of DLTensor by another framework. It is
+     *  not meant to transfer the tensor. When the borrowing framework doesn't need
+     *  the tensor, it should call the deleter to notify the host that the resource
+     *  is no longer needed.
+     */
+    typedef struct DLManagedTensor
+    {
+        /*! \brief DLTensor which is being memory managed */
+        DLTensor dl_tensor;
+        /*! \brief the context of the original host framework of DLManagedTensor in
+         *   which DLManagedTensor is used in the framework. It can also be NULL.
+         */
+        void*    manager_ctx;
+        /*! \brief Destructor signature void (*)(void*) - this should be called
+         *   to destruct manager_ctx which holds the DLManagedTensor. It can be NULL
+         *   if there is no way for the caller to provide a reasonable destructor.
+         *   The destructors deletes the argument self as well.
+         */
+        void (*deleter)(struct DLManagedTensor* self);
+    } DLManagedTensor;
 #ifdef __cplusplus
 }  // DLPACK_EXTERN_C
 #endif