-
Notifications
You must be signed in to change notification settings - Fork 376
[Feature] C++ Extension: Introduce DocNode, TextSplitterBase, SentenseSplitter
#1022
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 39 commits
e011555
bfb16fa
54e43c1
894bb73
b0a1ca0
8e7e017
fa1d7d2
7ea108e
f0f6657
1854448
1484fc7
a69f82f
a6cfceb
0170a0e
459cfd4
5ea167c
e4070f8
6017ffa
cc7ab7e
615b7b0
0b193c8
0d88ea6
02cbec4
af7e617
1c7ee82
9ef9bd8
068ca98
19e00dd
e0c3acc
06aa586
a214e35
e865ab6
2fd8583
ac9dad3
4ab5a93
b38affc
79218fb
ee3ecbc
42252a7
06eabd4
fa73e50
08f3333
2c893df
f850d15
9e709da
1024d0e
81c9aaa
25d0c83
7680fff
980d0ad
0dec57a
b5c4ba3
1e1087a
13a167d
043fd1b
5ccc9a7
a9417f3
c1d03cd
2b5393e
3fbe4b0
787cddd
ed4c5d5
8ae6609
f25a1d4
9e2cb42
6e32f84
369eada
326c068
5a9d1b7
1b74fa4
4600b72
a105fc1
62f2793
d82edc8
20b3eb5
7608a63
47a2680
754388e
7e895b7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,23 +1,48 @@ | ||
| cmake_minimum_required(VERSION 3.16) | ||
| project(LazyLLMCPP LANGUAGES CXX) | ||
|
|
||
| set(CMAKE_CXX_STANDARD 11) | ||
| set(CMAKE_CXX_STANDARD 17) | ||
| set(CMAKE_CXX_STANDARD_REQUIRED ON) | ||
| set(CMAKE_POSITION_INDEPENDENT_CODE ON) | ||
|
|
||
| find_package(Python3 COMPONENTS Interpreter Development.Module REQUIRED) | ||
| find_package(pybind11 CONFIG REQUIRED) | ||
| # Third party libs | ||
| include(cmake/third_party.cmake) | ||
|
|
||
| # Config lazyllm_core lib with pure cpp code. | ||
| file(GLOB_RECURSE LAZYLLM_CORE_SOURCES CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp") | ||
| file(GLOB_RECURSE LAZYLLM_CORE_SOURCES CONFIGURE_DEPENDS | ||
| "${CMAKE_CURRENT_SOURCE_DIR}/core/src/*.cpp") | ||
| add_library(lazyllm_core STATIC ${LAZYLLM_CORE_SOURCES}) | ||
| target_include_directories(lazyllm_core PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include) | ||
| target_include_directories(lazyllm_core PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/core/include) | ||
| target_link_libraries(lazyllm_core PUBLIC xxhash) | ||
| target_link_libraries(lazyllm_core PUBLIC tiktoken) | ||
| target_link_libraries(lazyllm_core PUBLIC utf8proc) | ||
| target_compile_options(lazyllm_core PRIVATE -Werror -Wshadow) | ||
|
|
||
| # Config lazyllm_adaptor lib which maintains callback invocations. | ||
| file(GLOB_RECURSE LAZYLLM_ADAPTOR_SOURCES CONFIGURE_DEPENDS | ||
| "${CMAKE_CURRENT_SOURCE_DIR}/adaptor/*.cpp") | ||
| add_library(lazyllm_adaptor STATIC ${LAZYLLM_ADAPTOR_SOURCES}) | ||
| target_include_directories(lazyllm_adaptor PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/adaptor) | ||
| target_link_libraries(lazyllm_adaptor PUBLIC pybind11::headers Python3::Python lazyllm_core) | ||
| target_compile_options(lazyllm_adaptor PRIVATE -Werror -Wshadow) | ||
|
|
||
| # Config lazyllm_cpp lib with binding infomations. | ||
| set(LAZYLLM_BINDING_SOURCES binding/lazyllm.cpp binding/doc.cpp) | ||
| file(GLOB_RECURSE LAZYLLM_BINDING_SOURCES CONFIGURE_DEPENDS | ||
| "${CMAKE_CURRENT_SOURCE_DIR}/binding/*.cpp") | ||
| set(INTERFACE_TARGET_NAME lazyllm_cpp) | ||
| pybind11_add_module(${INTERFACE_TARGET_NAME} ${LAZYLLM_BINDING_SOURCES}) | ||
| target_link_libraries(${INTERFACE_TARGET_NAME} PRIVATE lazyllm_core) | ||
| target_link_libraries(${INTERFACE_TARGET_NAME} PRIVATE lazyllm_core lazyllm_adaptor) | ||
| target_compile_options(${INTERFACE_TARGET_NAME} PRIVATE -Werror -Wshadow) | ||
|
|
||
| # Ensure lazyllm_cpp can find third-party shared libraries under lazyllm/cpp_lib. | ||
| set(_lazyllm_cpp_rpath "$ORIGIN/cpp_lib") | ||
| if (APPLE) | ||
| set(_lazyllm_cpp_rpath "@loader_path/cpp_lib") | ||
| endif() | ||
| set_target_properties(${INTERFACE_TARGET_NAME} PROPERTIES | ||
| BUILD_RPATH "${_lazyllm_cpp_rpath}" | ||
| INSTALL_RPATH "${_lazyllm_cpp_rpath}" | ||
| ) | ||
|
|
||
| if (CMAKE_BUILD_TYPE STREQUAL "Debug") | ||
| # SHOW_SYMBOL | ||
|
|
@@ -26,7 +51,14 @@ if (CMAKE_BUILD_TYPE STREQUAL "Debug") | |
| endif() | ||
|
|
||
| # Install | ||
| install(TARGETS ${INTERFACE_TARGET_NAME} LIBRARY DESTINATION lazyllm) | ||
| install(TARGETS ${INTERFACE_TARGET_NAME} | ||
| LIBRARY DESTINATION lazyllm COMPONENT lazyllm_cpp | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is an AI-generated suggestion; please verify before applying. [medium] [logic] Install rules for shared libraries omit ARCHIVE destination, so Windows import libraries (.lib) won't be installed correctly. Suggestion: Add auto reviewed by BOT (claude-opus-4-6) |
||
| RUNTIME DESTINATION lazyllm COMPONENT lazyllm_cpp | ||
| ) | ||
| install(TARGETS tiktoken utf8proc | ||
| LIBRARY DESTINATION lazyllm/cpp_lib COMPONENT lazyllm_cpp | ||
| RUNTIME DESTINATION lazyllm/cpp_lib COMPONENT lazyllm_cpp | ||
| ) | ||
|
|
||
|
|
||
| # TESTS | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,2 @@ | ||
| #include "adaptor_base_wrapper.hpp" | ||
| #include "document_store.hpp" | ||
CompromisedKiwi marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,37 @@ | ||
| #pragma once | ||
|
|
||
| #include <memory> | ||
| #include <mutex> | ||
| #include <string> | ||
| #include <unordered_map> | ||
| #include <vector> | ||
|
|
||
| #include <pybind11/pybind11.h> | ||
|
|
||
| #include "adaptor_base.hpp" | ||
|
|
||
|
|
||
| namespace lazyllm { | ||
|
|
||
| class LAZYLLM_HIDDEN AdaptorBaseWrapper : public AdaptorBase { | ||
| pybind11::object _py_obj; | ||
| public: | ||
| AdaptorBaseWrapper(const pybind11::object &obj) : _py_obj(obj) {} | ||
| virtual ~AdaptorBaseWrapper() = default; | ||
|
|
||
| std::any call( | ||
| const std::string& func_name, | ||
| const std::unordered_map<std::string, std::any>& args) const override final | ||
| { | ||
| pybind11::gil_scoped_acquire gil; | ||
| pybind11::object func = pybind11::getattr(_py_obj, func_name.c_str(), pybind11::none()); | ||
| return call_impl(func_name, func, args); | ||
| } | ||
|
|
||
| virtual std::any call_impl( | ||
| const std::string& func_name, | ||
| const pybind11::object& func, | ||
| const std::unordered_map<std::string, std::any>& args) const = 0; | ||
| }; | ||
|
|
||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,119 @@ | ||
| #pragma once | ||
|
|
||
| #include <memory> | ||
| #include <string> | ||
| #include <unordered_map> | ||
| #include <vector> | ||
|
|
||
| #include <pybind11/pybind11.h> | ||
| #include <pybind11/stl.h> | ||
|
|
||
| #include "adaptor_base_wrapper.hpp" | ||
| #include "doc_node.hpp" | ||
|
|
||
| namespace lazyllm { | ||
|
|
||
| struct NodeGroup { | ||
| enum class Type { | ||
| ORIGINAL, CHUNK, SUMMARY, IMAGE_INFO, QUESTION_ANSWER, OTHER | ||
| }; | ||
| std::string _parent; | ||
| std::string _display_name; | ||
| Type _type; | ||
| NodeGroup( | ||
| const std::string& parent, | ||
| const std::string& display_name, | ||
| const Type& type = Type::ORIGINAL) : | ||
| _parent(parent), _display_name(display_name), _type(type) {} | ||
| }; | ||
|
|
||
| class LAZYLLM_HIDDEN DocumentStore : public AdaptorBaseWrapper { | ||
| public: | ||
| DocumentStore() = delete; | ||
| explicit DocumentStore( | ||
| const pybind11::object& store, | ||
| const std::unordered_map<std::string, NodeGroup> &map) : | ||
| AdaptorBaseWrapper(store), _node_groups_map(map) {} | ||
|
|
||
| // Cache-aware factory to avoid rebuilding adaptor for the same Python store. | ||
| static std::shared_ptr<DocumentStore> from_store( | ||
| const pybind11::object& store, const std::unordered_map<std::string, NodeGroup>& map) { | ||
| if (store.is_none()) return nullptr; | ||
|
|
||
| pybind11::gil_scoped_acquire gil; | ||
| PyObject *key = store.ptr(); | ||
| auto &cache = store_cache(); | ||
| auto it = cache.find(key); | ||
| if (it != cache.end()) { | ||
| if (auto existing = it->second.lock()) | ||
| return existing; | ||
| } | ||
| auto created = std::make_shared<DocumentStore>(store, map); | ||
| cache[key] = created; | ||
| return created; | ||
| } | ||
|
|
||
| DocNode::Children get_node_children(const DocNode* node) const { | ||
| DocNode::Children out; | ||
| auto& kb_id = std::any_cast<std::string&>(node->_p_global_metadata->at(std::string(RAGMetadataKeys::KB_ID))); | ||
| auto& doc_id = std::any_cast<std::string&>(node->_p_global_metadata->at(std::string(RAGMetadataKeys::DOC_ID))); | ||
| auto& group_name = node->_group_name; | ||
| for(auto& [current_group_name, group] : _node_groups_map) { | ||
| if (group._parent != group_name) continue; | ||
| if (!std::any_cast<bool>(call("is_group_active", {{"group", current_group_name}}))) continue; | ||
| auto nodes_in_group = std::any_cast<std::vector<PDocNode>>(call("get_nodes", { | ||
| {"group_name", current_group_name}, | ||
| {"kb_id", kb_id}, | ||
| {"doc_ids", std::vector<std::string>({doc_id})} | ||
| })); | ||
|
|
||
| std::vector<PDocNode> children; | ||
| children.reserve(nodes_in_group.size()); | ||
| for (auto n : nodes_in_group) | ||
| if (n->get_parent_node() == node) children.push_back(n); | ||
| out[current_group_name] = children; | ||
| } | ||
| return out; | ||
| } | ||
|
|
||
| private: | ||
| std::unordered_map<std::string, NodeGroup> _node_groups_map; | ||
|
|
||
| std::any call_impl( | ||
| const std::string& func_name, | ||
| const pybind11::object& func, | ||
| const std::unordered_map<std::string, std::any>& args) const override | ||
| { | ||
| if (func_name == "is_group_active") { | ||
| return func(args.at("group")).cast<bool>(); | ||
| } | ||
| else if (func_name == "get_node") { | ||
| return func( | ||
| pybind11::arg("group_name") = std::any_cast<std::string>(args.at("group_name")), | ||
| pybind11::arg("uids") = std::vector<std::string>({std::any_cast<std::string>(args.at("uid"))}), | ||
| pybind11::arg("kb_id") = std::any_cast<std::string>(args.at("kb_id")), | ||
| pybind11::arg("display") = true | ||
| ).cast<pybind11::list>()[0].cast<DocNode*>(); | ||
| } | ||
CompromisedKiwi marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| else if (func_name == "get_nodes") { | ||
| return func( | ||
| pybind11::arg("group_name") = std::any_cast<std::string>(args.at("group_name")), | ||
| pybind11::arg("kb_id") = std::any_cast<std::string>(args.at("kb_id")), | ||
| pybind11::arg("doc_ids") = std::vector<std::string>({std::any_cast<std::string>(args.at("doc_id"))}) | ||
CompromisedKiwi marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| ).cast<std::vector<DocNode*>>(); | ||
| } | ||
| else if (func_name == "get_node_children") { | ||
| return get_node_children(std::any_cast<DocNode*>(args.at("node"))); | ||
| } | ||
|
|
||
| throw std::runtime_error("Unknown DocumentStore function: " + func_name); | ||
| } | ||
|
|
||
| // Cache by Python object identity to ensure one wrapper per store instance. | ||
| static std::unordered_map<PyObject *, std::weak_ptr<DocumentStore>> &store_cache() { | ||
| static std::unordered_map<PyObject *, std::weak_ptr<DocumentStore>> cache; | ||
| return cache; | ||
| } | ||
CompromisedKiwi marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| }; | ||
|
|
||
| } // namespace lazyllm | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is an AI-generated suggestion; please verify before applying.
[critical] [logic]
if: always()使得cpp_ext_test作业在needs: [clone]失败时仍会运行,可能导致在没有正确 clone 的情况下执行后续步骤并产生不可预测的失败。Suggestion: 如果意图是即使其他作业失败也运行,但前提是 clone 成功,应改为:
auto reviewed by BOT (claude-opus-4-6)