-
Notifications
You must be signed in to change notification settings - Fork 376
[Feature] C++ Extension: Introduce DocNode, TextSplitterBase, SentenseSplitter
#1022
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 67 commits
e011555
bfb16fa
54e43c1
894bb73
b0a1ca0
8e7e017
fa1d7d2
7ea108e
f0f6657
1854448
1484fc7
a69f82f
a6cfceb
0170a0e
459cfd4
5ea167c
e4070f8
6017ffa
cc7ab7e
615b7b0
0b193c8
0d88ea6
02cbec4
af7e617
1c7ee82
9ef9bd8
068ca98
19e00dd
e0c3acc
06aa586
a214e35
e865ab6
2fd8583
ac9dad3
4ab5a93
b38affc
79218fb
ee3ecbc
42252a7
06eabd4
fa73e50
08f3333
2c893df
f850d15
9e709da
1024d0e
81c9aaa
25d0c83
7680fff
980d0ad
0dec57a
b5c4ba3
1e1087a
13a167d
043fd1b
5ccc9a7
a9417f3
c1d03cd
2b5393e
3fbe4b0
787cddd
ed4c5d5
8ae6609
f25a1d4
9e2cb42
6e32f84
369eada
326c068
5a9d1b7
1b74fa4
4600b72
a105fc1
62f2793
d82edc8
20b3eb5
7608a63
47a2680
754388e
7e895b7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -748,30 +748,103 @@ jobs: | |
| export LAZYLLM_K8S_CONFIG_PATH="/mnt/nfs_share/k8s_config.yaml" | ||
| export LAZYLLM_HOME="${{ env.K8S_CI_PATH }}/${{ github.run_id }}-${{ github.job }}" | ||
| mkdir -p $LAZYLLM_HOME | ||
| source /mnt/nfs_share/env.sh | ||
| source /mnt/nfs_share/env.sh | ||
| pytest --lf --last-failed-no-failures=all --durations=0 --reruns=2 -v tests/k8s_tests | ||
|
|
||
| cpp_ext_test: | ||
| name: C++ Extension Test (${{ matrix.os }}) | ||
| name: C++ Build + Python Regression (${{ matrix.os }}) | ||
| needs: [ clone ] | ||
| if: always() | ||
| runs-on: ${{ matrix.os }} | ||
| strategy: | ||
| fail-fast: false | ||
| matrix: | ||
| os: [ubuntu-latest, macos-latest, windows-latest] | ||
| timeout-minutes: 120 | ||
| defaults: | ||
| run: | ||
| shell: bash | ||
| env: | ||
| PYTHONNOUSERSITE: "1" | ||
| PYTHONPATH: ${{ github.workspace }} | ||
| LAZYLLM_ENABLE_CPP_OVERRIDE: "1" | ||
| LAZYLLM_EXPECTED_LOG_MODULES: "all" | ||
| LAZYLLM_DEFAULT_LAUNCHER: "empty" | ||
| LAZYLLM_OPENAI_API_KEY: ${{ secrets.LAZYLLM_OPENAI_API_KEY }} | ||
| LAZYLLM_KIMI_API_KEY: ${{ secrets.LAZYLLM_KIMI_API_KEY }} | ||
| LAZYLLM_AIPING_API_KEY: ${{ secrets.LAZYLLM_AIPING_API_KEY }} | ||
| LAZYLLM_GLM_API_KEY: ${{ secrets.LAZYLLM_GLM_API_KEY }} | ||
| LAZYLLM_GLM_MODEL_NAME: ${{ secrets.LAZYLLM_GLM_MODEL_NAME }} | ||
| LAZYLLM_QWEN_API_KEY: ${{ secrets.LAZYLLM_QWEN_API_KEY }} | ||
| LAZYLLM_QWEN_MODEL_NAME: ${{ secrets.LAZYLLM_QWEN_MODEL_NAME }} | ||
| LAZYLLM_QWEN_TEXT2IMAGE_MODEL_NAME: ${{ secrets.LAZYLLM_QWEN_TEXT2IMAGE_MODEL_NAME }} | ||
| LAZYLLM_SENSENOVA_API_KEY: ${{ secrets.LAZYLLM_SENSENOVA_API_KEY }} | ||
| LAZYLLM_SENSENOVA_SECRET_KEY: ${{ secrets.LAZYLLM_SENSENOVA_SECRET_KEY }} | ||
| LAZYLLM_DOUBAO_API_KEY: ${{ secrets.LAZYLLM_DOUBAO_API_KEY }} | ||
| LAZYLLM_DOUBAO_MODEL_NAME: ${{ secrets.LAZYLLM_DOUBAO_MODEL_NAME }} | ||
| LAZYLLM_SILICONFLOW_API_KEY: ${{ secrets.LAZYLLM_SILICONFLOW_API_KEY }} | ||
| LAZYLLM_SILICONFLOW_MODEL_NAME: ${{ secrets.LAZYLLM_SILICONFLOW_MODEL_NAME }} | ||
| LAZYLLM_MINIMAX_API_KEY: ${{ secrets.LAZYLLM_MINIMAX_API_KEY }} | ||
| LAZYLLM_MINIMAX_MODEL_NAME: ${{ secrets.LAZYLLM_MINIMAX_MODEL_NAME }} | ||
| LAZYLLM_PPOP_API_KEY: ${{ secrets.LAZYLLM_PPOP_API_KEY }} | ||
|
|
||
| steps: | ||
| - name: Checkout | ||
| uses: actions/checkout@v4 | ||
| with: | ||
| submodules: false | ||
| submodules: false | ||
|
|
||
| - name: Set up python ${{ env.PYTHON_VERSION }} | ||
| uses: actions/setup-python@v5 | ||
| with: | ||
| python-version: ${{ env.PYTHON_VERSION }} | ||
| - name: Setup | ||
| uses: ./.github/actions/setup | ||
|
|
||
| - name: Install test requirements | ||
| run: | | ||
| pip install -r tests/requirements.txt | ||
| if [[ "${{ runner.os }}" == "Linux" ]]; then | ||
| pip install -r tests/requirements_linux.txt | ||
| elif [[ "${{ runner.os }}" == "macOS" ]]; then | ||
| pip install -r tests/requirements_mac.txt | ||
| fi | ||
|
|
||
| - name: Download test dataset | ||
| run: | | ||
| set -euo pipefail | ||
| DATA_DIR="$GITHUB_WORKSPACE/.ci_data" | ||
| rm -rf "$DATA_DIR" | ||
| export GIT_TERMINAL_PROMPT=0 | ||
| git clone --depth 1 https://[email protected]/LazyAGI/LazyLLM-Data.git "$DATA_DIR" | ||
| echo "LAZYLLM_DATA_PATH=$DATA_DIR" >> "$GITHUB_ENV" | ||
| env: | ||
| GITHUB_TOKEN: ${{ secrets.PERSONAL_GITHUB_TOKEN || github.token }} | ||
|
|
||
| - name: Test | ||
| - name: Build and run C++ tests | ||
| run: | | ||
| pip install pybind11 | ||
| bash csrc/scripts/build_test.sh | ||
|
|
||
| - name: Install C++ extension artifacts into workspace | ||
| run: | | ||
| cmake --install build --prefix . --component lazyllm_cpp | ||
| ls -al lazyllm | rg "lazyllm_cpp|cpp_lib" || true | ||
|
|
||
| - name: Run basic tests | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is an AI-generated suggestion; please verify before applying. [medium] [maintainability] basic_tests 和 advanced_tests 的 marker 计算逻辑完全重复,违反 DRY 原则,后续修改容易遗漏。 Suggestion: 将 marker 计算提取到一个独立的前置步骤中,通过环境变量传递给后续步骤: auto reviewed by BOT (claude-opus-4-6) |
||
| run: | | ||
| if [[ "${{ runner.os }}" == "Linux" ]]; then | ||
| MARKER="not skip_on_linux" | ||
| elif [[ "${{ runner.os }}" == "macOS" ]]; then | ||
| MARKER="not skip_on_mac" | ||
| else | ||
| MARKER="not skip_on_win" | ||
| fi | ||
| pytest -v --order-scope=class -m "$MARKER" tests/basic_tests | ||
|
|
||
| - name: Run advanced tests | ||
| run: | | ||
| if [[ "${{ runner.os }}" == "Linux" ]]; then | ||
| MARKER="not skip_on_linux" | ||
| elif [[ "${{ runner.os }}" == "macOS" ]]; then | ||
| MARKER="not skip_on_mac" | ||
| else | ||
| MARKER="not skip_on_win" | ||
| fi | ||
| pytest -v --order-scope=class -m "$MARKER" tests/advanced_tests | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,23 +1,81 @@ | ||
| cmake_minimum_required(VERSION 3.16) | ||
| project(LazyLLMCPP LANGUAGES CXX) | ||
|
|
||
| set(CMAKE_CXX_STANDARD 11) | ||
| set(CMAKE_CXX_STANDARD 17) | ||
| set(CMAKE_CXX_STANDARD_REQUIRED ON) | ||
| set(CMAKE_POSITION_INDEPENDENT_CODE ON) | ||
|
|
||
| find_package(Python3 COMPONENTS Interpreter Development.Module REQUIRED) | ||
| find_package(pybind11 CONFIG REQUIRED) | ||
| function(lazyllm_enable_strict_warnings target_name) | ||
| if (MSVC) | ||
| target_compile_options(${target_name} PRIVATE /W4 /WX) | ||
| else () | ||
| target_compile_options(${target_name} PRIVATE -Werror -Wshadow) | ||
| endif () | ||
| endfunction() | ||
|
|
||
| # Third party libs | ||
| include(cmake/third_party.cmake) | ||
|
|
||
| # Config lazyllm_core lib with pure cpp code. | ||
| file(GLOB_RECURSE LAZYLLM_CORE_SOURCES CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp") | ||
| file(GLOB_RECURSE LAZYLLM_CORE_SOURCES CONFIGURE_DEPENDS | ||
| "${CMAKE_CURRENT_SOURCE_DIR}/core/src/*.cpp") | ||
| add_library(lazyllm_core STATIC ${LAZYLLM_CORE_SOURCES}) | ||
| target_include_directories(lazyllm_core PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include) | ||
| target_include_directories(lazyllm_core PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/core/include) | ||
| target_link_libraries(lazyllm_core PUBLIC xxhash) | ||
| target_link_libraries(lazyllm_core PUBLIC tiktoken) | ||
| target_link_libraries(lazyllm_core PUBLIC utf8proc) | ||
| lazyllm_enable_strict_warnings(lazyllm_core) | ||
CompromisedKiwi marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| # Config lazyllm_cpp lib with binding infomations. | ||
| set(LAZYLLM_BINDING_SOURCES binding/lazyllm.cpp binding/doc.cpp) | ||
| file(GLOB_RECURSE LAZYLLM_BINDING_SOURCES CONFIGURE_DEPENDS | ||
| "${CMAKE_CURRENT_SOURCE_DIR}/binding/*.cpp") | ||
| set(INTERFACE_TARGET_NAME lazyllm_cpp) | ||
| pybind11_add_module(${INTERFACE_TARGET_NAME} ${LAZYLLM_BINDING_SOURCES}) | ||
| target_include_directories(${INTERFACE_TARGET_NAME} PRIVATE | ||
| ${CMAKE_CURRENT_SOURCE_DIR}/binding | ||
| ${CMAKE_CURRENT_SOURCE_DIR}/core/include | ||
| ) | ||
| target_link_libraries(${INTERFACE_TARGET_NAME} PRIVATE lazyllm_core) | ||
| lazyllm_enable_strict_warnings(${INTERFACE_TARGET_NAME}) | ||
|
|
||
| # Runtime loader configuration per platform. | ||
| set(_lazyllm_cpp_rpath "") | ||
| set(LAZYLLM_TEST_RUNTIME_ENV "" CACHE INTERNAL "Runtime env for LazyLLM C++ tests" FORCE) | ||
| if (WIN32) | ||
| # Windows has no ELF rpath; loader resolution is driven by PATH and DLL search order. | ||
| # Keep test runtime env empty by default. | ||
| elseif (APPLE) | ||
| # Ensure lazyllm_cpp can find third-party dylibs under lazyllm/cpp_lib. | ||
| list(APPEND _lazyllm_cpp_rpath "@loader_path/cpp_lib") | ||
| else () | ||
| # Ensure lazyllm_cpp can find third-party shared libraries under lazyllm/cpp_lib. | ||
| list(APPEND _lazyllm_cpp_rpath "$ORIGIN/cpp_lib") | ||
| # Use DT_RPATH (instead of DT_RUNPATH) so the extension's own runtime | ||
| # search path can take precedence over host interpreter bundled libs. | ||
| target_link_options(${INTERFACE_TARGET_NAME} PRIVATE -Wl,--disable-new-dtags) | ||
|
|
||
| # Resolve libstdc++ from the active C++ compiler and include it in rpath. | ||
| execute_process( | ||
| COMMAND ${CMAKE_CXX_COMPILER} -print-file-name=libstdc++.so.6 | ||
| OUTPUT_VARIABLE LIBSTDCPP_PATH | ||
| OUTPUT_STRIP_TRAILING_WHITESPACE | ||
| ) | ||
| if (LIBSTDCPP_PATH AND NOT LIBSTDCPP_PATH STREQUAL "libstdc++.so.6") | ||
| get_filename_component(LIBSTDCPP_DIR "${LIBSTDCPP_PATH}" DIRECTORY) | ||
| if (LIBSTDCPP_DIR) | ||
| list(APPEND _lazyllm_cpp_rpath "${LIBSTDCPP_DIR}") | ||
| set(LAZYLLM_TEST_RUNTIME_ENV "LD_LIBRARY_PATH=${LIBSTDCPP_DIR}:$ENV{LD_LIBRARY_PATH}" | ||
| CACHE INTERNAL "Runtime env for LazyLLM C++ tests" FORCE) | ||
| endif () | ||
| endif () | ||
| endif () | ||
|
|
||
| if (_lazyllm_cpp_rpath) | ||
| set_target_properties(${INTERFACE_TARGET_NAME} PROPERTIES | ||
| BUILD_RPATH "${_lazyllm_cpp_rpath}" | ||
| INSTALL_RPATH "${_lazyllm_cpp_rpath}" | ||
| ) | ||
| endif () | ||
|
|
||
| if (CMAKE_BUILD_TYPE STREQUAL "Debug") | ||
| # SHOW_SYMBOL | ||
|
|
@@ -26,7 +84,14 @@ if (CMAKE_BUILD_TYPE STREQUAL "Debug") | |
| endif() | ||
|
|
||
| # Install | ||
| install(TARGETS ${INTERFACE_TARGET_NAME} LIBRARY DESTINATION lazyllm) | ||
| install(TARGETS ${INTERFACE_TARGET_NAME} | ||
| LIBRARY DESTINATION lazyllm COMPONENT lazyllm_cpp | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is an AI-generated suggestion; please verify before applying. [medium] [logic] Install rules for shared libraries omit ARCHIVE destination, so Windows import libraries (.lib) won't be installed correctly. Suggestion: Add auto reviewed by BOT (claude-opus-4-6) |
||
| RUNTIME DESTINATION lazyllm COMPONENT lazyllm_cpp | ||
| ) | ||
| install(TARGETS tiktoken utf8proc | ||
| LIBRARY DESTINATION lazyllm/cpp_lib COMPONENT lazyllm_cpp | ||
| RUNTIME DESTINATION lazyllm/cpp_lib COMPONENT lazyllm_cpp | ||
| ) | ||
|
|
||
|
|
||
| # TESTS | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,131 @@ | ||
| #include "binding_utils.hpp" | ||
|
|
||
| #include <type_traits> | ||
|
|
||
| namespace lazyllm::pybind_utils { | ||
|
|
||
| std::string DumpJson(const py::object& obj) { | ||
| py::object json = py::module_::import("json"); | ||
| py::object dumps = json.attr("dumps"); | ||
| py::object dumped = dumps(obj, py::arg("ensure_ascii") = false); | ||
| return dumped.cast<std::string>(); | ||
| } | ||
|
|
||
| py::object LoadJson(const std::string& text) { | ||
| py::object json = py::module_::import("json"); | ||
| py::object loads = json.attr("loads"); | ||
| return loads(py::str(text)); | ||
| } | ||
|
|
||
| bool ExtractStringSequence(const py::object& obj, std::vector<std::string>* out) { | ||
| if (!py::isinstance<py::sequence>(obj) || py::isinstance<py::str>(obj)) return false; | ||
| py::sequence seq = obj.cast<py::sequence>(); | ||
| out->clear(); | ||
| out->reserve(seq.size()); | ||
| for (py::handle item : seq) { | ||
| if (!py::isinstance<py::str>(item)) { | ||
| out->clear(); | ||
| return false; | ||
| } | ||
| out->push_back(py::cast<std::string>(item)); | ||
| } | ||
| return true; | ||
| } | ||
|
|
||
| lazyllm::MetadataMode ParseMetadataMode(const py::object& mode) { | ||
| if (mode.is_none()) return lazyllm::MetadataMode::NONE; | ||
| try { | ||
| if (py::hasattr(mode, "name")) { | ||
| const auto name = py::cast<std::string>(mode.attr("name")); | ||
| if (name == "ALL") return lazyllm::MetadataMode::ALL; | ||
| if (name == "EMBED") return lazyllm::MetadataMode::EMBED; | ||
| if (name == "LLM") return lazyllm::MetadataMode::LLM; | ||
| if (name == "NONE") return lazyllm::MetadataMode::NONE; | ||
| } | ||
| } catch (const py::error_already_set&) { | ||
| } | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is an AI-generated suggestion; please verify before applying. [medium] [exception] Silently swallowing Suggestion: At minimum, log or clear the Python error state explicitly. Consider narrowing the catch scope or adding a comment explaining why the exception is intentionally ignored. auto reviewed by BOT (claude-opus-4-6) |
||
| if (py::isinstance<py::str>(mode)) { | ||
| const auto name = mode.cast<std::string>(); | ||
| if (name == "ALL") return lazyllm::MetadataMode::ALL; | ||
| if (name == "EMBED") return lazyllm::MetadataMode::EMBED; | ||
| if (name == "LLM") return lazyllm::MetadataMode::LLM; | ||
| if (name == "NONE") return lazyllm::MetadataMode::NONE; | ||
| } | ||
| if (py::isinstance<py::int_>(mode)) { | ||
| const auto value = mode.cast<int>(); | ||
| switch (value) { | ||
| case 0: return lazyllm::MetadataMode::ALL; | ||
| case 1: return lazyllm::MetadataMode::EMBED; | ||
| case 2: return lazyllm::MetadataMode::LLM; | ||
| case 3: return lazyllm::MetadataMode::NONE; | ||
| default: break; | ||
| } | ||
| } | ||
| return lazyllm::MetadataMode::NONE; | ||
| } | ||
|
|
||
| lazyllm::MetadataVType PyToMetadataValue(const py::handle& value) { | ||
| if (value.is_none()) return std::string("None"); | ||
| if (py::isinstance<py::bool_>(value)) return static_cast<int>(value.cast<bool>()); | ||
| if (py::isinstance<py::int_>(value)) return value.cast<int>(); | ||
| if (py::isinstance<py::float_>(value)) return value.cast<double>(); | ||
| if (py::isinstance<py::str>(value)) return value.cast<std::string>(); | ||
|
|
||
| if (py::isinstance<py::sequence>(value) && !py::isinstance<py::str>(value)) { | ||
| py::sequence seq = value.cast<py::sequence>(); | ||
| if (seq.empty()) return std::vector<std::string>{}; | ||
|
|
||
| bool all_str = true; | ||
| bool all_int = true; | ||
| bool all_numeric = true; | ||
|
|
||
| for (py::handle item : seq) { | ||
| const bool is_str = py::isinstance<py::str>(item); | ||
| const bool is_int = py::isinstance<py::int_>(item) && !py::isinstance<py::bool_>(item); | ||
| const bool is_numeric = is_int || py::isinstance<py::float_>(item) || py::isinstance<py::bool_>(item); | ||
| all_str = all_str && is_str; | ||
| all_int = all_int && is_int; | ||
| all_numeric = all_numeric && is_numeric; | ||
| } | ||
|
|
||
| if (all_str) { | ||
| std::vector<std::string> out; | ||
| out.reserve(seq.size()); | ||
| for (py::handle item : seq) out.push_back(py::cast<std::string>(item)); | ||
| return out; | ||
| } | ||
| if (all_int) { | ||
| std::vector<int> out; | ||
| out.reserve(seq.size()); | ||
| for (py::handle item : seq) out.push_back(py::cast<int>(item)); | ||
| return out; | ||
| } | ||
| if (all_numeric) { | ||
| std::vector<double> out; | ||
| out.reserve(seq.size()); | ||
| for (py::handle item : seq) out.push_back(py::cast<double>(item)); | ||
| return out; | ||
| } | ||
|
|
||
| std::vector<std::string> out; | ||
| out.reserve(seq.size()); | ||
| for (py::handle item : seq) out.push_back(py::str(item).cast<std::string>()); | ||
| return out; | ||
| } | ||
| return py::str(value).cast<std::string>(); | ||
| } | ||
|
|
||
| py::object MetadataValueToPy(const lazyllm::MetadataVType& value) { | ||
| return std::visit([](const auto& v) -> py::object { | ||
| using T = std::decay_t<decltype(v)>; | ||
| if constexpr (std::is_same_v<T, std::string>) return py::str(v); | ||
| if constexpr (std::is_same_v<T, int>) return py::int_(v); | ||
| if constexpr (std::is_same_v<T, double>) return py::float_(v); | ||
| if constexpr (std::is_same_v<T, std::vector<std::string>>) return py::cast(v); | ||
| if constexpr (std::is_same_v<T, std::vector<int>>) return py::cast(v); | ||
| if constexpr (std::is_same_v<T, std::vector<double>>) return py::cast(v); | ||
| return py::none(); | ||
| }, value); | ||
| } | ||
|
|
||
| } // namespace lazyllm::pybind_utils | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is an AI-generated suggestion; please verify before applying.
[critical] [logic]
if: always()使得cpp_ext_test作业在needs: [clone]失败时仍会运行,可能导致在没有正确 clone 的情况下执行后续步骤并产生不可预测的失败。Suggestion: 如果意图是即使其他作业失败也运行,但前提是 clone 成功,应改为:
auto reviewed by BOT (claude-opus-4-6)