diff --git a/.appveyor.yml b/.appveyor.yml index 274064fc56cd..a5cd02d69e23 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -23,7 +23,6 @@ clone_depth: 5 install: - git submodule update --init --recursive # get `external_libs` folder - - set PATH=%PATH:C:\Program Files\Git\usr\bin;=% # delete sh.exe from PATH (mingw32-make fix) - set PATH=C:\mingw-w64\x86_64-8.1.0-posix-seh-rt_v6-rev0\mingw64\bin;%PATH% - set PYTHON_VERSION=%CONFIGURATION% - set CONDA_ENV="test-env" diff --git a/.ci/check_python_dists.sh b/.ci/check_python_dists.sh index e7e4a86b47e4..cb0bbae79fa9 100644 --- a/.ci/check_python_dists.sh +++ b/.ci/check_python_dists.sh @@ -17,4 +17,35 @@ if { test "${TASK}" = "bdist" || test "${METHOD}" = "wheel"; }; then check-wheel-contents ${DIST_DIR}/*.whl || exit -1 fi +PY_MINOR_VER=$(python -c "import sys; print(sys.version_info.minor)") +if [ $PY_MINOR_VER -gt 7 ]; then + echo "pydistcheck..." + pip install pydistcheck + if { test "${TASK}" = "cuda" || test "${METHOD}" = "wheel"; }; then + pydistcheck \ + --inspect \ + --ignore 'compiled-objects-have-debug-symbols,distro-too-large-compressed' \ + --max-allowed-size-uncompressed '60M' \ + --max-allowed-files 800 \ + ${DIST_DIR}/* || exit -1 + elif { test $(uname -m) = "aarch64"; }; then + pydistcheck \ + --inspect \ + --ignore 'compiled-objects-have-debug-symbols' \ + --max-allowed-size-compressed '5M' \ + --max-allowed-size-uncompressed '15M' \ + --max-allowed-files 800 \ + ${DIST_DIR}/* || exit -1 + else + pydistcheck \ + --inspect \ + --max-allowed-size-compressed '5M' \ + --max-allowed-size-uncompressed '15M' \ + --max-allowed-files 800 \ + ${DIST_DIR}/* || exit -1 + fi +else + echo "skipping pydistcheck (does not support Python 3.${PY_MINOR_VER})" +fi + echo "done checking Python package distributions" diff --git a/.ci/lint-cpp.sh b/.ci/lint-cpp.sh new file mode 100755 index 000000000000..ef9fff683731 --- /dev/null +++ b/.ci/lint-cpp.sh @@ -0,0 +1,20 @@ +#!/bin/sh + +echo "running cpplint" +cpplint \ + --filter=-build/c++11,-build/include_subdir,-build/header_guard,-whitespace/line_length \ + --recursive ./src ./include ./R-package ./swig ./tests \ +|| exit -1 +echo "done running cpplint" + +echo "running cmakelint" +cmake_files=$( + find . -name CMakeLists.txt -o -path "./cmake/*.cmake" \ + | grep -v external_libs +) +cmakelint \ + --linelength=120 \ + --filter=-convention/filename,-package/stdargs,-readability/wonkycase \ + ${cmake_files} \ +|| exit -1 +echo "done running cmakelint" diff --git a/.ci/lint-python.sh b/.ci/lint-python.sh new file mode 100755 index 000000000000..887bc9fdebf1 --- /dev/null +++ b/.ci/lint-python.sh @@ -0,0 +1,23 @@ +#!/bin/sh + +echo "running ruff" +ruff check \ + --config=./python-package/pyproject.toml \ + . \ +|| exit -1 +echo "done running ruff" + +echo "running isort" +isort \ + --check-only \ + --settings-path=./python-package/pyproject.toml \ + . \ +|| exit -1 +echo "done running isort" + +echo "running mypy" +mypy \ + --config-file=./python-package/pyproject.toml \ + ./python-package \ +|| true +echo "done running mypy" diff --git a/.ci/test.sh b/.ci/test.sh index 4b01e7c241af..80ed7d2d0ce3 100755 --- a/.ci/test.sh +++ b/.ci/test.sh @@ -66,30 +66,22 @@ if [[ $TASK == "swig" ]]; then fi if [[ $TASK == "lint" ]]; then + cd ${BUILD_DIRECTORY} conda create -q -y -n $CONDA_ENV \ ${CONDA_PYTHON_REQUIREMENT} \ cmakelint \ cpplint \ - flake8 \ isort \ mypy \ - pydocstyle \ - "r-lintr>=3.0" + 'r-lintr>=3.0' \ + ruff source activate $CONDA_ENV echo "Linting Python code" - flake8 \ - --ignore=E501,W503 \ - --exclude=./.nuget,./external_libs,./python-package/build \ - . || exit -1 - pydocstyle --convention=numpy --add-ignore=D105 --match-dir="^(?!^external_libs|test|example).*" --match="(?!^test_|setup).*\.py" . || exit -1 - isort . --check-only || exit -1 - mypy --ignore-missing-imports python-package/ || true + sh ${BUILD_DIRECTORY}/.ci/lint-python.sh || exit -1 echo "Linting R code" Rscript ${BUILD_DIRECTORY}/.ci/lint_r_code.R ${BUILD_DIRECTORY} || exit -1 echo "Linting C++ code" - cpplint --filter=-build/c++11,-build/include_subdir,-build/header_guard,-whitespace/line_length --recursive ./src ./include ./R-package ./swig ./tests || exit -1 - cmake_files=$(find . -name CMakeLists.txt -o -path "*/cmake/*.cmake") - cmakelint --linelength=120 --filter=-convention/filename,-package/stdargs,-readability/wonkycase ${cmake_files} || exit -1 + sh ${BUILD_DIRECTORY}/.ci/lint-cpp.sh || exit -1 exit 0 fi @@ -153,21 +145,23 @@ if [[ $OS_NAME == "macos" ]] && [[ $COMPILER == "clang" ]]; then fi if [[ $TASK == "sdist" ]]; then - cd $BUILD_DIRECTORY/python-package && python setup.py sdist || exit -1 - sh $BUILD_DIRECTORY/.ci/check_python_dists.sh $BUILD_DIRECTORY/python-package/dist || exit -1 - pip install --user $BUILD_DIRECTORY/python-package/dist/lightgbm-$LGB_VER.tar.gz -v || exit -1 + cd $BUILD_DIRECTORY && sh ./build-python.sh sdist || exit -1 + sh $BUILD_DIRECTORY/.ci/check_python_dists.sh $BUILD_DIRECTORY/dist || exit -1 + pip install --user $BUILD_DIRECTORY/dist/lightgbm-$LGB_VER.tar.gz -v || exit -1 if [[ $PRODUCES_ARTIFACTS == "true" ]]; then - cp $BUILD_DIRECTORY/python-package/dist/lightgbm-$LGB_VER.tar.gz $BUILD_ARTIFACTSTAGINGDIRECTORY + cp $BUILD_DIRECTORY/dist/lightgbm-$LGB_VER.tar.gz $BUILD_ARTIFACTSTAGINGDIRECTORY || exit -1 fi pytest $BUILD_DIRECTORY/tests/python_package_test || exit -1 exit 0 elif [[ $TASK == "bdist" ]]; then if [[ $OS_NAME == "macos" ]]; then - cd $BUILD_DIRECTORY/python-package && python setup.py bdist_wheel --plat-name=macosx --python-tag py3 || exit -1 - sh $BUILD_DIRECTORY/.ci/check_python_dists.sh $BUILD_DIRECTORY/python-package/dist || exit -1 - mv dist/lightgbm-$LGB_VER-py3-none-macosx.whl dist/lightgbm-$LGB_VER-py3-none-macosx_10_15_x86_64.macosx_11_6_x86_64.macosx_12_5_x86_64.whl + cd $BUILD_DIRECTORY && sh ./build-python.sh bdist_wheel || exit -1 + sh $BUILD_DIRECTORY/.ci/check_python_dists.sh $BUILD_DIRECTORY/dist || exit -1 + mv \ + ./dist/*.whl \ + dist/lightgbm-$LGB_VER-py3-none-macosx_10_15_x86_64.macosx_11_6_x86_64.macosx_12_5_x86_64.whl || exit -1 if [[ $PRODUCES_ARTIFACTS == "true" ]]; then - cp dist/lightgbm-$LGB_VER-py3-none-macosx*.whl $BUILD_ARTIFACTSTAGINGDIRECTORY + cp dist/lightgbm-$LGB_VER-py3-none-macosx*.whl $BUILD_ARTIFACTSTAGINGDIRECTORY || exit -1 fi else ARCH=$(uname -m) @@ -176,37 +170,51 @@ elif [[ $TASK == "bdist" ]]; then else PLATFORM="manylinux2014_$ARCH" fi - cd $BUILD_DIRECTORY/python-package && python setup.py bdist_wheel --integrated-opencl --plat-name=$PLATFORM --python-tag py3 || exit -1 - sh $BUILD_DIRECTORY/.ci/check_python_dists.sh $BUILD_DIRECTORY/python-package/dist || exit -1 + cd $BUILD_DIRECTORY && sh ./build-python.sh bdist_wheel --integrated-opencl || exit -1 + mv \ + ./dist/*.whl \ + ./dist/lightgbm-$LGB_VER-py3-none-$PLATFORM.whl || exit -1 + sh $BUILD_DIRECTORY/.ci/check_python_dists.sh $BUILD_DIRECTORY/dist || exit -1 if [[ $PRODUCES_ARTIFACTS == "true" ]]; then - cp dist/lightgbm-$LGB_VER-py3-none-$PLATFORM.whl $BUILD_ARTIFACTSTAGINGDIRECTORY + cp dist/lightgbm-$LGB_VER-py3-none-$PLATFORM.whl $BUILD_ARTIFACTSTAGINGDIRECTORY || exit -1 fi # Make sure we can do both CPU and GPU; see tests/python_package_test/test_dual.py export LIGHTGBM_TEST_DUAL_CPU_GPU=1 fi - pip install --user $BUILD_DIRECTORY/python-package/dist/*.whl || exit -1 + pip install --user $BUILD_DIRECTORY/dist/*.whl || exit -1 pytest $BUILD_DIRECTORY/tests || exit -1 exit 0 fi -mkdir $BUILD_DIRECTORY/build && cd $BUILD_DIRECTORY/build +# temporarily pin pip to versions that support 'pip install --install-option' +# ref: https://github.com/microsoft/LightGBM/issues/5061#issuecomment-1510642287 +if [[ $METHOD == "pip" ]]; then + pip install 'pip<23.1' +fi if [[ $TASK == "gpu" ]]; then sed -i'.bak' 's/std::string device_type = "cpu";/std::string device_type = "gpu";/' $BUILD_DIRECTORY/include/LightGBM/config.h grep -q 'std::string device_type = "gpu"' $BUILD_DIRECTORY/include/LightGBM/config.h || exit -1 # make sure that changes were really done if [[ $METHOD == "pip" ]]; then - cd $BUILD_DIRECTORY/python-package && python setup.py sdist || exit -1 - sh $BUILD_DIRECTORY/.ci/check_python_dists.sh $BUILD_DIRECTORY/python-package/dist || exit -1 - pip install --user $BUILD_DIRECTORY/python-package/dist/lightgbm-$LGB_VER.tar.gz -v --install-option=--gpu || exit -1 + cd $BUILD_DIRECTORY && sh ./build-python.sh sdist || exit -1 + sh $BUILD_DIRECTORY/.ci/check_python_dists.sh $BUILD_DIRECTORY/dist || exit -1 + pip install \ + --user \ + -v \ + --install-option=--gpu \ + $BUILD_DIRECTORY/dist/lightgbm-$LGB_VER.tar.gz \ + || exit -1 pytest $BUILD_DIRECTORY/tests/python_package_test || exit -1 exit 0 elif [[ $METHOD == "wheel" ]]; then - cd $BUILD_DIRECTORY/python-package && python setup.py bdist_wheel --gpu || exit -1 - sh $BUILD_DIRECTORY/.ci/check_python_dists.sh $BUILD_DIRECTORY/python-package/dist || exit -1 - pip install --user $BUILD_DIRECTORY/python-package/dist/lightgbm-$LGB_VER*.whl -v || exit -1 + cd $BUILD_DIRECTORY && sh ./build-python.sh bdist_wheel --gpu || exit -1 + sh $BUILD_DIRECTORY/.ci/check_python_dists.sh $BUILD_DIRECTORY/dist || exit -1 + pip install --user $BUILD_DIRECTORY/dist/lightgbm-$LGB_VER*.whl -v || exit -1 pytest $BUILD_DIRECTORY/tests || exit -1 exit 0 elif [[ $METHOD == "source" ]]; then + mkdir $BUILD_DIRECTORY/build + cd $BUILD_DIRECTORY/build cmake -DUSE_GPU=ON .. fi elif [[ $TASK == "cuda" ]]; then @@ -216,43 +224,59 @@ elif [[ $TASK == "cuda" ]]; then sed -i'.bak' 's/gpu_use_dp = false;/gpu_use_dp = true;/' $BUILD_DIRECTORY/include/LightGBM/config.h grep -q 'gpu_use_dp = true' $BUILD_DIRECTORY/include/LightGBM/config.h || exit -1 # make sure that changes were really done if [[ $METHOD == "pip" ]]; then - cd $BUILD_DIRECTORY/python-package && python setup.py sdist || exit -1 - sh $BUILD_DIRECTORY/.ci/check_python_dists.sh $BUILD_DIRECTORY/python-package/dist || exit -1 - pip install --user $BUILD_DIRECTORY/python-package/dist/lightgbm-$LGB_VER.tar.gz -v --install-option=--cuda || exit -1 + cd $BUILD_DIRECTORY && sh ./build-python.sh sdist || exit -1 + sh $BUILD_DIRECTORY/.ci/check_python_dists.sh $BUILD_DIRECTORY/dist || exit -1 + pip install \ + --user \ + -v \ + --install-option=--cuda \ + $BUILD_DIRECTORY/dist/lightgbm-$LGB_VER.tar.gz \ + || exit -1 pytest $BUILD_DIRECTORY/tests/python_package_test || exit -1 exit 0 elif [[ $METHOD == "wheel" ]]; then - cd $BUILD_DIRECTORY/python-package && python setup.py bdist_wheel --cuda || exit -1 - sh $BUILD_DIRECTORY/.ci/check_python_dists.sh $BUILD_DIRECTORY/python-package/dist || exit -1 - pip install --user $BUILD_DIRECTORY/python-package/dist/lightgbm-$LGB_VER*.whl -v || exit -1 + cd $BUILD_DIRECTORY && sh ./build-python.sh bdist_wheel --cuda || exit -1 + sh $BUILD_DIRECTORY/.ci/check_python_dists.sh $BUILD_DIRECTORY/dist || exit -1 + pip install --user $BUILD_DIRECTORY/dist/lightgbm-$LGB_VER*.whl -v || exit -1 pytest $BUILD_DIRECTORY/tests || exit -1 exit 0 elif [[ $METHOD == "source" ]]; then + mkdir $BUILD_DIRECTORY/build + cd $BUILD_DIRECTORY/build cmake -DUSE_CUDA=ON .. fi elif [[ $TASK == "mpi" ]]; then if [[ $METHOD == "pip" ]]; then - cd $BUILD_DIRECTORY/python-package && python setup.py sdist || exit -1 - sh $BUILD_DIRECTORY/.ci/check_python_dists.sh $BUILD_DIRECTORY/python-package/dist || exit -1 - pip install --user $BUILD_DIRECTORY/python-package/dist/lightgbm-$LGB_VER.tar.gz -v --install-option=--mpi || exit -1 + cd $BUILD_DIRECTORY && sh ./build-python.sh sdist || exit -1 + sh $BUILD_DIRECTORY/.ci/check_python_dists.sh $BUILD_DIRECTORY/dist || exit -1 + pip install \ + --user \ + -v \ + --install-option=--mpi \ + $BUILD_DIRECTORY/dist/lightgbm-$LGB_VER.tar.gz \ + || exit -1 pytest $BUILD_DIRECTORY/tests/python_package_test || exit -1 exit 0 elif [[ $METHOD == "wheel" ]]; then - cd $BUILD_DIRECTORY/python-package && python setup.py bdist_wheel --mpi || exit -1 - sh $BUILD_DIRECTORY/.ci/check_python_dists.sh $BUILD_DIRECTORY/python-package/dist || exit -1 - pip install --user $BUILD_DIRECTORY/python-package/dist/lightgbm-$LGB_VER*.whl -v || exit -1 + cd $BUILD_DIRECTORY && sh ./build-python.sh bdist_wheel --mpi || exit -1 + sh $BUILD_DIRECTORY/.ci/check_python_dists.sh $BUILD_DIRECTORY/dist || exit -1 + pip install --user $BUILD_DIRECTORY/dist/lightgbm-$LGB_VER*.whl -v || exit -1 pytest $BUILD_DIRECTORY/tests || exit -1 exit 0 elif [[ $METHOD == "source" ]]; then + mkdir $BUILD_DIRECTORY/build + cd $BUILD_DIRECTORY/build cmake -DUSE_MPI=ON -DUSE_DEBUG=ON .. fi else + mkdir $BUILD_DIRECTORY/build + cd $BUILD_DIRECTORY/build cmake .. fi make _lightgbm -j4 || exit -1 -cd $BUILD_DIRECTORY/python-package && python setup.py install --precompile --user || exit -1 +cd $BUILD_DIRECTORY && sh ./build-python.sh install --precompile --user || exit -1 pytest $BUILD_DIRECTORY/tests || exit -1 if [[ $TASK == "regular" ]]; then diff --git a/.ci/test_r_package.sh b/.ci/test_r_package.sh index dbc76f061114..34322ecef7d1 100755 --- a/.ci/test_r_package.sh +++ b/.ci/test_r_package.sh @@ -17,7 +17,7 @@ fi R_MAJOR_VERSION=( ${R_VERSION//./ } ) if [[ "${R_MAJOR_VERSION}" == "3" ]]; then export R_MAC_VERSION=3.6.3 - export R_MAC_PKG_URL=${CRAN_MIRROR}/bin/macosx/R-${R_MAC_VERSION}.pkg + export R_MAC_PKG_URL=${CRAN_MIRROR}/bin/macosx/R-${R_MAC_VERSION}.nn.pkg export R_LINUX_VERSION="3.6.3-1bionic" export R_APT_REPO="bionic-cran35/" elif [[ "${R_MAJOR_VERSION}" == "4" ]]; then @@ -77,13 +77,14 @@ fi # Installing R precompiled for Mac OS 10.11 or higher if [[ $OS_NAME == "macos" ]]; then + brew update-reset && brew update if [[ $R_BUILD_TYPE == "cran" ]]; then brew install automake || exit -1 fi brew install \ checkbashisms \ qpdf || exit -1 - brew install --cask basictex || exit -1 + brew install basictex || exit -1 export PATH="/Library/TeX/texbin:$PATH" sudo tlmgr --verify-repo=none update --self || exit -1 sudo tlmgr --verify-repo=none install inconsolata helvetic rsfs || exit -1 @@ -117,6 +118,16 @@ if [[ $OS_NAME == "macos" ]]; then fi fi +# fix for issue where CRAN was not returning {lattice} when using R 3.6 +# "Warning: dependency ‘lattice’ is not available" +# +# refs for that MRAN snapshot: +# * https://cran.r-project.org/web/packages/checkpoint/readme/README.html +# * https://help.codeocean.com/en/articles/3087704-using-mran-snapshots-to-install-archived-r-packages +if [[ "${R_MAJOR_VERSION}" == "3" ]]; then + Rscript --vanilla -e "install.packages('lattice', repos = 'https://cran.microsoft.com/snapshot/2020-04-23/', lib = '${R_LIB_PATH}')" +fi + # Manually install Depends and Imports libraries + 'knitr', 'RhpcBLASctl', 'rmarkdown', 'testthat' # to avoid a CI-time dependency on devtools (for devtools::install_deps()) # NOTE: testthat is not required when running rchk diff --git a/.ci/test_windows.ps1 b/.ci/test_windows.ps1 index 4735de82902c..1e14cec3fc28 100644 --- a/.ci/test_windows.ps1 +++ b/.ci/test_windows.ps1 @@ -35,7 +35,7 @@ if ($env:TASK -eq "swig") { mkdir $env:BUILD_SOURCESDIRECTORY/build; cd $env:BUILD_SOURCESDIRECTORY/build cmake -A x64 -DUSE_SWIG=ON .. ; cmake --build . --target ALL_BUILD --config Release ; Check-Output $? if ($env:AZURE -eq "true") { - cp $env:BUILD_SOURCESDIRECTORY/build/lightgbmlib.jar $env:BUILD_ARTIFACTSTAGINGDIRECTORY/lightgbmlib_win.jar + cp $env:BUILD_SOURCESDIRECTORY/build/lightgbmlib.jar $env:BUILD_ARTIFACTSTAGINGDIRECTORY/lightgbmlib_win.jar ; Check-Output $? } Exit 0 } @@ -44,6 +44,12 @@ if ($env:TASK -eq "swig") { conda init powershell conda activate conda config --set always_yes yes --set changeps1 no + +# ref: +# * https://stackoverflow.com/a/62897729/3986677 +# * https://github.com/microsoft/LightGBM/issues/5899 +conda install brotlipy + conda update -q -y conda conda create -q -y -n $env:CONDA_ENV ` cloudpickle ` @@ -65,15 +71,15 @@ if ($env:TASK -ne "bdist") { if ($env:TASK -eq "regular") { mkdir $env:BUILD_SOURCESDIRECTORY/build; cd $env:BUILD_SOURCESDIRECTORY/build cmake -A x64 .. ; cmake --build . --target ALL_BUILD --config Release ; Check-Output $? - cd $env:BUILD_SOURCESDIRECTORY/python-package - python setup.py install --precompile ; Check-Output $? + cd $env:BUILD_SOURCESDIRECTORY + sh $env:BUILD_SOURCESDIRECTORY/build-python.sh install --precompile ; Check-Output $? cp $env:BUILD_SOURCESDIRECTORY/Release/lib_lightgbm.dll $env:BUILD_ARTIFACTSTAGINGDIRECTORY cp $env:BUILD_SOURCESDIRECTORY/Release/lightgbm.exe $env:BUILD_ARTIFACTSTAGINGDIRECTORY } elseif ($env:TASK -eq "sdist") { - cd $env:BUILD_SOURCESDIRECTORY/python-package - python setup.py sdist --formats gztar ; Check-Output $? - sh $env:BUILD_SOURCESDIRECTORY/.ci/check_python_dists.sh $env:BUILD_SOURCESDIRECTORY/python-package/dist ; Check-Output $? + cd $env:BUILD_SOURCESDIRECTORY + sh $env:BUILD_SOURCESDIRECTORY/build-python.sh sdist ; Check-Output $? + sh $env:BUILD_SOURCESDIRECTORY/.ci/check_python_dists.sh $env:BUILD_SOURCESDIRECTORY/dist ; Check-Output $? cd dist; pip install @(Get-ChildItem *.gz) -v ; Check-Output $? } elseif ($env:TASK -eq "bdist") { @@ -87,17 +93,17 @@ elseif ($env:TASK -eq "bdist") { Get-ItemProperty -Path Registry::HKEY_LOCAL_MACHINE\SOFTWARE\Khronos\OpenCL\Vendors conda activate $env:CONDA_ENV - cd $env:BUILD_SOURCESDIRECTORY/python-package - python setup.py bdist_wheel --integrated-opencl --plat-name=win-amd64 --python-tag py3 ; Check-Output $? - sh $env:BUILD_SOURCESDIRECTORY/.ci/check_python_dists.sh $env:BUILD_SOURCESDIRECTORY/python-package/dist ; Check-Output $? + cd $env:BUILD_SOURCESDIRECTORY + sh "build-python.sh" bdist_wheel --integrated-opencl ; Check-Output $? + sh $env:BUILD_SOURCESDIRECTORY/.ci/check_python_dists.sh $env:BUILD_SOURCESDIRECTORY/dist ; Check-Output $? cd dist; pip install --user @(Get-ChildItem *.whl) ; Check-Output $? cp @(Get-ChildItem *.whl) $env:BUILD_ARTIFACTSTAGINGDIRECTORY } elseif (($env:APPVEYOR -eq "true") -and ($env:TASK -eq "python")) { - cd $env:BUILD_SOURCESDIRECTORY\python-package + cd $env:BUILD_SOURCESDIRECTORY if ($env:COMPILER -eq "MINGW") { - python setup.py install --mingw ; Check-Output $? + sh $env:BUILD_SOURCESDIRECTORY/build-python.sh install --mingw ; Check-Output $? } else { - python setup.py install ; Check-Output $? + sh $env:BUILD_SOURCESDIRECTORY/build-python.sh install ; Check-Output $? } } diff --git a/.github/workflows/r_package.yml b/.github/workflows/r_package.yml index 62ebc86726a6..eb2cb90a424e 100644 --- a/.github/workflows/r_package.yml +++ b/.github/workflows/r_package.yml @@ -63,24 +63,12 @@ jobs: r_version: 4.2 build_type: cmake container: 'ubuntu:22.04' - - os: macOS-latest - task: r-package - compiler: gcc - r_version: 3.6 - build_type: cmake - container: null - os: macOS-latest task: r-package compiler: gcc r_version: 4.2 build_type: cmake container: null - - os: macOS-latest - task: r-package - compiler: clang - r_version: 3.6 - build_type: cmake - container: null - os: macOS-latest task: r-package compiler: clang diff --git a/.github/workflows/static_analysis.yml b/.github/workflows/static_analysis.yml index 415cbb66086a..bf369e79c0c5 100644 --- a/.github/workflows/static_analysis.yml +++ b/.github/workflows/static_analysis.yml @@ -21,7 +21,7 @@ env: CONDA_ENV: test-env GITHUB_ACTIONS: 'true' OS_NAME: 'linux' - PYTHON_VERSION: '3.10' + PYTHON_VERSION: '3.11' jobs: test: diff --git a/.gitignore b/.gitignore index bb65ca426bba..d4045d9a4798 100644 --- a/.gitignore +++ b/.gitignore @@ -399,6 +399,7 @@ lightgbm.model /cmake-build-debug/ # Files from local Python install +lightgbm-python/ python-package/LICENSE python-package/build_cpp/ python-package/compile/ diff --git a/CMakeLists.txt b/CMakeLists.txt index 750b41ab8164..0792f0959ca6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,8 +15,11 @@ set( "Semicolon separated list of sanitizer names, e.g., 'address;leak'. \ Supported sanitizers are address, leak, undefined and thread." ) +option(BUILD_CLI "Build the 'lightbgm' command-line interface in addition to lib_lightgbm" ON) option(BUILD_CPP_TEST "Build C++ tests with Google Test" OFF) option(BUILD_STATIC_LIB "Build static library" OFF) +option(INSTALL_HEADERS "Install headers to CMAKE_INSTALL_PREFIX (e.g. '/usr/local/include')" ON) +option(__BUILD_FOR_PYTHON "Set to ON if building lib_lightgbm for use with the Python package" OFF) option(__BUILD_FOR_R "Set to ON if building lib_lightgbm for use with the R package" OFF) option(__INTEGRATE_OPENCL "Set to ON if building LightGBM with the OpenCL ICD Loader and its dependencies included" OFF) @@ -55,6 +58,14 @@ if(__INTEGRATE_OPENCL) message(STATUS "Building library with integrated OpenCL components") endif() +if(__BUILD_FOR_PYTHON OR __BUILD_FOR_R) + # the Python and R package don't require the CLI + set(BUILD_CLI OFF) + # installing the R and Python package shouldn't place LightGBM's headers + # outside of where the package is installed + set(INSTALL_HEADERS OFF) +endif() + if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "4.8.2") message(FATAL_ERROR "Insufficient gcc version") @@ -421,8 +432,10 @@ endif() add_library(lightgbm_objs OBJECT ${SOURCES}) -add_executable(lightgbm src/main.cpp src/application/application.cpp) -target_link_libraries(lightgbm PRIVATE lightgbm_objs) +if(BUILD_CLI) + add_executable(lightgbm src/main.cpp src/application/application.cpp) + target_link_libraries(lightgbm PRIVATE lightgbm_objs) +endif() set(API_SOURCES "src/c_api.cpp") # Only build the R part of the library if building for @@ -544,19 +557,25 @@ if(USE_CUDA) # each target that contains or depends on cuda source. set_target_properties(lightgbm_objs PROPERTIES CUDA_ARCHITECTURES OFF) set_target_properties(_lightgbm PROPERTIES CUDA_ARCHITECTURES OFF) - set_target_properties(lightgbm PROPERTIES CUDA_ARCHITECTURES OFF) + if(BUILD_CLI) + set_target_properties(lightgbm PROPERTIES CUDA_ARCHITECTURES OFF) + endif() set_target_properties(lightgbm_objs PROPERTIES CUDA_SEPARABLE_COMPILATION ON) # Device linking is not supported for object libraries. # Thus we have to specify them on final targets. - set_target_properties(lightgbm PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON) + if(BUILD_CLI) + set_target_properties(lightgbm PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON) + endif() set_target_properties(_lightgbm PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON) # histograms are list of object libraries. Linking object library to other # object libraries only gets usage requirements, the linked objects won't be # used. Thus we have to call target_link_libraries on final targets here. - target_link_libraries(lightgbm PRIVATE ${histograms}) + if(BUILD_CLI) + target_link_libraries(lightgbm PRIVATE ${histograms}) + endif() target_link_libraries(_lightgbm PRIVATE ${histograms}) endif() @@ -566,7 +585,7 @@ endif() if(WIN32) if(MINGW OR CYGWIN) - target_link_libraries(lightgbm_objs PUBLIC Ws2_32 IPHLPAPI) + target_link_libraries(lightgbm_objs PUBLIC ws2_32 iphlpapi) endif() endif() @@ -619,11 +638,20 @@ if(BUILD_CPP_TEST) target_link_libraries(testlightgbm PRIVATE lightgbm_objs lightgbm_capi_objs GTest::GTest) endif() +if(BUILD_CLI) + install( + TARGETS lightgbm + RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/bin + ) +endif() + install( - TARGETS lightgbm _lightgbm + TARGETS _lightgbm RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/bin LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/lib ) -install(DIRECTORY ${LightGBM_HEADER_DIR}/LightGBM DESTINATION ${CMAKE_INSTALL_PREFIX}/include) +if(INSTALL_HEADERS) + install(DIRECTORY ${LightGBM_HEADER_DIR}/LightGBM DESTINATION ${CMAKE_INSTALL_PREFIX}/include) +endif() diff --git a/R-package/src/Makevars.in b/R-package/src/Makevars.in index 471bfc948cc3..ba9ef054bfab 100644 --- a/R-package/src/Makevars.in +++ b/R-package/src/Makevars.in @@ -48,6 +48,7 @@ OBJECTS = \ treelearner/data_parallel_tree_learner.o \ treelearner/feature_parallel_tree_learner.o \ treelearner/gpu_tree_learner.o \ + treelearner/gradient_discretizer.o \ treelearner/linear_tree_learner.o \ treelearner/serial_tree_learner.o \ treelearner/tree_learner.o \ diff --git a/R-package/src/Makevars.win.in b/R-package/src/Makevars.win.in index 8d39317b4a3a..14f5afde002f 100644 --- a/R-package/src/Makevars.win.in +++ b/R-package/src/Makevars.win.in @@ -19,7 +19,7 @@ PKG_LIBS = \ ${SHLIB_OPENMP_CXXFLAGS} \ ${SHLIB_PTHREAD_FLAGS} \ -lws2_32 \ - -lIphlpapi + -liphlpapi OBJECTS = \ boosting/boosting.o \ @@ -49,6 +49,7 @@ OBJECTS = \ treelearner/data_parallel_tree_learner.o \ treelearner/feature_parallel_tree_learner.o \ treelearner/gpu_tree_learner.o \ + treelearner/gradient_discretizer.o \ treelearner/linear_tree_learner.o \ treelearner/serial_tree_learner.o \ treelearner/tree_learner.o \ diff --git a/build-python.sh b/build-python.sh new file mode 100755 index 000000000000..1dd8bc9fe966 --- /dev/null +++ b/build-python.sh @@ -0,0 +1,342 @@ +#!/bin/sh + +# [description] +# +# Prepare a source distribution (sdist) or built distribution (wheel) +# of the Python package, and optionally install it. +# +# [usage] +# +# # build sdist and put it in dist/ +# sh ./build-python.sh sdist +# +# # build wheel and put it in dist/ +# sh ./build-python.sh bdist_wheel [OPTIONS] +# +# # compile lib_lightgbm and install the Python package wrapping it +# sh ./build-python.sh install [OPTIONS] +# +# # install the Python package using a pre-compiled lib_lightgbm +# # (assumes lib_lightgbm.{dll,so} is located at the root of the repo) +# sh ./build-python.sh install --precompile +# +# [options] +# +# --boost-dir=FILEPATH +# Directory with Boost package configuration file. +# --boost-include-dir=FILEPATH +# Directory containing Boost headers. +# --boost-librarydir=FILEPATH +# Preferred Boost library directory. +# --boost-root=FILEPATH +# Boost preferred installation prefix. +# --opencl-include-dir=FILEPATH +# OpenCL include directory. +# --opencl-library=FILEPATH +# Path to OpenCL library. +# --bit32 +# Compile 32-bit version. +# --cuda +# Compile CUDA version. +# --gpu +# Compile GPU version. +# --hdfs +# Compile HDFS version. +# --integrated-opencl +# Compile integrated OpenCL version. +# --mingw +# Compile with MinGW. +# --mpi +# Compile MPI version. +# --nomp +# Compile version without OpenMP support. +# --precompile +# Use precompiled library. +# Only used with 'install' command. +# --time-costs +# Output time costs for different internal routines. +# --user +# Install into user-specific instead of global site-packages directory. +# Only used with 'install' command. + +set -e -u + +echo "building lightgbm" + +# Default values of arguments +INSTALL="false" +BUILD_SDIST="false" +BUILD_WHEEL="false" + +PIP_INSTALL_ARGS="" +BUILD_ARGS="" +PRECOMPILE="false" + +while [ $# -gt 0 ]; do + case "$1" in + ############################ + # sub-commands of setup.py # + ############################ + install) + INSTALL="true" + ;; + sdist) + BUILD_SDIST="true" + ;; + bdist_wheel) + BUILD_WHEEL="true" + ;; + ############################ + # customized library paths # + ############################ + --boost-dir|--boost-dir=*) + if [[ "$1" != *=* ]]; + then shift; + fi + BOOST_DIR="${1#*=}" + BUILD_ARGS="${BUILD_ARGS} --boost-dir='${BOOST_DIR}'" + ;; + --boost-include-dir|--boost-include-dir=*) + if [[ "$1" != *=* ]]; + then shift; + fi + BOOST_INCLUDE_DIR="${1#*=}" + BUILD_ARGS="${BUILD_ARGS} --boost-include-dir='${BOOST_INCLUDE_DIR}'" + ;; + --boost-librarydir|--boost-librarydir=*) + if [[ "$1" != *=* ]]; + then shift; + fi + BOOST_LIBRARY_DIR="${1#*=}" + BUILD_ARGS="${BUILD_ARGS} --boost-librarydir='${BOOST_LIBRARY_DIR}'" + ;; + --boost-root|--boost-root=*) + if [[ "$1" != *=* ]]; + then shift; + fi + BOOST_ROOT="${1#*=}" + BUILD_ARGS="${BUILD_ARGS} --boost-root='${BOOST_ROOT}'" + ;; + --opencl-include-dir|--opencl-include-dir=*) + if [[ "$1" != *=* ]]; + then shift; + fi + OPENCL_INCLUDE_DIR="${1#*=}" + BUILD_ARGS="${BUILD_ARGS} --opencl-include-dir='${OPENCL_INCLUDE_DIR}'" + ;; + --opencl-library|--opencl-library=*) + if [[ "$1" != *=* ]]; + then shift; + fi + OPENCL_LIBRARY="${1#*=}" + BUILD_ARGS="${BUILD_ARGS} --opencl-library='${OPENCL_LIBRARY}'" + ;; + ######### + # flags # + ######### + --bit32) + BUILD_ARGS="${BUILD_ARGS} --bit32" + ;; + --cuda) + BUILD_ARGS="${BUILD_ARGS} --cuda" + ;; + --gpu) + BUILD_ARGS="${BUILD_ARGS} --gpu" + ;; + --hdfs) + BUILD_ARGS="${BUILD_ARGS} --hdfs" + ;; + --integrated-opencl) + BUILD_ARGS="${BUILD_ARGS} --integrated-opencl" + ;; + --mingw) + BUILD_ARGS="${BUILD_ARGS} --mingw" + ;; + --mpi) + BUILD_ARGS="${BUILD_ARGS} --mpi" + ;; + --nomp) + BUILD_ARGS="${BUILD_ARGS} --nomp" + ;; + --precompile) + PRECOMPILE="true" + ;; + --time-costs) + BUILD_ARGS="${PIP_INSTALL_ARGS} --time-costs" + ;; + --user) + PIP_INSTALL_ARGS="${PIP_INSTALL_ARGS} --user" + ;; + *) + echo "invalid argument '${1}'" + exit -1 + ;; + esac + shift +done + +# create a new directory that just contains the files needed +# to build the Python package +create_isolated_source_dir() { + rm -rf \ + ./lightgbm-python \ + ./lightgbm \ + ./python-package/build \ + ./python-package/build_cpp \ + ./python-package/compile \ + ./python-package/dist \ + ./python-package/lightgbm.egg-info + + cp -R ./python-package ./lightgbm-python + + # temporarily remove these files until + # https://github.com/microsoft/LightGBM/issues/5061 is done + rm ./lightgbm-python/pyproject.toml + rm ./lightgbm-python/setup.cfg + + cp LICENSE ./lightgbm-python/ + cp VERSION.txt ./lightgbm-python/lightgbm/VERSION.txt + + mkdir -p ./lightgbm-python/compile + cp -R ./cmake ./lightgbm-python/compile + cp CMakeLists.txt ./lightgbm-python/compile + cp -R ./include ./lightgbm-python/compile + cp -R ./src ./lightgbm-python/compile + cp -R ./swig ./lightgbm-python/compile + cp -R ./windows ./lightgbm-python/compile + + # include only specific files from external_libs, to keep the package + # small and avoid redistributing code with licenses incompatible with + # LightGBM's license + + ###################### + # fast_double_parser # + ###################### + mkdir -p ./lightgbm-python/compile/external_libs/fast_double_parser + cp \ + external_libs/fast_double_parser/CMakeLists.txt \ + ./lightgbm-python/compile/external_libs/fast_double_parser/CMakeLists.txt + cp \ + external_libs/fast_double_parser/LICENSE* \ + ./lightgbm-python/compile/external_libs/fast_double_parser/ + + mkdir -p ./lightgbm-python/compile/external_libs/fast_double_parser/include/ + cp \ + external_libs/fast_double_parser/include/fast_double_parser.h \ + ./lightgbm-python/compile/external_libs/fast_double_parser/include/ + + ####### + # fmt # + ####### + mkdir -p ./lightgbm-python/compile/external_libs/fmt + cp \ + external_libs/fast_double_parser/CMakeLists.txt \ + ./lightgbm-python/compile/external_libs/fmt/CMakeLists.txt + cp \ + external_libs/fmt/LICENSE* \ + ./lightgbm-python/compile/external_libs/fmt/ + + mkdir -p ./lightgbm-python/compile/external_libs/fmt/include/fmt + cp \ + external_libs/fmt/include/fmt/*.h \ + ./lightgbm-python/compile/external_libs/fmt/include/fmt/ + + ######### + # Eigen # + ######### + mkdir -p ./lightgbm-python/compile/external_libs/eigen/Eigen + cp \ + external_libs/eigen/CMakeLists.txt \ + ./lightgbm-python/compile/external_libs/eigen/CMakeLists.txt + + modules="Cholesky Core Dense Eigenvalues Geometry Householder Jacobi LU QR SVD" + for eigen_module in ${modules}; do + cp \ + external_libs/eigen/Eigen/${eigen_module} \ + ./lightgbm-python/compile/external_libs/eigen/Eigen/${eigen_module} + if [ ${eigen_module} != "Dense" ]; then + mkdir -p ./lightgbm-python/compile/external_libs/eigen/Eigen/src/${eigen_module}/ + cp \ + -R \ + external_libs/eigen/Eigen/src/${eigen_module}/* \ + ./lightgbm-python/compile/external_libs/eigen/Eigen/src/${eigen_module}/ + fi + done + + mkdir -p ./lightgbm-python/compile/external_libs/eigen/Eigen/misc + cp \ + -R \ + external_libs/eigen/Eigen/src/misc \ + ./lightgbm-python/compile/external_libs/eigen/Eigen/src/misc/ + + mkdir -p ./lightgbm-python/compile/external_libs/eigen/Eigen/plugins + cp \ + -R \ + external_libs/eigen/Eigen/src/plugins \ + ./lightgbm-python/compile/external_libs/eigen/Eigen/src/plugins/ + + ################### + # compute (Boost) # + ################### + mkdir -p ./lightgbm-python/compile/external_libs/compute + cp \ + external_libs/compute/CMakeLists.txt \ + ./lightgbm-python/compile/external_libs/compute/ + cp \ + -R \ + external_libs/compute/cmake \ + ./lightgbm-python/compile/external_libs/compute/cmake/ + cp \ + -R \ + external_libs/compute/include \ + ./lightgbm-python/compile/external_libs/compute/include/ + cp \ + -R \ + external_libs/compute/meta \ + ./lightgbm-python/compile/external_libs/compute/meta/ +} + +create_isolated_source_dir + +cd ./lightgbm-python + +# installation involves building the wheel + `pip install`-ing it +if test "${INSTALL}" = true; then + if test "${PRECOMPILE}" = true; then + echo "--- installing lightgbm (from precompiled lib_lightgbm) ---" + python setup.py install ${PIP_INSTALL_ARGS} --precompile + exit 0 + else + BUILD_SDIST="false" + BUILD_WHEEL="true" + fi +fi + +if test "${BUILD_SDIST}" = true; then + echo "--- building sdist ---" + rm -f ../dist/*.tar.gz + python ./setup.py sdist \ + --dist-dir ../dist +fi + +if test "${BUILD_WHEEL}" = true; then + echo "--- building wheel ---" + rm -f ../dist/*.whl || true + python setup.py bdist_wheel \ + --dist-dir ../dist \ + ${BUILD_ARGS} +fi + +if test "${INSTALL}" = true; then + echo "--- installing lightgbm ---" + # ref for use of '--find-links': https://stackoverflow.com/a/52481267/3986677 + cd ../dist + pip install \ + ${PIP_INSTALL_ARGS} \ + --find-links=. \ + lightgbm + cd ../ +fi + +echo "cleaning up" +rm -rf ./lightgbm-python diff --git a/docker/dockerfile-python b/docker/dockerfile-python index 6c5ca6501ac3..541884811a0b 100644 --- a/docker/dockerfile-python +++ b/docker/dockerfile-python @@ -26,7 +26,7 @@ RUN apt-get update && \ # lightgbm conda install -q -y numpy scipy scikit-learn pandas && \ git clone --recursive --branch stable --depth 1 https://github.com/Microsoft/LightGBM && \ - cd LightGBM/python-package && python setup.py install && \ + sh ./build-python.sh install && \ # clean apt-get autoremove -y && apt-get clean && \ conda clean -a -y && \ diff --git a/docker/gpu/dockerfile.gpu b/docker/gpu/dockerfile.gpu index bac9d97b2c2b..74c301234020 100644 --- a/docker/gpu/dockerfile.gpu +++ b/docker/gpu/dockerfile.gpu @@ -88,7 +88,7 @@ RUN cd /usr/local/src && mkdir lightgbm && cd lightgbm && \ ENV PATH /usr/local/src/lightgbm/LightGBM:${PATH} -RUN /bin/bash -c "source activate py3 && cd /usr/local/src/lightgbm/LightGBM/python-package && python setup.py install --precompile && source deactivate" +RUN /bin/bash -c "source activate py3 && cd /usr/local/src/lightgbm/LightGBM && sh ./build-python.sh install --precompile && source deactivate" ################################################################################################################# # System CleanUp diff --git a/docs/FAQ.rst b/docs/FAQ.rst index 9f86b882e0a1..2a09fd674e4c 100644 --- a/docs/FAQ.rst +++ b/docs/FAQ.rst @@ -277,6 +277,10 @@ Python-package 1. ``Error: setup script specifies an absolute path`` when installing from GitHub using ``python setup.py install``. -------------------------------------------------------------------------------------------------------------------- +.. note:: + As of v4.0.0, ``lightgbm`` does not support directly invoking ``setup.py``. + This answer refers only to versions of ``lightgbm`` prior to v4.0.0. + .. code-block:: console error: Error: setup script specifies an absolute path: @@ -329,7 +333,7 @@ So, if you want to: We are doing our best to provide universal wheels which have high running speed and are compatible with any hardware, OS, compiler, etc. at the same time. However, sometimes it's just impossible to guarantee the possibility of usage of LightGBM in any specific environment (see `Microsoft/LightGBM#1743 `__). -Therefore, the first thing you should try in case of segfaults is **compiling from the source** using ``pip install --no-binary :all: lightgbm``. +Therefore, the first thing you should try in case of segfaults is **compiling from the source** using ``pip install --no-binary lightgbm lightgbm``. For the OS-specific prerequisites see `this guide `__. Also, feel free to post a new issue in our GitHub repository. We always look at each case individually and try to find a root cause. diff --git a/docs/GPU-Tutorial.rst b/docs/GPU-Tutorial.rst index 1ca98784e3f6..836ab1add378 100644 --- a/docs/GPU-Tutorial.rst +++ b/docs/GPU-Tutorial.rst @@ -80,9 +80,7 @@ If you want to use the Python interface of LightGBM, you can install it now (alo sudo apt-get -y install python-pip sudo -H pip install setuptools numpy scipy scikit-learn -U - cd python-package/ - sudo python setup.py install --precompile - cd .. + sudo sh ./build-python.sh install --precompile You need to set an additional parameter ``"device" : "gpu"`` (along with your other options like ``learning_rate``, ``num_leaves``, etc) to use GPU in Python. diff --git a/docs/Parameters.rst b/docs/Parameters.rst index abbd8cb14e14..aee1cc4e7f84 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -658,6 +658,38 @@ Learning Control Parameters - **Note**: can be used only in CLI version +- ``use_quantized_grad`` :raw-html:`🔗︎`, default = ``false``, type = bool + + - whether to use gradient quantization when training + + - enabling this will discretize (quantize) the gradients and hessians into bins of ``num_grad_quant_bins`` + + - with quantized training, most arithmetics in the training process will be integer operations + + - gradient quantization can accelerate training, with little accuracy drop in most cases + + - **Note**: can be used only with ``device_type = cpu`` + +- ``num_grad_quant_bins`` :raw-html:`🔗︎`, default = ``4``, type = int + + - number of bins to quantization gradients and hessians + + - with more bins, the quantized training will be closer to full precision training + + - **Note**: can be used only with ``device_type = cpu`` + +- ``quant_train_renew_leaf`` :raw-html:`🔗︎`, default = ``false``, type = bool + + - whether to renew the leaf values with original gradients when quantized training + + - renewing is very helpful for good quantized training accuracy for ranking objectives + + - **Note**: can be used only with ``device_type = cpu`` + +- ``stochastic_rounding`` :raw-html:`🔗︎`, default = ``true``, type = bool + + - whether to use stochastic rounding in gradient quantization + IO Parameters ------------- diff --git a/docs/Quick-Start.rst b/docs/Quick-Start.rst index 7c1883652d96..04e64beb1281 100644 --- a/docs/Quick-Start.rst +++ b/docs/Quick-Start.rst @@ -59,14 +59,14 @@ Run LightGBM :: - "./lightgbm" config=your_config_file other_args ... + lightgbm config=your_config_file other_args ... Parameters can be set both in the config file and command line, and the parameters in command line have higher priority than in the config file. For example, the following command line will keep ``num_trees=10`` and ignore the same parameter in the config file. :: - "./lightgbm" config=train.conf num_trees=10 + lightgbm config=train.conf num_trees=10 Examples -------- diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h index a6199bbbcbd2..ffb8f2844843 100644 --- a/include/LightGBM/bin.h +++ b/include/LightGBM/bin.h @@ -30,11 +30,14 @@ enum MissingType { }; typedef double hist_t; +typedef int32_t int_hist_t; typedef uint64_t hist_cnt_t; // check at compile time static_assert(sizeof(hist_t) == sizeof(hist_cnt_t), "Histogram entry size is not correct"); const size_t kHistEntrySize = 2 * sizeof(hist_t); +const size_t kInt32HistEntrySize = 2 * sizeof(int_hist_t); +const size_t kInt16HistEntrySize = 2 * sizeof(int16_t); const int kHistOffset = 2; const double kSparseThreshold = 0.7; @@ -56,6 +59,28 @@ inline static void HistogramSumReducer(const char* src, char* dst, int type_size } } +inline static void Int32HistogramSumReducer(const char* src, char* dst, int type_size, comm_size_t len) { + const int64_t* src_ptr = reinterpret_cast(src); + int64_t* dst_ptr = reinterpret_cast(dst); + const comm_size_t steps = (len + (type_size * 2) - 1) / (type_size * 2); + const int num_threads = OMP_NUM_THREADS(); + #pragma omp parallel for schedule(static) num_threads(num_threads) + for (comm_size_t i = 0; i < steps; ++i) { + dst_ptr[i] += src_ptr[i]; + } +} + +inline static void Int16HistogramSumReducer(const char* src, char* dst, int type_size, comm_size_t len) { + const int32_t* src_ptr = reinterpret_cast(src); + int32_t* dst_ptr = reinterpret_cast(dst); + const comm_size_t steps = (len + (type_size * 2) - 1) / (type_size * 2); + const int num_threads = OMP_NUM_THREADS(); + #pragma omp parallel for schedule(static) num_threads(num_threads) + for (comm_size_t i = 0; i < steps; ++i) { + dst_ptr[i] += src_ptr[i]; + } +} + /*! \brief This class used to convert feature values into bin, * and store some meta information for bin*/ class BinMapper { @@ -332,6 +357,33 @@ class Bin { const score_t* ordered_gradients, const score_t* ordered_hessians, hist_t* out) const = 0; + virtual void ConstructHistogramInt8( + const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* ordered_gradients, const score_t* ordered_hessians, + hist_t* out) const = 0; + + virtual void ConstructHistogramInt8(data_size_t start, data_size_t end, + const score_t* ordered_gradients, const score_t* ordered_hessians, + hist_t* out) const = 0; + + virtual void ConstructHistogramInt16( + const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* ordered_gradients, const score_t* ordered_hessians, + hist_t* out) const = 0; + + virtual void ConstructHistogramInt16(data_size_t start, data_size_t end, + const score_t* ordered_gradients, const score_t* ordered_hessians, + hist_t* out) const = 0; + + virtual void ConstructHistogramInt32( + const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* ordered_gradients, const score_t* ordered_hessians, + hist_t* out) const = 0; + + virtual void ConstructHistogramInt32(data_size_t start, data_size_t end, + const score_t* ordered_gradients, const score_t* ordered_hessians, + hist_t* out) const = 0; + /*! * \brief Construct histogram of this feature, * Note: We use ordered_gradients and ordered_hessians to improve cache hit chance @@ -351,6 +403,24 @@ class Bin { virtual void ConstructHistogram(data_size_t start, data_size_t end, const score_t* ordered_gradients, hist_t* out) const = 0; + virtual void ConstructHistogramInt8(const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* ordered_gradients, hist_t* out) const = 0; + + virtual void ConstructHistogramInt8(data_size_t start, data_size_t end, + const score_t* ordered_gradients, hist_t* out) const = 0; + + virtual void ConstructHistogramInt16(const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* ordered_gradients, hist_t* out) const = 0; + + virtual void ConstructHistogramInt16(data_size_t start, data_size_t end, + const score_t* ordered_gradients, hist_t* out) const = 0; + + virtual void ConstructHistogramInt32(const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* ordered_gradients, hist_t* out) const = 0; + + virtual void ConstructHistogramInt32(data_size_t start, data_size_t end, + const score_t* ordered_gradients, hist_t* out) const = 0; + virtual data_size_t Split(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, uint32_t most_freq_bin, MissingType missing_type, bool default_left, @@ -464,6 +534,57 @@ class MultiValBin { const score_t* ordered_hessians, hist_t* out) const = 0; + virtual void ConstructHistogramInt32(const data_size_t* data_indices, + data_size_t start, data_size_t end, + const score_t* gradients, + const score_t* hessians, + hist_t* out) const = 0; + + virtual void ConstructHistogramInt32(data_size_t start, data_size_t end, + const score_t* gradients, + const score_t* hessians, + hist_t* out) const = 0; + + virtual void ConstructHistogramOrderedInt32(const data_size_t* data_indices, + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* ordered_hessians, + hist_t* out) const = 0; + + virtual void ConstructHistogramInt16(const data_size_t* data_indices, + data_size_t start, data_size_t end, + const score_t* gradients, + const score_t* hessians, + hist_t* out) const = 0; + + virtual void ConstructHistogramInt16(data_size_t start, data_size_t end, + const score_t* gradients, + const score_t* hessians, + hist_t* out) const = 0; + + virtual void ConstructHistogramOrderedInt16(const data_size_t* data_indices, + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* ordered_hessians, + hist_t* out) const = 0; + + virtual void ConstructHistogramInt8(const data_size_t* data_indices, + data_size_t start, data_size_t end, + const score_t* gradients, + const score_t* hessians, + hist_t* out) const = 0; + + virtual void ConstructHistogramInt8(data_size_t start, data_size_t end, + const score_t* gradients, + const score_t* hessians, + hist_t* out) const = 0; + + virtual void ConstructHistogramOrderedInt8(const data_size_t* data_indices, + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* ordered_hessians, + hist_t* out) const = 0; + virtual void FinishLoad() = 0; virtual bool IsSparse() = 0; diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index cbb2735baeb2..89318a7af246 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -592,6 +592,30 @@ struct Config { // desc = **Note**: can be used only in CLI version int snapshot_freq = -1; + // [no-save] + // desc = whether to use gradient quantization when training + // desc = enabling this will discretize (quantize) the gradients and hessians into bins of ``num_grad_quant_bins`` + // desc = with quantized training, most arithmetics in the training process will be integer operations + // desc = gradient quantization can accelerate training, with little accuracy drop in most cases + // desc = **Note**: can be used only with ``device_type = cpu`` + bool use_quantized_grad = false; + + // [no-save] + // desc = number of bins to quantization gradients and hessians + // desc = with more bins, the quantized training will be closer to full precision training + // desc = **Note**: can be used only with ``device_type = cpu`` + int num_grad_quant_bins = 4; + + // [no-save] + // desc = whether to renew the leaf values with original gradients when quantized training + // desc = renewing is very helpful for good quantized training accuracy for ranking objectives + // desc = **Note**: can be used only with ``device_type = cpu`` + bool quant_train_renew_leaf = false; + + // [no-save] + // desc = whether to use stochastic rounding in gradient quantization + bool stochastic_rounding = true; + #ifndef __NVCC__ #pragma endregion diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index 79c4ed196b09..825c5c6ebcf8 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -598,10 +598,11 @@ class Dataset { MultiValBin* GetMultiBinFromAllFeatures(const std::vector& offsets) const; + template TrainingShareStates* GetShareStates( score_t* gradients, score_t* hessians, const std::vector& is_feature_used, bool is_constant_hessian, - bool force_col_wise, bool force_row_wise) const; + bool force_col_wise, bool force_row_wise, const int num_grad_quant_bins) const; LIGHTGBM_EXPORT void FinishLoad(); @@ -636,7 +637,7 @@ class Dataset { void InitTrain(const std::vector& is_feature_used, TrainingShareStates* share_state) const; - template + template void ConstructHistogramsInner(const std::vector& is_feature_used, const data_size_t* data_indices, data_size_t num_data, const score_t* gradients, @@ -646,7 +647,7 @@ class Dataset { TrainingShareStates* share_state, hist_t* hist_data) const; - template + template void ConstructHistogramsMultiVal(const data_size_t* data_indices, data_size_t num_data, const score_t* gradients, @@ -654,6 +655,7 @@ class Dataset { TrainingShareStates* share_state, hist_t* hist_data) const; + template inline void ConstructHistograms( const std::vector& is_feature_used, const data_size_t* data_indices, data_size_t num_data, @@ -666,21 +668,21 @@ class Dataset { bool use_indices = data_indices != nullptr && (num_data < num_data_); if (share_state->is_constant_hessian) { if (use_indices) { - ConstructHistogramsInner( + ConstructHistogramsInner( is_feature_used, data_indices, num_data, gradients, hessians, ordered_gradients, ordered_hessians, share_state, hist_data); } else { - ConstructHistogramsInner( + ConstructHistogramsInner( is_feature_used, data_indices, num_data, gradients, hessians, ordered_gradients, ordered_hessians, share_state, hist_data); } } else { if (use_indices) { - ConstructHistogramsInner( + ConstructHistogramsInner( is_feature_used, data_indices, num_data, gradients, hessians, ordered_gradients, ordered_hessians, share_state, hist_data); } else { - ConstructHistogramsInner( + ConstructHistogramsInner( is_feature_used, data_indices, num_data, gradients, hessians, ordered_gradients, ordered_hessians, share_state, hist_data); } @@ -689,6 +691,9 @@ class Dataset { void FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, hist_t* data) const; + template + void FixHistogramInt(int feature_idx, int64_t sum_gradient_and_hessian, hist_t* data) const; + inline data_size_t Split(int feature, const uint32_t* threshold, int num_threshold, bool default_left, const data_size_t* data_indices, diff --git a/include/LightGBM/train_share_states.h b/include/LightGBM/train_share_states.h index 8c50734695b2..f102668edf70 100644 --- a/include/LightGBM/train_share_states.h +++ b/include/LightGBM/train_share_states.h @@ -19,7 +19,7 @@ namespace LightGBM { class MultiValBinWrapper { public: MultiValBinWrapper(MultiValBin* bin, data_size_t num_data, - const std::vector& feature_groups_contained); + const std::vector& feature_groups_contained, const int num_grad_quant_bins); bool IsSparse() { if (multi_val_bin_ != nullptr) { @@ -34,15 +34,17 @@ class MultiValBinWrapper { const data_size_t* bagging_use_indices, data_size_t bagging_indices_cnt); + template void HistMove(const std::vector>& hist_buf); + template void HistMerge(std::vector>* hist_buf); void ResizeHistBuf(std::vector>* hist_buf, MultiValBin* sub_multi_val_bin, hist_t* origin_hist_data); - template + template void ConstructHistograms(const data_size_t* data_indices, data_size_t num_data, const score_t* gradients, @@ -59,55 +61,145 @@ class MultiValBinWrapper { Threading::BlockInfo(num_threads_, num_data, min_block_size_, &n_data_block_, &data_block_size_); ResizeHistBuf(hist_buf, cur_multi_val_bin, origin_hist_data); + const int inner_hist_bits = (data_block_size_ * num_grad_quant_bins_ < 256 && HIST_BITS == 16) ? 8 : HIST_BITS; OMP_INIT_EX(); #pragma omp parallel for schedule(static) num_threads(num_threads_) for (int block_id = 0; block_id < n_data_block_; ++block_id) { OMP_LOOP_EX_BEGIN(); data_size_t start = block_id * data_block_size_; data_size_t end = std::min(start + data_block_size_, num_data); - ConstructHistogramsForBlock( - cur_multi_val_bin, start, end, data_indices, gradients, hessians, - block_id, hist_buf); + if (inner_hist_bits == 8) { + ConstructHistogramsForBlock( + cur_multi_val_bin, start, end, data_indices, gradients, hessians, + block_id, hist_buf); + } else { + ConstructHistogramsForBlock( + cur_multi_val_bin, start, end, data_indices, gradients, hessians, + block_id, hist_buf); + } OMP_LOOP_EX_END(); } OMP_THROW_EX(); global_timer.Stop("Dataset::sparse_bin_histogram"); global_timer.Start("Dataset::sparse_bin_histogram_merge"); - HistMerge(hist_buf); + if (inner_hist_bits == 8) { + HistMerge(hist_buf); + } else { + HistMerge(hist_buf); + } global_timer.Stop("Dataset::sparse_bin_histogram_merge"); global_timer.Start("Dataset::sparse_bin_histogram_move"); - HistMove(*hist_buf); + if (inner_hist_bits == 8) { + HistMove(*hist_buf); + } else { + HistMove(*hist_buf); + } global_timer.Stop("Dataset::sparse_bin_histogram_move"); } } - template + template void ConstructHistogramsForBlock(const MultiValBin* sub_multi_val_bin, data_size_t start, data_size_t end, const data_size_t* data_indices, const score_t* gradients, const score_t* hessians, int block_id, std::vector>* hist_buf) { - hist_t* data_ptr = origin_hist_data_; - if (block_id == 0) { - if (is_use_subcol_) { - data_ptr = hist_buf->data() + hist_buf->size() - 2 * static_cast(num_bin_aligned_); + if (USE_QUANT_GRAD) { + if (HIST_BITS == 8) { + int8_t* hist_buf_ptr = reinterpret_cast(hist_buf->data()); + int8_t* data_ptr = hist_buf_ptr + + static_cast(num_bin_aligned_) * block_id * 2; + std::memset(reinterpret_cast(data_ptr), 0, num_bin_ * kInt8HistBufferEntrySize); + if (USE_INDICES) { + if (ORDERED) { + sub_multi_val_bin->ConstructHistogramOrderedInt8(data_indices, start, end, + gradients, hessians, + reinterpret_cast(data_ptr)); + } else { + sub_multi_val_bin->ConstructHistogramInt8(data_indices, start, end, gradients, + hessians, + reinterpret_cast(data_ptr)); + } + } else { + sub_multi_val_bin->ConstructHistogramInt8(start, end, gradients, hessians, + reinterpret_cast(data_ptr)); + } + } else if (HIST_BITS == 16) { + int16_t* data_ptr = reinterpret_cast(origin_hist_data_); + int16_t* hist_buf_ptr = reinterpret_cast(hist_buf->data()); + if (block_id == 0) { + if (is_use_subcol_) { + data_ptr = hist_buf_ptr + hist_buf->size() - 2 * static_cast(num_bin_aligned_); + } + } else { + data_ptr = hist_buf_ptr + + static_cast(num_bin_aligned_) * (block_id - 1) * 2; + } + std::memset(reinterpret_cast(data_ptr), 0, num_bin_ * kInt16HistBufferEntrySize); + if (USE_INDICES) { + if (ORDERED) { + sub_multi_val_bin->ConstructHistogramOrderedInt16(data_indices, start, end, + gradients, hessians, + reinterpret_cast(data_ptr)); + } else { + sub_multi_val_bin->ConstructHistogramInt16(data_indices, start, end, gradients, + hessians, + reinterpret_cast(data_ptr)); + } + } else { + sub_multi_val_bin->ConstructHistogramInt16(start, end, gradients, hessians, + reinterpret_cast(data_ptr)); + } + } else { + int32_t* data_ptr = reinterpret_cast(origin_hist_data_); + int32_t* hist_buf_ptr = reinterpret_cast(hist_buf->data()); + if (block_id == 0) { + if (is_use_subcol_) { + data_ptr = hist_buf_ptr + hist_buf->size() - 2 * static_cast(num_bin_aligned_); + } + } else { + data_ptr = hist_buf_ptr + + static_cast(num_bin_aligned_) * (block_id - 1) * 2; + } + std::memset(reinterpret_cast(data_ptr), 0, num_bin_ * kInt32HistBufferEntrySize); + if (USE_INDICES) { + if (ORDERED) { + sub_multi_val_bin->ConstructHistogramOrderedInt32(data_indices, start, end, + gradients, hessians, + reinterpret_cast(data_ptr)); + } else { + sub_multi_val_bin->ConstructHistogramInt32(data_indices, start, end, gradients, + hessians, + reinterpret_cast(data_ptr)); + } + } else { + sub_multi_val_bin->ConstructHistogramInt32(start, end, gradients, hessians, + reinterpret_cast(data_ptr)); + } } } else { - data_ptr = hist_buf->data() + - static_cast(num_bin_aligned_) * (block_id - 1) * 2; - } - std::memset(reinterpret_cast(data_ptr), 0, num_bin_ * kHistBufferEntrySize); - if (USE_INDICES) { - if (ORDERED) { - sub_multi_val_bin->ConstructHistogramOrdered(data_indices, start, end, - gradients, hessians, data_ptr); + hist_t* data_ptr = origin_hist_data_; + if (block_id == 0) { + if (is_use_subcol_) { + data_ptr = hist_buf->data() + hist_buf->size() - 2 * static_cast(num_bin_aligned_); + } } else { - sub_multi_val_bin->ConstructHistogram(data_indices, start, end, gradients, - hessians, data_ptr); + data_ptr = hist_buf->data() + + static_cast(num_bin_aligned_) * (block_id - 1) * 2; + } + std::memset(reinterpret_cast(data_ptr), 0, num_bin_ * kHistBufferEntrySize); + if (USE_INDICES) { + if (ORDERED) { + sub_multi_val_bin->ConstructHistogramOrdered(data_indices, start, end, + gradients, hessians, data_ptr); + } else { + sub_multi_val_bin->ConstructHistogram(data_indices, start, end, gradients, + hessians, data_ptr); + } + } else { + sub_multi_val_bin->ConstructHistogram(start, end, gradients, hessians, + data_ptr); } - } else { - sub_multi_val_bin->ConstructHistogram(start, end, gradients, hessians, - data_ptr); } } @@ -162,10 +254,14 @@ class MultiValBinWrapper { int data_block_size_; int min_block_size_; int num_data_; + int num_grad_quant_bins_; hist_t* origin_hist_data_; const size_t kHistBufferEntrySize = 2 * sizeof(hist_t); + const size_t kInt32HistBufferEntrySize = 2 * sizeof(int32_t); + const size_t kInt16HistBufferEntrySize = 2 * sizeof(int16_t); + const size_t kInt8HistBufferEntrySize = 2 * sizeof(int8_t); }; struct TrainingShareStates { @@ -193,7 +289,7 @@ struct TrainingShareStates { void SetMultiValBin(MultiValBin* bin, data_size_t num_data, const std::vector>& feature_groups, - bool dense_only, bool sparse_only); + bool dense_only, bool sparse_only, const int num_grad_quant_bins); void CalcBinOffsets(const std::vector>& feature_groups, std::vector* offsets, bool is_col_wise); @@ -210,14 +306,14 @@ struct TrainingShareStates { } } - template + template void ConstructHistograms(const data_size_t* data_indices, data_size_t num_data, const score_t* gradients, const score_t* hessians, hist_t* hist_data) { if (multi_val_bin_wrapper_ != nullptr) { - multi_val_bin_wrapper_->ConstructHistograms( + multi_val_bin_wrapper_->ConstructHistograms( data_indices, num_data, gradients, hessians, &hist_buf_, hist_data); } } diff --git a/python-package/README.rst b/python-package/README.rst index 6cabf33ab29c..2f927f4f2010 100644 --- a/python-package/README.rst +++ b/python-package/README.rst @@ -41,7 +41,7 @@ Build from Sources .. code:: sh - pip install --no-binary :all: lightgbm + pip install --no-binary lightgbm lightgbm For **Linux** and **macOS** users, installation from sources requires installed `CMake`_. @@ -193,34 +193,33 @@ For **Windows** users, if you get any errors during installation and there is th .. code:: sh git clone --recursive https://github.com/microsoft/LightGBM.git - cd LightGBM/python-package # export CXX=g++-7 CC=gcc-7 # macOS users, if you decided to compile with gcc, don't forget to specify compilers (replace "7" with version of gcc installed on your machine) - python setup.py install + sh ./build-python.sh install Note: ``sudo`` (or administrator rights in **Windows**) may be needed to perform the command. -Run ``python setup.py install --nomp`` to disable **OpenMP** support. All requirements from `Build Threadless Version section <#build-threadless-version>`__ apply for this installation option as well. +Run ``sh ./build-python.sh install --nomp`` to disable **OpenMP** support. All requirements from `Build Threadless Version section <#build-threadless-version>`__ apply for this installation option as well. -Run ``python setup.py install --mpi`` to enable **MPI** support. All requirements from `Build MPI Version section <#build-mpi-version>`__ apply for this installation option as well. +Run ``sh ./build-python.sh install --mpi`` to enable **MPI** support. All requirements from `Build MPI Version section <#build-mpi-version>`__ apply for this installation option as well. -Run ``python setup.py install --mingw``, if you want to use **MinGW-w64** on **Windows** instead of **Visual Studio**. All requirements from `Build with MinGW-w64 on Windows section <#build-with-mingw-w64-on-windows>`__ apply for this installation option as well. +Run ``sh ./build-python.sh install --mingw``, if you want to use **MinGW-w64** on **Windows** instead of **Visual Studio**. All requirements from `Build with MinGW-w64 on Windows section <#build-with-mingw-w64-on-windows>`__ apply for this installation option as well. -Run ``python setup.py install --gpu`` to enable GPU support. All requirements from `Build GPU Version section <#build-gpu-version>`__ apply for this installation option as well. To pass additional options to **CMake** use the following syntax: ``python setup.py install --gpu --opencl-include-dir=/usr/local/cuda/include/``, see `Build GPU Version section <#build-gpu-version>`__ for the complete list of them. +Run ``sh ./build-python.sh install --gpu`` to enable GPU support. All requirements from `Build GPU Version section <#build-gpu-version>`__ apply for this installation option as well. To pass additional options to **CMake** use the following syntax: ``sh ./build-python.sh install --gpu --opencl-include-dir="/usr/local/cuda/include/"``, see `Build GPU Version section <#build-gpu-version>`__ for the complete list of them. -Run ``python setup.py install --cuda`` to enable CUDA support. All requirements from `Build CUDA Version section <#build-cuda-version>`__ apply for this installation option as well. +Run ``sh ./build-python.sh install --cuda`` to enable CUDA support. All requirements from `Build CUDA Version section <#build-cuda-version>`__ apply for this installation option as well. -Run ``python setup.py install --hdfs`` to enable HDFS support. All requirements from `Build HDFS Version section <#build-hdfs-version>`__ apply for this installation option as well. +Run ``sh ./build-python.sh install --hdfs`` to enable HDFS support. All requirements from `Build HDFS Version section <#build-hdfs-version>`__ apply for this installation option as well. -Run ``python setup.py install --bit32``, if you want to use 32-bit version. All requirements from `Build 32-bit Version with 32-bit Python section <#build-32-bit-version-with-32-bit-python>`__ apply for this installation option as well. +Run ``sh ./build-python.sh install --bit32``, if you want to use 32-bit version. All requirements from `Build 32-bit Version with 32-bit Python section <#build-32-bit-version-with-32-bit-python>`__ apply for this installation option as well. -Run ``python setup.py install --time-costs``, if you want to output time costs for different internal routines. All requirements from `Build with Time Costs Output section <#build-with-time-costs-output>`__ apply for this installation option as well. +Run ``sh ./build-python.sh install --time-costs``, if you want to output time costs for different internal routines. All requirements from `Build with Time Costs Output section <#build-with-time-costs-output>`__ apply for this installation option as well. -If you get any errors during installation or due to any other reasons, you may want to build dynamic library from sources by any method you prefer (see `Installation Guide `__) and then just run ``python setup.py install --precompile``. +If you get any errors during installation or due to any other reasons, you may want to build dynamic library from sources by any method you prefer (see `Installation Guide `__) and then just run ``sh ./build-python.sh install --precompile``. Build Wheel File **************** -You can use ``python setup.py bdist_wheel`` instead of ``python setup.py install`` to build wheel file and use it for installation later. This might be useful for systems with restricted or completely without network access. +You can use ``sh ./build-python.sh install bdist_wheel`` instead of ``sh ./build-python.sh install`` to build wheel file and use it for installation later. This might be useful for systems with restricted or completely without network access. Install Dask-package '''''''''''''''''''' @@ -235,7 +234,7 @@ To install all additional dependencies required for Dask-package, you can append pip install lightgbm[dask] -Or replace ``python setup.py install`` with ``pip install -e .[dask]`` if you are installing the package from source files. +Or replace ``sh ./build-python.sh install`` with ``pip install -e .[dask]`` if you are installing the package from source files. Troubleshooting --------------- @@ -252,9 +251,15 @@ Refer to the walk through examples in `Python guide folder `_. If you would like to make a contribution and not familiar with PEP 8, please check the PEP 8 style guide first. Otherwise, the check won't pass. Only E501 (line too long) and W503 (line break occurred before a binary operator) can be ignored. +The code style of Python-package follows `PEP 8 `_. -Documentation strings (docstrings) are written in the NumPy style. +The package's documentation strings (docstrings) are written in the `numpydoc style `_. + +To check that a contribution to the package matches its style expectations, run the following from the root of the repo. + +.. code:: sh + + sh .ci/lint-python.sh .. |License| image:: https://img.shields.io/github/license/microsoft/lightgbm.svg :target: https://github.com/microsoft/LightGBM/blob/master/LICENSE diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 445a1047d959..fd07283aa236 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -2,6 +2,7 @@ """Wrapper for C API of LightGBM.""" import abc import ctypes +import inspect import json import warnings from collections import OrderedDict @@ -12,7 +13,7 @@ from os.path import getsize from pathlib import Path from tempfile import NamedTemporaryFile -from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union +from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union import numpy as np import scipy.sparse @@ -20,6 +21,9 @@ from .compat import PANDAS_INSTALLED, concat, dt_DataTable, pd_CategoricalDtype, pd_DataFrame, pd_Series from .libpath import find_lib_path +if TYPE_CHECKING: + from typing import Literal + __all__ = [ 'Booster', 'Dataset', @@ -34,6 +38,10 @@ "ctypes._Pointer[ctypes.c_int32]", "ctypes._Pointer[ctypes.c_int64]" ] +_ctypes_int_array = Union[ + "ctypes.Array[ctypes._Pointer[ctypes.c_int32]]", + "ctypes.Array[ctypes._Pointer[ctypes.c_int64]]" +] _ctypes_float_ptr = Union[ "ctypes._Pointer[ctypes.c_float]", "ctypes._Pointer[ctypes.c_double]" @@ -45,8 +53,8 @@ _LGBM_EvalFunctionResultType = Tuple[str, float, bool] _LGBM_BoosterBestScoreType = Dict[str, Dict[str, float]] _LGBM_BoosterEvalMethodResultType = Tuple[str, str, float, bool] -_LGBM_CategoricalFeatureConfiguration = Union[List[str], List[int], str] -_LGBM_FeatureNameConfiguration = Union[List[str], str] +_LGBM_CategoricalFeatureConfiguration = Union[List[str], List[int], "Literal['auto']"] +_LGBM_FeatureNameConfiguration = Union[List[str], "Literal['auto']"] _LGBM_GroupType = Union[ List[float], List[int], @@ -72,7 +80,8 @@ List[np.ndarray] ] _LGBM_LabelType = Union[ - list, + List[float], + List[int], np.ndarray, pd_Series, pd_DataFrame @@ -276,8 +285,8 @@ def _is_1d_collection(data: Any) -> bool: def _list_to_1d_numpy( data: Any, - dtype: "np.typing.DTypeLike" = np.float32, - name: str = 'list' + dtype: "np.typing.DTypeLike", + name: str ) -> np.ndarray: """Convert data to numpy 1-D array.""" if _is_numpy_1d_array(data): @@ -317,8 +326,8 @@ def _is_2d_collection(data: Any) -> bool: def _data_to_2d_numpy( data: Any, - dtype: "np.typing.DTypeLike" = np.float32, - name: str = 'list' + dtype: "np.typing.DTypeLike", + name: str ) -> np.ndarray: """Convert data to numpy 2-D array.""" if _is_numpy_2d_array(data): @@ -588,13 +597,16 @@ def _convert_from_sliced_object(data: np.ndarray) -> np.ndarray: return data -def _c_float_array(data): +def _c_float_array( + data: np.ndarray +) -> Tuple[_ctypes_float_ptr, int, np.ndarray]: """Get pointer of float numpy array / list.""" if _is_1d_list(data): data = np.array(data, copy=False) if _is_numpy_1d_array(data): data = _convert_from_sliced_object(data) assert data.flags.c_contiguous + ptr_data: _ctypes_float_ptr if data.dtype == np.float32: ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)) type_data = _C_API_DTYPE_FLOAT32 @@ -608,13 +620,16 @@ def _c_float_array(data): return (ptr_data, type_data, data) # return `data` to avoid the temporary copy is freed -def _c_int_array(data): +def _c_int_array( + data: np.ndarray +) -> Tuple[_ctypes_int_ptr, int, np.ndarray]: """Get pointer of int numpy array / list.""" if _is_1d_list(data): data = np.array(data, copy=False) if _is_numpy_1d_array(data): data = _convert_from_sliced_object(data) assert data.flags.c_contiguous + ptr_data: _ctypes_int_ptr if data.dtype == np.int32: ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)) type_data = _C_API_DTYPE_INT32 @@ -677,7 +692,7 @@ def _data_from_pandas( if categorical_feature == 'auto': # use cat cols from DataFrame categorical_feature = cat_cols_not_ordered else: # use cat cols specified by user - categorical_feature = list(categorical_feature) + categorical_feature = list(categorical_feature) # type: ignore[assignment] if feature_name == 'auto': feature_name = list(data.columns) _check_for_bad_pandas_dtypes(data.dtypes) @@ -982,8 +997,8 @@ def predict( elif isinstance(data, list): try: data = np.array(data) - except BaseException: - raise ValueError('Cannot convert data list to numpy array.') + except BaseException as err: + raise ValueError('Cannot convert data list to numpy array.') from err preds, nrow = self.__pred_for_np2d( mat=data, start_iteration=start_iteration, @@ -1001,8 +1016,8 @@ def predict( try: _log_warning('Converting data to scipy sparse matrix.') csr = scipy.sparse.csr_matrix(data) - except BaseException: - raise TypeError(f'Cannot predict data for type {type(data).__name__}') + except BaseException as err: + raise TypeError(f'Cannot predict data for type {type(data).__name__}') from err preds, nrow = self.__pred_for_csr( csr=csr, start_iteration=start_iteration, @@ -1623,10 +1638,10 @@ def _init_from_sample( # c type: double** # each double* element points to start of each column of sample data. - sample_col_ptr = (ctypes.POINTER(ctypes.c_double) * ncol)() + sample_col_ptr: _ctypes_float_array = (ctypes.POINTER(ctypes.c_double) * ncol)() # c type int** # each int* points to start of indices for each column - indices_col_ptr = (ctypes.POINTER(ctypes.c_int32) * ncol)() + indices_col_ptr: _ctypes_int_array = (ctypes.POINTER(ctypes.c_int32) * ncol)() for i in range(ncol): sample_col_ptr[i] = _c_float_array(sample_data[i])[0] indices_col_ptr[i] = _c_int_array(sample_indices[i])[0] @@ -1724,18 +1739,20 @@ def _free_handle(self) -> "Dataset": def _set_init_score_by_predictor( self, predictor: Optional[_InnerPredictor], - data, - used_indices: Optional[List[int]] - ): + data: _LGBM_TrainDataType, + used_indices: Optional[Union[List[int], np.ndarray]] + ) -> "Dataset": data_has_header = False if isinstance(data, (str, Path)) and self.params is not None: # check data has header or not data_has_header = any(self.params.get(alias, False) for alias in _ConfigAliases.get("header")) num_data = self.num_data() if predictor is not None: - init_score = predictor.predict(data, - raw_score=True, - data_has_header=data_has_header) + init_score: Union[np.ndarray, scipy.sparse.spmatrix] = predictor.predict( + data=data, + raw_score=True, + data_has_header=data_has_header + ) init_score = init_score.ravel() if used_indices is not None: assert not self._need_slice @@ -1754,23 +1771,24 @@ def _set_init_score_by_predictor( new_init_score[j * num_data + i] = init_score[i * predictor.num_class + j] init_score = new_init_score elif self.init_score is not None: - init_score = np.zeros(self.init_score.shape, dtype=np.float64) + init_score = np.full_like(self.init_score, fill_value=0.0, dtype=np.float64) else: return self self.set_init_score(init_score) + return self def _lazy_init( self, data: Optional[_LGBM_TrainDataType], - label: Optional[_LGBM_LabelType] = None, - reference: Optional["Dataset"] = None, - weight: Optional[_LGBM_WeightType] = None, - group: Optional[_LGBM_GroupType] = None, - init_score: Optional[_LGBM_InitScoreType] = None, - predictor=None, - feature_name='auto', - categorical_feature='auto', - params: Optional[Dict[str, Any]] = None + label: Optional[_LGBM_LabelType], + reference: Optional["Dataset"], + weight: Optional[_LGBM_WeightType], + group: Optional[_LGBM_GroupType], + init_score: Optional[_LGBM_InitScoreType], + predictor: Optional[_InnerPredictor], + feature_name: _LGBM_FeatureNameConfiguration, + categorical_feature: _LGBM_CategoricalFeatureConfiguration, + params: Optional[Dict[str, Any]] ) -> "Dataset": if data is None: self.handle = None @@ -1778,16 +1796,14 @@ def _lazy_init( if reference is not None: self.pandas_categorical = reference.pandas_categorical categorical_feature = reference.categorical_feature - data, feature_name, categorical_feature, self.pandas_categorical = _data_from_pandas(data, - feature_name, - categorical_feature, - self.pandas_categorical) + data, feature_name, categorical_feature, self.pandas_categorical = _data_from_pandas(data=data, + feature_name=feature_name, + categorical_feature=categorical_feature, + pandas_categorical=self.pandas_categorical) # process for args params = {} if params is None else params - args_names = (getattr(self.__class__, '_lazy_init') - .__code__ - .co_varnames[:getattr(self.__class__, '_lazy_init').__code__.co_argcount]) + args_names = inspect.signature(self.__class__._lazy_init).parameters.keys() for key in params.keys(): if key in args_names: _log_warning(f'{key} keyword has been found in `params` and will be ignored.\n' @@ -1851,8 +1867,8 @@ def _lazy_init( try: csr = scipy.sparse.csr_matrix(data) self.__init_from_csr(csr, params_str, ref_dataset) - except BaseException: - raise TypeError(f'Cannot initialize Dataset from {type(data).__name__}') + except BaseException as err: + raise TypeError(f'Cannot initialize Dataset from {type(data).__name__}') from err if label is not None: self.set_label(label) if self.get_label() is None: @@ -1903,7 +1919,7 @@ def __sample(self, seqs: List[Sequence], total_nrow: int) -> Tuple[List[np.ndarr indices = self._create_sample_indices(total_nrow) # Select sampled rows, transpose to column order. - sampled = np.array([row for row in self._yield_row_from_seqlist(seqs, indices)]) + sampled = np.array(list(self._yield_row_from_seqlist(seqs, indices))) sampled = sampled.T filtered = [] @@ -1996,7 +2012,7 @@ def __init_from_list_np2d( ptr_data = (ctypes.POINTER(ctypes.c_float) * len(mats))() holders = [] - type_ptr_data = None + type_ptr_data = -1 for i, mat in enumerate(mats): if len(mat.shape) != 2: @@ -2013,7 +2029,7 @@ def __init_from_list_np2d( mats[i] = np.array(mat.reshape(mat.size), dtype=np.float32) chunk_ptr_data, chunk_type_ptr_data, holder = _c_float_array(mats[i]) - if type_ptr_data is not None and chunk_type_ptr_data != type_ptr_data: + if type_ptr_data != -1 and chunk_type_ptr_data != type_ptr_data: raise ValueError('Input chunks must have same type') ptr_data[i] = chunk_ptr_data type_ptr_data = chunk_type_ptr_data @@ -2154,13 +2170,13 @@ def construct(self) -> "Dataset": self._update_params(reference_params) if self.used_indices is None: # create valid - self._lazy_init(self.data, label=self.label, reference=self.reference, + self._lazy_init(data=self.data, label=self.label, reference=self.reference, weight=self.weight, group=self.group, init_score=self.init_score, predictor=self._predictor, - feature_name=self.feature_name, params=self.params) + feature_name=self.feature_name, categorical_feature='auto', params=self.params) else: # construct subset - used_indices = _list_to_1d_numpy(self.used_indices, np.int32, name='used_indices') + used_indices = _list_to_1d_numpy(self.used_indices, dtype=np.int32, name='used_indices') assert used_indices.flags.c_contiguous if self.reference.group is not None: group_info = np.array(self.reference.group).astype(np.int32, copy=False) @@ -2189,7 +2205,7 @@ def construct(self) -> "Dataset": ) else: # create train - self._lazy_init(self.data, label=self.label, + self._lazy_init(data=self.data, label=self.label, reference=None, weight=self.weight, group=self.group, init_score=self.init_score, predictor=self._predictor, feature_name=self.feature_name, categorical_feature=self.categorical_feature, params=self.params) @@ -2329,7 +2345,7 @@ def _reverse_update_params(self) -> "Dataset": def set_field( self, field_name: str, - data + data: Optional[Union[List[List[float]], List[List[int]], List[float], List[int], np.ndarray, pd_Series, pd_DataFrame]] ) -> "Dataset": """Set property into the Dataset. @@ -2360,9 +2376,9 @@ def set_field( if field_name == 'init_score': dtype = np.float64 if _is_1d_collection(data): - data = _list_to_1d_numpy(data, dtype, name=field_name) + data = _list_to_1d_numpy(data, dtype=dtype, name=field_name) elif _is_2d_collection(data): - data = _data_to_2d_numpy(data, dtype, name=field_name) + data = _data_to_2d_numpy(data, dtype=dtype, name=field_name) data = data.ravel(order='F') else: raise TypeError( @@ -2371,8 +2387,9 @@ def set_field( ) else: dtype = np.int32 if field_name == 'group' else np.float32 - data = _list_to_1d_numpy(data, dtype, name=field_name) + data = _list_to_1d_numpy(data, dtype=dtype, name=field_name) + ptr_data: Union[_ctypes_float_ptr, _ctypes_int_ptr] if data.dtype == np.float32 or data.dtype == np.float64: ptr_data, type_data, _ = _c_float_array(data) elif data.dtype == np.int32: @@ -2460,7 +2477,7 @@ def set_categorical_feature( else: if self.categorical_feature != 'auto': _log_warning('categorical_feature in Dataset is overridden.\n' - f'New categorical_feature is {sorted(list(categorical_feature))}') + f'New categorical_feature is {list(categorical_feature)}') self.categorical_feature = categorical_feature return self._free_handle() else: @@ -2528,7 +2545,7 @@ def set_reference(self, reference: "Dataset") -> "Dataset": raise LightGBMError("Cannot set reference after freed raw data, " "set free_raw_data=False when construct Dataset to avoid this.") - def set_feature_name(self, feature_name: Union[List[str], str]) -> "Dataset": + def set_feature_name(self, feature_name: _LGBM_FeatureNameConfiguration) -> "Dataset": """Set feature name. Parameters @@ -2584,7 +2601,7 @@ def set_label(self, label: Optional[_LGBM_LabelType]) -> "Dataset": label = label.to_numpy(dtype=np.float32, na_value=np.nan) label_array = np.ravel(label) else: - label_array = _list_to_1d_numpy(label, name='label') + label_array = _list_to_1d_numpy(label, dtype=np.float32, name='label') self.set_field('label', label_array) self.label = self.get_field('label') # original values can be modified at cpp side return self @@ -2609,7 +2626,7 @@ def set_weight( weight = None self.weight = weight if self.handle is not None and weight is not None: - weight = _list_to_1d_numpy(weight, name='weight') + weight = _list_to_1d_numpy(weight, dtype=np.float32, name='weight') self.set_field('weight', weight) self.weight = self.get_field('weight') # original values can be modified at cpp side return self @@ -2658,7 +2675,7 @@ def set_group( """ self.group = group if self.handle is not None and group is not None: - group = _list_to_1d_numpy(group, np.int32, name='group') + group = _list_to_1d_numpy(group, dtype=np.int32, name='group') self.set_field('group', group) return self @@ -2759,7 +2776,7 @@ def get_data(self) -> Optional[_LGBM_TrainDataType]: elif isinstance(self.data, Sequence): self.data = self.data[self.used_indices] elif isinstance(self.data, list) and len(self.data) > 0 and all(isinstance(x, Sequence) for x in self.data): - self.data = np.array([row for row in self._yield_row_from_seqlist(self.data, self.used_indices)]) + self.data = np.array(list(self._yield_row_from_seqlist(self.data, self.used_indices))) else: _log_warning(f"Cannot subset {type(self.data).__name__} type of raw data.\n" "Returning original raw data") @@ -3097,7 +3114,7 @@ def __init__( ctypes.byref(out_num_class))) self.__num_class = out_num_class.value # buffer for inner predict - self.__inner_predict_buffer = [None] + self.__inner_predict_buffer: List[Optional[np.ndarray]] = [None] self.__is_predicted_cur_iter = [False] self.__get_eval_info() self.pandas_categorical = train_set.pandas_categorical @@ -3288,13 +3305,21 @@ def trees_to_dataframe(self) -> pd_DataFrame: if self.num_trees() == 0: raise LightGBMError('There are no trees in this Booster and thus nothing to parse') - def _is_split_node(tree): + def _is_split_node(tree: Dict[str, Any]) -> bool: return 'split_index' in tree.keys() - def create_node_record(tree, node_depth=1, tree_index=None, - feature_names=None, parent_node=None): - - def _get_node_index(tree, tree_index): + def create_node_record( + tree: Dict[str, Any], + node_depth: int = 1, + tree_index: Optional[int] = None, + feature_names: Optional[List[str]] = None, + parent_node: Optional[str] = None + ) -> Dict[str, Any]: + + def _get_node_index( + tree: Dict[str, Any], + tree_index: Optional[int] + ) -> str: tree_num = f'{tree_index}-' if tree_index is not None else '' is_split = _is_split_node(tree) node_type = 'S' if is_split else 'L' @@ -3302,7 +3327,10 @@ def _get_node_index(tree, tree_index): node_num = tree.get('split_index' if is_split else 'leaf_index', 0) return f"{tree_num}{node_type}{node_num}" - def _get_split_feature(tree, feature_names): + def _get_split_feature( + tree: Dict[str, Any], + feature_names: Optional[List[str]] + ) -> Optional[str]: if _is_split_node(tree): if feature_names is not None: feature_name = feature_names[tree['split_feature']] @@ -3312,11 +3340,11 @@ def _get_split_feature(tree, feature_names): feature_name = None return feature_name - def _is_single_node_tree(tree): + def _is_single_node_tree(tree: Dict[str, Any]) -> bool: return set(tree.keys()) == {'leaf_value'} # Create the node record, and populate universal data members - node = OrderedDict() + node: Dict[str, Union[int, str, None]] = OrderedDict() node['tree_index'] = tree_index node['node_depth'] = node_depth node['node_index'] = _get_node_index(tree, tree_index) @@ -3353,10 +3381,15 @@ def _is_single_node_tree(tree): return node - def tree_dict_to_node_list(tree, node_depth=1, tree_index=None, - feature_names=None, parent_node=None): + def tree_dict_to_node_list( + tree: Dict[str, Any], + node_depth: int = 1, + tree_index: Optional[int] = None, + feature_names: Optional[List[str]] = None, + parent_node: Optional[str] = None + ) -> List[Dict[str, Any]]: - node = create_node_record(tree, + node = create_node_record(tree=tree, node_depth=node_depth, tree_index=tree_index, feature_names=feature_names, @@ -3369,11 +3402,12 @@ def tree_dict_to_node_list(tree, node_depth=1, tree_index=None, children = ['left_child', 'right_child'] for child in children: subtree_list = tree_dict_to_node_list( - tree[child], + tree=tree[child], node_depth=node_depth + 1, tree_index=tree_index, feature_names=feature_names, - parent_node=node['node_index']) + parent_node=node['node_index'] + ) # In tree format, "subtree_list" is a list of node records (dicts), # and we add node to the list. res.extend(subtree_list) @@ -3383,7 +3417,7 @@ def tree_dict_to_node_list(tree, node_depth=1, tree_index=None, feature_names = model_dict['feature_names'] model_list = [] for tree in model_dict['tree_info']: - model_list.extend(tree_dict_to_node_list(tree['tree_structure'], + model_list.extend(tree_dict_to_node_list(tree=tree['tree_structure'], tree_index=tree['tree_index'], feature_names=feature_names)) @@ -3558,8 +3592,8 @@ def __boost( if self.__num_class > 1: grad = grad.ravel(order='F') hess = hess.ravel(order='F') - grad = _list_to_1d_numpy(grad, name='gradient') - hess = _list_to_1d_numpy(hess, name='hessian') + grad = _list_to_1d_numpy(grad, dtype=np.float32, name='gradient') + hess = _list_to_1d_numpy(hess, dtype=np.float32, name='hessian') assert grad.flags.c_contiguous assert hess.flags.c_contiguous if len(grad) != len(hess): @@ -4068,7 +4102,7 @@ def predict( Prediction result. Can be sparse or a list of sparse objects (each element represents predictions for one class) for feature contributions (when ``pred_contrib=True``). """ - predictor = self._to_predictor(deepcopy(kwargs)) + predictor = self._to_predictor(pred_parameter=deepcopy(kwargs)) if num_iteration is None: if start_iteration <= 0: num_iteration = self.best_iteration @@ -4158,8 +4192,8 @@ def refit( raise LightGBMError('Cannot refit due to null objective function.') if dataset_params is None: dataset_params = {} - predictor = self._to_predictor(deepcopy(kwargs)) - leaf_preds = predictor.predict( + predictor = self._to_predictor(pred_parameter=deepcopy(kwargs)) + leaf_preds: np.ndarray = predictor.predict( # type: ignore[assignment] data=data, start_iteration=-1, pred_leaf=True, @@ -4262,7 +4296,7 @@ def set_leaf_output( def _to_predictor( self, - pred_parameter: Optional[Dict[str, Any]] = None + pred_parameter: Dict[str, Any] ) -> _InnerPredictor: """Convert to predictor.""" predictor = _InnerPredictor(booster_handle=self.handle, pred_parameter=pred_parameter) @@ -4414,7 +4448,7 @@ def add(root: Dict[str, Any]) -> None: model = self.dump_model() feature_names = model.get('feature_names') tree_infos = model['tree_info'] - values = [] + values: List[float] = [] for tree_info in tree_infos: add(tree_info['tree_structure']) @@ -4488,16 +4522,16 @@ def __inner_predict(self, data_idx: int) -> np.ndarray: # avoid to predict many time in one iteration if not self.__is_predicted_cur_iter[data_idx]: tmp_out_len = ctypes.c_int64(0) - data_ptr = self.__inner_predict_buffer[data_idx].ctypes.data_as(ctypes.POINTER(ctypes.c_double)) + data_ptr = self.__inner_predict_buffer[data_idx].ctypes.data_as(ctypes.POINTER(ctypes.c_double)) # type: ignore[union-attr] _safe_call(_LIB.LGBM_BoosterGetPredict( self.handle, ctypes.c_int(data_idx), ctypes.byref(tmp_out_len), data_ptr)) - if tmp_out_len.value != len(self.__inner_predict_buffer[data_idx]): + if tmp_out_len.value != len(self.__inner_predict_buffer[data_idx]): # type: ignore[arg-type] raise ValueError(f"Wrong length of predict results for data {data_idx}") self.__is_predicted_cur_iter[data_idx] = True - result = self.__inner_predict_buffer[data_idx] + result: np.ndarray = self.__inner_predict_buffer[data_idx] # type: ignore[assignment] if self.__num_class > 1: num_data = result.size // self.__num_class result = result.reshape(num_data, self.__num_class, order='F') diff --git a/python-package/lightgbm/callback.py b/python-package/lightgbm/callback.py index 45f67edf5949..0c5d3e7956fa 100644 --- a/python-package/lightgbm/callback.py +++ b/python-package/lightgbm/callback.py @@ -15,6 +15,10 @@ _EvalResultDict = Dict[str, Dict[str, List[Any]]] _EvalResultTuple = Union[ + _LGBM_BoosterEvalMethodResultType, + Tuple[str, str, float, bool, float] +] +_ListOfEvalResultTuples = Union[ List[_LGBM_BoosterEvalMethodResultType], List[Tuple[str, str, float, bool, float]] ] @@ -23,7 +27,7 @@ class EarlyStopException(Exception): """Exception of early stopping.""" - def __init__(self, best_iteration: int, best_score: _EvalResultTuple) -> None: + def __init__(self, best_iteration: int, best_score: _ListOfEvalResultTuples) -> None: """Create early stopping exception. Parameters @@ -55,7 +59,7 @@ def _format_eval_result(value: _EvalResultTuple, show_stdv: bool) -> str: return f"{value[0]}'s {value[1]}: {value[2]:g}" elif len(value) == 5: if show_stdv: - return f"{value[0]}'s {value[1]}: {value[2]:g} + {value[4]:g}" + return f"{value[0]}'s {value[1]}: {value[2]:g} + {value[4]:g}" # type: ignore[misc] else: return f"{value[0]}'s {value[1]}: {value[2]:g}" else: @@ -256,7 +260,7 @@ def __init__( def _reset_storages(self) -> None: self.best_score: List[float] = [] self.best_iter: List[int] = [] - self.best_score_list: List[Union[_EvalResultTuple, None]] = [] + self.best_score_list: List[_ListOfEvalResultTuples] = [] self.cmp_op: List[Callable[[float, float], bool]] = [] self.first_metric = '' @@ -297,7 +301,7 @@ def _init(self, env: CallbackEnv) -> None: self._reset_storages() - n_metrics = len(set(m[1] for m in env.evaluation_result_list)) + n_metrics = len({m[1] for m in env.evaluation_result_list}) n_datasets = len(env.evaluation_result_list) // n_metrics if isinstance(self.min_delta, list): if not all(t >= 0 for t in self.min_delta): @@ -327,7 +331,6 @@ def _init(self, env: CallbackEnv) -> None: self.first_metric = env.evaluation_result_list[0][1].split(" ")[-1] for eval_ret, delta in zip(env.evaluation_result_list, deltas): self.best_iter.append(0) - self.best_score_list.append(None) if eval_ret[3]: # greater is better self.best_score.append(float('-inf')) self.cmp_op.append(partial(self._gt_delta, delta=delta)) @@ -350,12 +353,17 @@ def __call__(self, env: CallbackEnv) -> None: self._init(env) if not self.enabled: return + # self.best_score_list is initialized to an empty list + first_time_updating_best_score_list = (self.best_score_list == []) for i in range(len(env.evaluation_result_list)): score = env.evaluation_result_list[i][2] - if self.best_score_list[i] is None or self.cmp_op[i](score, self.best_score[i]): + if first_time_updating_best_score_list or self.cmp_op[i](score, self.best_score[i]): self.best_score[i] = score self.best_iter[i] = env.iteration - self.best_score_list[i] = env.evaluation_result_list + if first_time_updating_best_score_list: + self.best_score_list.append(env.evaluation_result_list) + else: + self.best_score_list[i] = env.evaluation_result_list # split is needed for " " case (e.g. "train l1") eval_name_splitted = env.evaluation_result_list[i][1].split(" ") if self.first_metric_only and self.first_metric != eval_name_splitted[-1]: diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py index 02bf7af2d253..c856fa1a9b11 100644 --- a/python-package/lightgbm/compat.py +++ b/python-package/lightgbm/compat.py @@ -77,9 +77,9 @@ def __init__(self, *args, **kwargs): from sklearn.utils.validation import assert_all_finite, check_array, check_X_y try: from sklearn.exceptions import NotFittedError - from sklearn.model_selection import BaseCrossValidator, GroupKFold, StratifiedKFold + from sklearn.model_selection import BaseCrossValidator, GroupKFold, StratifiedKFold, train_test_split except ImportError: - from sklearn.cross_validation import BaseCrossValidator, GroupKFold, StratifiedKFold + from sklearn.cross_validation import BaseCrossValidator, GroupKFold, StratifiedKFold, train_test_split from sklearn.utils.validation import NotFittedError try: from sklearn.utils.validation import _check_sample_weight @@ -100,6 +100,7 @@ def _check_sample_weight(sample_weight, X, dtype=None): LGBMNotFittedError = NotFittedError _LGBMStratifiedKFold = StratifiedKFold _LGBMGroupKFold = GroupKFold + _LGBMTrainTestSplit = train_test_split _LGBMCheckXY = check_X_y _LGBMCheckArray = check_array _LGBMCheckSampleWeight = _check_sample_weight diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py index 8ea1d6907081..88487d515f81 100644 --- a/python-package/lightgbm/dask.py +++ b/python-package/lightgbm/dask.py @@ -21,9 +21,9 @@ from .compat import (DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED, Client, LGBMNotFittedError, concat, dask_Array, dask_array_from_delayed, dask_bag_from_delayed, dask_DataFrame, dask_Series, default_client, delayed, pd_DataFrame, pd_Series, wait) -from .sklearn import (LGBMClassifier, LGBMModel, LGBMRanker, LGBMRegressor, _LGBM_ScikitCustomObjectiveFunction, - _LGBM_ScikitEvalMetricType, _lgbmmodel_doc_custom_eval_note, _lgbmmodel_doc_fit, - _lgbmmodel_doc_predict) +from .sklearn import (LGBMClassifier, LGBMModel, LGBMRanker, LGBMRegressor, _LGBM_ScikitCustomEvalSetSplitter, + _LGBM_ScikitCustomObjectiveFunction, _LGBM_ScikitEvalMetricType, _lgbmmodel_doc_custom_eval_note, + _lgbmmodel_doc_fit, _lgbmmodel_doc_predict) __all__ = [ 'DaskLGBMClassifier', @@ -189,7 +189,7 @@ def _train_part( local_listen_port: int, num_machines: int, return_model: bool, - time_out: int = 120, + time_out: int, **kwargs: Any ) -> Optional[LGBMModel]: network_params = { @@ -576,13 +576,48 @@ def _train( # pad eval sets when they come in different sizes. n_largest_eval_parts = max(x[0].npartitions for x in eval_set) - eval_sets = defaultdict(list) + eval_sets: Dict[ + int, + List[ + Union[ + _DatasetNames, + Tuple[ + List[Optional[_DaskMatrixLike]], + List[Optional[_DaskVectorLike]] + ] + ] + ] + ] = defaultdict(list) if eval_sample_weight: - eval_sample_weights = defaultdict(list) + eval_sample_weights: Dict[ + int, + List[ + Union[ + _DatasetNames, + List[Optional[_DaskVectorLike]] + ] + ] + ] = defaultdict(list) if eval_group: - eval_groups = defaultdict(list) + eval_groups: Dict[ + int, + List[ + Union[ + _DatasetNames, + List[Optional[_DaskVectorLike]] + ] + ] + ] = defaultdict(list) if eval_init_score: - eval_init_scores = defaultdict(list) + eval_init_scores: Dict[ + int, + List[ + Union[ + _DatasetNames, + List[Optional[_DaskMatrixLike]] + ] + ] + ] = defaultdict(list) for i, (X_eval, y_eval) in enumerate(eval_set): n_this_eval_parts = X_eval.npartitions @@ -610,8 +645,8 @@ def _train( eval_sets[parts_idx].append(([x_e], [y_e])) else: # append additional chunks of this eval set to this part. - eval_sets[parts_idx][-1][0].append(x_e) - eval_sets[parts_idx][-1][1].append(y_e) + eval_sets[parts_idx][-1][0].append(x_e) # type: ignore[index, union-attr] + eval_sets[parts_idx][-1][1].append(y_e) # type: ignore[index, union-attr] if eval_sample_weight: if eval_sample_weight[i] is sample_weight: @@ -631,7 +666,7 @@ def _train( if j < n_parts: eval_sample_weights[parts_idx].append([w_e]) else: - eval_sample_weights[parts_idx][-1].append(w_e) + eval_sample_weights[parts_idx][-1].append(w_e) # type: ignore[union-attr] if eval_init_score: if eval_init_score[i] is init_score: @@ -649,7 +684,7 @@ def _train( if j < n_parts: eval_init_scores[parts_idx].append([init_score_e]) else: - eval_init_scores[parts_idx][-1].append(init_score_e) + eval_init_scores[parts_idx][-1].append(init_score_e) # type: ignore[union-attr] if eval_group: if eval_group[i] is group: @@ -667,7 +702,7 @@ def _train( if j < n_parts: eval_groups[parts_idx].append([g_e]) else: - eval_groups[parts_idx][-1].append(g_e) + eval_groups[parts_idx][-1].append(g_e) # type: ignore[union-attr] # assign sub-eval_set components to worker parts. for parts_idx, e_set in eval_sets.items(): @@ -686,7 +721,8 @@ def _train( for part in parts: if part.status == 'error': # type: ignore - return part # trigger error locally + # trigger error locally + return part # type: ignore[return-value] # Find locations of all parts and map them to particular Dask workers key_to_part_dict = {part.key: part for part in parts} # type: ignore @@ -701,7 +737,7 @@ def _train( for worker in worker_map: has_eval_set = False for part in worker_map[worker]: - if 'eval_set' in part.result(): + if 'eval_set' in part.result(): # type: ignore[attr-defined] has_eval_set = True break @@ -751,7 +787,7 @@ def _train( else: if listen_port_in_params: _log_info("Using passed-in 'local_listen_port' for all workers") - unique_hosts = set(urlparse(a).hostname for a in worker_addresses) + unique_hosts = {urlparse(a).hostname for a in worker_addresses} if len(unique_hosts) < len(worker_addresses): msg = ( "'local_listen_port' was provided in Dask training parameters, but at least one " @@ -836,6 +872,7 @@ def _predict_part( **kwargs: Any ) -> _DaskPart: + result: _DaskPart if part.shape[0] == 0: result = np.array([]) elif pred_proba: @@ -1001,7 +1038,7 @@ def _extract(items: List[Any], i: int) -> Any: **kwargs, ) pred_row = predict_fn(data_row) - chunks = (data.chunks[0],) + chunks: Tuple[int, ...] = (data.chunks[0],) map_blocks_kwargs = {} if len(pred_row.shape) > 1: chunks += (pred_row.shape[1],) @@ -1133,10 +1170,19 @@ def __init__( random_state: Optional[Union[int, np.random.RandomState]] = None, n_jobs: Optional[int] = None, importance_type: str = 'split', + early_stopping: bool = False, + validation_fraction: Optional[float] = 0.1, + n_iter_no_change: int = 10, + validation_set_split_strategy: Optional[Union[str, _LGBM_ScikitCustomEvalSetSplitter]] = None, client: Optional[Client] = None, **kwargs: Any ): """Docstring is inherited from the lightgbm.LGBMClassifier.__init__.""" + if early_stopping: + raise NotImplementedError( + "Early Stopping is not available for the Dask interface of lightgbm " + f"(found early_stopping={early_stopping})" + ) self.client = client super().__init__( boosting_type=boosting_type, @@ -1235,7 +1281,7 @@ def fit( # type: ignore[override] def predict( self, - X: _DaskMatrixLike, + X: _DaskMatrixLike, # type: ignore[override] raw_score: bool = False, start_iteration: int = 0, num_iteration: Optional[int] = None, @@ -1270,7 +1316,7 @@ def predict( def predict_proba( self, - X: _DaskMatrixLike, + X: _DaskMatrixLike, # type: ignore[override] raw_score: bool = False, start_iteration: int = 0, num_iteration: Optional[int] = None, @@ -1338,10 +1384,19 @@ def __init__( random_state: Optional[Union[int, np.random.RandomState]] = None, n_jobs: Optional[int] = None, importance_type: str = 'split', + early_stopping: bool = False, + validation_fraction: Optional[float] = 0.1, + n_iter_no_change: int = 10, + validation_set_split_strategy: Optional[Union[str, _LGBM_ScikitCustomEvalSetSplitter]] = None, client: Optional[Client] = None, **kwargs: Any ): """Docstring is inherited from the lightgbm.LGBMRegressor.__init__.""" + if early_stopping: + raise NotImplementedError( + "Early Stopping is not available for the Dask interface of lightgbm " + f"(found early_stopping={early_stopping})" + ) self.client = client super().__init__( boosting_type=boosting_type, @@ -1441,7 +1496,7 @@ def fit( # type: ignore[override] def predict( self, - X: _DaskMatrixLike, + X: _DaskMatrixLike, # type: ignore[override] raw_score: bool = False, start_iteration: int = 0, num_iteration: Optional[int] = None, @@ -1508,10 +1563,19 @@ def __init__( random_state: Optional[Union[int, np.random.RandomState]] = None, n_jobs: Optional[int] = None, importance_type: str = 'split', + early_stopping: bool = False, + validation_fraction: Optional[float] = 0.1, + n_iter_no_change: int = 10, + validation_set_split_strategy: Optional[Union[str, _LGBM_ScikitCustomEvalSetSplitter]] = None, client: Optional[Client] = None, **kwargs: Any ): """Docstring is inherited from the lightgbm.LGBMRanker.__init__.""" + if early_stopping: + raise NotImplementedError( + "Early Stopping is not available for the Dask interface of lightgbm " + f"(found early_stopping={early_stopping})" + ) self.client = client super().__init__( boosting_type=boosting_type, @@ -1616,7 +1680,7 @@ def fit( # type: ignore[override] def predict( self, - X: _DaskMatrixLike, + X: _DaskMatrixLike, # type: ignore[override] raw_score: bool = False, start_iteration: int = 0, num_iteration: Optional[int] = None, diff --git a/python-package/lightgbm/engine.py b/python-package/lightgbm/engine.py index 3a0c93fba332..1f8624b7055d 100644 --- a/python-package/lightgbm/engine.py +++ b/python-package/lightgbm/engine.py @@ -11,8 +11,9 @@ from . import callback from .basic import (Booster, Dataset, LightGBMError, _choose_param_value, _ConfigAliases, _InnerPredictor, - _LGBM_CategoricalFeatureConfiguration, _LGBM_CustomObjectiveFunction, - _LGBM_FeatureNameConfiguration, _log_warning) + _LGBM_BoosterEvalMethodResultType, _LGBM_CategoricalFeatureConfiguration, + _LGBM_CustomObjectiveFunction, _LGBM_EvalFunctionResultType, _LGBM_FeatureNameConfiguration, + _log_warning) from .compat import SKLEARN_INSTALLED, _LGBMBaseCrossValidator, _LGBMGroupKFold, _LGBMStratifiedKFold __all__ = [ @@ -22,9 +23,15 @@ ] -_LGBM_CustomMetricFunction = Callable[ - [np.ndarray, Dataset], - Union[Tuple[str, float, bool], List[Tuple[str, float, bool]]] +_LGBM_CustomMetricFunction = Union[ + Callable[ + [np.ndarray, Dataset], + _LGBM_EvalFunctionResultType, + ], + Callable[ + [np.ndarray, Dataset], + List[_LGBM_EvalFunctionResultType] + ], ] _LGBM_PreprocFunction = Callable[ @@ -134,6 +141,20 @@ def train( booster : Booster The trained Booster model. """ + if not isinstance(train_set, Dataset): + raise TypeError(f"train() only accepts Dataset object, train_set has type '{type(train_set).__name__}'.") + + if num_boost_round <= 0: + raise ValueError(f"num_boost_round must be greater than 0. Got {num_boost_round}.") + + if isinstance(valid_sets, list): + for i, valid_item in enumerate(valid_sets): + if not isinstance(valid_item, Dataset): + raise TypeError( + "Every item in valid_sets must be a Dataset object. " + f"Item {i} has type '{type(valid_item).__name__}'." + ) + # create predictor first params = copy.deepcopy(params) params = _choose_param_value( @@ -160,17 +181,12 @@ def train( params.pop("early_stopping_round") first_metric_only = params.get('first_metric_only', False) - if num_boost_round <= 0: - raise ValueError("num_boost_round should be greater than zero.") predictor: Optional[_InnerPredictor] = None if isinstance(init_model, (str, Path)): predictor = _InnerPredictor(model_file=init_model, pred_parameter=params) elif isinstance(init_model, Booster): - predictor = init_model._to_predictor(dict(init_model.params, **params)) + predictor = init_model._to_predictor(pred_parameter=dict(init_model.params, **params)) init_iteration = predictor.num_total_iteration if predictor is not None else 0 - # check dataset - if not isinstance(train_set, Dataset): - raise TypeError("Training only accepts Dataset object") train_set._update_params(params) \ ._set_predictor(predictor) \ @@ -193,8 +209,6 @@ def train( if valid_names is not None: train_data_name = valid_names[i] continue - if not isinstance(valid_data, Dataset): - raise TypeError("Training only accepts Dataset object") reduced_valid_sets.append(valid_data._update_params(params).set_reference(train_set)) if valid_names is not None and len(valid_names) > i: name_valid_sets.append(valid_names[i]) @@ -211,7 +225,7 @@ def train( if "early_stopping_round" in params: callbacks_set.add( callback.early_stopping( - stopping_rounds=params["early_stopping_round"], + stopping_rounds=params["early_stopping_round"], # type: ignore[arg-type] first_metric_only=first_metric_only, verbose=_choose_param_value( main_param_name="verbosity", @@ -251,7 +265,7 @@ def train( booster.update(fobj=fobj) - evaluation_result_list = [] + evaluation_result_list: List[_LGBM_BoosterEvalMethodResultType] = [] # check evaluation result. if valid_sets is not None: if is_valid_contain_train: @@ -531,7 +545,7 @@ def cv( callbacks: Optional[List[Callable]] = None, eval_train_metric: bool = False, return_cvbooster: bool = False -) -> Dict[str, Any]: +) -> Dict[str, Union[List[float], CVBooster]]: """Perform the cross-validation with given parameters. Parameters @@ -637,10 +651,14 @@ def cv( {'metric1-mean': [values], 'metric1-stdv': [values], 'metric2-mean': [values], 'metric2-stdv': [values], ...}. - If ``return_cvbooster=True``, also returns trained boosters via ``cvbooster`` key. + If ``return_cvbooster=True``, also returns trained boosters wrapped in a ``CVBooster`` object via ``cvbooster`` key. """ if not isinstance(train_set, Dataset): - raise TypeError("Training only accepts Dataset object") + raise TypeError(f"cv() only accepts Dataset object, train_set has type '{type(train_set).__name__}'.") + + if num_boost_round <= 0: + raise ValueError(f"num_boost_round must be greater than 0. Got {num_boost_round}.") + params = copy.deepcopy(params) params = _choose_param_value( main_param_name='objective', @@ -666,12 +684,10 @@ def cv( params.pop("early_stopping_round") first_metric_only = params.get('first_metric_only', False) - if num_boost_round <= 0: - raise ValueError("num_boost_round should be greater than zero.") if isinstance(init_model, (str, Path)): predictor = _InnerPredictor(model_file=init_model, pred_parameter=params) elif isinstance(init_model, Booster): - predictor = init_model._to_predictor(dict(init_model.params, **params)) + predictor = init_model._to_predictor(pred_parameter=dict(init_model.params, **params)) else: predictor = None @@ -702,7 +718,7 @@ def cv( if "early_stopping_round" in params: callbacks_set.add( callback.early_stopping( - stopping_rounds=params["early_stopping_round"], + stopping_rounds=params["early_stopping_round"], # type: ignore[arg-type] first_metric_only=first_metric_only, verbose=_choose_param_value( main_param_name="verbosity", @@ -725,8 +741,8 @@ def cv( begin_iteration=0, end_iteration=num_boost_round, evaluation_result_list=None)) - cvfolds.update(fobj=fobj) - res = _agg_cv_result(cvfolds.eval_valid(feval)) + cvfolds.update(fobj=fobj) # type: ignore[call-arg] + res = _agg_cv_result(cvfolds.eval_valid(feval)) # type: ignore[call-arg] for _, key, mean, _, std in res: results[f'{key}-mean'].append(mean) results[f'{key}-stdv'].append(std) @@ -747,6 +763,6 @@ def cv( break if return_cvbooster: - results['cvbooster'] = cvfolds + results['cvbooster'] = cvfolds # type: ignore[assignment] return dict(results) diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index bf3190320dc3..1bd991c0f618 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -6,15 +6,16 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union import numpy as np +import scipy.sparse from .basic import (Booster, Dataset, LightGBMError, _choose_param_value, _ConfigAliases, _LGBM_BoosterBestScoreType, _LGBM_CategoricalFeatureConfiguration, _LGBM_EvalFunctionResultType, _LGBM_FeatureNameConfiguration, - _log_warning) -from .callback import _EvalResultDict, record_evaluation + _LGBM_GroupType, _LGBM_InitScoreType, _LGBM_LabelType, _LGBM_WeightType, _log_warning) +from .callback import _EarlyStoppingCallback, _EvalResultDict, early_stopping, record_evaluation from .compat import (SKLEARN_INSTALLED, LGBMNotFittedError, _LGBMAssertAllFinite, _LGBMCheckArray, _LGBMCheckClassificationTargets, _LGBMCheckSampleWeight, _LGBMCheckXY, _LGBMClassifierBase, - _LGBMComputeSampleWeight, _LGBMCpuCount, _LGBMLabelEncoder, _LGBMModelBase, _LGBMRegressorBase, - dt_DataTable, pd_DataFrame) + _LGBMComputeSampleWeight, _LGBMCpuCount, _LGBMGroupKFold, _LGBMLabelEncoder, _LGBMModelBase, + _LGBMRegressorBase, _LGBMTrainTestSplit, dt_DataTable, pd_DataFrame) from .engine import train __all__ = [ @@ -24,39 +25,79 @@ 'LGBMRegressor', ] +_LGBM_ScikitMatrixLike = Union[ + dt_DataTable, + List[Union[List[float], List[int]]], + np.ndarray, + pd_DataFrame, + scipy.sparse.spmatrix +] _LGBM_ScikitCustomObjectiveFunction = Union[ + # f(labels, preds) Callable[ - [np.ndarray, np.ndarray], + [Optional[np.ndarray], np.ndarray], Tuple[np.ndarray, np.ndarray] ], + # f(labels, preds, weights) Callable[ - [np.ndarray, np.ndarray, np.ndarray], + [Optional[np.ndarray], np.ndarray, Optional[np.ndarray]], Tuple[np.ndarray, np.ndarray] ], + # f(labels, preds, weights, group) Callable[ - [np.ndarray, np.ndarray, np.ndarray, np.ndarray], + [Optional[np.ndarray], np.ndarray, Optional[np.ndarray], Optional[np.ndarray]], Tuple[np.ndarray, np.ndarray] ], ] _LGBM_ScikitCustomEvalFunction = Union[ + # f(labels, preds) + Callable[ + [Optional[np.ndarray], np.ndarray], + _LGBM_EvalFunctionResultType + ], Callable[ - [np.ndarray, np.ndarray], - Union[_LGBM_EvalFunctionResultType, List[_LGBM_EvalFunctionResultType]] + [Optional[np.ndarray], np.ndarray], + List[_LGBM_EvalFunctionResultType] ], + # f(labels, preds, weights) Callable[ - [np.ndarray, np.ndarray, np.ndarray], - Union[_LGBM_EvalFunctionResultType, List[_LGBM_EvalFunctionResultType]] + [Optional[np.ndarray], np.ndarray, Optional[np.ndarray]], + _LGBM_EvalFunctionResultType ], Callable[ - [np.ndarray, np.ndarray, np.ndarray, np.ndarray], - Union[_LGBM_EvalFunctionResultType, List[_LGBM_EvalFunctionResultType]] + [Optional[np.ndarray], np.ndarray, Optional[np.ndarray]], + List[_LGBM_EvalFunctionResultType] ], + # f(labels, preds, weights, group) + Callable[ + [Optional[np.ndarray], np.ndarray, Optional[np.ndarray], Optional[np.ndarray]], + _LGBM_EvalFunctionResultType + ], + Callable[ + [Optional[np.ndarray], np.ndarray, Optional[np.ndarray], Optional[np.ndarray]], + List[_LGBM_EvalFunctionResultType] + ] ] _LGBM_ScikitEvalMetricType = Union[ str, _LGBM_ScikitCustomEvalFunction, List[Union[str, _LGBM_ScikitCustomEvalFunction]] ] +_LGBM_ScikitCustomEvalSetSplitter = Union[ + Callable[ + [_LGBM_ScikitMatrixLike, _LGBM_LabelType], + Tuple[_LGBM_ScikitMatrixLike, _LGBM_ScikitMatrixLike, _LGBM_LabelType, _LGBM_LabelType] + ], + Callable[ + [_LGBM_ScikitMatrixLike, _LGBM_LabelType, Optional[np.ndarray]], + Tuple[_LGBM_ScikitMatrixLike, _LGBM_ScikitMatrixLike, _LGBM_LabelType, _LGBM_LabelType, Optional[np.ndarray], Optional[np.ndarray]] + ], + Callable[ + [_LGBM_ScikitMatrixLike, _LGBM_LabelType, Optional[np.ndarray], _LGBM_GroupType], + Tuple[_LGBM_ScikitMatrixLike, _LGBM_ScikitMatrixLike, _LGBM_LabelType, _LGBM_LabelType, Optional[np.ndarray], Optional[np.ndarray], _LGBM_GroupType, _LGBM_GroupType] + ], +] +_LGBM_ScikitValidSet = Tuple[_LGBM_ScikitMatrixLike, _LGBM_LabelType] class _ObjectiveFunctionWrapper: @@ -127,11 +168,11 @@ def __call__(self, preds: np.ndarray, dataset: Dataset) -> Tuple[np.ndarray, np. labels = dataset.get_label() argc = len(signature(self.func).parameters) if argc == 2: - grad, hess = self.func(labels, preds) + grad, hess = self.func(labels, preds) # type: ignore[call-arg] elif argc == 3: - grad, hess = self.func(labels, preds, dataset.get_weight()) + grad, hess = self.func(labels, preds, dataset.get_weight()) # type: ignore[call-arg] elif argc == 4: - grad, hess = self.func(labels, preds, dataset.get_weight(), dataset.get_group()) + grad, hess = self.func(labels, preds, dataset.get_weight(), dataset.get_group()) # type: ignore [call-arg] else: raise TypeError(f"Self-defined objective function should have 2, 3 or 4 arguments, got {argc}") return grad, hess @@ -205,15 +246,280 @@ def __call__( labels = dataset.get_label() argc = len(signature(self.func).parameters) if argc == 2: - return self.func(labels, preds) + return self.func(labels, preds) # type: ignore[call-arg] elif argc == 3: - return self.func(labels, preds, dataset.get_weight()) + return self.func(labels, preds, dataset.get_weight()) # type: ignore[call-arg] elif argc == 4: - return self.func(labels, preds, dataset.get_weight(), dataset.get_group()) + return self.func(labels, preds, dataset.get_weight(), dataset.get_group()) # type: ignore[call-arg] else: raise TypeError(f"Self-defined eval function should have 2, 3 or 4 arguments, got {argc}") +def _train_test_split( + X: _LGBM_ScikitMatrixLike, + y: _LGBM_LabelType, + weight, + test_size: float, + random_state: Optional[Union[int, np.random.RandomState]], + stratified: bool, +) -> Tuple[ + _LGBM_ScikitMatrixLike, + _LGBM_ScikitMatrixLike, + _LGBM_LabelType, + _LGBM_LabelType, + Optional[np.ndarray], + Optional[np.ndarray], +]: + """Split X, y and weights into random train and test subsets. + + Parameters + ---------- + X : numpy 2-D array of shape = [n_samples, n_features] + The features matrix. + y : numpy 1-D array of shape = [n_samples] + The target values. + weight : numpy 1-D array of shape = [n_samples] + The weight of samples. Weights should be non-negative. + test_size : float + Should be between 0.0 and 1.0 and represent the proportion of the dataset + that to include in the test split. + random_state : int, RandomState instance or None + Controls the shuffling applied to the data before applying the split. + Pass an int for reproducible output across multiple function calls. + stratified : bool + If true, split data in a stratified fashion. + + Returns + ------- + Tuple[ np.ndarray, np.ndarray, np.ndarray, np.ndarray, Optional[np.ndarray], Optional[np.ndarray] ] + X_train: numpy 2-D array of shape = [n_train_samples, n_features] + The features matrix to be used for training. + X_val: numpy 2-D array of shape = [n_val_samples, n_features] + The features matrix to be used as a evaluation set for early stopping. + y_train: numpy 1-D array of shape = [n_train_samples] + The target values to be used for training. + y_val: numpy 1-D array of shape = [n_val_samples] + The target values to be used as a evaluation set for early stopping. + weight_train: numpy 1-D array of shape = [n_train_samples], optional + The weight of samples to be used for training. Returned if input weights is not None. + Weights should be non-negative. + weight_val: numpy 1-D array of shape = [n_val_samples], optional + The weight of samples to be used as a evaluation set for early stopping. Returned if input weights is not None. + Weights should be non-negative. + """ + stratify = y if stratified else None + if weight is not None: + return _LGBMTrainTestSplit( + X, y, weight, test_size=test_size, random_state=random_state, stratify=stratify + ) + else: + X_train, X_val, y_train, y_val = _LGBMTrainTestSplit( + X, y, test_size=test_size, random_state=random_state, stratify=stratify + ) + return X_train, X_val, y_train, y_val, None, None + + +def _train_test_group_split( + X: _LGBM_ScikitMatrixLike, + y: _LGBM_LabelType, + weight, + group: _LGBM_GroupType, + n_splits: int +) -> Tuple[ + _LGBM_ScikitMatrixLike, + _LGBM_ScikitMatrixLike, + _LGBM_LabelType, + _LGBM_LabelType, + Optional[np.ndarray], + Optional[np.ndarray], + _LGBM_GroupType, + _LGBM_GroupType, +]: + """Split X, y, weights and group into train and test subsets. + + Parameters + ---------- + X : numpy 2-D array of shape = [n_samples, n_features] + The features matrix. + y : numpy 1-D array of shape = [n_samples] + The target values. + weight : numpy 1-D array of shape = [n_samples] + The weight of samples. Weights should be non-negative. + group : numpy 1-D array + Group/query data. + Only used in the learning-to-rank task. + sum(group) = n_samples. + For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, + where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. + n_splits : int + controls the size of the test set. The test set will have + size = n_samples / n_splits + + Returns + ------- + Tuple[ np.ndarray, np.ndarray, np.ndarray, np.ndarray, Optional[np.ndarray], Optional[np.ndarray], np.ndarray, np.ndarray ] + X_train: numpy 2-D array of shape = [n_train_samples, n_features] + The features matrix to be used for training. + X_val: numpy 2-D array of shape = [n_val_samples, n_features] + The features matrix to be used as a evaluation set for early stopping. + y_train: numpy 1-D array of shape = [n_train_samples] + The target values to be used for training. + y_val: numpy 1-D array of shape = [n_val_samples] + The target values to be used as a evaluation set for early stopping. + weight_train: numpy 1-D array of shape = [n_train_samples], optional + The weight of samples to be used for training. Returned if input weights is not None. + Weights should be non-negative. + weight_val: numpy 1-D array of shape = [n_val_samples], optional + The weight of samples to be used as a evaluation set for early stopping. Returned if input weights is not None. + Weights should be non-negative. + group_train: numpy 1-D array + Group/query data to be used for training. + Only used in the learning-to-rank task. + sum(group) = n_train_samples. + For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, + where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. + group_val: numpy 1-D array + Group/query data to be used as a evaluation set for early stopping. + Only used in the learning-to-rank task. + sum(group) = n_val_samples. + For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, + where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. + """ + group_k_fold = _LGBMGroupKFold(n_splits) + group = np.array(group, dtype=np.int32, copy=False) + group_flattened = np.repeat(list(range(len(group))), group) + train_idx, val_idx = next(group_k_fold.split(X, y, groups=group_flattened)) + full_ds = Dataset(data=X, label=y, weight=weight, group=group, free_raw_data=False) + train_ds = full_ds.subset(sorted(train_idx)).construct() + val_ds = full_ds.subset(sorted(val_idx)).construct() + return ( + train_ds.get_data(), + val_ds.get_data(), + train_ds.get_label(), + val_ds.get_label(), + train_ds.get_weight(), + val_ds.get_weight(), + train_ds.get_group(), + val_ds.get_group(), + ) + + +def _train_test_split_custom_splitter( + custom_splitter: _LGBM_ScikitCustomEvalSetSplitter, + X: _LGBM_ScikitMatrixLike, + y: _LGBM_LabelType, + weight, + group: Optional[_LGBM_GroupType] +) -> Tuple[ + _LGBM_ScikitMatrixLike, + _LGBM_ScikitMatrixLike, + _LGBM_LabelType, + _LGBM_LabelType, + Optional[np.ndarray], + Optional[np.ndarray], + Optional[_LGBM_GroupType], + Optional[_LGBM_GroupType], +]: + """Call passed custom_splitter with appropriate arguments. + + Parameters + ---------- + func : callable + Expects a callable with following signatures: + ``func(X, y) -> X_train, X_val, y_train, y_val``, + ``func(X, y, weight) -> X_train, X_val, y_train, y_val, weight_train, weight_val`` or + ``func(X, y, weight, group) -> + X_train, X_val, y_train, y_val, weight_train, weight_val, group_train, group_val`` + where + X : numpy 2-D array of shape = [n_samples, n_features] + The features matrix. + y : numpy 1-D array of shape = [n_samples] + The target values. + weight : numpy 1-D array of shape = [n_samples] + The weight of samples. Weights should be non-negative. + group : numpy 1-D array + Group/query data. + Only used in the learning-to-rank task. + sum(group) = n_samples. + For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, + where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. + X_train: numpy 2-D array of shape = [n_train_samples, n_features] + The features matrix to be used for training. + X_val: numpy 2-D array of shape = [n_val_samples, n_features] + The features matrix to be used as a evaluation set for early stopping. + y_train: numpy 1-D array of shape = [n_train_samples] + The target values to be used for training. + y_val: numpy 1-D array of shape = [n_val_samples] + The target values to be used as a evaluation set for early stopping. + weight_train: numpy 1-D array of shape = [n_train_samples] + The weight of samples to be used for training. Weights should be non-negative. + weight_val: numpy 1-D array of shape = [n_val_samples] + The weight of samples to be used as a evaluation set for early stopping. Weights should be non-negative. + group_train: numpy 1-D array + Group/query data to be used for training. + Only used in the learning-to-rank task. + sum(group) = n_train_samples. + For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, + where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. + group_val: numpy 1-D array + Group/query data to be used as a evaluation set for early stopping. + Only used in the learning-to-rank task. + sum(group) = n_val_samples. + For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, + where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. + X : numpy 2-D array of shape = [n_samples, n_features] + The features matrix. + y : numpy 1-D array of shape = [n_samples] + The target values. + weight : numpy 1-D array of shape = [n_samples] + The weight of samples. Weights should be non-negative. + group : numpy 1-D array + Group/query data. + Only used in the learning-to-rank task. + sum(group) = n_samples. + For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, + where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. + + Returns + ------- + X_train: numpy 2-D array of shape = [n_train_samples, n_features] + The features matrix to be used for training. + X_val: numpy 2-D array of shape = [n_val_samples, n_features] + The features matrix to be used as a evaluation set for early stopping. + y_train: numpy 1-D array of shape = [n_train_samples] + The target values to be used for training. + y_val: numpy 1-D array of shape = [n_val_samples] + The target values to be used as a evaluation set for early stopping. + weight_train: numpy 1-D array of shape = [n_train_samples] + The weight of samples to be used for training. Weights should be non-negative. + weight_val: numpy 1-D array of shape = [n_val_samples] + The weight of samples to be used as a evaluation set for early stopping. Weights should be non-negative. + group_train: numpy 1-D array + Group/query data to be used for training. + Only used in the learning-to-rank task. + sum(group) = n_train_samples. + For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, + where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. + group_val: numpy 1-D array + Group/query data to be used as a evaluation set for early stopping. + Only used in the learning-to-rank task. + sum(group) = n_val_samples. + For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, + where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. + """ + argc = len(signature(custom_splitter).parameters) + if argc == 2: + X_train, X_val, y_train, y_val = custom_splitter(X, y) + return X_train, X_val, y_train, y_val, None, None, None, None + elif argc == 3: + X_train, X_val, y_train, y_val, weight_train, weight_val = custom_splitter(X, y, weight) + return X_train, X_val, y_train, y_val, weight_train, weight_val, None, None + elif argc == 4: + return custom_splitter(X, y, weight, group) + else: + raise TypeError(f"Self-defined eval function should have 2, 3 or 4 arguments, got {argc}") + + # documentation templates for LGBMModel methods are shared between the classes in # this module and those in the ``dask`` module @@ -385,6 +691,10 @@ def __init__( random_state: Optional[Union[int, np.random.RandomState]] = None, n_jobs: Optional[int] = None, importance_type: str = 'split', + early_stopping: bool = False, + validation_fraction: Optional[float] = 0.1, + n_iter_no_change: int = 10, + validation_set_split_strategy: Optional[Union[str, _LGBM_ScikitCustomEvalSetSplitter]] = None, **kwargs ): r"""Construct a gradient boosting model. @@ -461,6 +771,25 @@ def __init__( The type of feature importance to be filled into ``feature_importances_``. If 'split', result contains numbers of times the feature is used in a model. If 'gain', result contains total gains of splits which use the feature. + early_stopping : bool, optional (default=False) + If ``True``, enables early stopping. If ``False`` and no ``early_stopping`` callbacks are passed + to the ``fit`` method, then early stopping is disabled. + validation_fraction : float or None, optional (default=0.1) + Proportion of training data to set aside as + validation data for early stopping. If None, early stopping is done on + the training data. Only used if early stopping is performed. + n_iter_no_change : int, optional (default=10) + Used to determine when to "early stop". The fitting process is + stopped when none of the last ``n_iter_no_change`` scores are better + than the ``n_iter_no_change - 1`` -th-to-last one, up to some + tolerance. Only used if early stopping is performed. + validation_set_split_strategy : Union[str, _LGBM_ScikitCustomEvalSetSplitter] (default=None) + Strategy to use to split validation data for early stopping. + If 'random' a random set of train data is used for validation. + If 'stratify', the random set of data is taking using stratifed sampling. + If 'group', the split is done using a random sample of groups. Only used in the learning-to-rank task. + Default: 'random' for LGBMRegressor, 'stratify' for LGBMClassifier, 'group' for LGBMRanker. + Alternatively, a custom splitting function can be provided, for more details, see note below. **kwargs Other parameters for the model. Check http://lightgbm.readthedocs.io/en/latest/Parameters.html for more parameters. @@ -500,6 +829,51 @@ def __init__( For multi-class task, y_pred is a numpy 2-D array of shape = [n_samples, n_classes], and grad and hess should be returned in the same format. + + A custom validation data splitting function can be provided for the ``validation_set_split_strategy`` parameter. + In this case, it should have the signature + ``func(X, y) -> X_train, X_val, y_train, y_val``, + ``func(X, y, weight) -> X_train, X_val, y_train, y_val, weight_train, weight_val`` or + ``func(X, y, weight, group) -> X_train, X_val, y_train, y_val, weight_train, weight_val, group_train, group_val`` + where: + + X : numpy 2-D array of shape = [n_samples, n_features] + The features matrix. + y : numpy 1-D array of shape = [n_samples] + The target values. + weight : numpy 1-D array of shape = [n_samples] + The weight of samples. Weights should be non-negative. + group : numpy 1-D array + Group/query data. + Only used in the learning-to-rank task. + sum(group) = n_samples. + For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, + where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. + X_train: numpy 2-D array of shape = [n_train_samples, n_features] + The features matrix to be used for training. + X_val: numpy 2-D array of shape = [n_val_samples, n_features] + The features matrix to be used as a evaluation set for early stopping. + y_train: numpy 1-D array of shape = [n_train_samples] + The target values to be used for training. + y_val: numpy 1-D array of shape = [n_val_samples] + The target values to be used as a evaluation set for early stopping. + weight_train: numpy 1-D array of shape = [n_train_samples] + The weight of samples to be used for training. Weights should be non-negative. + weight_val: numpy 1-D array of shape = [n_val_samples] + The weight of samples to be used as a evaluation set for early stopping. Weights should be non-negative. + group_train: numpy 1-D array + Group/query data to be used for training. + Only used in the learning-to-rank task. + sum(group) = n_train_samples. + For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, + where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. + group_val: numpy 1-D array + Group/query data to be used as a evaluation set for early stopping. + Only used in the learning-to-rank task. + sum(group) = n_val_samples. + For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups, + where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc. + """ if not SKLEARN_INSTALLED: raise LightGBMError('scikit-learn is required for lightgbm.sklearn. ' @@ -536,6 +910,10 @@ def __init__( self._n_features_in: int = -1 self._classes: Optional[np.ndarray] = None self._n_classes: int = -1 + self.early_stopping = early_stopping + self.validation_fraction = validation_fraction + self.n_iter_no_change = n_iter_no_change + self.validation_set_split_strategy = validation_set_split_strategy self.set_params(**kwargs) def _more_tags(self) -> Dict[str, Any]: @@ -638,6 +1016,10 @@ def _process_params(self, stage: str) -> Dict[str, Any]: params.pop('importance_type', None) params.pop('n_estimators', None) params.pop('class_weight', None) + params.pop("early_stopping", None) + params.pop("validation_fraction", None) + params.pop("n_iter_no_change", None) + params.pop("validation_set_split_strategy", None) if isinstance(params['random_state'], np.random.RandomState): params['random_state'] = params['random_state'].randint(np.iinfo(np.int32).max) @@ -697,17 +1079,17 @@ def _process_n_jobs(self, n_jobs: Optional[int]) -> int: def fit( self, - X, - y, - sample_weight=None, - init_score=None, - group=None, - eval_set=None, + X: _LGBM_ScikitMatrixLike, + y: _LGBM_LabelType, + sample_weight: Optional[_LGBM_WeightType] = None, + init_score: Optional[_LGBM_InitScoreType] = None, + group: Optional[_LGBM_GroupType] = None, + eval_set: Optional[List[_LGBM_ScikitValidSet]] = None, eval_names: Optional[List[str]] = None, - eval_sample_weight=None, - eval_class_weight=None, - eval_init_score=None, - eval_group=None, + eval_sample_weight: Optional[List[_LGBM_WeightType]] = None, + eval_class_weight: Optional[List[float]] = None, + eval_init_score: Optional[List[_LGBM_InitScoreType]] = None, + eval_group: Optional[List[_LGBM_GroupType]] = None, eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None, feature_name: _LGBM_FeatureNameConfiguration = 'auto', categorical_feature: _LGBM_CategoricalFeatureConfiguration = 'auto', @@ -719,9 +1101,13 @@ def fit( # Do not modify original args in fit function # Refer to https://github.com/microsoft/LightGBM/pull/2619 - eval_metric_list = copy.deepcopy(eval_metric) - if not isinstance(eval_metric_list, list): - eval_metric_list = [eval_metric_list] + eval_metric_list: List[Union[str, _LGBM_ScikitCustomEvalFunction]] + if eval_metric is None: + eval_metric_list = [] + elif isinstance(eval_metric, list): + eval_metric_list = copy.deepcopy(eval_metric) + else: + eval_metric_list = [copy.deepcopy(eval_metric)] # Separate built-in from callable evaluation metrics eval_metrics_callable = [_EvalFunctionWrapper(f) for f in eval_metric_list if callable(f)] @@ -732,10 +1118,120 @@ def fit( params['metric'] = [e for e in eval_metrics_builtin if e not in params['metric']] + params['metric'] params['metric'] = [metric for metric in params['metric'] if metric is not None] + if self.early_stopping and eval_set is None: + + if self.validation_set_split_strategy is None: + if isinstance(self, LGBMRegressor): + _validation_set_split_strategy = "random" + elif isinstance(self, LGBMClassifier): + _validation_set_split_strategy = "stratify" + elif isinstance(self, LGBMRanker): + _validation_set_split_strategy = "group" + else: + raise ValueError("Unknown LGBMModel type.") + else: + _validation_set_split_strategy = self.validation_set_split_strategy + + if callable(_validation_set_split_strategy): + ( + _X_train, _X_val, + _y_train, _y_val, + sample_weight_train, sample_weight_val, + group_train, group_val + ) = _train_test_split_custom_splitter( + custom_splitter=_validation_set_split_strategy, + X=X, + y=y, + weight=sample_weight, + group=group + ) + elif self.validation_fraction is None: + # If validation_fraction is None early stopping is done on the training data + _X_train = X + _X_val = copy.copy(X) + _y_train = y + _y_val = copy.copy(y) + sample_weight_train = sample_weight + sample_weight_val = copy.copy(sample_weight) + group_train = group + group_val = copy.copy(group) + elif ( + _validation_set_split_strategy != "group" + and isinstance(self, LGBMRanker) + ): + raise ValueError( + "Parameter group has been specified but the selected" + f"validation_set_split_strategy ({_validation_set_split_strategy})" + "does not support groups please set validation_set_split_strategy to \"group\" or " + "provide a callable with the signature func(X, y, weights, group) -> " + "(X_train, X_val, y_train, y_val, weights_train, weights_val, group_train, group_val)" + ) + elif _validation_set_split_strategy == "random": + ( + _X_train, _X_val, + _y_train, _y_val, + sample_weight_train, sample_weight_val, + ) = _train_test_split( + X, + y, + sample_weight, + test_size=self.validation_fraction, + random_state=self.random_state, + stratified=False, + ) + group_train, group_val = None, None + elif _validation_set_split_strategy == "stratify": + ( + _X_train, _X_val, + _y_train, _y_val, + sample_weight_train, sample_weight_val, + ) = _train_test_split( + X, + y, + sample_weight, + test_size=self.validation_fraction, + random_state=self.random_state, + stratified=True, + ) + group_train, group_val = None, None + elif _validation_set_split_strategy == "group": + n_splits = max(int(np.ceil(1 / self.validation_fraction)), 2) + ( + _X_train, _X_val, + _y_train, _y_val, + sample_weight_train, sample_weight_val, + group_train, group_val + ) = _train_test_group_split( + X, + y, + sample_weight, + group, + n_splits=n_splits + ) + else: + raise ValueError( + "validation_set_split_strategy must be a callable or one of the following" + "values { \"random\", \"stratify\", \"group\"}, got" + f"{_validation_set_split_strategy}") + + eval_set = [(_X_val, _y_val)] + if sample_weight_val is not None: + eval_sample_weight = [sample_weight_val] + if group_val is not None: + eval_group = [group_val] + + _X, _y = _X_train, _y_train + _sample_weight = sample_weight_train + _group = group_train + else: + _X, _y = X, y + _sample_weight = sample_weight + _group = group + if not isinstance(X, (pd_DataFrame, dt_DataTable)): - _X, _y = _LGBMCheckXY(X, y, accept_sparse=True, force_all_finite=False, ensure_min_samples=2) - if sample_weight is not None: - sample_weight = _LGBMCheckSampleWeight(sample_weight, _X) + _X, _y = _LGBMCheckXY(_X, _y, accept_sparse=True, force_all_finite=False, ensure_min_samples=2) + if _sample_weight is not None: + _sample_weight = _LGBMCheckSampleWeight(_sample_weight, _X) else: _X, _y = X, y @@ -743,16 +1239,16 @@ def fit( self._class_weight = self.class_weight if self._class_weight is not None: class_sample_weight = _LGBMComputeSampleWeight(self._class_weight, y) - if sample_weight is None or len(sample_weight) == 0: - sample_weight = class_sample_weight + if _sample_weight is None or len(_sample_weight) == 0: + _sample_weight = class_sample_weight else: - sample_weight = np.multiply(sample_weight, class_sample_weight) + _sample_weight = np.multiply(_sample_weight, class_sample_weight) self._n_features = _X.shape[1] # copy for consistency self._n_features_in = self._n_features - train_set = Dataset(data=_X, label=_y, weight=sample_weight, group=group, + train_set = Dataset(data=_X, label=_y, weight=_sample_weight, group=_group, init_score=init_score, categorical_feature=categorical_feature, params=params) @@ -805,13 +1301,16 @@ def _get_meta_data(collection, name, i): evals_result: _EvalResultDict = {} callbacks.append(record_evaluation(evals_result)) + if (self.early_stopping and all(type(callback) is not _EarlyStoppingCallback for callback in callbacks)): + callbacks.append(early_stopping(self.n_iter_no_change)) + self._Booster = train( params=params, train_set=train_set, num_boost_round=self.n_estimators, valid_sets=valid_sets, valid_names=eval_names, - feval=eval_metrics_callable, + feval=eval_metrics_callable, # type: ignore[arg-type] init_model=init_model, feature_name=feature_name, callbacks=callbacks @@ -829,19 +1328,19 @@ def _get_meta_data(collection, name, i): return self fit.__doc__ = _lgbmmodel_doc_fit.format( - X_shape="array-like or sparse matrix of shape = [n_samples, n_features]", - y_shape="array-like of shape = [n_samples]", - sample_weight_shape="array-like of shape = [n_samples] or None, optional (default=None)", - init_score_shape="array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task) or shape = [n_samples, n_classes] (for multi-class task) or None, optional (default=None)", - group_shape="array-like or None, optional (default=None)", - eval_sample_weight_shape="list of array, or None, optional (default=None)", - eval_init_score_shape="list of array, or None, optional (default=None)", - eval_group_shape="list of array, or None, optional (default=None)" + X_shape="numpy array, pandas DataFrame, H2O DataTable's Frame , scipy.sparse, list of lists of int or float of shape = [n_samples, n_features]", + y_shape="numpy array, pandas DataFrame, pandas Series, list of int or float of shape = [n_samples]", + sample_weight_shape="numpy array, pandas Series, list of int or float of shape = [n_samples] or None, optional (default=None)", + init_score_shape="numpy array, pandas DataFrame, pandas Series, list of int or float of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task) or shape = [n_samples, n_classes] (for multi-class task) or None, optional (default=None)", + group_shape="numpy array, pandas Series, list of int or float, or None, optional (default=None)", + eval_sample_weight_shape="list of array (same types as ``sample_weight`` supports), or None, optional (default=None)", + eval_init_score_shape="list of array (same types as ``init_score`` supports), or None, optional (default=None)", + eval_group_shape="list of array (same types as ``group`` supports), or None, optional (default=None)" ) + "\n\n" + _lgbmmodel_doc_custom_eval_note def predict( self, - X, + X: _LGBM_ScikitMatrixLike, raw_score: bool = False, start_iteration: int = 0, num_iteration: Optional[int] = None, @@ -889,7 +1388,7 @@ def predict( predict.__doc__ = _lgbmmodel_doc_predict.format( description="Return the predicted value for each sample.", - X_shape="array-like or sparse matrix of shape = [n_samples, n_features]", + X_shape="numpy array, pandas DataFrame, H2O DataTable's Frame , scipy.sparse, list of lists of int or float of shape = [n_samples, n_features]", output_name="predicted_result", predicted_result_shape="array-like of shape = [n_samples] or shape = [n_samples, n_classes]", X_leaves_shape="array-like of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]", @@ -929,7 +1428,7 @@ def objective_(self) -> Union[str, _LGBM_ScikitCustomObjectiveFunction]: """:obj:`str` or :obj:`callable`: The concrete objective used while fitting this model.""" if not self.__sklearn_is_fitted__(): raise LGBMNotFittedError('No objective found. Need to call fit beforehand.') - return self._objective + return self._objective # type: ignore[return-value] @property def n_estimators_(self) -> int: @@ -993,14 +1492,14 @@ class LGBMRegressor(_LGBMRegressorBase, LGBMModel): def fit( # type: ignore[override] self, - X, - y, - sample_weight=None, - init_score=None, - eval_set=None, + X: _LGBM_ScikitMatrixLike, + y: _LGBM_LabelType, + sample_weight: Optional[_LGBM_WeightType] = None, + init_score: Optional[_LGBM_InitScoreType] = None, + eval_set: Optional[List[_LGBM_ScikitValidSet]] = None, eval_names: Optional[List[str]] = None, - eval_sample_weight=None, - eval_init_score=None, + eval_sample_weight: Optional[List[_LGBM_WeightType]] = None, + eval_init_score: Optional[List[_LGBM_InitScoreType]] = None, eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None, feature_name: _LGBM_FeatureNameConfiguration = 'auto', categorical_feature: _LGBM_CategoricalFeatureConfiguration = 'auto', @@ -1039,15 +1538,15 @@ class LGBMClassifier(_LGBMClassifierBase, LGBMModel): def fit( # type: ignore[override] self, - X, - y, - sample_weight=None, - init_score=None, - eval_set=None, + X: _LGBM_ScikitMatrixLike, + y: _LGBM_LabelType, + sample_weight: Optional[_LGBM_WeightType] = None, + init_score: Optional[_LGBM_InitScoreType] = None, + eval_set: Optional[List[_LGBM_ScikitValidSet]] = None, eval_names: Optional[List[str]] = None, - eval_sample_weight=None, - eval_class_weight=None, - eval_init_score=None, + eval_sample_weight: Optional[List[_LGBM_WeightType]] = None, + eval_class_weight: Optional[List[float]] = None, + eval_init_score: Optional[List[_LGBM_InitScoreType]] = None, eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None, feature_name: _LGBM_FeatureNameConfiguration = 'auto', categorical_feature: _LGBM_CategoricalFeatureConfiguration = 'auto', @@ -1090,7 +1589,7 @@ def fit( # type: ignore[override] eval_metric = eval_metric_list # do not modify args, as it causes errors in model selection tools - valid_sets: Optional[List[Tuple]] = None + valid_sets: Optional[List[_LGBM_ScikitValidSet]] = None if eval_set is not None: if isinstance(eval_set, tuple): eval_set = [eval_set] @@ -1127,7 +1626,7 @@ def fit( # type: ignore[override] def predict( self, - X, + X: _LGBM_ScikitMatrixLike, raw_score: bool = False, start_iteration: int = 0, num_iteration: Optional[int] = None, @@ -1157,7 +1656,7 @@ def predict( def predict_proba( self, - X, + X: _LGBM_ScikitMatrixLike, raw_score: bool = False, start_iteration: int = 0, num_iteration: Optional[int] = None, @@ -1189,7 +1688,7 @@ def predict_proba( predict_proba.__doc__ = _lgbmmodel_doc_predict.format( description="Return the predicted probability for each class for each sample.", - X_shape="array-like or sparse matrix of shape = [n_samples, n_features]", + X_shape="numpy array, pandas DataFrame, H2O DataTable's Frame , scipy.sparse, list of lists of int or float of shape = [n_samples, n_features]", output_name="predicted_probability", predicted_result_shape="array-like of shape = [n_samples] or shape = [n_samples, n_classes]", X_leaves_shape="array-like of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]", @@ -1223,16 +1722,16 @@ class LGBMRanker(LGBMModel): def fit( # type: ignore[override] self, - X, - y, - sample_weight=None, - init_score=None, - group=None, - eval_set=None, + X: _LGBM_ScikitMatrixLike, + y: _LGBM_LabelType, + sample_weight: Optional[_LGBM_WeightType] = None, + init_score: Optional[_LGBM_InitScoreType] = None, + group: Optional[_LGBM_GroupType] = None, + eval_set: Optional[List[_LGBM_ScikitValidSet]] = None, eval_names: Optional[List[str]] = None, - eval_sample_weight=None, - eval_init_score=None, - eval_group=None, + eval_sample_weight: Optional[List[_LGBM_WeightType]] = None, + eval_init_score: Optional[List[_LGBM_InitScoreType]] = None, + eval_group: Optional[List[_LGBM_GroupType]] = None, eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None, eval_at: Union[List[int], Tuple[int, ...]] = (1, 2, 3, 4, 5), feature_name: _LGBM_FeatureNameConfiguration = 'auto', diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml new file mode 100644 index 000000000000..c3d550047389 --- /dev/null +++ b/python-package/pyproject.toml @@ -0,0 +1,57 @@ +[tool.isort] +line_length = 120 +skip_glob = [ + "*/external_libs/*", + "*/lightgbm-python/*" +] + +[tool.mypy] +exclude = 'build/*|compile/*|docs/*|examples/*|external_libs/*|lightgbm-python/*|tests/*' +ignore_missing_imports = true + +[tool.ruff] +exclude = [ + "build", + "compile", + "docs", + "external_libs", + "lightgbm-python", + "setup.py" +] +ignore = [ + # (pydocstyle) Missing docstring in magic method + "D105", + # (pycodestyle) Line too long + "E501" +] +select = [ + # flake8-bugbear + "B", + # flake8-comprehensions + "C4", + # pydocstyle + "D", + # pycodestyle + "E", + # pyflakes + "F" +] + +# this should be set to the oldest version of python LightGBM supports +target-version = "py37" + +[tool.ruff.per-file-ignores] +"examples/*" = [ + # pydocstyle + "D" +] +"tests/*" = [ + # (flake8-bugbear) Found useless expression + "B018", + # pydocstyle + "D" +] + +[tool.ruff.pydocstyle] + +convention = "numpy" diff --git a/python-package/setup.cfg b/python-package/setup.cfg new file mode 100644 index 000000000000..0f2746df16c4 --- /dev/null +++ b/python-package/setup.cfg @@ -0,0 +1,12 @@ +[flake8] +ignore = + # line too long + E501, + # line break occurred before a binary operator + W503 +exclude = + ./.nuget, + ./external_libs, + ./lightgbm-python, + ./python-package/build, + ./python-package/compile diff --git a/python-package/setup.py b/python-package/setup.py index b1620929f816..565cddd75ee4 100644 --- a/python-package/setup.py +++ b/python-package/setup.py @@ -7,8 +7,8 @@ from os import chdir from pathlib import Path from platform import system -from shutil import copyfile, copytree, rmtree -from typing import List, Optional, Union +from shutil import rmtree +from typing import List, Optional from setuptools import find_packages, setup from setuptools.command.install import install @@ -46,41 +46,6 @@ def find_lib() -> List[str]: return LIB_PATH -def copy_files(integrated_opencl: bool = False, use_gpu: bool = False) -> None: - - def copy_files_helper(folder_name: Union[str, Path]) -> None: - src = CURRENT_DIR.parent / folder_name - if src.is_dir(): - dst = CURRENT_DIR / 'compile' / folder_name - if dst.is_dir(): - rmtree(dst) - copytree(src, dst) - else: - raise Exception(f'Cannot copy {src} folder') - - if not IS_SOURCE_FLAG_PATH.is_file(): - copy_files_helper('include') - copy_files_helper('src') - for submodule in (CURRENT_DIR.parent / 'external_libs').iterdir(): - submodule_stem = submodule.stem - if submodule_stem == 'compute' and not use_gpu: - continue - copy_files_helper(Path('external_libs') / submodule_stem) - (CURRENT_DIR / "compile" / "windows").mkdir(parents=True, exist_ok=True) - copyfile(CURRENT_DIR.parent / "windows" / "LightGBM.sln", - CURRENT_DIR / "compile" / "windows" / "LightGBM.sln") - copyfile(CURRENT_DIR.parent / "windows" / "LightGBM.vcxproj", - CURRENT_DIR / "compile" / "windows" / "LightGBM.vcxproj") - copyfile(CURRENT_DIR.parent / "LICENSE", - CURRENT_DIR / "LICENSE") - copyfile(CURRENT_DIR.parent / "CMakeLists.txt", - CURRENT_DIR / "compile" / "CMakeLists.txt") - if integrated_opencl: - (CURRENT_DIR / "compile" / "cmake").mkdir(parents=True, exist_ok=True) - copyfile(CURRENT_DIR.parent / "cmake" / "IntegratedOpenCL.cmake", - CURRENT_DIR / "compile" / "cmake" / "IntegratedOpenCL.cmake") - - def clear_path(path: Path) -> None: if path.is_dir(): for file_name in path.iterdir(): @@ -126,7 +91,7 @@ def compile_cpp( logger.info("Starting to compile the library.") - cmake_cmd = ["cmake", str(CURRENT_DIR / "compile")] + cmake_cmd = ["cmake", str(CURRENT_DIR / "compile"), "-D__BUILD_FOR_PYTHON=ON"] if integrated_opencl: use_gpu = False cmake_cmd.append("-D__INTEGRATE_OPENCL=ON") @@ -160,7 +125,8 @@ def compile_cpp( if use_mpi: raise Exception('MPI version cannot be compiled by MinGW due to the miss of MPI library in it') logger.info("Starting to compile with CMake and MinGW.") - silent_call(cmake_cmd + ["-G", "MinGW Makefiles"], raise_error=True, + # ref: https://stackoverflow.com/a/45104058/3986677 + silent_call(cmake_cmd + ["-G", "MinGW Makefiles", "-DCMAKE_SH=CMAKE_SH-NOTFOUND"], raise_error=True, error_msg='Please install CMake and all required dependencies first') silent_call(["mingw32-make.exe", "_lightgbm", f"-I{build_dir}", "-j4"], raise_error=True, error_msg='Please install MinGW first') @@ -254,7 +220,6 @@ def run(self) -> None: "please use 64-bit Python instead.") LOG_PATH.touch() if not self.precompile: - copy_files(integrated_opencl=self.integrated_opencl, use_gpu=self.gpu) compile_cpp(use_mingw=self.mingw, use_gpu=self.gpu, use_cuda=self.cuda, use_mpi=self.mpi, use_hdfs=self.hdfs, boost_root=self.boost_root, boost_dir=self.boost_dir, boost_include_dir=self.boost_include_dir, boost_librarydir=self.boost_librarydir, @@ -315,7 +280,6 @@ def finalize_options(self) -> None: class CustomSdist(sdist): def run(self) -> None: - copy_files(integrated_opencl=True, use_gpu=True) IS_SOURCE_FLAG_PATH.touch() rmtree(CURRENT_DIR / 'lightgbm' / 'Release', ignore_errors=True) rmtree(CURRENT_DIR / 'lightgbm' / 'windows' / 'x64', ignore_errors=True) @@ -332,11 +296,8 @@ def run(self) -> None: LOG_PATH = Path.home() / 'LightGBM_compilation.log' LOG_NOTICE = f"The full version of error log was saved into {LOG_PATH}" IS_SOURCE_FLAG_PATH = CURRENT_DIR / '_IS_SOURCE_PACKAGE.txt' - _version_src = CURRENT_DIR.parent / 'VERSION.txt' - _version_dst = CURRENT_DIR / 'lightgbm' / 'VERSION.txt' - if _version_src.is_file(): - copyfile(_version_src, _version_dst) - version = _version_dst.read_text(encoding='utf-8').strip() + _version_file = CURRENT_DIR / 'lightgbm' / 'VERSION.txt' + version = _version_file.read_text(encoding='utf-8').strip() readme = (CURRENT_DIR / 'README.rst').read_text(encoding='utf-8') sys.path.insert(0, str(CURRENT_DIR)) diff --git a/src/io/bin.cpp b/src/io/bin.cpp index 30da15d81053..3d84599e6589 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -160,20 +160,6 @@ namespace LightGBM { const std::vector& forced_upper_bounds) { std::vector bin_upper_bound; - // get list of distinct values - int left_cnt_data = 0; - int cnt_zero = 0; - int right_cnt_data = 0; - for (int i = 0; i < num_distinct_values; ++i) { - if (distinct_values[i] <= -kZeroThreshold) { - left_cnt_data += counts[i]; - } else if (distinct_values[i] > kZeroThreshold) { - right_cnt_data += counts[i]; - } else { - cnt_zero += counts[i]; - } - } - // get number of positive and negative distinct values int left_cnt = -1; for (int i = 0; i < num_distinct_values; ++i) { diff --git a/src/io/config.cpp b/src/io/config.cpp index 86b64a52d105..e8578046960a 100644 --- a/src/io/config.cpp +++ b/src/io/config.cpp @@ -378,6 +378,10 @@ void Config::CheckParamConflict() { if (deterministic) { Log::Warning("Although \"deterministic\" is set, the results ran by GPU may be non-deterministic."); } + if (use_quantized_grad) { + Log::Warning("Quantized training is not supported by GPU tree learner. Switch to full precision training."); + use_quantized_grad = false; + } } else if (device_type == std::string("cuda")) { // force row-wise for cuda version force_col_wise = false; @@ -385,6 +389,10 @@ void Config::CheckParamConflict() { if (deterministic) { Log::Warning("Although \"deterministic\" is set, the results ran by GPU may be non-deterministic."); } + if (use_quantized_grad) { + Log::Warning("Quantized training is not supported by CUDA tree learner. Switch to full precision training."); + use_quantized_grad = false; + } } // linear tree learner must be serial type and run on CPU device if (linear_tree) { diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index b1dbcc378a27..0906ba4b6439 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -251,6 +251,10 @@ const std::unordered_set& Config::parameter_set() { "output_model", "saved_feature_importance_type", "snapshot_freq", + "use_quantized_grad", + "num_grad_quant_bins", + "quant_train_renew_leaf", + "stochastic_rounding", "linear_tree", "max_bin", "max_bin_by_feature", @@ -493,6 +497,14 @@ void Config::GetMembersFromString(const std::unordered_map>& Config::paramet {"output_model", {"model_output", "model_out"}}, {"saved_feature_importance_type", {}}, {"snapshot_freq", {"save_period"}}, + {"use_quantized_grad", {}}, + {"num_grad_quant_bins", {}}, + {"quant_train_renew_leaf", {}}, + {"stochastic_rounding", {}}, {"linear_tree", {"linear_trees"}}, {"max_bin", {"max_bins"}}, {"max_bin_by_feature", {}}, @@ -966,6 +982,10 @@ const std::unordered_map& Config::ParameterTypes() { {"output_model", "string"}, {"saved_feature_importance_type", "int"}, {"snapshot_freq", "int"}, + {"use_quantized_grad", "bool"}, + {"num_grad_quant_bins", "int"}, + {"quant_train_renew_leaf", "bool"}, + {"stochastic_rounding", "bool"}, {"linear_tree", "bool"}, {"max_bin", "int"}, {"max_bin_by_feature", "vector"}, diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index a8f449d3f55b..5b23f01ec3a0 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -608,10 +608,12 @@ MultiValBin* Dataset::GetMultiBinFromAllFeatures(const std::vector& of return ret.release(); } +template TrainingShareStates* Dataset::GetShareStates( score_t* gradients, score_t* hessians, const std::vector& is_feature_used, bool is_constant_hessian, - bool force_col_wise, bool force_row_wise) const { + bool force_col_wise, bool force_row_wise, + const int num_grad_quant_bins) const { Common::FunctionTimer fun_timer("Dataset::TestMultiThreadingMethod", global_timer); if (force_col_wise && force_row_wise) { @@ -631,7 +633,7 @@ TrainingShareStates* Dataset::GetShareStates( share_state->CalcBinOffsets( feature_groups_, &offsets, true); share_state->SetMultiValBin(GetMultiBinFromSparseFeatures(offsets), - num_data_, feature_groups_, false, true); + num_data_, feature_groups_, false, true, num_grad_quant_bins); share_state->is_col_wise = true; share_state->is_constant_hessian = is_constant_hessian; return share_state; @@ -641,7 +643,7 @@ TrainingShareStates* Dataset::GetShareStates( share_state->CalcBinOffsets( feature_groups_, &offsets, false); share_state->SetMultiValBin(GetMultiBinFromAllFeatures(offsets), num_data_, - feature_groups_, false, false); + feature_groups_, false, false, num_grad_quant_bins); share_state->is_col_wise = false; share_state->is_constant_hessian = is_constant_hessian; return share_state; @@ -658,14 +660,14 @@ TrainingShareStates* Dataset::GetShareStates( std::vector col_wise_offsets; col_wise_state->CalcBinOffsets(feature_groups_, &col_wise_offsets, true); col_wise_state->SetMultiValBin(GetMultiBinFromSparseFeatures(col_wise_offsets), num_data_, - feature_groups_, false, true); + feature_groups_, false, true, num_grad_quant_bins); col_wise_init_time = std::chrono::steady_clock::now() - start_time; start_time = std::chrono::steady_clock::now(); std::vector row_wise_offsets; row_wise_state->CalcBinOffsets(feature_groups_, &row_wise_offsets, false); row_wise_state->SetMultiValBin(GetMultiBinFromAllFeatures(row_wise_offsets), num_data_, - feature_groups_, false, false); + feature_groups_, false, false, num_grad_quant_bins); row_wise_init_time = std::chrono::steady_clock::now() - start_time; uint64_t max_total_bin = std::max(row_wise_state->num_hist_total_bin(), @@ -685,12 +687,12 @@ TrainingShareStates* Dataset::GetShareStates( InitTrain(is_feature_used, row_wise_state.get()); std::chrono::duration col_wise_time, row_wise_time; start_time = std::chrono::steady_clock::now(); - ConstructHistograms(is_feature_used, nullptr, num_data_, gradients, + ConstructHistograms(is_feature_used, nullptr, num_data_, gradients, hessians, gradients, hessians, col_wise_state.get(), hist_data.data()); col_wise_time = std::chrono::steady_clock::now() - start_time; start_time = std::chrono::steady_clock::now(); - ConstructHistograms(is_feature_used, nullptr, num_data_, gradients, + ConstructHistograms(is_feature_used, nullptr, num_data_, gradients, hessians, gradients, hessians, row_wise_state.get(), hist_data.data()); row_wise_time = std::chrono::steady_clock::now() - start_time; @@ -721,6 +723,24 @@ TrainingShareStates* Dataset::GetShareStates( } } +template TrainingShareStates* Dataset::GetShareStates( + score_t* gradients, score_t* hessians, + const std::vector& is_feature_used, bool is_constant_hessian, + bool force_col_wise, bool force_row_wise, + const int num_grad_quant_bins) const; + +template TrainingShareStates* Dataset::GetShareStates( + score_t* gradients, score_t* hessians, + const std::vector& is_feature_used, bool is_constant_hessian, + bool force_col_wise, bool force_row_wise, + const int num_grad_quant_bins) const; + +template TrainingShareStates* Dataset::GetShareStates( + score_t* gradients, score_t* hessians, + const std::vector& is_feature_used, bool is_constant_hessian, + bool force_col_wise, bool force_row_wise, + const int num_grad_quant_bins) const; + void Dataset::CopyFeatureMapperFrom(const Dataset* dataset) { feature_groups_.clear(); num_features_ = dataset->num_features_; @@ -1203,7 +1223,7 @@ void Dataset::InitTrain(const std::vector& is_feature_used, is_feature_used); } -template +template void Dataset::ConstructHistogramsMultiVal(const data_size_t* data_indices, data_size_t num_data, const score_t* gradients, @@ -1212,18 +1232,18 @@ void Dataset::ConstructHistogramsMultiVal(const data_size_t* data_indices, hist_t* hist_data) const { Common::FunctionTimer fun_time("Dataset::ConstructHistogramsMultiVal", global_timer); - share_state->ConstructHistograms( + share_state->ConstructHistograms( data_indices, num_data, gradients, hessians, hist_data); } -template +template void Dataset::ConstructHistogramsInner( const std::vector& is_feature_used, const data_size_t* data_indices, data_size_t num_data, const score_t* gradients, const score_t* hessians, score_t* ordered_gradients, score_t* ordered_hessians, TrainingShareStates* share_state, hist_t* hist_data) const { if (!share_state->is_col_wise) { - return ConstructHistogramsMultiVal( + return ConstructHistogramsMultiVal( data_indices, num_data, gradients, hessians, share_state, hist_data); } std::vector used_dense_group; @@ -1275,30 +1295,80 @@ void Dataset::ConstructHistogramsInner( for (int gi = 0; gi < num_used_dense_group; ++gi) { OMP_LOOP_EX_BEGIN(); int group = used_dense_group[gi]; - auto data_ptr = hist_data + group_bin_boundaries_[group] * 2; const int num_bin = feature_groups_[group]->num_total_bin_; - std::memset(reinterpret_cast(data_ptr), 0, - num_bin * kHistEntrySize); - if (USE_HESSIAN) { - if (USE_INDICES) { - feature_groups_[group]->bin_data_->ConstructHistogram( - data_indices, 0, num_data, ptr_ordered_grad, ptr_ordered_hess, - data_ptr); + if (USE_QUANT_GRAD) { + if (HIST_BITS == 16) { + auto data_ptr = reinterpret_cast(reinterpret_cast(hist_data) + group_bin_boundaries_[group]); + std::memset(reinterpret_cast(data_ptr), 0, + num_bin * kInt16HistEntrySize); + if (USE_HESSIAN) { + if (USE_INDICES) { + feature_groups_[group]->bin_data_->ConstructHistogramInt16( + data_indices, 0, num_data, ptr_ordered_grad, ptr_ordered_hess, + data_ptr); + } else { + feature_groups_[group]->bin_data_->ConstructHistogramInt16( + 0, num_data, ptr_ordered_grad, ptr_ordered_hess, data_ptr); + } + } else { + if (USE_INDICES) { + feature_groups_[group]->bin_data_->ConstructHistogramInt16( + data_indices, 0, num_data, ptr_ordered_grad, + data_ptr); + } else { + feature_groups_[group]->bin_data_->ConstructHistogramInt16( + 0, num_data, ptr_ordered_grad, data_ptr); + } + } } else { - feature_groups_[group]->bin_data_->ConstructHistogram( - 0, num_data, ptr_ordered_grad, ptr_ordered_hess, data_ptr); + auto data_ptr = hist_data + group_bin_boundaries_[group]; + std::memset(reinterpret_cast(data_ptr), 0, + num_bin * kInt32HistEntrySize); + if (USE_HESSIAN) { + if (USE_INDICES) { + feature_groups_[group]->bin_data_->ConstructHistogramInt32( + data_indices, 0, num_data, ptr_ordered_grad, ptr_ordered_hess, + data_ptr); + } else { + feature_groups_[group]->bin_data_->ConstructHistogramInt32( + 0, num_data, ptr_ordered_grad, ptr_ordered_hess, data_ptr); + } + } else { + if (USE_INDICES) { + feature_groups_[group]->bin_data_->ConstructHistogramInt32( + data_indices, 0, num_data, ptr_ordered_grad, + data_ptr); + } else { + feature_groups_[group]->bin_data_->ConstructHistogramInt32( + 0, num_data, ptr_ordered_grad, data_ptr); + } + } } } else { - if (USE_INDICES) { - feature_groups_[group]->bin_data_->ConstructHistogram( - data_indices, 0, num_data, ptr_ordered_grad, data_ptr); + auto data_ptr = hist_data + group_bin_boundaries_[group] * 2; + std::memset(reinterpret_cast(data_ptr), 0, + num_bin * kHistEntrySize); + if (USE_HESSIAN) { + if (USE_INDICES) { + feature_groups_[group]->bin_data_->ConstructHistogram( + data_indices, 0, num_data, ptr_ordered_grad, ptr_ordered_hess, + data_ptr); + } else { + feature_groups_[group]->bin_data_->ConstructHistogram( + 0, num_data, ptr_ordered_grad, ptr_ordered_hess, data_ptr); + } } else { - feature_groups_[group]->bin_data_->ConstructHistogram( - 0, num_data, ptr_ordered_grad, data_ptr); - } - auto cnt_dst = reinterpret_cast(data_ptr + 1); - for (int i = 0; i < num_bin * 2; i += 2) { - data_ptr[i + 1] = static_cast(cnt_dst[i]) * hessians[0]; + if (USE_INDICES) { + feature_groups_[group]->bin_data_->ConstructHistogram( + data_indices, 0, num_data, ptr_ordered_grad, data_ptr); + } else { + feature_groups_[group]->bin_data_->ConstructHistogram( + 0, num_data, ptr_ordered_grad, data_ptr); + } + auto cnt_dst = reinterpret_cast(data_ptr + 1); + for (int i = 0; i < num_bin * 2; i += 2) { + data_ptr[i + 1] = static_cast(cnt_dst[i]) * hessians[0]; + } } } OMP_LOOP_EX_END(); @@ -1307,43 +1377,78 @@ void Dataset::ConstructHistogramsInner( } global_timer.Stop("Dataset::dense_bin_histogram"); if (multi_val_groud_id >= 0) { - if (num_used_dense_group > 0) { - ConstructHistogramsMultiVal( - data_indices, num_data, ptr_ordered_grad, ptr_ordered_hess, - share_state, - hist_data + group_bin_boundaries_[multi_val_groud_id] * 2); + if (USE_QUANT_GRAD) { + if (HIST_BITS == 32) { + int32_t* hist_data_ptr = reinterpret_cast(hist_data); + if (num_used_dense_group > 0) { + ConstructHistogramsMultiVal( + data_indices, num_data, ptr_ordered_grad, ptr_ordered_hess, + share_state, + reinterpret_cast(hist_data_ptr + group_bin_boundaries_[multi_val_groud_id] * 2)); + } else { + ConstructHistogramsMultiVal( + data_indices, num_data, gradients, hessians, share_state, + reinterpret_cast(hist_data_ptr + group_bin_boundaries_[multi_val_groud_id] * 2)); + } + } else if (HIST_BITS == 16) { + int16_t* hist_data_ptr = reinterpret_cast(hist_data); + if (num_used_dense_group > 0) { + ConstructHistogramsMultiVal( + data_indices, num_data, ptr_ordered_grad, ptr_ordered_hess, + share_state, + reinterpret_cast(hist_data_ptr + group_bin_boundaries_[multi_val_groud_id] * 2)); + } else { + ConstructHistogramsMultiVal( + data_indices, num_data, gradients, hessians, share_state, + reinterpret_cast(hist_data_ptr + group_bin_boundaries_[multi_val_groud_id] * 2)); + } + } } else { - ConstructHistogramsMultiVal( - data_indices, num_data, gradients, hessians, share_state, - hist_data + group_bin_boundaries_[multi_val_groud_id] * 2); + if (num_used_dense_group > 0) { + ConstructHistogramsMultiVal( + data_indices, num_data, ptr_ordered_grad, ptr_ordered_hess, + share_state, + hist_data + group_bin_boundaries_[multi_val_groud_id] * 2); + } else { + ConstructHistogramsMultiVal( + data_indices, num_data, gradients, hessians, share_state, + hist_data + group_bin_boundaries_[multi_val_groud_id] * 2); + } } } } // explicitly initialize template methods, for cross module call -template void Dataset::ConstructHistogramsInner( - const std::vector& is_feature_used, const data_size_t* data_indices, - data_size_t num_data, const score_t* gradients, const score_t* hessians, - score_t* ordered_gradients, score_t* ordered_hessians, - TrainingShareStates* share_state, hist_t* hist_data) const; +#define CONSTRUCT_HISTOGRAMS_INNER_PARMA \ + const std::vector& is_feature_used, const data_size_t* data_indices, \ + data_size_t num_data, const score_t* gradients, const score_t* hessians, \ + score_t* ordered_gradients, score_t* ordered_hessians, \ + TrainingShareStates* share_state, hist_t* hist_data -template void Dataset::ConstructHistogramsInner( - const std::vector& is_feature_used, const data_size_t* data_indices, - data_size_t num_data, const score_t* gradients, const score_t* hessians, - score_t* ordered_gradients, score_t* ordered_hessians, - TrainingShareStates* share_state, hist_t* hist_data) const; +// explicitly initialize template methods, for cross module call +template void Dataset::ConstructHistogramsInner(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const; -template void Dataset::ConstructHistogramsInner( - const std::vector& is_feature_used, const data_size_t* data_indices, - data_size_t num_data, const score_t* gradients, const score_t* hessians, - score_t* ordered_gradients, score_t* ordered_hessians, - TrainingShareStates* share_state, hist_t* hist_data) const; +template void Dataset::ConstructHistogramsInner(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const; -template void Dataset::ConstructHistogramsInner( - const std::vector& is_feature_used, const data_size_t* data_indices, - data_size_t num_data, const score_t* gradients, const score_t* hessians, - score_t* ordered_gradients, score_t* ordered_hessians, - TrainingShareStates* share_state, hist_t* hist_data) const; +template void Dataset::ConstructHistogramsInner(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const; + +template void Dataset::ConstructHistogramsInner(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const; + +template void Dataset::ConstructHistogramsInner(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const; + +template void Dataset::ConstructHistogramsInner(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const; + +template void Dataset::ConstructHistogramsInner(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const; + +template void Dataset::ConstructHistogramsInner(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const; + +template void Dataset::ConstructHistogramsInner(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const; + +template void Dataset::ConstructHistogramsInner(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const; + +template void Dataset::ConstructHistogramsInner(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const; + +template void Dataset::ConstructHistogramsInner(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const; void Dataset::FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, hist_t* data) const { @@ -1365,6 +1470,49 @@ void Dataset::FixHistogram(int feature_idx, double sum_gradient, } } +template +void Dataset::FixHistogramInt(int feature_idx, int64_t int_sum_gradient_and_hessian, hist_t* data) const { + const int group = feature2group_[feature_idx]; + const int sub_feature = feature2subfeature_[feature_idx]; + const BinMapper* bin_mapper = + feature_groups_[group]->bin_mappers_[sub_feature].get(); + const int most_freq_bin = bin_mapper->GetMostFreqBin(); + PACKED_HIST_BIN_T* data_ptr = reinterpret_cast(data); + PACKED_HIST_ACC_T int_sum_gradient_and_hessian_local = HIST_BITS_ACC == 16 ? + ((static_cast(int_sum_gradient_and_hessian >> 32) << 16) | + static_cast(int_sum_gradient_and_hessian & 0x0000ffff)) : + int_sum_gradient_and_hessian; + if (most_freq_bin > 0) { + const int num_bin = bin_mapper->num_bin(); + if (HIST_BITS_BIN == HIST_BITS_ACC) { + for (int i = 0; i < num_bin; ++i) { + if (i != most_freq_bin) { + int_sum_gradient_and_hessian_local -= data_ptr[i]; + } + } + data_ptr[most_freq_bin] = int_sum_gradient_and_hessian_local; + } else { + CHECK_EQ(HIST_BITS_ACC, 32); + CHECK_EQ(HIST_BITS_BIN, 16); + for (int i = 0; i < num_bin; ++i) { + if (i != most_freq_bin) { + const PACKED_HIST_BIN_T packed_hist = data_ptr[i]; + const PACKED_HIST_ACC_T packed_hist_acc = (static_cast(static_cast(packed_hist >> 16)) << 32) | + static_cast(packed_hist & 0x0000ffff); + int_sum_gradient_and_hessian_local -= packed_hist_acc; + } + } + PACKED_HIST_BIN_T int_sum_gradient_and_hessian_local_bin = + (static_cast(int_sum_gradient_and_hessian_local >> 32) << 16) | static_cast(int_sum_gradient_and_hessian_local & 0x0000ffff); + data_ptr[most_freq_bin] = int_sum_gradient_and_hessian_local_bin; + } + } +} + +template void Dataset::FixHistogramInt(int feature_idx, int64_t int_sum_gradient_and_hessian, hist_t* data) const; + +template void Dataset::FixHistogramInt(int feature_idx, int64_t int_sum_gradient_and_hessian, hist_t* data) const; + template void PushVector(std::vector* dest, const std::vector& src) { dest->reserve(dest->size() + src.size()); diff --git a/src/io/dense_bin.hpp b/src/io/dense_bin.hpp index 3d0f8db8e549..e612052e47d2 100644 --- a/src/io/dense_bin.hpp +++ b/src/io/dense_bin.hpp @@ -171,6 +171,146 @@ class DenseBin : public Bin { } + template + void ConstructHistogramIntInner(const data_size_t* data_indices, + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const { + data_size_t i = start; + PACKED_HIST_T* out_ptr = reinterpret_cast(out); + const int16_t* gradients_ptr = reinterpret_cast(ordered_gradients); + const VAL_T* data_ptr_base = data_.data(); + if (USE_PREFETCH) { + const data_size_t pf_offset = 64 / sizeof(VAL_T); + const data_size_t pf_end = end - pf_offset; + for (; i < pf_end; ++i) { + const auto idx = USE_INDICES ? data_indices[i] : i; + const auto pf_idx = + USE_INDICES ? data_indices[i + pf_offset] : i + pf_offset; + if (IS_4BIT) { + PREFETCH_T0(data_ptr_base + (pf_idx >> 1)); + } else { + PREFETCH_T0(data_ptr_base + pf_idx); + } + const auto ti = static_cast(data(idx)); + const int16_t gradient_16 = gradients_ptr[i]; + if (USE_HESSIAN) { + const PACKED_HIST_T gradient_packed = HIST_BITS == 8 ? gradient_16 : + (static_cast(static_cast(gradient_16 >> 8)) << HIST_BITS) | (gradient_16 & 0xff); + out_ptr[ti] += gradient_packed; + } else { + const PACKED_HIST_T gradient_packed = HIST_BITS == 8 ? gradient_16 : + (static_cast(static_cast(gradient_16 >> 8)) << HIST_BITS) | (1); + out_ptr[ti] += gradient_packed; + } + } + } + for (; i < end; ++i) { + const auto idx = USE_INDICES ? data_indices[i] : i; + const auto ti = static_cast(data(idx)); + const int16_t gradient_16 = gradients_ptr[i]; + if (USE_HESSIAN) { + const PACKED_HIST_T gradient_packed = HIST_BITS == 8 ? gradient_16 : + (static_cast(static_cast(gradient_16 >> 8)) << HIST_BITS) | (gradient_16 & 0xff); + out_ptr[ti] += gradient_packed; + } else { + const PACKED_HIST_T gradient_packed = HIST_BITS == 8 ? gradient_16 : + (static_cast(static_cast(gradient_16 >> 8)) << HIST_BITS) | (1); + out_ptr[ti] += gradient_packed; + } + } + } + + void ConstructHistogramInt8(const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const override { + ConstructHistogramIntInner( + data_indices, start, end, ordered_gradients, out); + } + + void ConstructHistogramInt8(data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const override { + ConstructHistogramIntInner( + nullptr, start, end, ordered_gradients, out); + } + + void ConstructHistogramInt8(const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const override { + ConstructHistogramIntInner( + data_indices, start, end, ordered_gradients, out); + } + + void ConstructHistogramInt8(data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const override { + ConstructHistogramIntInner( + nullptr, start, end, ordered_gradients, out); + } + + void ConstructHistogramInt16(const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const override { + ConstructHistogramIntInner( + data_indices, start, end, ordered_gradients, out); + } + + void ConstructHistogramInt16(data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const override { + ConstructHistogramIntInner( + nullptr, start, end, ordered_gradients, out); + } + + void ConstructHistogramInt16(const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const override { + ConstructHistogramIntInner( + data_indices, start, end, ordered_gradients, out); + } + + void ConstructHistogramInt16(data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const override { + ConstructHistogramIntInner( + nullptr, start, end, ordered_gradients, out); + } + + void ConstructHistogramInt32(const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const override { + ConstructHistogramIntInner( + data_indices, start, end, ordered_gradients, out); + } + + void ConstructHistogramInt32(data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const override { + ConstructHistogramIntInner( + nullptr, start, end, ordered_gradients, out); + } + + void ConstructHistogramInt32(const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const override { + ConstructHistogramIntInner( + data_indices, start, end, ordered_gradients, out); + } + + void ConstructHistogramInt32(data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const override { + ConstructHistogramIntInner( + nullptr, start, end, ordered_gradients, out); + } + template data_size_t SplitInner(uint32_t min_bin, uint32_t max_bin, diff --git a/src/io/multi_val_dense_bin.hpp b/src/io/multi_val_dense_bin.hpp index b4fbfbe673aa..780272bdc4e1 100644 --- a/src/io/multi_val_dense_bin.hpp +++ b/src/io/multi_val_dense_bin.hpp @@ -124,6 +124,123 @@ class MultiValDenseBin : public MultiValBin { gradients, hessians, out); } + template + void ConstructHistogramIntInner(const data_size_t* data_indices, data_size_t start, data_size_t end, + const score_t* gradients_and_hessians, hist_t* out) const { + data_size_t i = start; + const VAL_T* data_ptr_base = data_.data(); + const int16_t* gradients_and_hessians_ptr = reinterpret_cast(gradients_and_hessians); + PACKED_HIST_T* out_ptr = reinterpret_cast(out); + + if (USE_PREFETCH) { + const data_size_t pf_offset = 32 / sizeof(VAL_T); + const data_size_t pf_end = end - pf_offset; + + for (; i < pf_end; ++i) { + const auto idx = USE_INDICES ? data_indices[i] : i; + const auto pf_idx = USE_INDICES ? data_indices[i + pf_offset] : i + pf_offset; + if (!ORDERED) { + PREFETCH_T0(gradients_and_hessians_ptr + pf_idx); + } + PREFETCH_T0(data_ptr_base + RowPtr(pf_idx)); + const auto j_start = RowPtr(idx); + const VAL_T* data_ptr = data_ptr_base + j_start; + const int16_t gradient_16 = gradients_and_hessians_ptr[idx]; + const PACKED_HIST_T gradient_packed = (HIST_BITS == 8) ? gradient_16 : + ((static_cast(static_cast(gradient_16 >> 8)) << HIST_BITS) | + static_cast(gradient_16 & 0xff)); + for (int j = 0; j < num_feature_; ++j) { + const uint32_t bin = static_cast(data_ptr[j]); + const auto ti = (bin + offsets_[j]); + out_ptr[ti] += gradient_packed; + } + } + } + for (; i < end; ++i) { + const auto idx = USE_INDICES ? data_indices[i] : i; + const auto j_start = RowPtr(idx); + const VAL_T* data_ptr = data_ptr_base + j_start; + const int16_t gradient_16 = gradients_and_hessians_ptr[idx]; + const PACKED_HIST_T gradient_packed = (HIST_BITS == 8) ? gradient_16 : + ((static_cast(static_cast(gradient_16 >> 8)) << HIST_BITS) | + static_cast(gradient_16 & 0xff)); + for (int j = 0; j < num_feature_; ++j) { + const uint32_t bin = static_cast(data_ptr[j]); + const auto ti = (bin + offsets_[j]); + out_ptr[ti] += gradient_packed; + } + } + } + + void ConstructHistogramInt32(const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* gradients, + const score_t* /*hessians*/, hist_t* out) const override { + ConstructHistogramIntInner(data_indices, start, end, + gradients, out); + } + + void ConstructHistogramInt32(data_size_t start, data_size_t end, + const score_t* gradients, const score_t* /*hessians*/, + hist_t* out) const override { + ConstructHistogramIntInner( + nullptr, start, end, gradients, out); + } + + void ConstructHistogramOrderedInt32(const data_size_t* data_indices, + data_size_t start, data_size_t end, + const score_t* gradients, + const score_t* /*hessians*/, + hist_t* out) const override { + ConstructHistogramIntInner(data_indices, start, end, + gradients, out); + } + + void ConstructHistogramInt16(const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* gradients, + const score_t* /*hessians*/, hist_t* out) const override { + ConstructHistogramIntInner(data_indices, start, end, + gradients, out); + } + + void ConstructHistogramInt16(data_size_t start, data_size_t end, + const score_t* gradients, const score_t* /*hessians*/, + hist_t* out) const override { + ConstructHistogramIntInner( + nullptr, start, end, gradients, out); + } + + void ConstructHistogramOrderedInt16(const data_size_t* data_indices, + data_size_t start, data_size_t end, + const score_t* gradients, + const score_t* /*hessians*/, + hist_t* out) const override { + ConstructHistogramIntInner(data_indices, start, end, + gradients, out); + } + + void ConstructHistogramInt8(const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* gradients, + const score_t* /*hessians*/, hist_t* out) const override { + ConstructHistogramIntInner(data_indices, start, end, + gradients, out); + } + + void ConstructHistogramInt8(data_size_t start, data_size_t end, + const score_t* gradients, const score_t* /*hessians*/, + hist_t* out) const override { + ConstructHistogramIntInner( + nullptr, start, end, gradients, out); + } + + void ConstructHistogramOrderedInt8(const data_size_t* data_indices, + data_size_t start, data_size_t end, + const score_t* gradients, + const score_t* /*hessians*/, + hist_t* out) const override { + ConstructHistogramIntInner(data_indices, start, end, + gradients, out); + } + MultiValBin* CreateLike(data_size_t num_data, int num_bin, int num_feature, double, const std::vector& offsets) const override { return new MultiValDenseBin(num_data, num_bin, num_feature, offsets); diff --git a/src/io/multi_val_sparse_bin.hpp b/src/io/multi_val_sparse_bin.hpp index eaa30ef0a0cc..32a5a51b4f89 100644 --- a/src/io/multi_val_sparse_bin.hpp +++ b/src/io/multi_val_sparse_bin.hpp @@ -180,6 +180,124 @@ class MultiValSparseBin : public MultiValBin { gradients, hessians, out); } + template + void ConstructHistogramIntInner(const data_size_t* data_indices, + data_size_t start, data_size_t end, + const score_t* gradients_and_hessians, hist_t* out) const { + data_size_t i = start; + PACKED_HIST_T* out_ptr = reinterpret_cast(out); + const int16_t* gradients_and_hessians_ptr = reinterpret_cast(gradients_and_hessians); + const VAL_T* data_ptr = data_.data(); + const INDEX_T* row_ptr_base = row_ptr_.data(); + if (USE_PREFETCH) { + const data_size_t pf_offset = 32 / sizeof(VAL_T); + const data_size_t pf_end = end - pf_offset; + + for (; i < pf_end; ++i) { + const auto idx = USE_INDICES ? data_indices[i] : i; + const auto pf_idx = + USE_INDICES ? data_indices[i + pf_offset] : i + pf_offset; + if (!ORDERED) { + PREFETCH_T0(gradients_and_hessians_ptr + pf_idx); + } + PREFETCH_T0(row_ptr_base + pf_idx); + PREFETCH_T0(data_ptr + row_ptr_[pf_idx]); + const auto j_start = RowPtr(idx); + const auto j_end = RowPtr(idx + 1); + const int16_t gradient_16 = ORDERED ? gradients_and_hessians_ptr[i] : gradients_and_hessians_ptr[idx]; + const PACKED_HIST_T gradient_packed = (HIST_BITS == 8) ? gradient_16 : + ((static_cast(static_cast(gradient_16 >> 8)) << HIST_BITS) | + static_cast(gradient_16 & 0xff)); + for (auto j = j_start; j < j_end; ++j) { + const auto ti = static_cast(data_ptr[j]); + out_ptr[ti] += gradient_packed; + } + } + } + for (; i < end; ++i) { + const auto idx = USE_INDICES ? data_indices[i] : i; + const auto j_start = RowPtr(idx); + const auto j_end = RowPtr(idx + 1); + const int16_t gradient_16 = ORDERED ? gradients_and_hessians_ptr[i] : gradients_and_hessians_ptr[idx]; + const PACKED_HIST_T gradient_packed = (HIST_BITS == 8) ? gradient_16 : + ((static_cast(static_cast(gradient_16 >> 8)) << HIST_BITS) | + static_cast(gradient_16 & 0xff)); + for (auto j = j_start; j < j_end; ++j) { + const auto ti = static_cast(data_ptr[j]); + out_ptr[ti] += gradient_packed; + } + } + } + + void ConstructHistogramInt32(const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* gradients, + const score_t* /*hessians*/, hist_t* out) const override { + ConstructHistogramIntInner(data_indices, start, end, + gradients, out); + } + + void ConstructHistogramInt32(data_size_t start, data_size_t end, + const score_t* gradients, const score_t* /*hessians*/, + hist_t* out) const override { + ConstructHistogramIntInner( + nullptr, start, end, gradients, out); + } + + void ConstructHistogramOrderedInt32(const data_size_t* data_indices, + data_size_t start, data_size_t end, + const score_t* gradients, + const score_t* /*hessians*/, + hist_t* out) const override { + ConstructHistogramIntInner(data_indices, start, end, + gradients, out); + } + + void ConstructHistogramInt16(const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* gradients, + const score_t* /*hessians*/, hist_t* out) const override { + ConstructHistogramIntInner(data_indices, start, end, + gradients, out); + } + + void ConstructHistogramInt16(data_size_t start, data_size_t end, + const score_t* gradients, const score_t* /*hessians*/, + hist_t* out) const override { + ConstructHistogramIntInner( + nullptr, start, end, gradients, out); + } + + void ConstructHistogramOrderedInt16(const data_size_t* data_indices, + data_size_t start, data_size_t end, + const score_t* gradients, + const score_t* /*hessians*/, + hist_t* out) const override { + ConstructHistogramIntInner(data_indices, start, end, + gradients, out); + } + + void ConstructHistogramInt8(const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* gradients, + const score_t* /*hessians*/, hist_t* out) const override { + ConstructHistogramIntInner(data_indices, start, end, + gradients, out); + } + + void ConstructHistogramInt8(data_size_t start, data_size_t end, + const score_t* gradients, const score_t* /*hessians*/, + hist_t* out) const override { + ConstructHistogramIntInner( + nullptr, start, end, gradients, out); + } + + void ConstructHistogramOrderedInt8(const data_size_t* data_indices, + data_size_t start, data_size_t end, + const score_t* gradients, + const score_t* /*hessians*/, + hist_t* out) const override { + ConstructHistogramIntInner(data_indices, start, end, + gradients, out); + } + MultiValBin* CreateLike(data_size_t num_data, int num_bin, int, double estimate_element_per_row, const std::vector& /*offsets*/) const override { diff --git a/src/io/sparse_bin.hpp b/src/io/sparse_bin.hpp index e01c0afcf5bc..f7137d29ffd9 100644 --- a/src/io/sparse_bin.hpp +++ b/src/io/sparse_bin.hpp @@ -203,6 +203,184 @@ class SparseBin : public Bin { } #undef ACC_GH + template + void ConstructIntHistogramInner(data_size_t start, data_size_t end, + const score_t* ordered_gradients_and_hessians, + hist_t* out) const { + data_size_t i_delta, cur_pos; + InitIndex(start, &i_delta, &cur_pos); + if (USE_HESSIAN) { + PACKED_HIST_T* out_ptr = reinterpret_cast(out); + const int16_t* gradients_and_hessians_ptr = reinterpret_cast(ordered_gradients_and_hessians); + while (cur_pos < start && i_delta < num_vals_) { + cur_pos += deltas_[++i_delta]; + } + while (cur_pos < end && i_delta < num_vals_) { + const VAL_T bin = vals_[i_delta]; + const int16_t gradient_16 = gradients_and_hessians_ptr[cur_pos]; + const PACKED_HIST_T gradient_64 = (static_cast(static_cast(gradient_16 >> 8)) << HIST_BITS) | (gradient_16 & 0xff); + out_ptr[bin] += gradient_64; + cur_pos += deltas_[++i_delta]; + } + } else { + GRAD_HIST_T* grad = reinterpret_cast(out); + HESS_HIST_T* cnt = reinterpret_cast(out) + 1; + const int8_t* gradients_and_hessians_ptr = reinterpret_cast(ordered_gradients_and_hessians); + while (cur_pos < start && i_delta < num_vals_) { + cur_pos += deltas_[++i_delta]; + } + while (cur_pos < end && i_delta < num_vals_) { + const uint32_t ti = static_cast(vals_[i_delta]) << 1; + grad[ti] += gradients_and_hessians_ptr[cur_pos]; + ++cnt[ti]; + cur_pos += deltas_[++i_delta]; + } + } + } + + template + void ConstructIntHistogramInner(const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients_and_hessians, + hist_t* out) const { + data_size_t i_delta, cur_pos; + InitIndex(data_indices[start], &i_delta, &cur_pos); + data_size_t i = start; + if (USE_HESSIAN) { + PACKED_HIST_T* out_ptr = reinterpret_cast(out); + const int16_t* gradients_and_hessians_ptr = reinterpret_cast(ordered_gradients_and_hessians); + for (;;) { + if (cur_pos < data_indices[i]) { + cur_pos += deltas_[++i_delta]; + if (i_delta >= num_vals_) { + break; + } + } else if (cur_pos > data_indices[i]) { + if (++i >= end) { + break; + } + } else { + const VAL_T bin = vals_[i_delta]; + const int16_t gradient_16 = gradients_and_hessians_ptr[i]; + const PACKED_HIST_T gradient_packed = (HIST_BITS == 8) ? gradient_16 : + (static_cast(static_cast(gradient_16 >> 8)) << HIST_BITS) | (gradient_16 & 0xff); + out_ptr[bin] += gradient_packed; + if (++i >= end) { + break; + } + cur_pos += deltas_[++i_delta]; + if (i_delta >= num_vals_) { + break; + } + } + } + } else { + GRAD_HIST_T* grad = reinterpret_cast(out); + HESS_HIST_T* cnt = reinterpret_cast(out) + 1; + const int8_t* gradients_and_hessians_ptr = reinterpret_cast(ordered_gradients_and_hessians); + for (;;) { + if (cur_pos < data_indices[i]) { + cur_pos += deltas_[++i_delta]; + if (i_delta >= num_vals_) { + break; + } + } else if (cur_pos > data_indices[i]) { + if (++i >= end) { + break; + } + } else { + const uint32_t ti = static_cast(vals_[i_delta]) << 1; + grad[ti] += gradients_and_hessians_ptr[i << 1]; + ++cnt[ti]; + if (++i >= end) { + break; + } + cur_pos += deltas_[++i_delta]; + if (i_delta >= num_vals_) { + break; + } + } + } + } + } + + void ConstructHistogramInt32(const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const override { + ConstructIntHistogramInner(data_indices, start, end, ordered_gradients, out); + } + + void ConstructHistogramInt32(data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const override { + ConstructIntHistogramInner(start, end, ordered_gradients, out); + } + + void ConstructHistogramInt32(const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const override { + ConstructIntHistogramInner(data_indices, start, end, ordered_gradients, out); + } + + void ConstructHistogramInt32(data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const override { + ConstructIntHistogramInner(start, end, ordered_gradients, out); + } + + void ConstructHistogramInt16(const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const override { + ConstructIntHistogramInner(data_indices, start, end, ordered_gradients, out); + } + + void ConstructHistogramInt16(data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const override { + ConstructIntHistogramInner(start, end, ordered_gradients, out); + } + + void ConstructHistogramInt16(const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const override { + ConstructIntHistogramInner(data_indices, start, end, ordered_gradients, out); + } + + void ConstructHistogramInt16(data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const override { + ConstructIntHistogramInner(start, end, ordered_gradients, out); + } + + void ConstructHistogramInt8(const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const override { + ConstructIntHistogramInner(data_indices, start, end, ordered_gradients, out); + } + + void ConstructHistogramInt8(data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* /*ordered_hessians*/, + hist_t* out) const override { + ConstructIntHistogramInner(start, end, ordered_gradients, out); + } + + void ConstructHistogramInt8(const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* ordered_gradients, + hist_t* out) const override { + ConstructIntHistogramInner(data_indices, start, end, ordered_gradients, out); + } + + void ConstructHistogramInt8(data_size_t start, data_size_t end, + const score_t* ordered_gradients, + hist_t* out) const override { + ConstructIntHistogramInner(start, end, ordered_gradients, out); + } + inline void NextNonzeroFast(data_size_t* i_delta, data_size_t* cur_pos) const { *cur_pos += deltas_[++(*i_delta)]; diff --git a/src/io/train_share_states.cpp b/src/io/train_share_states.cpp index f6462697a93d..71b2e097ef1b 100644 --- a/src/io/train_share_states.cpp +++ b/src/io/train_share_states.cpp @@ -9,7 +9,7 @@ namespace LightGBM { MultiValBinWrapper::MultiValBinWrapper(MultiValBin* bin, data_size_t num_data, - const std::vector& feature_groups_contained): + const std::vector& feature_groups_contained, const int num_grad_quant_bins): feature_groups_contained_(feature_groups_contained) { num_threads_ = OMP_NUM_THREADS(); num_data_ = num_data; @@ -19,6 +19,7 @@ MultiValBinWrapper::MultiValBinWrapper(MultiValBin* bin, data_size_t num_data, } num_bin_ = bin->num_bin(); num_bin_aligned_ = (num_bin_ + kAlignedSize - 1) / kAlignedSize * kAlignedSize; + num_grad_quant_bins_ = num_grad_quant_bins; } void MultiValBinWrapper::InitTrain(const std::vector& group_feature_start, @@ -45,43 +46,161 @@ void MultiValBinWrapper::InitTrain(const std::vector& group_feature_start, } } +template void MultiValBinWrapper::HistMove(const std::vector>& hist_buf) { - if (!is_use_subcol_) { + if (!is_use_subcol_ && INNER_HIST_BITS != 8) { return; } - const hist_t* src = hist_buf.data() + hist_buf.size() - - 2 * static_cast(num_bin_aligned_); - #pragma omp parallel for schedule(static) - for (int i = 0; i < static_cast(hist_move_src_.size()); ++i) { - std::copy_n(src + hist_move_src_[i], hist_move_size_[i], - origin_hist_data_ + hist_move_dest_[i]); + if (USE_QUANT_GRAD) { + if (HIST_BITS == 32) { + const int64_t* src = reinterpret_cast(hist_buf.data()) + hist_buf.size() / 2 - + static_cast(num_bin_aligned_); + #pragma omp parallel for schedule(static) + for (int i = 0; i < static_cast(hist_move_src_.size()); ++i) { + std::copy_n(src + hist_move_src_[i] / 2, hist_move_size_[i] / 2, + reinterpret_cast(origin_hist_data_) + hist_move_dest_[i] / 2); + } + } else if (HIST_BITS == 16) { + const int32_t* src = reinterpret_cast(hist_buf.data()) + hist_buf.size() / 2 - + static_cast(num_bin_aligned_); + if (is_use_subcol_) { + #pragma omp parallel for schedule(static) + for (int i = 0; i < static_cast(hist_move_src_.size()); ++i) { + std::copy_n(src + hist_move_src_[i] / 2, hist_move_size_[i] / 2, + reinterpret_cast(origin_hist_data_) + hist_move_dest_[i] / 2); + } + } else { + int32_t* orig_ptr = reinterpret_cast(origin_hist_data_); + #pragma omp parallel for schedule(static) + for (int i = 0; i < num_bin_; ++i) { + orig_ptr[i] = src[i]; + } + } + } + } else { + const hist_t* src = hist_buf.data() + hist_buf.size() - + 2 * static_cast(num_bin_aligned_); + #pragma omp parallel for schedule(static) + for (int i = 0; i < static_cast(hist_move_src_.size()); ++i) { + std::copy_n(src + hist_move_src_[i], hist_move_size_[i], + origin_hist_data_ + hist_move_dest_[i]); + } } } +template void MultiValBinWrapper::HistMove(const std::vector>& hist_buf); + +template void MultiValBinWrapper::HistMove(const std::vector>& hist_buf); + +template void MultiValBinWrapper::HistMove(const std::vector>& hist_buf); + +template void MultiValBinWrapper::HistMove(const std::vector>& hist_buf); + +template void MultiValBinWrapper::HistMove(const std::vector>& hist_buf); + +template void MultiValBinWrapper::HistMove(const std::vector>& hist_buf); + +template void MultiValBinWrapper::HistMerge(std::vector>* hist_buf) { int n_bin_block = 1; int bin_block_size = num_bin_; Threading::BlockInfo(num_threads_, num_bin_, 512, &n_bin_block, &bin_block_size); - hist_t* dst = origin_hist_data_; - if (is_use_subcol_) { - dst = hist_buf->data() + hist_buf->size() - 2 * static_cast(num_bin_aligned_); - } - #pragma omp parallel for schedule(static, 1) num_threads(num_threads_) - for (int t = 0; t < n_bin_block; ++t) { - const int start = t * bin_block_size; - const int end = std::min(start + bin_block_size, num_bin_); - for (int tid = 1; tid < n_data_block_; ++tid) { - auto src_ptr = hist_buf->data() + static_cast(num_bin_aligned_) * 2 * (tid - 1); - for (int i = start * 2; i < end * 2; ++i) { - dst[i] += src_ptr[i]; + if (USE_QUANT_GRAD) { + if (HIST_BITS == 32) { + int64_t* dst = reinterpret_cast(origin_hist_data_); + if (is_use_subcol_) { + dst = reinterpret_cast(hist_buf->data()) + hist_buf->size() / 2 - static_cast(num_bin_aligned_); + } + #pragma omp parallel for schedule(static, 1) num_threads(num_threads_) + for (int t = 0; t < n_bin_block; ++t) { + const int start = t * bin_block_size; + const int end = std::min(start + bin_block_size, num_bin_); + for (int tid = 1; tid < n_data_block_; ++tid) { + auto src_ptr = reinterpret_cast(hist_buf->data()) + static_cast(num_bin_aligned_) * (tid - 1); + for (int i = start; i < end; ++i) { + dst[i] += src_ptr[i]; + } + } + } + } else if (HIST_BITS == 16 && INNER_HIST_BITS == 16) { + int32_t* dst = reinterpret_cast(origin_hist_data_); + if (is_use_subcol_) { + dst = reinterpret_cast(hist_buf->data()) + hist_buf->size() / 2 - static_cast(num_bin_aligned_); + } + #pragma omp parallel for schedule(static, 1) num_threads(num_threads_) + for (int t = 0; t < n_bin_block; ++t) { + const int start = t * bin_block_size; + const int end = std::min(start + bin_block_size, num_bin_); + for (int tid = 1; tid < n_data_block_; ++tid) { + auto src_ptr = reinterpret_cast(hist_buf->data()) + static_cast(num_bin_aligned_) * (tid - 1); + for (int i = start; i < end; ++i) { + dst[i] += src_ptr[i]; + } + } + } + } else if (HIST_BITS == 16 && INNER_HIST_BITS == 8) { + int32_t* dst = reinterpret_cast(hist_buf->data()) + hist_buf->size() / 2 - static_cast(num_bin_aligned_); + std::memset(reinterpret_cast(dst), 0, num_bin_ * kInt16HistBufferEntrySize); + #pragma omp parallel for schedule(static, 1) num_threads(num_threads_) + for (int t = 0; t < n_bin_block; ++t) { + const int start = t * bin_block_size; + const int end = std::min(start + bin_block_size, num_bin_); + for (int tid = 0; tid < n_data_block_; ++tid) { + auto src_ptr = reinterpret_cast(hist_buf->data()) + static_cast(num_bin_aligned_) * tid; + for (int i = start; i < end; ++i) { + const int16_t packed_hist = src_ptr[i]; + const int32_t packed_hist_int32 = (static_cast(static_cast(packed_hist >> 8)) << 16) | static_cast(packed_hist & 0x00ff); + dst[i] += packed_hist_int32; + } + } + } + } + } else { + hist_t* dst = origin_hist_data_; + if (is_use_subcol_) { + dst = hist_buf->data() + hist_buf->size() - 2 * static_cast(num_bin_aligned_); + } + #pragma omp parallel for schedule(static, 1) num_threads(num_threads_) + for (int t = 0; t < n_bin_block; ++t) { + const int start = t * bin_block_size; + const int end = std::min(start + bin_block_size, num_bin_); + for (int tid = 1; tid < n_data_block_; ++tid) { + auto src_ptr = hist_buf->data() + static_cast(num_bin_aligned_) * 2 * (tid - 1); + for (int i = start * 2; i < end * 2; ++i) { + dst[i] += src_ptr[i]; + } } } } } +template void MultiValBinWrapper::HistMerge(std::vector>* hist_buf); + +template void MultiValBinWrapper::HistMerge(std::vector>* hist_buf); + +template void MultiValBinWrapper::HistMerge(std::vector>* hist_buf); + +template void MultiValBinWrapper::HistMerge(std::vector>* hist_buf); + +template void MultiValBinWrapper::HistMerge(std::vector>* hist_buf); + +template void MultiValBinWrapper::HistMerge(std::vector>* hist_buf); + void MultiValBinWrapper::ResizeHistBuf(std::vector>* hist_buf, MultiValBin* sub_multi_val_bin, @@ -389,7 +508,7 @@ void TrainingShareStates::CalcBinOffsets(const std::vector>& feature_groups, - bool dense_only, bool sparse_only) { + bool dense_only, bool sparse_only, const int num_grad_quant_bins) { num_threads = OMP_NUM_THREADS(); if (bin == nullptr) { return; @@ -408,7 +527,7 @@ void TrainingShareStates::SetMultiValBin(MultiValBin* bin, data_size_t num_data, num_total_bin_ += bin->num_bin(); num_elements_per_row_ += bin->num_element_per_row(); multi_val_bin_wrapper_.reset(new MultiValBinWrapper( - bin, num_data, feature_groups_contained)); + bin, num_data, feature_groups_contained, num_grad_quant_bins)); } } // namespace LightGBM diff --git a/src/treelearner/data_parallel_tree_learner.cpp b/src/treelearner/data_parallel_tree_learner.cpp index 677b7dc6eb82..2509db5e722a 100644 --- a/src/treelearner/data_parallel_tree_learner.cpp +++ b/src/treelearner/data_parallel_tree_learner.cpp @@ -30,7 +30,9 @@ void DataParallelTreeLearner::Init(const Dataset* train_data, boo auto max_cat_threshold = this->config_->max_cat_threshold; // need to be able to hold smaller and larger best splits in SyncUpGlobalBestSplit size_t split_info_size = static_cast(SplitInfo::Size(max_cat_threshold) * 2); - size_t histogram_size = static_cast(this->share_state_->num_hist_total_bin() * kHistEntrySize); + size_t histogram_size = this->config_->use_quantized_grad ? + static_cast(this->share_state_->num_hist_total_bin() * kInt32HistEntrySize) : + static_cast(this->share_state_->num_hist_total_bin() * kHistEntrySize); // allocate buffer for communication size_t buffer_size = std::max(histogram_size, split_info_size); @@ -43,8 +45,19 @@ void DataParallelTreeLearner::Init(const Dataset* train_data, boo block_start_.resize(num_machines_); block_len_.resize(num_machines_); + if (this->config_->use_quantized_grad) { + block_start_int16_.resize(num_machines_); + block_len_int16_.resize(num_machines_); + } + buffer_write_start_pos_.resize(this->num_features_); buffer_read_start_pos_.resize(this->num_features_); + + if (this->config_->use_quantized_grad) { + buffer_write_start_pos_int16_.resize(this->num_features_); + buffer_read_start_pos_int16_.resize(this->num_features_); + } + global_data_count_in_leaf_.resize(this->config_->num_leaves); } @@ -55,100 +68,155 @@ void DataParallelTreeLearner::ResetConfig(const Config* config) { } template -void DataParallelTreeLearner::BeforeTrain() { - TREELEARNER_T::BeforeTrain(); - // generate feature partition for current tree - std::vector> feature_distribution(num_machines_, std::vector()); - std::vector num_bins_distributed(num_machines_, 0); - for (int i = 0; i < this->train_data_->num_total_features(); ++i) { - int inner_feature_index = this->train_data_->InnerFeatureIndex(i); - if (inner_feature_index == -1) { continue; } - if (this->col_sampler_.is_feature_used_bytree()[inner_feature_index]) { - int cur_min_machine = static_cast(ArrayArgs::ArgMin(num_bins_distributed)); - feature_distribution[cur_min_machine].push_back(inner_feature_index); - auto num_bin = this->train_data_->FeatureNumBin(inner_feature_index); - if (this->train_data_->FeatureBinMapper(inner_feature_index)->GetMostFreqBin() == 0) { - num_bin -= 1; - } - num_bins_distributed[cur_min_machine] += num_bin; - } - is_feature_aggregated_[inner_feature_index] = false; - } - // get local used feature - for (auto fid : feature_distribution[rank_]) { - is_feature_aggregated_[fid] = true; - } - +void DataParallelTreeLearner::PrepareBufferPos( + const std::vector>& feature_distribution, + std::vector* block_start, + std::vector* block_len, + std::vector* buffer_write_start_pos, + std::vector* buffer_read_start_pos, + comm_size_t* reduce_scatter_size, + size_t hist_entry_size) { // get block start and block len for reduce scatter - reduce_scatter_size_ = 0; + *reduce_scatter_size = 0; for (int i = 0; i < num_machines_; ++i) { - block_len_[i] = 0; + (*block_len)[i] = 0; for (auto fid : feature_distribution[i]) { auto num_bin = this->train_data_->FeatureNumBin(fid); if (this->train_data_->FeatureBinMapper(fid)->GetMostFreqBin() == 0) { num_bin -= 1; } - block_len_[i] += num_bin * kHistEntrySize; + (*block_len)[i] += num_bin * hist_entry_size; } - reduce_scatter_size_ += block_len_[i]; + *reduce_scatter_size += (*block_len)[i]; } - block_start_[0] = 0; + (*block_start)[0] = 0; for (int i = 1; i < num_machines_; ++i) { - block_start_[i] = block_start_[i - 1] + block_len_[i - 1]; + (*block_start)[i] = (*block_start)[i - 1] + (*block_len)[i - 1]; } - // get buffer_write_start_pos_ + // get buffer_write_start_pos int bin_size = 0; for (int i = 0; i < num_machines_; ++i) { for (auto fid : feature_distribution[i]) { - buffer_write_start_pos_[fid] = bin_size; + (*buffer_write_start_pos)[fid] = bin_size; auto num_bin = this->train_data_->FeatureNumBin(fid); if (this->train_data_->FeatureBinMapper(fid)->GetMostFreqBin() == 0) { num_bin -= 1; } - bin_size += num_bin * kHistEntrySize; + bin_size += num_bin * hist_entry_size; } } - // get buffer_read_start_pos_ + // get buffer_read_start_pos bin_size = 0; for (auto fid : feature_distribution[rank_]) { - buffer_read_start_pos_[fid] = bin_size; + (*buffer_read_start_pos)[fid] = bin_size; auto num_bin = this->train_data_->FeatureNumBin(fid); if (this->train_data_->FeatureBinMapper(fid)->GetMostFreqBin() == 0) { num_bin -= 1; } - bin_size += num_bin * kHistEntrySize; + bin_size += num_bin * hist_entry_size; } +} - // sync global data sumup info - std::tuple data(this->smaller_leaf_splits_->num_data_in_leaf(), - this->smaller_leaf_splits_->sum_gradients(), this->smaller_leaf_splits_->sum_hessians()); - int size = sizeof(data); - std::memcpy(input_buffer_.data(), &data, size); - // global sumup reduce - Network::Allreduce(input_buffer_.data(), size, sizeof(std::tuple), output_buffer_.data(), [](const char *src, char *dst, int type_size, comm_size_t len) { - comm_size_t used_size = 0; - const std::tuple *p1; - std::tuple *p2; - while (used_size < len) { - p1 = reinterpret_cast *>(src); - p2 = reinterpret_cast *>(dst); - std::get<0>(*p2) = std::get<0>(*p2) + std::get<0>(*p1); - std::get<1>(*p2) = std::get<1>(*p2) + std::get<1>(*p1); - std::get<2>(*p2) = std::get<2>(*p2) + std::get<2>(*p1); - src += type_size; - dst += type_size; - used_size += type_size; +template +void DataParallelTreeLearner::BeforeTrain() { + TREELEARNER_T::BeforeTrain(); + // generate feature partition for current tree + std::vector> feature_distribution(num_machines_, std::vector()); + std::vector num_bins_distributed(num_machines_, 0); + for (int i = 0; i < this->train_data_->num_total_features(); ++i) { + int inner_feature_index = this->train_data_->InnerFeatureIndex(i); + if (inner_feature_index == -1) { continue; } + if (this->col_sampler_.is_feature_used_bytree()[inner_feature_index]) { + int cur_min_machine = static_cast(ArrayArgs::ArgMin(num_bins_distributed)); + feature_distribution[cur_min_machine].push_back(inner_feature_index); + auto num_bin = this->train_data_->FeatureNumBin(inner_feature_index); + if (this->train_data_->FeatureBinMapper(inner_feature_index)->GetMostFreqBin() == 0) { + num_bin -= 1; + } + num_bins_distributed[cur_min_machine] += num_bin; } - }); - // copy back - std::memcpy(reinterpret_cast(&data), output_buffer_.data(), size); - // set global sumup info - this->smaller_leaf_splits_->Init(std::get<1>(data), std::get<2>(data)); - // init global data count in leaf - global_data_count_in_leaf_[0] = std::get<0>(data); + is_feature_aggregated_[inner_feature_index] = false; + } + // get local used feature + for (auto fid : feature_distribution[rank_]) { + is_feature_aggregated_[fid] = true; + } + + // get block start and block len for reduce scatter + if (this->config_->use_quantized_grad) { + PrepareBufferPos(feature_distribution, &block_start_, &block_len_, &buffer_write_start_pos_, + &buffer_read_start_pos_, &reduce_scatter_size_, kInt32HistEntrySize); + PrepareBufferPos(feature_distribution, &block_start_int16_, &block_len_int16_, &buffer_write_start_pos_int16_, + &buffer_read_start_pos_int16_, &reduce_scatter_size_int16_, kInt16HistEntrySize); + } else { + PrepareBufferPos(feature_distribution, &block_start_, &block_len_, &buffer_write_start_pos_, + &buffer_read_start_pos_, &reduce_scatter_size_, kHistEntrySize); + } + + if (this->config_->use_quantized_grad) { + // sync global data sumup info + std::tuple data(this->smaller_leaf_splits_->num_data_in_leaf(), + this->smaller_leaf_splits_->sum_gradients(), this->smaller_leaf_splits_->sum_hessians(), + this->smaller_leaf_splits_->int_sum_gradients_and_hessians()); + int size = sizeof(data); + std::memcpy(input_buffer_.data(), &data, size); + // global sumup reduce + Network::Allreduce(input_buffer_.data(), size, sizeof(std::tuple), output_buffer_.data(), [](const char *src, char *dst, int type_size, comm_size_t len) { + comm_size_t used_size = 0; + const std::tuple *p1; + std::tuple *p2; + while (used_size < len) { + p1 = reinterpret_cast *>(src); + p2 = reinterpret_cast *>(dst); + std::get<0>(*p2) = std::get<0>(*p2) + std::get<0>(*p1); + std::get<1>(*p2) = std::get<1>(*p2) + std::get<1>(*p1); + std::get<2>(*p2) = std::get<2>(*p2) + std::get<2>(*p1); + std::get<3>(*p2) = std::get<3>(*p2) + std::get<3>(*p1); + src += type_size; + dst += type_size; + used_size += type_size; + } + }); + // copy back + std::memcpy(reinterpret_cast(&data), output_buffer_.data(), size); + // set global sumup info + this->smaller_leaf_splits_->Init(std::get<1>(data), std::get<2>(data), std::get<3>(data)); + // init global data count in leaf + global_data_count_in_leaf_[0] = std::get<0>(data); + // reset hist num bits according to global num data + this->gradient_discretizer_->template SetNumBitsInHistogramBin(0, -1, GetGlobalDataCountInLeaf(0), 0); + } else { + // sync global data sumup info + std::tuple data(this->smaller_leaf_splits_->num_data_in_leaf(), + this->smaller_leaf_splits_->sum_gradients(), this->smaller_leaf_splits_->sum_hessians()); + int size = sizeof(data); + std::memcpy(input_buffer_.data(), &data, size); + // global sumup reduce + Network::Allreduce(input_buffer_.data(), size, sizeof(std::tuple), output_buffer_.data(), [](const char *src, char *dst, int type_size, comm_size_t len) { + comm_size_t used_size = 0; + const std::tuple *p1; + std::tuple *p2; + while (used_size < len) { + p1 = reinterpret_cast *>(src); + p2 = reinterpret_cast *>(dst); + std::get<0>(*p2) = std::get<0>(*p2) + std::get<0>(*p1); + std::get<1>(*p2) = std::get<1>(*p2) + std::get<1>(*p1); + std::get<2>(*p2) = std::get<2>(*p2) + std::get<2>(*p1); + src += type_size; + dst += type_size; + used_size += type_size; + } + }); + // copy back + std::memcpy(reinterpret_cast(&data), output_buffer_.data(), size); + // set global sumup info + this->smaller_leaf_splits_->Init(std::get<1>(data), std::get<2>(data)); + // init global data count in leaf + global_data_count_in_leaf_[0] = std::get<0>(data); + } } template @@ -167,23 +235,66 @@ void DataParallelTreeLearner::FindBestSplits(const Tree* tree) { const BinMapper* feature_bin_mapper = this->train_data_->FeatureBinMapper(feature_index); const int offset = static_cast(feature_bin_mapper->GetMostFreqBin() == 0); const int num_bin = feature_bin_mapper->num_bin(); - hist_t* hist_ptr = this->smaller_leaf_histogram_array_[feature_index].RawData(); - std::memset(reinterpret_cast(hist_ptr), 0, (num_bin - offset) * kHistEntrySize); + if (this->config_->use_quantized_grad) { + int32_t* hist_ptr = this->smaller_leaf_histogram_array_[feature_index].RawDataInt32(); + std::memset(reinterpret_cast(hist_ptr), 0, (num_bin - offset) * kInt32HistEntrySize); + int16_t* hist_ptr_int16 = this->smaller_leaf_histogram_array_[feature_index].RawDataInt16(); + std::memset(reinterpret_cast(hist_ptr_int16), 0, (num_bin - offset) * kInt16HistEntrySize); + } else { + hist_t* hist_ptr = this->smaller_leaf_histogram_array_[feature_index].RawData(); + std::memset(reinterpret_cast(hist_ptr), 0, (num_bin - offset) * kHistEntrySize); + } } } // construct local histograms + global_timer.Start("DataParallelTreeLearner::ReduceHistogram"); + global_timer.Start("DataParallelTreeLearner::ReduceHistogram::Copy"); #pragma omp parallel for schedule(static) for (int feature_index = 0; feature_index < this->num_features_; ++feature_index) { if (this->col_sampler_.is_feature_used_bytree()[feature_index] == false) continue; // copy to buffer - std::memcpy(input_buffer_.data() + buffer_write_start_pos_[feature_index], + if (this->config_->use_quantized_grad) { + const uint8_t local_smaller_leaf_num_bits = this->gradient_discretizer_->template GetHistBitsInLeaf(this->smaller_leaf_splits_->leaf_index()); + const uint8_t smaller_leaf_num_bits = this->gradient_discretizer_->template GetHistBitsInLeaf(this->smaller_leaf_splits_->leaf_index()); + if (smaller_leaf_num_bits <= 16) { + std::memcpy(input_buffer_.data() + buffer_write_start_pos_int16_[feature_index], + this->smaller_leaf_histogram_array_[feature_index].RawDataInt16(), + this->smaller_leaf_histogram_array_[feature_index].SizeOfInt16Histgram()); + } else { + if (local_smaller_leaf_num_bits == 32) { + std::memcpy(input_buffer_.data() + buffer_write_start_pos_[feature_index], + this->smaller_leaf_histogram_array_[feature_index].RawDataInt32(), + this->smaller_leaf_histogram_array_[feature_index].SizeOfInt32Histgram()); + } else { + this->smaller_leaf_histogram_array_[feature_index].CopyFromInt16ToInt32( + input_buffer_.data() + buffer_write_start_pos_[feature_index]); + } + } + } else { + std::memcpy(input_buffer_.data() + buffer_write_start_pos_[feature_index], this->smaller_leaf_histogram_array_[feature_index].RawData(), this->smaller_leaf_histogram_array_[feature_index].SizeOfHistgram()); + } } + global_timer.Stop("DataParallelTreeLearner::ReduceHistogram::Copy"); // Reduce scatter for histogram - Network::ReduceScatter(input_buffer_.data(), reduce_scatter_size_, sizeof(hist_t), block_start_.data(), - block_len_.data(), output_buffer_.data(), static_cast(output_buffer_.size()), &HistogramSumReducer); + global_timer.Start("DataParallelTreeLearner::ReduceHistogram::ReduceScatter"); + if (!this->config_->use_quantized_grad) { + Network::ReduceScatter(input_buffer_.data(), reduce_scatter_size_, sizeof(hist_t), block_start_.data(), + block_len_.data(), output_buffer_.data(), static_cast(output_buffer_.size()), &HistogramSumReducer); + } else { + const uint8_t smaller_leaf_num_bits = this->gradient_discretizer_->template GetHistBitsInLeaf(this->smaller_leaf_splits_->leaf_index()); + if (smaller_leaf_num_bits <= 16) { + Network::ReduceScatter(input_buffer_.data(), reduce_scatter_size_int16_, sizeof(int16_t), block_start_int16_.data(), + block_len_int16_.data(), output_buffer_.data(), static_cast(output_buffer_.size()), &Int16HistogramSumReducer); + } else { + Network::ReduceScatter(input_buffer_.data(), reduce_scatter_size_, sizeof(int_hist_t), block_start_.data(), + block_len_.data(), output_buffer_.data(), static_cast(output_buffer_.size()), &Int32HistogramSumReducer); + } + } + global_timer.Stop("DataParallelTreeLearner::ReduceHistogram::ReduceScatter"); + global_timer.Stop("DataParallelTreeLearner::ReduceHistogram"); this->FindBestSplitsFromHistograms( this->col_sampler_.is_feature_used_bytree(), true, tree); } @@ -198,6 +309,26 @@ void DataParallelTreeLearner::FindBestSplitsFromHistograms(const this->col_sampler_.GetByNode(tree, this->larger_leaf_splits_->leaf_index()); double smaller_leaf_parent_output = this->GetParentOutput(tree, this->smaller_leaf_splits_.get()); double larger_leaf_parent_output = this->GetParentOutput(tree, this->larger_leaf_splits_.get()); + + if (this->config_->use_quantized_grad && this->larger_leaf_splits_ != nullptr && this->larger_leaf_splits_->leaf_index() >= 0) { + const int parent_index = std::min(this->smaller_leaf_splits_->leaf_index(), this->larger_leaf_splits_->leaf_index()); + const uint8_t parent_num_bits = this->gradient_discretizer_->template GetHistBitsInNode(parent_index); + const uint8_t larger_leaf_num_bits = this->gradient_discretizer_->template GetHistBitsInLeaf(this->larger_leaf_splits_->leaf_index()); + const uint8_t smaller_leaf_num_bits = this->gradient_discretizer_->template GetHistBitsInLeaf(this->smaller_leaf_splits_->leaf_index()); + if (parent_num_bits > 16 && larger_leaf_num_bits <= 16) { + CHECK_LE(smaller_leaf_num_bits, 16); + OMP_INIT_EX(); + #pragma omp parallel for schedule(static) + for (int feature_index = 0; feature_index < this->num_features_; ++feature_index) { + OMP_LOOP_EX_BEGIN(); + if (!is_feature_aggregated_[feature_index]) continue; + this->larger_leaf_histogram_array_[feature_index].CopyToBuffer(this->gradient_discretizer_->GetChangeHistBitsBuffer(feature_index)); + OMP_LOOP_EX_END(); + } + OMP_THROW_EX(); + } + } + OMP_INIT_EX(); #pragma omp parallel for schedule(static) for (int feature_index = 0; feature_index < this->num_features_; ++feature_index) { @@ -206,12 +337,39 @@ void DataParallelTreeLearner::FindBestSplitsFromHistograms(const const int tid = omp_get_thread_num(); const int real_feature_index = this->train_data_->RealFeatureIndex(feature_index); // restore global histograms from buffer - this->smaller_leaf_histogram_array_[feature_index].FromMemory( - output_buffer_.data() + buffer_read_start_pos_[feature_index]); + if (this->config_->use_quantized_grad) { + const uint8_t smaller_leaf_num_bits = this->gradient_discretizer_->template GetHistBitsInLeaf(this->smaller_leaf_splits_->leaf_index()); + if (smaller_leaf_num_bits <= 16) { + this->smaller_leaf_histogram_array_[feature_index].FromMemoryInt16( + output_buffer_.data() + buffer_read_start_pos_int16_[feature_index]); + } else { + this->smaller_leaf_histogram_array_[feature_index].FromMemoryInt32( + output_buffer_.data() + buffer_read_start_pos_[feature_index]); + } + } else { + this->smaller_leaf_histogram_array_[feature_index].FromMemory( + output_buffer_.data() + buffer_read_start_pos_[feature_index]); + } - this->train_data_->FixHistogram(feature_index, - this->smaller_leaf_splits_->sum_gradients(), this->smaller_leaf_splits_->sum_hessians(), - this->smaller_leaf_histogram_array_[feature_index].RawData()); + if (this->config_->use_quantized_grad) { + const uint8_t smaller_leaf_num_bits = this->gradient_discretizer_->template GetHistBitsInLeaf(this->smaller_leaf_splits_->leaf_index()); + const int64_t int_sum_gradient_and_hessian = this->smaller_leaf_splits_->int_sum_gradients_and_hessians(); + if (smaller_leaf_num_bits <= 16) { + this->train_data_->template FixHistogramInt( + feature_index, + int_sum_gradient_and_hessian, + reinterpret_cast(this->smaller_leaf_histogram_array_[feature_index].RawDataInt16())); + } else { + this->train_data_->template FixHistogramInt( + feature_index, + int_sum_gradient_and_hessian, + reinterpret_cast(this->smaller_leaf_histogram_array_[feature_index].RawDataInt32())); + } + } else { + this->train_data_->FixHistogram(feature_index, + this->smaller_leaf_splits_->sum_gradients(), this->smaller_leaf_splits_->sum_hessians(), + this->smaller_leaf_histogram_array_[feature_index].RawData()); + } this->ComputeBestSplitForFeature( this->smaller_leaf_histogram_array_, feature_index, real_feature_index, @@ -225,8 +383,31 @@ void DataParallelTreeLearner::FindBestSplitsFromHistograms(const if (this->larger_leaf_splits_ == nullptr || this->larger_leaf_splits_->leaf_index() < 0) continue; // construct histgroms for large leaf, we init larger leaf as the parent, so we can just subtract the smaller leaf's histograms - this->larger_leaf_histogram_array_[feature_index].Subtract( - this->smaller_leaf_histogram_array_[feature_index]); + if (this->config_->use_quantized_grad) { + const int parent_index = std::min(this->smaller_leaf_splits_->leaf_index(), this->larger_leaf_splits_->leaf_index()); + const uint8_t parent_num_bits = this->gradient_discretizer_->template GetHistBitsInNode(parent_index); + const uint8_t larger_leaf_num_bits = this->gradient_discretizer_->template GetHistBitsInLeaf(this->larger_leaf_splits_->leaf_index()); + const uint8_t smaller_leaf_num_bits = this->gradient_discretizer_->template GetHistBitsInLeaf(this->smaller_leaf_splits_->leaf_index()); + if (parent_num_bits <= 16) { + CHECK_LE(smaller_leaf_num_bits, 16); + CHECK_LE(larger_leaf_num_bits, 16); + this->larger_leaf_histogram_array_[feature_index].template Subtract( + this->smaller_leaf_histogram_array_[feature_index]); + } else if (larger_leaf_num_bits <= 16) { + CHECK_LE(smaller_leaf_num_bits, 16); + this->larger_leaf_histogram_array_[feature_index].template Subtract( + this->smaller_leaf_histogram_array_[feature_index], this->gradient_discretizer_->GetChangeHistBitsBuffer(feature_index)); + } else if (smaller_leaf_num_bits <= 16) { + this->larger_leaf_histogram_array_[feature_index].template Subtract( + this->smaller_leaf_histogram_array_[feature_index]); + } else { + this->larger_leaf_histogram_array_[feature_index].template Subtract( + this->smaller_leaf_histogram_array_[feature_index]); + } + } else { + this->larger_leaf_histogram_array_[feature_index].Subtract( + this->smaller_leaf_histogram_array_[feature_index]); + } this->ComputeBestSplitForFeature( this->larger_leaf_histogram_array_, feature_index, real_feature_index, @@ -273,6 +454,10 @@ void DataParallelTreeLearner::Split(Tree* tree, int best_Leaf, in // need update global number of data in leaf global_data_count_in_leaf_[*left_leaf] = best_split_info.left_count; global_data_count_in_leaf_[*right_leaf] = best_split_info.right_count; + // reset hist num bits according to global num data + if (this->config_->use_quantized_grad) { + this->gradient_discretizer_->template SetNumBitsInHistogramBin(*left_leaf, *right_leaf, GetGlobalDataCountInLeaf(*left_leaf), GetGlobalDataCountInLeaf(*right_leaf)); + } } // instantiate template classes, otherwise linker cannot find the code diff --git a/src/treelearner/feature_histogram.hpp b/src/treelearner/feature_histogram.hpp index 7804292d15d0..d917ed7917ec 100644 --- a/src/treelearner/feature_histogram.hpp +++ b/src/treelearner/feature_histogram.hpp @@ -51,6 +51,18 @@ class FeatureHistogram { /*! \brief Disable copy */ FeatureHistogram(const FeatureHistogram&) = delete; + /*! + * \brief Init the feature histogram + * \param feature the feature data for this histogram + * \param min_num_data_one_leaf minimal number of data in one leaf + */ + void Init(hist_t* data, int16_t* data_int16, const FeatureMetainfo* meta) { + meta_ = meta; + data_ = data; + data_int16_ = data_int16; + ResetFunc(); + } + /*! * \brief Init the feature histogram * \param feature the feature data for this histogram @@ -59,6 +71,7 @@ class FeatureHistogram { void Init(hist_t* data, const FeatureMetainfo* meta) { meta_ = meta; data_ = data; + data_int16_ = nullptr; ResetFunc(); } @@ -72,13 +85,80 @@ class FeatureHistogram { hist_t* RawData() { return data_; } + int32_t* RawDataInt32() { return reinterpret_cast(data_); } + + int16_t* RawDataInt16() { return data_int16_; } + /*! * \brief Subtract current histograms with other * \param other The histogram that want to subtract */ - void Subtract(const FeatureHistogram& other) { - for (int i = 0; i < (meta_->num_bin - meta_->offset) * 2; ++i) { - data_[i] -= other.data_[i]; + template + void Subtract(const FeatureHistogram& other, const int32_t* buffer = nullptr) { + if (USE_DIST_GRAD) { + const THIS_HIST_T* this_int_data = THIS_HIST_BITS == 16 ? + reinterpret_cast(data_int16_) : + (RESULT_HIST_BITS == 16 ? + reinterpret_cast(buffer) : + reinterpret_cast(data_)); + const OTHER_HIST_T* other_int_data = OTHER_HIST_BITS == 16 ? + reinterpret_cast(other.data_int16_) : + reinterpret_cast(other.data_); + RESULT_HIST_T* result_int_data = RESULT_HIST_BITS == 16 ? + reinterpret_cast(data_int16_) : + reinterpret_cast(data_); + if (THIS_HIST_BITS == 32 && OTHER_HIST_BITS == 16 && RESULT_HIST_BITS == 32) { + for (int i = 0; i < meta_->num_bin - meta_->offset; ++i) { + const int32_t other_grad_hess = static_cast(other_int_data[i]); + const int64_t this_grad_hess = this_int_data[i]; + const int64_t other_grad_hess_int64 = + (static_cast(static_cast(other_grad_hess >> 16)) << 32) | + (static_cast(other_grad_hess & 0x0000ffff)); + const int64_t result_grad_hess = this_grad_hess - other_grad_hess_int64; + result_int_data[i] = result_grad_hess; + } + } else if (THIS_HIST_BITS == 32 && OTHER_HIST_BITS == 16 && RESULT_HIST_BITS == 16) { + for (int i = 0; i < meta_->num_bin - meta_->offset; ++i) { + const int32_t other_grad_hess = static_cast(other_int_data[i]); + const int64_t this_grad_hess = this_int_data[i]; + const int64_t other_grad_hess_int64 = + (static_cast(static_cast(other_grad_hess >> 16)) << 32) | + (static_cast(other_grad_hess & 0x0000ffff)); + const int64_t result_grad_hess = this_grad_hess - other_grad_hess_int64; + const int32_t result_grad_hess_int32 = + (static_cast(result_grad_hess >> 32) << 16) | + static_cast(result_grad_hess & 0x00000000ffffffff); + result_int_data[i] = result_grad_hess_int32; + } + } else { + for (int i = 0; i < meta_->num_bin - meta_->offset; ++i) { + result_int_data[i] = this_int_data[i] - other_int_data[i]; + } + } + } else { + for (int i = 0; i < (meta_->num_bin - meta_->offset) * 2; ++i) { + data_[i] -= other.data_[i]; + } + } + } + + void CopyToBuffer(int32_t* buffer) { + const int64_t* data_ptr = reinterpret_cast(data_); + int64_t* buffer_ptr = reinterpret_cast(buffer); + for (int i = 0; i < meta_->num_bin - meta_->offset; ++i) { + buffer_ptr[i] = data_ptr[i]; + } + } + + void CopyFromInt16ToInt32(char* buffer) { + const int32_t* int16_data = reinterpret_cast(RawDataInt16()); + int64_t* int32_data = reinterpret_cast(buffer); + for (int i = 0; i < meta_->num_bin - meta_->offset; ++i) { + const int32_t int16_val = int16_data[i]; + int32_data[i] = (static_cast(static_cast(int16_val >> 16)) << 32) | + static_cast(int16_val & 0x0000ffff); } } @@ -94,8 +174,23 @@ class FeatureHistogram { output->gain *= meta_->penalty; } + void FindBestThresholdInt(int64_t sum_gradient_and_hessian, + double grad_scale, double hess_scale, + const uint8_t num_bits_bin, + const uint8_t num_bits_acc, + data_size_t num_data, + const FeatureConstraint* constraints, + double parent_output, + SplitInfo* output) { + output->default_left = true; + output->gain = kMinScore; + int_find_best_threshold_fun_(sum_gradient_and_hessian, grad_scale, hess_scale, num_bits_bin, num_bits_acc, num_data, + constraints, parent_output, output); + output->gain *= meta_->penalty; + } + template - double BeforeNumercal(double sum_gradient, double sum_hessian, double parent_output, data_size_t num_data, + double BeforeNumerical(double sum_gradient, double sum_hessian, double parent_output, data_size_t num_data, SplitInfo* output, int* rand_threshold) { is_splittable_ = false; output->monotone_type = meta_->monotone_type; @@ -112,6 +207,27 @@ class FeatureHistogram { return gain_shift + meta_->config->min_gain_to_split; } + template + double BeforeNumericalInt(int64_t sum_gradient_and_hessian, double grad_scale, double hess_scale, double parent_output, data_size_t num_data, + SplitInfo* output, int* rand_threshold) { + is_splittable_ = false; + output->monotone_type = meta_->monotone_type; + const int32_t int_sum_gradient = static_cast(sum_gradient_and_hessian >> 32); + const uint32_t int_sum_hessian = static_cast(sum_gradient_and_hessian & 0x00000000ffffffff); + const double sum_gradient = static_cast(int_sum_gradient) * grad_scale; + const double sum_hessian = static_cast(int_sum_hessian) * hess_scale; + double gain_shift = GetLeafGain( + sum_gradient, sum_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2, + meta_->config->max_delta_step, meta_->config->path_smooth, num_data, parent_output); + *rand_threshold = 0; + if (USE_RAND) { + if (meta_->num_bin - 2 > 0) { + *rand_threshold = meta_->rand.NextInt(0, meta_->num_bin - 2); + } + } + return gain_shift + meta_->config->min_gain_to_split; + } + void FuncForNumrical() { if (meta_->config->extra_trees) { if (meta_->config->monotone_constraints.empty()) { @@ -155,6 +271,119 @@ class FeatureHistogram { template void FuncForNumricalL3() { + if (meta_->config->use_quantized_grad) { +#define TEMPLATE_PREFIX_INT USE_RAND, USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING +#define LAMBDA_ARGUMENTS_INT \ + int64_t sum_gradient_and_hessian, double grad_scale, double hess_scale, const uint8_t hist_bits_bin, const uint8_t hist_bits_acc, data_size_t num_data, \ + const FeatureConstraint* constraints, double parent_output, SplitInfo *output +#define BEFORE_ARGUMENTS_INT sum_gradient_and_hessian, grad_scale, hess_scale, parent_output, num_data, output, &rand_threshold +#define FUNC_ARGUMENTS_INT \ + sum_gradient_and_hessian, grad_scale, hess_scale, num_data, constraints, min_gain_shift, \ + output, rand_threshold, parent_output + + if (meta_->num_bin > 2 && meta_->missing_type != MissingType::None) { + if (meta_->missing_type == MissingType::Zero) { + int_find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS_INT) { + int rand_threshold = 0; + double min_gain_shift = + BeforeNumericalInt( + BEFORE_ARGUMENTS_INT); + if (hist_bits_acc <= 16) { + CHECK_LE(hist_bits_bin, 16); + FindBestThresholdSequentiallyInt( + FUNC_ARGUMENTS_INT); + FindBestThresholdSequentiallyInt( + FUNC_ARGUMENTS_INT); + } else { + if (hist_bits_bin == 32) { + FindBestThresholdSequentiallyInt( + FUNC_ARGUMENTS_INT); + FindBestThresholdSequentiallyInt( + FUNC_ARGUMENTS_INT); + } else { + FindBestThresholdSequentiallyInt( + FUNC_ARGUMENTS_INT); + FindBestThresholdSequentiallyInt( + FUNC_ARGUMENTS_INT); + } + } + }; + } else { + int_find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS_INT) { + int rand_threshold = 0; + double min_gain_shift = + BeforeNumericalInt( + BEFORE_ARGUMENTS_INT); + if (hist_bits_acc <= 16) { + CHECK_LE(hist_bits_bin, 16); + FindBestThresholdSequentiallyInt( + FUNC_ARGUMENTS_INT); + FindBestThresholdSequentiallyInt( + FUNC_ARGUMENTS_INT); + } else { + if (hist_bits_bin == 32) { + FindBestThresholdSequentiallyInt( + FUNC_ARGUMENTS_INT); + FindBestThresholdSequentiallyInt( + FUNC_ARGUMENTS_INT); + } else { + FindBestThresholdSequentiallyInt( + FUNC_ARGUMENTS_INT); + FindBestThresholdSequentiallyInt( + FUNC_ARGUMENTS_INT); + } + } + }; + } + } else { + if (meta_->missing_type != MissingType::NaN) { + int_find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS_INT) { + int rand_threshold = 0; + double min_gain_shift = + BeforeNumericalInt( + BEFORE_ARGUMENTS_INT); + if (hist_bits_acc <= 16) { + CHECK_LE(hist_bits_bin, 16); + FindBestThresholdSequentiallyInt( + FUNC_ARGUMENTS_INT); + } else { + if (hist_bits_bin == 32) { + FindBestThresholdSequentiallyInt( + FUNC_ARGUMENTS_INT); + } else { + FindBestThresholdSequentiallyInt( + FUNC_ARGUMENTS_INT); + } + } + }; + } else { + int_find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS_INT) { + int rand_threshold = 0; + double min_gain_shift = + BeforeNumericalInt( + BEFORE_ARGUMENTS_INT); + if (hist_bits_acc <= 16) { + CHECK_LE(hist_bits_bin, 16); + FindBestThresholdSequentiallyInt( + FUNC_ARGUMENTS_INT); + } else { + if (hist_bits_bin == 32) { + FindBestThresholdSequentiallyInt( + FUNC_ARGUMENTS_INT); + } else { + FindBestThresholdSequentiallyInt( + FUNC_ARGUMENTS_INT); + } + } + output->default_left = false; + }; + } + } +#undef TEMPLATE_PREFIX_INT +#undef LAMBDA_ARGUMENTS_INT +#undef BEFORE_ARGUMENTS_INT +#undef FUNC_ARGURMENTS_INT + } else { #define TEMPLATE_PREFIX USE_RAND, USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING #define LAMBDA_ARGUMENTS \ double sum_gradient, double sum_hessian, data_size_t num_data, \ @@ -164,56 +393,57 @@ class FeatureHistogram { sum_gradient, sum_hessian, num_data, constraints, min_gain_shift, \ output, rand_threshold, parent_output - if (meta_->num_bin > 2 && meta_->missing_type != MissingType::None) { - if (meta_->missing_type == MissingType::Zero) { - find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS) { - int rand_threshold = 0; - double min_gain_shift = - BeforeNumercal( - BEFORE_ARGUMENTS); - FindBestThresholdSequentially( - FUNC_ARGUMENTS); - FindBestThresholdSequentially( - FUNC_ARGUMENTS); - }; - } else { - find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS) { - int rand_threshold = 0; - double min_gain_shift = - BeforeNumercal( - BEFORE_ARGUMENTS); - FindBestThresholdSequentially( - FUNC_ARGUMENTS); - FindBestThresholdSequentially( - FUNC_ARGUMENTS); - }; - } - } else { - if (meta_->missing_type != MissingType::NaN) { - find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS) { - int rand_threshold = 0; - double min_gain_shift = - BeforeNumercal( - BEFORE_ARGUMENTS); - FindBestThresholdSequentially( - FUNC_ARGUMENTS); - }; + if (meta_->num_bin > 2 && meta_->missing_type != MissingType::None) { + if (meta_->missing_type == MissingType::Zero) { + find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS) { + int rand_threshold = 0; + double min_gain_shift = + BeforeNumerical( + BEFORE_ARGUMENTS); + FindBestThresholdSequentially( + FUNC_ARGUMENTS); + FindBestThresholdSequentially( + FUNC_ARGUMENTS); + }; + } else { + find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS) { + int rand_threshold = 0; + double min_gain_shift = + BeforeNumerical( + BEFORE_ARGUMENTS); + FindBestThresholdSequentially( + FUNC_ARGUMENTS); + FindBestThresholdSequentially( + FUNC_ARGUMENTS); + }; + } } else { - find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS) { - int rand_threshold = 0; - double min_gain_shift = - BeforeNumercal( - BEFORE_ARGUMENTS); - FindBestThresholdSequentially( - FUNC_ARGUMENTS); - output->default_left = false; - }; + if (meta_->missing_type != MissingType::NaN) { + find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS) { + int rand_threshold = 0; + double min_gain_shift = + BeforeNumerical( + BEFORE_ARGUMENTS); + FindBestThresholdSequentially( + FUNC_ARGUMENTS); + }; + } else { + find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS) { + int rand_threshold = 0; + double min_gain_shift = + BeforeNumerical( + BEFORE_ARGUMENTS); + FindBestThresholdSequentially( + FUNC_ARGUMENTS); + output->default_left = false; + }; + } } - } #undef TEMPLATE_PREFIX #undef LAMBDA_ARGUMENTS #undef BEFORE_ARGUMENTS #undef FUNC_ARGURMENTS + } } void FuncForCategorical() { @@ -716,6 +946,14 @@ class FeatureHistogram { return (meta_->num_bin - meta_->offset) * kHistEntrySize; } + int SizeOfInt32Histgram() const { + return (meta_->num_bin - meta_->offset) * kInt32HistEntrySize; + } + + int SizeOfInt16Histgram() const { + return (meta_->num_bin - meta_->offset) * kInt16HistEntrySize; + } + /*! * \brief Restore histogram from memory */ @@ -724,6 +962,16 @@ class FeatureHistogram { (meta_->num_bin - meta_->offset) * kHistEntrySize); } + void FromMemoryInt32(char* memory_data) { + std::memcpy(data_, memory_data, + (meta_->num_bin - meta_->offset) * kInt32HistEntrySize); + } + + void FromMemoryInt16(char* memory_data) { + std::memcpy(data_int16_, memory_data, + (meta_->num_bin - meta_->offset) * kInt16HistEntrySize); + } + /*! * \brief True if this histogram can be splitted */ @@ -1082,14 +1330,312 @@ class FeatureHistogram { } } + template + void FindBestThresholdSequentiallyInt(int64_t int_sum_gradient_and_hessian, + const double grad_scale, const double hess_scale, + data_size_t num_data, + const FeatureConstraint* constraints, + double min_gain_shift, SplitInfo* output, + int rand_threshold, double parent_output) { + const int8_t offset = meta_->offset; + PACKED_HIST_ACC_T best_sum_left_gradient_and_hessian = 0; + PACKED_HIST_ACC_T local_int_sum_gradient_and_hessian = + HIST_BITS_ACC == 16 ? + ((static_cast(int_sum_gradient_and_hessian >> 32) << 16) | static_cast(int_sum_gradient_and_hessian & 0x0000ffff)) : + int_sum_gradient_and_hessian; + double best_gain = kMinScore; + uint32_t best_threshold = static_cast(meta_->num_bin); + const double cnt_factor = static_cast(num_data) / + static_cast(static_cast(int_sum_gradient_and_hessian & 0x00000000ffffffff)); + + BasicConstraint best_right_constraints; + BasicConstraint best_left_constraints; + bool constraint_update_necessary = + USE_MC && constraints->ConstraintDifferentDependingOnThreshold(); + + if (USE_MC) { + constraints->InitCumulativeConstraints(REVERSE); + } + + const PACKED_HIST_BIN_T* data_ptr = nullptr; + if (HIST_BITS_BIN == 16) { + data_ptr = reinterpret_cast(data_int16_); + } else { + data_ptr = reinterpret_cast(data_); + } + if (REVERSE) { + PACKED_HIST_ACC_T sum_right_gradient_and_hessian = 0; + + int t = meta_->num_bin - 1 - offset - NA_AS_MISSING; + const int t_end = 1 - offset; + + // from right to left, and we don't need data in bin0 + for (; t >= t_end; --t) { + // need to skip default bin + if (SKIP_DEFAULT_BIN) { + if ((t + offset) == static_cast(meta_->default_bin)) { + continue; + } + } + const PACKED_HIST_BIN_T grad_and_hess = data_ptr[t]; + if (HIST_BITS_ACC != HIST_BITS_BIN) { + const PACKED_HIST_ACC_T grad_and_hess_acc = HIST_BITS_BIN == 16 ? + ((static_cast(static_cast(grad_and_hess >> HIST_BITS_BIN)) << HIST_BITS_ACC) | + (static_cast(grad_and_hess & 0x0000ffff))) : + ((static_cast(static_cast(grad_and_hess >> HIST_BITS_BIN)) << HIST_BITS_ACC) | + (static_cast(grad_and_hess & 0x00000000ffffffff))); + sum_right_gradient_and_hessian += grad_and_hess_acc; + } else { + sum_right_gradient_and_hessian += grad_and_hess; + } + const uint32_t int_sum_right_hessian = HIST_BITS_ACC == 16 ? + static_cast(sum_right_gradient_and_hessian & 0x0000ffff) : + static_cast(sum_right_gradient_and_hessian & 0x00000000ffffffff); + data_size_t right_count = Common::RoundInt(int_sum_right_hessian * cnt_factor); + double sum_right_hessian = int_sum_right_hessian * hess_scale; + // if data not enough, or sum hessian too small + if (right_count < meta_->config->min_data_in_leaf || + sum_right_hessian < meta_->config->min_sum_hessian_in_leaf) { + continue; + } + data_size_t left_count = num_data - right_count; + // if data not enough + if (left_count < meta_->config->min_data_in_leaf) { + break; + } + + const PACKED_HIST_ACC_T sum_left_gradient_and_hessian = local_int_sum_gradient_and_hessian - sum_right_gradient_and_hessian; + const uint32_t int_sum_left_hessian = HIST_BITS_ACC == 16 ? + static_cast(sum_left_gradient_and_hessian & 0x0000ffff) : + static_cast(sum_left_gradient_and_hessian & 0x00000000ffffffff); + double sum_left_hessian = int_sum_left_hessian * hess_scale; + // if sum hessian too small + if (sum_left_hessian < meta_->config->min_sum_hessian_in_leaf) { + break; + } + + double sum_right_gradient = HIST_BITS_ACC == 16 ? + static_cast(static_cast(sum_right_gradient_and_hessian >> 16)) * grad_scale : + static_cast(static_cast(sum_right_gradient_and_hessian >> 32)) * grad_scale; + double sum_left_gradient = HIST_BITS_ACC == 16 ? + static_cast(static_cast(sum_left_gradient_and_hessian >> 16)) * grad_scale : + static_cast(static_cast(sum_left_gradient_and_hessian >> 32)) * grad_scale; + if (USE_RAND) { + if (t - 1 + offset != rand_threshold) { + continue; + } + } + + if (USE_MC && constraint_update_necessary) { + constraints->Update(t + offset); + } + + // current split gain + double current_gain = GetSplitGains( + sum_left_gradient, sum_left_hessian + kEpsilon, sum_right_gradient, + sum_right_hessian + kEpsilon, meta_->config->lambda_l1, + meta_->config->lambda_l2, meta_->config->max_delta_step, + constraints, meta_->monotone_type, meta_->config->path_smooth, + left_count, right_count, parent_output); + // gain with split is worse than without split + if (current_gain <= min_gain_shift) { + continue; + } + + // mark as able to be split + is_splittable_ = true; + // better split point + if (current_gain > best_gain) { + if (USE_MC) { + best_right_constraints = constraints->RightToBasicConstraint(); + best_left_constraints = constraints->LeftToBasicConstraint(); + if (best_right_constraints.min > best_right_constraints.max || + best_left_constraints.min > best_left_constraints.max) { + continue; + } + } + best_sum_left_gradient_and_hessian = sum_left_gradient_and_hessian; + // left is <= threshold, right is > threshold. so this is t-1 + best_threshold = static_cast(t - 1 + offset); + best_gain = current_gain; + } + } + } else { + PACKED_HIST_ACC_T sum_left_gradient_and_hessian = 0; + + int t = 0; + const int t_end = meta_->num_bin - 2 - offset; + + if (NA_AS_MISSING) { + if (offset == 1) { + sum_left_gradient_and_hessian = local_int_sum_gradient_and_hessian; + for (int i = 0; i < meta_->num_bin - offset; ++i) { + const PACKED_HIST_BIN_T grad_and_hess = data_ptr[i]; + if (HIST_BITS_ACC != HIST_BITS_BIN) { + const PACKED_HIST_ACC_T grad_and_hess_acc = HIST_BITS_BIN == 16 ? + ((static_cast(static_cast(grad_and_hess >> HIST_BITS_BIN)) << HIST_BITS_ACC) | + (static_cast(grad_and_hess & 0x0000ffff))) : + ((static_cast(static_cast(grad_and_hess >> HIST_BITS_BIN)) << HIST_BITS_ACC) | + (static_cast(grad_and_hess & 0x00000000ffffffff))); + sum_left_gradient_and_hessian -= grad_and_hess_acc; + } else { + sum_left_gradient_and_hessian -= grad_and_hess; + } + } + t = -1; + } + } + + for (; t <= t_end; ++t) { + if (SKIP_DEFAULT_BIN) { + if ((t + offset) == static_cast(meta_->default_bin)) { + continue; + } + } + if (t >= 0) { + const PACKED_HIST_BIN_T grad_and_hess = data_ptr[t]; + if (HIST_BITS_ACC != HIST_BITS_BIN) { + const PACKED_HIST_ACC_T grad_and_hess_acc = HIST_BITS_BIN == 16 ? + ((static_cast(static_cast(grad_and_hess >> HIST_BITS_BIN)) << HIST_BITS_ACC) | + (static_cast(grad_and_hess & 0x0000ffff))) : + ((static_cast(static_cast(grad_and_hess >> HIST_BITS_BIN)) << HIST_BITS_ACC) | + (static_cast(grad_and_hess & 0x00000000ffffffff))); + sum_left_gradient_and_hessian += grad_and_hess_acc; + } else { + sum_left_gradient_and_hessian += grad_and_hess; + } + } + // if data not enough, or sum hessian too small + const uint32_t int_sum_left_hessian = HIST_BITS_ACC == 16 ? + static_cast(sum_left_gradient_and_hessian & 0x0000ffff) : + static_cast(sum_left_gradient_and_hessian & 0x00000000ffffffff); + const data_size_t left_count = Common::RoundInt(static_cast(int_sum_left_hessian) * cnt_factor); + const double sum_left_hessian = static_cast(int_sum_left_hessian) * hess_scale; + if (left_count < meta_->config->min_data_in_leaf || + sum_left_hessian < meta_->config->min_sum_hessian_in_leaf) { + continue; + } + data_size_t right_count = num_data - left_count; + // if data not enough + if (right_count < meta_->config->min_data_in_leaf) { + break; + } + + const PACKED_HIST_ACC_T sum_right_gradient_and_hessian = local_int_sum_gradient_and_hessian - sum_left_gradient_and_hessian; + const uint32_t int_sum_right_hessian = HIST_BITS_ACC == 16 ? + static_cast(sum_right_gradient_and_hessian & 0x0000ffff) : + static_cast(sum_right_gradient_and_hessian & 0x00000000ffffffff); + const double sum_right_hessian = static_cast(int_sum_right_hessian) * hess_scale; + // if sum Hessian too small + if (sum_right_hessian < meta_->config->min_sum_hessian_in_leaf) { + break; + } + + double sum_right_gradient = HIST_BITS_ACC == 16 ? + static_cast(static_cast(sum_right_gradient_and_hessian >> 16)) * grad_scale : + static_cast(static_cast(sum_right_gradient_and_hessian >> 32)) * grad_scale; + double sum_left_gradient = HIST_BITS_ACC == 16 ? + static_cast(static_cast(sum_left_gradient_and_hessian >> 16)) * grad_scale : + static_cast(static_cast(sum_left_gradient_and_hessian >> 32)) * grad_scale; + if (USE_RAND) { + if (t + offset != rand_threshold) { + continue; + } + } + // current split gain + double current_gain = GetSplitGains( + sum_left_gradient, sum_left_hessian + kEpsilon, sum_right_gradient, + sum_right_hessian + kEpsilon, meta_->config->lambda_l1, + meta_->config->lambda_l2, meta_->config->max_delta_step, + constraints, meta_->monotone_type, meta_->config->path_smooth, left_count, + right_count, parent_output); + // gain with split is worse than without split + if (current_gain <= min_gain_shift) { + continue; + } + + // mark as able to be split + is_splittable_ = true; + // better split point + if (current_gain > best_gain) { + if (USE_MC) { + best_right_constraints = constraints->RightToBasicConstraint(); + best_left_constraints = constraints->LeftToBasicConstraint(); + if (best_right_constraints.min > best_right_constraints.max || + best_left_constraints.min > best_left_constraints.max) { + continue; + } + } + best_sum_left_gradient_and_hessian = sum_left_gradient_and_hessian; + best_threshold = static_cast(t + offset); + best_gain = current_gain; + } + } + } + + if (is_splittable_ && best_gain > output->gain + min_gain_shift) { + const int32_t int_best_sum_left_gradient = HIST_BITS_ACC == 16 ? + static_cast(static_cast(best_sum_left_gradient_and_hessian >> 16)) : + static_cast(best_sum_left_gradient_and_hessian >> 32); + const uint32_t int_best_sum_left_hessian = HIST_BITS_ACC == 16 ? + static_cast(best_sum_left_gradient_and_hessian & 0x0000ffff) : + static_cast(best_sum_left_gradient_and_hessian & 0x00000000ffffffff); + const double best_sum_left_gradient = static_cast(int_best_sum_left_gradient) * grad_scale; + const double best_sum_left_hessian = static_cast(int_best_sum_left_hessian) * hess_scale; + const int64_t best_sum_left_gradient_and_hessian_int64 = HIST_BITS_ACC == 16 ? + ((static_cast(static_cast(best_sum_left_gradient_and_hessian >> 16)) << 32) | + static_cast(best_sum_left_gradient_and_hessian & 0x0000ffff)) : + best_sum_left_gradient_and_hessian; + const int64_t best_sum_right_gradient_and_hessian = int_sum_gradient_and_hessian - best_sum_left_gradient_and_hessian_int64; + const int32_t int_best_sum_right_gradient = static_cast(best_sum_right_gradient_and_hessian >> 32); + const uint32_t int_best_sum_right_hessian = static_cast(best_sum_right_gradient_and_hessian & 0x00000000ffffffff); + const double best_sum_right_gradient = static_cast(int_best_sum_right_gradient) * grad_scale; + const double best_sum_right_hessian = static_cast(int_best_sum_right_hessian) * hess_scale; + const data_size_t best_left_count = Common::RoundInt(static_cast(int_best_sum_left_hessian) * cnt_factor); + const data_size_t best_right_count = Common::RoundInt(static_cast(int_best_sum_right_hessian) * cnt_factor); + // update split information + output->threshold = best_threshold; + output->left_output = + CalculateSplittedLeafOutput( + best_sum_left_gradient, best_sum_left_hessian, + meta_->config->lambda_l1, meta_->config->lambda_l2, + meta_->config->max_delta_step, best_left_constraints, meta_->config->path_smooth, + best_left_count, parent_output); + output->left_count = best_left_count; + output->left_sum_gradient = best_sum_left_gradient; + output->left_sum_hessian = best_sum_left_hessian; + output->left_sum_gradient_and_hessian = best_sum_left_gradient_and_hessian_int64; + output->right_output = + CalculateSplittedLeafOutput( + best_sum_right_gradient, + best_sum_right_hessian, meta_->config->lambda_l1, + meta_->config->lambda_l2, meta_->config->max_delta_step, + best_right_constraints, meta_->config->path_smooth, best_right_count, + parent_output); + output->right_count = best_right_count; + output->right_sum_gradient = best_sum_right_gradient; + output->right_sum_hessian = best_sum_right_hessian; + output->right_sum_gradient_and_hessian = best_sum_right_gradient_and_hessian; + output->gain = best_gain - min_gain_shift; + output->default_left = REVERSE; + } + } + const FeatureMetainfo* meta_; /*! \brief sum of gradient of each bin */ hist_t* data_; + int16_t* data_int16_; bool is_splittable_ = true; std::function find_best_threshold_fun_; + + std::function + int_find_best_threshold_fun_; }; class HistogramPool { @@ -1200,18 +1746,35 @@ class HistogramPool { pool_.resize(cache_size); data_.resize(cache_size); } - OMP_INIT_EX(); -#pragma omp parallel for schedule(static) - for (int i = old_cache_size; i < cache_size; ++i) { - OMP_LOOP_EX_BEGIN(); - pool_[i].reset(new FeatureHistogram[train_data->num_features()]); - data_[i].resize(num_total_bin * 2); - for (int j = 0; j < train_data->num_features(); ++j) { - pool_[i][j].Init(data_[i].data() + offsets[j] * 2, &feature_metas_[j]); + + if (config->use_quantized_grad) { + OMP_INIT_EX(); + #pragma omp parallel for schedule(static) + for (int i = old_cache_size; i < cache_size; ++i) { + OMP_LOOP_EX_BEGIN(); + pool_[i].reset(new FeatureHistogram[train_data->num_features()]); + data_[i].resize(num_total_bin); + for (int j = 0; j < train_data->num_features(); ++j) { + int16_t* data_ptr = reinterpret_cast(data_[i].data()); + pool_[i][j].Init(data_[i].data() + offsets[j], data_ptr + 2 * offsets[j], &feature_metas_[j]); + } + OMP_LOOP_EX_END(); + } + OMP_THROW_EX(); + } else { + OMP_INIT_EX(); + #pragma omp parallel for schedule(static) + for (int i = old_cache_size; i < cache_size; ++i) { + OMP_LOOP_EX_BEGIN(); + pool_[i].reset(new FeatureHistogram[train_data->num_features()]); + data_[i].resize(num_total_bin * 2); + for (int j = 0; j < train_data->num_features(); ++j) { + pool_[i][j].Init(data_[i].data() + offsets[j] * 2, &feature_metas_[j]); + } + OMP_LOOP_EX_END(); } - OMP_LOOP_EX_END(); + OMP_THROW_EX(); } - OMP_THROW_EX(); } void ResetConfig(const Dataset* train_data, const Config* config) { diff --git a/src/treelearner/gpu_tree_learner.cpp b/src/treelearner/gpu_tree_learner.cpp index f92da0fe9f76..294be28b6f86 100644 --- a/src/treelearner/gpu_tree_learner.cpp +++ b/src/treelearner/gpu_tree_learner.cpp @@ -991,7 +991,7 @@ void GPUTreeLearner::ConstructHistograms(const std::vector& is_feature_u nullptr, nullptr, nullptr, nullptr); // then construct sparse features on CPU - train_data_->ConstructHistograms(is_sparse_feature_used, + train_data_->ConstructHistograms(is_sparse_feature_used, smaller_leaf_splits_->data_indices(), smaller_leaf_splits_->num_data_in_leaf(), gradients_, hessians_, ordered_gradients_.data(), ordered_hessians_.data(), @@ -1056,7 +1056,7 @@ void GPUTreeLearner::ConstructHistograms(const std::vector& is_feature_u gradients_, hessians_, ordered_gradients_.data(), ordered_hessians_.data()); // then construct sparse features on CPU - train_data_->ConstructHistograms(is_sparse_feature_used, + train_data_->ConstructHistograms(is_sparse_feature_used, larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(), gradients_, hessians_, ordered_gradients_.data(), ordered_hessians_.data(), diff --git a/src/treelearner/gradient_discretizer.cpp b/src/treelearner/gradient_discretizer.cpp new file mode 100644 index 000000000000..4c00f73ab12c --- /dev/null +++ b/src/treelearner/gradient_discretizer.cpp @@ -0,0 +1,262 @@ +/*! + * Copyright (c) 2022 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ + +#include "gradient_discretizer.hpp" +#include + +#include +#include +#include + +namespace LightGBM { + +void GradientDiscretizer::Init( + const data_size_t num_data, const int num_leaves, + const int num_features, const Dataset* train_data) { + discretized_gradients_and_hessians_vector_.resize(num_data * 2); + gradient_random_values_.resize(num_data); + hessian_random_values_.resize(num_data); + random_values_use_start_eng_ = std::mt19937(random_seed_); + random_values_use_start_dist_ = std::uniform_int_distribution(0, num_data); + + const int num_threads = OMP_NUM_THREADS(); + int num_blocks = 0; + data_size_t block_size = 0; + Threading::BlockInfo(num_data, 512, &num_blocks, &block_size); + #pragma omp parallel for schedule(static, 1) num_threads(num_threads) + for (int thread_id = 0; thread_id < num_blocks; ++thread_id) { + const data_size_t start = thread_id * block_size; + const data_size_t end = std::min(start + block_size, num_data); + std::mt19937 gradient_random_values_eng(random_seed_ + thread_id); + std::uniform_real_distribution gradient_random_values_dist(0.0f, 1.0f); + std::mt19937 hessian_random_values_eng(random_seed_ + thread_id + num_threads); + std::uniform_real_distribution hessian_random_values_dist(0.0f, 1.0f); + for (data_size_t i = start; i < end; ++i) { + gradient_random_values_[i] = gradient_random_values_dist(gradient_random_values_eng); + hessian_random_values_[i] = hessian_random_values_dist(hessian_random_values_eng); + } + } + + max_gradient_abs_ = 0.0f; + max_hessian_abs_ = 0.0f; + + gradient_scale_ = 0.0f; + hessian_scale_ = 0.0f; + inverse_gradient_scale_ = 0.0f; + inverse_hessian_scale_ = 0.0f; + + num_leaves_ = num_leaves; + leaf_num_bits_in_histogram_bin_.resize(num_leaves_, 0); + node_num_bits_in_histogram_bin_.resize(num_leaves_, 0); + global_leaf_num_bits_in_histogram_bin_.resize(num_leaves_, 0); + global_node_num_bits_in_histogram_bin_.resize(num_leaves_, 0); + + leaf_grad_hess_stats_.resize(num_leaves_ * 2, 0.0); + change_hist_bits_buffer_.resize(num_features); + #pragma omp parallel for schedule(static) num_threads(num_threads) + for (int feature_index = 0; feature_index < num_features; ++feature_index) { + const BinMapper* bin_mapper = train_data->FeatureBinMapper(feature_index); + change_hist_bits_buffer_[feature_index].resize((bin_mapper->num_bin() - static_cast(bin_mapper->GetMostFreqBin() == 0)) * 2); + } + + ordered_int_gradients_and_hessians_.resize(2 * num_data); +} + +void GradientDiscretizer::DiscretizeGradients( + const data_size_t num_data, + const score_t* input_gradients, + const score_t* input_hessians) { + double max_gradient = std::fabs(input_gradients[0]); + double max_hessian = std::fabs(input_hessians[0]); + const int num_threads = OMP_NUM_THREADS(); + std::vector thread_max_gradient(num_threads, max_gradient); + std::vector thread_max_hessian(num_threads, max_hessian); + Threading::For(0, num_data, 1024, + [input_gradients, input_hessians, &thread_max_gradient, &thread_max_hessian] + (int, data_size_t start, data_size_t end) { + int thread_id = omp_get_thread_num(); + for (data_size_t i = start; i < end; ++i) { + double fabs_grad = std::fabs(input_gradients[i]); + double fabs_hess = std::fabs(input_hessians[i]); + if (fabs_grad > thread_max_gradient[thread_id]) { + thread_max_gradient[thread_id] = fabs_grad; + } + if (fabs_hess > thread_max_hessian[thread_id]) { + thread_max_hessian[thread_id] = fabs_hess; + } + }}); + max_gradient = thread_max_gradient[0]; + max_hessian = thread_max_hessian[0]; + for (int thread_id = 1; thread_id < num_threads; ++thread_id) { + if (max_gradient < thread_max_gradient[thread_id]) { + max_gradient = thread_max_gradient[thread_id]; + } + if (max_hessian < thread_max_hessian[thread_id]) { + max_hessian = thread_max_hessian[thread_id]; + } + } + if (Network::num_machines() > 1) { + max_gradient = Network::GlobalSyncUpByMax(max_gradient); + max_hessian = Network::GlobalSyncUpByMax(max_hessian); + } + max_gradient_abs_ = max_gradient; + max_hessian_abs_ = max_hessian; + gradient_scale_ = max_gradient_abs_ / static_cast(num_grad_quant_bins_ / 2); + if (is_constant_hessian_) { + hessian_scale_ = max_hessian_abs_; + } else { + hessian_scale_ = max_hessian_abs_ / static_cast(num_grad_quant_bins_); + } + inverse_gradient_scale_ = 1.0f / gradient_scale_; + inverse_hessian_scale_ = 1.0f / hessian_scale_; + + const int random_values_use_start = random_values_use_start_dist_(random_values_use_start_eng_); + int8_t* discretized_int8 = discretized_gradients_and_hessians_vector_.data(); + if (stochastic_rounding_) { + if (is_constant_hessian_) { + #pragma omp parallel for schedule(static) num_threads(num_threads) + for (data_size_t i = 0; i < num_data; ++i) { + const double gradient = input_gradients[i]; + const data_size_t random_value_pos = (i + random_values_use_start) % num_data; + discretized_int8[2 * i + 1] = gradient >= 0.0f ? + static_cast(gradient * inverse_gradient_scale_ + gradient_random_values_[random_value_pos]) : + static_cast(gradient * inverse_gradient_scale_ - gradient_random_values_[random_value_pos]); + discretized_int8[2 * i] = static_cast(1); + } + } else { + #pragma omp parallel for schedule(static) num_threads(num_threads) + for (data_size_t i = 0; i < num_data; ++i) { + const double gradient = input_gradients[i]; + const data_size_t random_value_pos = (i + random_values_use_start) % num_data; + discretized_int8[2 * i + 1] = gradient >= 0.0f ? + static_cast(gradient * inverse_gradient_scale_ + gradient_random_values_[random_value_pos]) : + static_cast(gradient * inverse_gradient_scale_ - gradient_random_values_[random_value_pos]); + discretized_int8[2 * i] = static_cast(input_hessians[i] * inverse_hessian_scale_ + hessian_random_values_[random_value_pos]); + } + } + } else { + if (is_constant_hessian_) { + #pragma omp parallel for schedule(static) num_threads(num_threads) + for (data_size_t i = 0; i < num_data; ++i) { + const double gradient = input_gradients[i]; + discretized_int8[2 * i + 1] = gradient >= 0.0f ? + static_cast(gradient * inverse_gradient_scale_ + 0.5) : + static_cast(gradient * inverse_gradient_scale_ - 0.5); + discretized_int8[2 * i] = static_cast(1); + } + } else { + #pragma omp parallel for schedule(static) num_threads(num_threads) + for (data_size_t i = 0; i < num_data; ++i) { + const double gradient = input_gradients[i]; + discretized_int8[2 * i + 1] = gradient >= 0.0f ? + static_cast(gradient * inverse_gradient_scale_ + 0.5) : + static_cast(gradient * inverse_gradient_scale_ - 0.5); + discretized_int8[2 * i] = static_cast(input_hessians[i] * inverse_hessian_scale_ + 0.5); + } + } + } +} + +template +void GradientDiscretizer::SetNumBitsInHistogramBin( + const int left_leaf_index, const int right_leaf_index, + const data_size_t num_data_in_left_leaf, const data_size_t num_data_in_right_leaf) { + std::vector& leaf_num_bits_in_histogram_bin = IS_GLOBAL ? + global_leaf_num_bits_in_histogram_bin_ : leaf_num_bits_in_histogram_bin_; + std::vector& node_num_bits_in_histogram_bin = IS_GLOBAL ? + global_node_num_bits_in_histogram_bin_ : node_num_bits_in_histogram_bin_; + if (right_leaf_index == -1) { + const uint64_t max_stat_per_bin = static_cast(num_data_in_left_leaf) * static_cast(num_grad_quant_bins_); + if (max_stat_per_bin < 256) { + leaf_num_bits_in_histogram_bin[left_leaf_index] = 8; + } else if (max_stat_per_bin < 65536) { + leaf_num_bits_in_histogram_bin[left_leaf_index] = 16; + } else { + leaf_num_bits_in_histogram_bin[left_leaf_index] = 32; + } + } else { + const uint64_t max_stat_left_per_bin = static_cast(num_data_in_left_leaf) * static_cast(num_grad_quant_bins_); + const uint64_t max_stat_right_per_bin = static_cast(num_data_in_right_leaf) * static_cast(num_grad_quant_bins_); + node_num_bits_in_histogram_bin[left_leaf_index] = leaf_num_bits_in_histogram_bin[left_leaf_index]; + if (max_stat_left_per_bin < 256) { + leaf_num_bits_in_histogram_bin[left_leaf_index] = 8; + } else if (max_stat_left_per_bin < 65536) { + leaf_num_bits_in_histogram_bin[left_leaf_index] = 16; + } else { + leaf_num_bits_in_histogram_bin[left_leaf_index] = 32; + } + if (max_stat_right_per_bin < 256) { + leaf_num_bits_in_histogram_bin[right_leaf_index] = 8; + } else if (max_stat_right_per_bin < 65536) { + leaf_num_bits_in_histogram_bin[right_leaf_index] = 16; + } else { + leaf_num_bits_in_histogram_bin[right_leaf_index] = 32; + } + } +} + +template void GradientDiscretizer::SetNumBitsInHistogramBin( + const int left_leaf_index, const int right_leaf_index, + const data_size_t num_data_in_left_leaf, const data_size_t num_data_in_right_leaf); + +template void GradientDiscretizer::SetNumBitsInHistogramBin( + const int left_leaf_index, const int right_leaf_index, + const data_size_t num_data_in_left_leaf, const data_size_t num_data_in_right_leaf); + +void GradientDiscretizer::RenewIntGradTreeOutput( + Tree* tree, const Config* config, const DataPartition* data_partition, + const score_t* gradients, const score_t* hessians, + const std::function& leaf_index_to_global_num_data) { + global_timer.Start("GradientDiscretizer::RenewIntGradTreeOutput"); + if (config->tree_learner == std::string("data")) { + for (int leaf_id = 0; leaf_id < tree->num_leaves(); ++leaf_id) { + data_size_t leaf_cnt = 0; + const data_size_t* data_indices = data_partition->GetIndexOnLeaf(leaf_id, &leaf_cnt); + double sum_gradient = 0.0f, sum_hessian = 0.0f; + #pragma omp parallel for schedule(static) reduction(+:sum_gradient, sum_hessian) + for (data_size_t i = 0; i < leaf_cnt; ++i) { + const data_size_t index = data_indices[i]; + const score_t grad = gradients[index]; + const score_t hess = hessians[index]; + sum_gradient += grad; + sum_hessian += hess; + } + leaf_grad_hess_stats_[2 * leaf_id] = sum_gradient; + leaf_grad_hess_stats_[2 * leaf_id + 1] = sum_hessian; + } + std::vector global_leaf_grad_hess_stats = Network::GlobalSum(&leaf_grad_hess_stats_); + for (int leaf_id = 0; leaf_id < tree->num_leaves(); ++leaf_id) { + const double sum_gradient = global_leaf_grad_hess_stats[2 * leaf_id]; + const double sum_hessian = global_leaf_grad_hess_stats[2 * leaf_id + 1]; + const double leaf_output = FeatureHistogram::CalculateSplittedLeafOutput( + sum_gradient, sum_hessian, + config->lambda_l1, config->lambda_l2, config->max_delta_step, config->path_smooth, + leaf_index_to_global_num_data(leaf_id), 0.0f); + tree->SetLeafOutput(leaf_id, leaf_output); + } + } else { + for (int leaf_id = 0; leaf_id < tree->num_leaves(); ++leaf_id) { + data_size_t leaf_cnt = 0; + const data_size_t* data_indices = data_partition->GetIndexOnLeaf(leaf_id, &leaf_cnt); + double sum_gradient = 0.0f, sum_hessian = 0.0f; + #pragma omp parallel for schedule(static) reduction(+:sum_gradient, sum_hessian) + for (data_size_t i = 0; i < leaf_cnt; ++i) { + const data_size_t index = data_indices[i]; + const score_t grad = gradients[index]; + const score_t hess = hessians[index]; + sum_gradient += grad; + sum_hessian += hess; + } + const double leaf_output = FeatureHistogram::CalculateSplittedLeafOutput(sum_gradient, sum_hessian, + config->lambda_l1, config->lambda_l2, config->max_delta_step, config->path_smooth, + leaf_cnt, 0.0f); + tree->SetLeafOutput(leaf_id, leaf_output); + } + } + global_timer.Stop("GradientDiscretizer::RenewIntGradTreeOutput"); +} + +} // namespace LightGBM diff --git a/src/treelearner/gradient_discretizer.hpp b/src/treelearner/gradient_discretizer.hpp new file mode 100644 index 000000000000..352788f7d093 --- /dev/null +++ b/src/treelearner/gradient_discretizer.hpp @@ -0,0 +1,128 @@ +/*! + * Copyright (c) 2022 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for + * license information. + */ +#ifndef LIGHTGBM_TREE_LEARNER_GRADIENT_DISCRETIZER_HPP_ +#define LIGHTGBM_TREE_LEARNER_GRADIENT_DISCRETIZER_HPP_ + +#include +#include +#include +#include + +#include +#include + +#include "data_partition.hpp" +#include "feature_histogram.hpp" + +namespace LightGBM { + +class GradientDiscretizer { + public: + GradientDiscretizer(int num_grad_quant_bins, int num_trees, int random_seed, bool is_constant_hessian, const bool stochastic_rounding) { + num_grad_quant_bins_ = num_grad_quant_bins; + iter_ = 0; + num_trees_ = num_trees; + random_seed_ = random_seed; + is_constant_hessian_ = is_constant_hessian; + stochastic_rounding_ = stochastic_rounding; + } + + ~GradientDiscretizer() {} + + virtual void DiscretizeGradients( + const data_size_t num_data, + const score_t* input_gradients, + const score_t* input_hessians); + + virtual const int8_t* discretized_gradients_and_hessians() const { + return discretized_gradients_and_hessians_vector_.data(); + } + + virtual double grad_scale() const { + return gradient_scale_; + } + + virtual double hess_scale() const { + return hessian_scale_; + } + + virtual void Init( + const data_size_t num_data, const int num_leaves, + const int num_features, const Dataset* train_data); + + template + void SetNumBitsInHistogramBin( + const int left_leaf_index, const int right_leaf_index, + const data_size_t num_data_in_left_leaf, const data_size_t num_data_in_right_leaf); + + template + int8_t GetHistBitsInLeaf(const int leaf_index) { + if (IS_GLOBAL) { + return global_leaf_num_bits_in_histogram_bin_[leaf_index]; + } else { + return leaf_num_bits_in_histogram_bin_[leaf_index]; + } + } + + template + int8_t GetHistBitsInNode(const int node_index) { + if (IS_GLOBAL) { + return global_node_num_bits_in_histogram_bin_[node_index]; + } else { + return node_num_bits_in_histogram_bin_[node_index]; + } + } + + int8_t* ordered_int_gradients_and_hessians() { + return ordered_int_gradients_and_hessians_.data(); + } + + void RenewIntGradTreeOutput( + Tree* tree, const Config* config, const DataPartition* data_partition, + const score_t* gradients, const score_t* hessians, + const std::function& leaf_index_to_global_num_data); + + int32_t* GetChangeHistBitsBuffer(const int feature_index) { + return change_hist_bits_buffer_[feature_index].data(); + } + + protected: + int num_grad_quant_bins_; + int iter_; + int num_trees_; + int random_seed_; + bool stochastic_rounding_; + + std::vector gradient_random_values_; + std::vector hessian_random_values_; + std::mt19937 random_values_use_start_eng_; + std::uniform_int_distribution random_values_use_start_dist_; + std::vector discretized_gradients_and_hessians_vector_; + std::vector ordered_int_gradients_and_hessians_; + + double max_gradient_abs_; + double max_hessian_abs_; + + double gradient_scale_; + double hessian_scale_; + double inverse_gradient_scale_; + double inverse_hessian_scale_; + + bool is_constant_hessian_; + int num_leaves_; + + std::vector leaf_num_bits_in_histogram_bin_; + std::vector node_num_bits_in_histogram_bin_; + std::vector global_leaf_num_bits_in_histogram_bin_; + std::vector global_node_num_bits_in_histogram_bin_; + + std::vector leaf_grad_hess_stats_; + std::vector> change_hist_bits_buffer_; +}; + +} // namespace LightGBM + +#endif // LIGHTGBM_TREE_LEARNER_GRADIENT_DISCRETIZER_HPP_ diff --git a/src/treelearner/leaf_splits.hpp b/src/treelearner/leaf_splits.hpp index 46d8ce417857..163bfc4df9ca 100644 --- a/src/treelearner/leaf_splits.hpp +++ b/src/treelearner/leaf_splits.hpp @@ -85,6 +85,38 @@ class LeafSplits { sum_hessians_ = tmp_sum_hessians; } + + /*! + * \brief Init splits on the current leaf, it will traverse all data to sum up the results + * \param int_gradients_and_hessians Discretized gradients and hessians + * \param grad_scale Scaling factor to recover original gradients from discretized gradients + * \param hess_scale Scaling factor to recover original hessians from discretized hessians + */ + void Init(const int8_t* int_gradients_and_hessians, + const double grad_scale, const double hess_scale) { + num_data_in_leaf_ = num_data_; + leaf_index_ = 0; + data_indices_ = nullptr; + double tmp_sum_gradients = 0.0f; + double tmp_sum_hessians = 0.0f; + const int16_t* packed_int_gradients_and_hessians = reinterpret_cast(int_gradients_and_hessians); + int64_t tmp_sum_gradients_and_hessians = 0; +#pragma omp parallel for schedule(static, 512) reduction(+:tmp_sum_gradients, tmp_sum_hessians, tmp_sum_gradients_and_hessians) if (num_data_in_leaf_ >= 1024 && !deterministic_) + for (data_size_t i = 0; i < num_data_in_leaf_; ++i) { + tmp_sum_gradients += int_gradients_and_hessians[2 * i + 1] * grad_scale; + tmp_sum_hessians += int_gradients_and_hessians[2 * i] * hess_scale; + const int16_t packed_int_grad_and_hess = packed_int_gradients_and_hessians[i]; + const int64_t packed_long_int_grad_and_hess = + (static_cast(static_cast(packed_int_grad_and_hess >> 8)) << 32) | + (static_cast(packed_int_grad_and_hess & 0x00ff)); + tmp_sum_gradients_and_hessians += packed_long_int_grad_and_hess; + } + sum_gradients_ = tmp_sum_gradients; + sum_hessians_ = tmp_sum_hessians; + int_sum_gradients_and_hessians_ = tmp_sum_gradients_and_hessians; + } + + /*! * \brief Init splits on current leaf of partial data. * \param leaf Index of current leaf @@ -109,6 +141,40 @@ class LeafSplits { } + /*! + * \brief Init splits on current leaf of partial data. + * \param leaf Index of current leaf + * \param data_partition current data partition + * \param int_gradients_and_hessians Discretized gradients and hessians + * \param grad_scale Scaling factor to recover original gradients from discretized gradients + * \param hess_scale Scaling factor to recover original hessians from discretized hessians + */ + void Init(int leaf, const DataPartition* data_partition, + const int8_t* int_gradients_and_hessians, + const score_t grad_scale, const score_t hess_scale) { + leaf_index_ = leaf; + data_indices_ = data_partition->GetIndexOnLeaf(leaf, &num_data_in_leaf_); + double tmp_sum_gradients = 0.0f; + double tmp_sum_hessians = 0.0f; + const int16_t* packed_int_gradients_and_hessians = reinterpret_cast(int_gradients_and_hessians); + int64_t tmp_sum_gradients_and_hessians = 0; +#pragma omp parallel for schedule(static, 512) reduction(+:tmp_sum_gradients, tmp_sum_hessians, tmp_sum_gradients_and_hessians) if (num_data_in_leaf_ >= 1024 && deterministic_) + for (data_size_t i = 0; i < num_data_in_leaf_; ++i) { + const data_size_t idx = data_indices_[i]; + tmp_sum_gradients += int_gradients_and_hessians[2 * idx + 1] * grad_scale; + tmp_sum_hessians += int_gradients_and_hessians[2 * idx] * hess_scale; + const int16_t packed_int_grad_and_hess = packed_int_gradients_and_hessians[i]; + const int64_t packed_long_int_grad_and_hess = + (static_cast(static_cast(packed_int_grad_and_hess >> 8)) << 32) | + (static_cast(packed_int_grad_and_hess & 0x00ff)); + tmp_sum_gradients_and_hessians += packed_long_int_grad_and_hess; + } + sum_gradients_ = tmp_sum_gradients; + sum_hessians_ = tmp_sum_hessians; + int_sum_gradients_and_hessians_ = tmp_sum_gradients_and_hessians; + } + + /*! * \brief Init splits on current leaf, only update sum_gradients and sum_hessians * \param sum_gradients @@ -120,6 +186,19 @@ class LeafSplits { sum_hessians_ = sum_hessians; } + /*! + * \brief Init splits on current leaf, only update sum_gradients and sum_hessians + * \param sum_gradients + * \param sum_hessians + * \param int_sum_gradients_and_hessians + */ + void Init(double sum_gradients, double sum_hessians, int64_t int_sum_gradients_and_hessians) { + leaf_index_ = 0; + sum_gradients_ = sum_gradients; + sum_hessians_ = sum_hessians; + int_sum_gradients_and_hessians_ = int_sum_gradients_and_hessians; + } + /*! * \brief Init splits on current leaf */ @@ -142,6 +221,9 @@ class LeafSplits { /*! \brief Get sum of Hessians of current leaf */ double sum_hessians() const { return sum_hessians_; } + /*! \brief Get sum of discretized gradients and Hessians of current leaf */ + int64_t int_sum_gradients_and_hessians() const { return int_sum_gradients_and_hessians_; } + /*! \brief Get indices of data of current leaf */ const data_size_t* data_indices() const { return data_indices_; } @@ -162,6 +244,8 @@ class LeafSplits { double sum_gradients_; /*! \brief sum of Hessians of current leaf */ double sum_hessians_; + /*! \brief sum of discretized gradients and Hessians of current leaf */ + int64_t int_sum_gradients_and_hessians_; /*! \brief indices of data of current leaf */ const data_size_t* data_indices_; /*! \brief weight of current leaf */ diff --git a/src/treelearner/parallel_tree_learner.h b/src/treelearner/parallel_tree_learner.h index 29f4e1688b99..b942dceab28b 100644 --- a/src/treelearner/parallel_tree_learner.h +++ b/src/treelearner/parallel_tree_learner.h @@ -71,15 +71,24 @@ class DataParallelTreeLearner: public TREELEARNER_T { } } + void PrepareBufferPos( + const std::vector>& feature_distribution, + std::vector* block_start, + std::vector* block_len, + std::vector* buffer_write_start_pos, + std::vector* buffer_read_start_pos, + comm_size_t* reduce_scatter_size, + size_t hist_entry_size); + private: /*! \brief Rank of local machine */ int rank_; /*! \brief Number of machines of this parallel task */ int num_machines_; /*! \brief Buffer for network send */ - std::vector input_buffer_; + std::vector> input_buffer_; /*! \brief Buffer for network receive */ - std::vector output_buffer_; + std::vector> output_buffer_; /*! \brief different machines will aggregate histograms for different features, use this to mark local aggregate features*/ std::vector is_feature_aggregated_; @@ -87,12 +96,22 @@ class DataParallelTreeLearner: public TREELEARNER_T { std::vector block_start_; /*! \brief Block size for reduce scatter */ std::vector block_len_; + /*! \brief Block start index for reduce scatter with int16 histograms */ + std::vector block_start_int16_; + /*! \brief Block size for reduce scatter with int16 histograms */ + std::vector block_len_int16_; /*! \brief Write positions for feature histograms */ std::vector buffer_write_start_pos_; /*! \brief Read positions for local feature histograms */ std::vector buffer_read_start_pos_; + /*! \brief Write positions for feature histograms with int16 histograms*/ + std::vector buffer_write_start_pos_int16_; + /*! \brief Read positions for local feature histograms with int16 histograms */ + std::vector buffer_read_start_pos_int16_; /*! \brief Size for reduce scatter */ comm_size_t reduce_scatter_size_; + /*! \brief Size for reduce scatter with int16 histogram*/ + comm_size_t reduce_scatter_size_int16_; /*! \brief Store global number of data in leaves */ std::vector global_data_count_in_leaf_; }; diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 5ca8a3f047f6..c322c1a796c2 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -21,6 +21,7 @@ namespace LightGBM { SerialTreeLearner::SerialTreeLearner(const Config* config) : config_(config), col_sampler_(config) { + gradient_discretizer_ = nullptr; } SerialTreeLearner::~SerialTreeLearner() { @@ -60,6 +61,11 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian ordered_gradients_.resize(num_data_); ordered_hessians_.resize(num_data_); + if (config_->use_quantized_grad) { + gradient_discretizer_.reset(new GradientDiscretizer(config_->num_grad_quant_bins, config_->num_iterations, config_->seed, is_constant_hessian, config_->stochastic_rounding)); + gradient_discretizer_->Init(num_data_, config_->num_leaves, num_features_, train_data_); + } + GetShareStates(train_data_, is_constant_hessian, true); histogram_pool_.DynamicChangeSize(train_data_, share_state_->num_hist_total_bin(), @@ -76,17 +82,31 @@ void SerialTreeLearner::GetShareStates(const Dataset* dataset, bool is_constant_hessian, bool is_first_time) { if (is_first_time) { - share_state_.reset(dataset->GetShareStates( - ordered_gradients_.data(), ordered_hessians_.data(), + if (config_->use_quantized_grad) { + share_state_.reset(dataset->GetShareStates( + reinterpret_cast(gradient_discretizer_->ordered_int_gradients_and_hessians()), nullptr, col_sampler_.is_feature_used_bytree(), is_constant_hessian, - config_->force_col_wise, config_->force_row_wise)); + config_->force_col_wise, config_->force_row_wise, config_->num_grad_quant_bins)); + } else { + share_state_.reset(dataset->GetShareStates( + ordered_gradients_.data(), ordered_hessians_.data(), + col_sampler_.is_feature_used_bytree(), is_constant_hessian, + config_->force_col_wise, config_->force_row_wise, config_->num_grad_quant_bins)); + } } else { CHECK_NOTNULL(share_state_); // cannot change is_hist_col_wise during training - share_state_.reset(dataset->GetShareStates( - ordered_gradients_.data(), ordered_hessians_.data(), col_sampler_.is_feature_used_bytree(), - is_constant_hessian, share_state_->is_col_wise, - !share_state_->is_col_wise)); + if (config_->use_quantized_grad) { + share_state_.reset(dataset->GetShareStates( + reinterpret_cast(gradient_discretizer_->ordered_int_gradients_and_hessians()), nullptr, + col_sampler_.is_feature_used_bytree(), is_constant_hessian, + share_state_->is_col_wise, !share_state_->is_col_wise, config_->num_grad_quant_bins)); + } else { + share_state_.reset(dataset->GetShareStates( + ordered_gradients_.data(), ordered_hessians_.data(), col_sampler_.is_feature_used_bytree(), + is_constant_hessian, share_state_->is_col_wise, + !share_state_->is_col_wise, config_->num_grad_quant_bins)); + } } CHECK_NOTNULL(share_state_); } @@ -169,6 +189,10 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians } share_state_->num_threads = num_threads; + if (config_->use_quantized_grad) { + gradient_discretizer_->DiscretizeGradients(num_data_, gradients_, hessians_); + } + // some initial works before training BeforeTrain(); @@ -205,6 +229,11 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians cur_depth = std::max(cur_depth, tree->leaf_depth(left_leaf)); } + if (config_->use_quantized_grad && config_->quant_train_renew_leaf) { + gradient_discretizer_->RenewIntGradTreeOutput(tree.get(), config_, data_partition_.get(), gradients_, hessians_, + [this] (int leaf_index) { return GetGlobalDataCountInLeaf(leaf_index); }); + } + Log::Debug("Trained a tree with leaves = %d and depth = %d", tree->num_leaves(), cur_depth); return tree.release(); } @@ -270,11 +299,25 @@ void SerialTreeLearner::BeforeTrain() { // Sumup for root if (data_partition_->leaf_count(0) == num_data_) { // use all data - smaller_leaf_splits_->Init(gradients_, hessians_); - + if (!config_->use_quantized_grad) { + smaller_leaf_splits_->Init(gradients_, hessians_); + } else { + smaller_leaf_splits_->Init( + gradient_discretizer_->discretized_gradients_and_hessians(), + gradient_discretizer_->grad_scale(), + gradient_discretizer_->hess_scale()); + } } else { // use bagging, only use part of data - smaller_leaf_splits_->Init(0, data_partition_.get(), gradients_, hessians_); + if (!config_->use_quantized_grad) { + smaller_leaf_splits_->Init(0, data_partition_.get(), gradients_, hessians_); + } else { + smaller_leaf_splits_->Init( + 0, data_partition_.get(), + gradient_discretizer_->discretized_gradients_and_hessians(), + gradient_discretizer_->grad_scale(), + gradient_discretizer_->hess_scale()); + } } larger_leaf_splits_->Init(); @@ -282,6 +325,10 @@ void SerialTreeLearner::BeforeTrain() { if (cegb_ != nullptr) { cegb_->BeforeTrain(); } + + if (config_->use_quantized_grad && config_->tree_learner != std::string("data")) { + gradient_discretizer_->SetNumBitsInHistogramBin(0, -1, data_partition_->leaf_count(0), 0); + } } bool SerialTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) { @@ -353,22 +400,67 @@ void SerialTreeLearner::ConstructHistograms( Common::FunctionTimer fun_timer("SerialTreeLearner::ConstructHistograms", global_timer); // construct smaller leaf - hist_t* ptr_smaller_leaf_hist_data = - smaller_leaf_histogram_array_[0].RawData() - kHistOffset; - train_data_->ConstructHistograms( - is_feature_used, smaller_leaf_splits_->data_indices(), - smaller_leaf_splits_->num_data_in_leaf(), gradients_, hessians_, - ordered_gradients_.data(), ordered_hessians_.data(), share_state_.get(), - ptr_smaller_leaf_hist_data); - if (larger_leaf_histogram_array_ != nullptr && !use_subtract) { - // construct larger leaf - hist_t* ptr_larger_leaf_hist_data = - larger_leaf_histogram_array_[0].RawData() - kHistOffset; - train_data_->ConstructHistograms( - is_feature_used, larger_leaf_splits_->data_indices(), - larger_leaf_splits_->num_data_in_leaf(), gradients_, hessians_, + if (config_->use_quantized_grad) { + const uint8_t smaller_leaf_num_bits = gradient_discretizer_->GetHistBitsInLeaf(smaller_leaf_splits_->leaf_index()); + hist_t* ptr_smaller_leaf_hist_data = + smaller_leaf_num_bits <= 16 ? + reinterpret_cast(smaller_leaf_histogram_array_[0].RawDataInt16() - kHistOffset) : + reinterpret_cast(smaller_leaf_histogram_array_[0].RawDataInt32() - kHistOffset); + #define SMALLER_LEAF_ARGS \ + is_feature_used, smaller_leaf_splits_->data_indices(), \ + smaller_leaf_splits_->num_data_in_leaf(), \ + reinterpret_cast(gradient_discretizer_->discretized_gradients_and_hessians()), \ + nullptr, \ + reinterpret_cast(gradient_discretizer_->ordered_int_gradients_and_hessians()), \ + nullptr, \ + share_state_.get(), \ + reinterpret_cast(ptr_smaller_leaf_hist_data) + if (smaller_leaf_num_bits <= 16) { + train_data_->ConstructHistograms(SMALLER_LEAF_ARGS); + } else { + train_data_->ConstructHistograms(SMALLER_LEAF_ARGS); + } + #undef SMALLER_LEAF_ARGS + if (larger_leaf_histogram_array_ && !use_subtract) { + const uint8_t larger_leaf_num_bits = gradient_discretizer_->GetHistBitsInLeaf(larger_leaf_splits_->leaf_index()); + hist_t* ptr_larger_leaf_hist_data = + larger_leaf_num_bits <= 16 ? + reinterpret_cast(larger_leaf_histogram_array_[0].RawDataInt16() - kHistOffset) : + reinterpret_cast(larger_leaf_histogram_array_[0].RawDataInt32() - kHistOffset); + #define LARGER_LEAF_ARGS \ + is_feature_used, larger_leaf_splits_->data_indices(), \ + larger_leaf_splits_->num_data_in_leaf(), \ + reinterpret_cast(gradient_discretizer_->discretized_gradients_and_hessians()), \ + nullptr, \ + reinterpret_cast(gradient_discretizer_->ordered_int_gradients_and_hessians()), \ + nullptr, \ + share_state_.get(), \ + reinterpret_cast(ptr_larger_leaf_hist_data) + if (larger_leaf_num_bits <= 16) { + train_data_->ConstructHistograms(LARGER_LEAF_ARGS); + } else { + train_data_->ConstructHistograms(LARGER_LEAF_ARGS); + } + #undef LARGER_LEAF_ARGS + } + } else { + hist_t* ptr_smaller_leaf_hist_data = + smaller_leaf_histogram_array_[0].RawData() - kHistOffset; + train_data_->ConstructHistograms( + is_feature_used, smaller_leaf_splits_->data_indices(), + smaller_leaf_splits_->num_data_in_leaf(), gradients_, hessians_, ordered_gradients_.data(), ordered_hessians_.data(), share_state_.get(), - ptr_larger_leaf_hist_data); + ptr_smaller_leaf_hist_data); + if (larger_leaf_histogram_array_ != nullptr && !use_subtract) { + // construct larger leaf + hist_t* ptr_larger_leaf_hist_data = + larger_leaf_histogram_array_[0].RawData() - kHistOffset; + train_data_->ConstructHistograms( + is_feature_used, larger_leaf_splits_->data_indices(), + larger_leaf_splits_->num_data_in_leaf(), gradients_, hessians_, + ordered_gradients_.data(), ordered_hessians_.data(), share_state_.get(), + ptr_larger_leaf_hist_data); + } } } @@ -388,6 +480,26 @@ void SerialTreeLearner::FindBestSplitsFromHistograms( if (larger_leaf_splits_->leaf_index() >= 0) { larger_node_used_features = col_sampler_.GetByNode(tree, larger_leaf_splits_->leaf_index()); } + + if (use_subtract && config_->use_quantized_grad) { + const int parent_index = std::min(smaller_leaf_splits_->leaf_index(), larger_leaf_splits_->leaf_index()); + const uint8_t parent_hist_bits = gradient_discretizer_->GetHistBitsInNode(parent_index); + const uint8_t larger_hist_bits = gradient_discretizer_->GetHistBitsInLeaf(larger_leaf_splits_->leaf_index()); + if (parent_hist_bits > 16 && larger_hist_bits <= 16) { + OMP_INIT_EX(); + #pragma omp parallel for schedule(static) num_threads(share_state_->num_threads) + for (int feature_index = 0; feature_index < num_features_; ++feature_index) { + OMP_LOOP_EX_BEGIN(); + if (!is_feature_used[feature_index]) { + continue; + } + larger_leaf_histogram_array_[feature_index].CopyToBuffer(gradient_discretizer_->GetChangeHistBitsBuffer(feature_index)); + OMP_LOOP_EX_END(); + } + OMP_THROW_EX(); + } + } + OMP_INIT_EX(); // find splits #pragma omp parallel for schedule(static) num_threads(share_state_->num_threads) @@ -397,10 +509,24 @@ void SerialTreeLearner::FindBestSplitsFromHistograms( continue; } const int tid = omp_get_thread_num(); - train_data_->FixHistogram( - feature_index, smaller_leaf_splits_->sum_gradients(), - smaller_leaf_splits_->sum_hessians(), - smaller_leaf_histogram_array_[feature_index].RawData()); + if (config_->use_quantized_grad) { + const uint8_t hist_bits_bin = gradient_discretizer_->GetHistBitsInLeaf(smaller_leaf_splits_->leaf_index()); + const int64_t int_sum_gradient_and_hessian = smaller_leaf_splits_->int_sum_gradients_and_hessians(); + if (hist_bits_bin <= 16) { + train_data_->FixHistogramInt( + feature_index, int_sum_gradient_and_hessian, + reinterpret_cast(smaller_leaf_histogram_array_[feature_index].RawDataInt16())); + } else { + train_data_->FixHistogramInt( + feature_index, int_sum_gradient_and_hessian, + reinterpret_cast(smaller_leaf_histogram_array_[feature_index].RawDataInt32())); + } + } else { + train_data_->FixHistogram( + feature_index, smaller_leaf_splits_->sum_gradients(), + smaller_leaf_splits_->sum_hessians(), + smaller_leaf_histogram_array_[feature_index].RawData()); + } int real_fidx = train_data_->RealFeatureIndex(feature_index); ComputeBestSplitForFeature(smaller_leaf_histogram_array_, feature_index, @@ -417,13 +543,50 @@ void SerialTreeLearner::FindBestSplitsFromHistograms( } if (use_subtract) { - larger_leaf_histogram_array_[feature_index].Subtract( - smaller_leaf_histogram_array_[feature_index]); + if (config_->use_quantized_grad) { + const int parent_index = std::min(smaller_leaf_splits_->leaf_index(), larger_leaf_splits_->leaf_index()); + const uint8_t parent_hist_bits = gradient_discretizer_->GetHistBitsInNode(parent_index); + const uint8_t smaller_hist_bits = gradient_discretizer_->GetHistBitsInLeaf(smaller_leaf_splits_->leaf_index()); + const uint8_t larger_hist_bits = gradient_discretizer_->GetHistBitsInLeaf(larger_leaf_splits_->leaf_index()); + if (parent_hist_bits <= 16) { + CHECK_LE(smaller_hist_bits, 16); + CHECK_LE(larger_hist_bits, 16); + larger_leaf_histogram_array_[feature_index].Subtract( + smaller_leaf_histogram_array_[feature_index]); + } else if (larger_hist_bits <= 16) { + CHECK_LE(smaller_hist_bits, 16); + larger_leaf_histogram_array_[feature_index].Subtract( + smaller_leaf_histogram_array_[feature_index], gradient_discretizer_->GetChangeHistBitsBuffer(feature_index)); + } else if (smaller_hist_bits <= 16) { + larger_leaf_histogram_array_[feature_index].Subtract( + smaller_leaf_histogram_array_[feature_index]); + } else { + larger_leaf_histogram_array_[feature_index].Subtract( + smaller_leaf_histogram_array_[feature_index]); + } + } else { + larger_leaf_histogram_array_[feature_index].Subtract( + smaller_leaf_histogram_array_[feature_index]); + } } else { - train_data_->FixHistogram( - feature_index, larger_leaf_splits_->sum_gradients(), - larger_leaf_splits_->sum_hessians(), - larger_leaf_histogram_array_[feature_index].RawData()); + if (config_->use_quantized_grad) { + const int64_t int_sum_gradient_and_hessian = larger_leaf_splits_->int_sum_gradients_and_hessians(); + const uint8_t hist_bits_bin = gradient_discretizer_->GetHistBitsInLeaf(larger_leaf_splits_->leaf_index()); + if (hist_bits_bin <= 16) { + train_data_->FixHistogramInt( + feature_index, int_sum_gradient_and_hessian, + reinterpret_cast(larger_leaf_histogram_array_[feature_index].RawDataInt16())); + } else { + train_data_->FixHistogramInt( + feature_index, int_sum_gradient_and_hessian, + reinterpret_cast(larger_leaf_histogram_array_[feature_index].RawDataInt32())); + } + } else { + train_data_->FixHistogram( + feature_index, larger_leaf_splits_->sum_gradients(), + larger_leaf_splits_->sum_hessians(), + larger_leaf_histogram_array_[feature_index].RawData()); + } } ComputeBestSplitForFeature(larger_leaf_histogram_array_, feature_index, @@ -699,6 +862,11 @@ void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf, best_split_info.left_sum_hessian, best_split_info.left_output); } + if (config_->use_quantized_grad && config_->tree_learner != std::string("data")) { + gradient_discretizer_->SetNumBitsInHistogramBin(*left_leaf, *right_leaf, + data_partition_->leaf_count(*left_leaf), + data_partition_->leaf_count(*right_leaf)); + } auto leaves_need_update = constraints_->Update( is_numerical_split, *left_leaf, *right_leaf, best_split_info.monotone_type, best_split_info.right_output, @@ -762,9 +930,21 @@ void SerialTreeLearner::ComputeBestSplitForFeature( train_data_->FeatureNumBin(feature_index)); } SplitInfo new_split; - histogram_array_[feature_index].FindBestThreshold( - leaf_splits->sum_gradients(), leaf_splits->sum_hessians(), num_data, - constraints_->GetFeatureConstraint(leaf_splits->leaf_index(), feature_index), parent_output, &new_split); + if (config_->use_quantized_grad) { + const uint8_t hist_bits_bin = gradient_discretizer_->GetHistBitsInLeaf(leaf_splits->leaf_index()); + histogram_array_[feature_index].FindBestThresholdInt( + leaf_splits->int_sum_gradients_and_hessians(), + gradient_discretizer_->grad_scale(), + gradient_discretizer_->hess_scale(), + hist_bits_bin, + hist_bits_bin, + num_data, + constraints_->GetFeatureConstraint(leaf_splits->leaf_index(), feature_index), parent_output, &new_split); + } else { + histogram_array_[feature_index].FindBestThreshold( + leaf_splits->sum_gradients(), leaf_splits->sum_hessians(), num_data, + constraints_->GetFeatureConstraint(leaf_splits->leaf_index(), feature_index), parent_output, &new_split); + } new_split.feature = real_fidx; if (cegb_ != nullptr) { new_split.gain -= diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h index 14b78eb6a577..1f8e3add0d8c 100644 --- a/src/treelearner/serial_tree_learner.h +++ b/src/treelearner/serial_tree_learner.h @@ -24,6 +24,7 @@ #include "col_sampler.hpp" #include "data_partition.hpp" #include "feature_histogram.hpp" +#include "gradient_discretizer.hpp" #include "leaf_splits.hpp" #include "monotone_constraints.hpp" #include "split_info.hpp" @@ -170,6 +171,8 @@ class SerialTreeLearner: public TreeLearner { std::set FindAllForceFeatures(Json force_split_leaf_setting); + void CheckSplit(const SplitInfo& best_split_info, const int left_leaf_index, const int right_leaf_index); + /*! * \brief Get the number of data in a leaf * \param leaf_idx The index of leaf @@ -230,6 +233,7 @@ class SerialTreeLearner: public TreeLearner { const Json* forced_split_json_; std::unique_ptr share_state_; std::unique_ptr cegb_; + std::unique_ptr gradient_discretizer_; }; inline data_size_t SerialTreeLearner::GetGlobalDataCountInLeaf(int leaf_idx) const { diff --git a/src/treelearner/split_info.hpp b/src/treelearner/split_info.hpp index 644bd329b3a6..234105eb9a34 100644 --- a/src/treelearner/split_info.hpp +++ b/src/treelearner/split_info.hpp @@ -40,10 +40,14 @@ struct SplitInfo { double left_sum_gradient = 0; /*! \brief Left sum hessian after split */ double left_sum_hessian = 0; + /*! \brief Left sum discretized gradient and hessian after split */ + int64_t left_sum_gradient_and_hessian = 0; /*! \brief Right sum gradient after split */ double right_sum_gradient = 0; /*! \brief Right sum hessian after split */ double right_sum_hessian = 0; + /*! \brief Right sum discretized gradient and hessian after split */ + int64_t right_sum_gradient_and_hessian = 0; std::vector cat_threshold; /*! \brief True if default split is left */ bool default_left = true; @@ -71,10 +75,14 @@ struct SplitInfo { buffer += sizeof(left_sum_gradient); std::memcpy(buffer, &left_sum_hessian, sizeof(left_sum_hessian)); buffer += sizeof(left_sum_hessian); + std::memcpy(buffer, &left_sum_gradient_and_hessian, sizeof(left_sum_gradient_and_hessian)); + buffer += sizeof(left_sum_gradient_and_hessian); std::memcpy(buffer, &right_sum_gradient, sizeof(right_sum_gradient)); buffer += sizeof(right_sum_gradient); std::memcpy(buffer, &right_sum_hessian, sizeof(right_sum_hessian)); buffer += sizeof(right_sum_hessian); + std::memcpy(buffer, &right_sum_gradient_and_hessian, sizeof(right_sum_gradient_and_hessian)); + buffer += sizeof(right_sum_gradient_and_hessian); std::memcpy(buffer, &default_left, sizeof(default_left)); buffer += sizeof(default_left); std::memcpy(buffer, &monotone_type, sizeof(monotone_type)); @@ -103,10 +111,14 @@ struct SplitInfo { buffer += sizeof(left_sum_gradient); std::memcpy(&left_sum_hessian, buffer, sizeof(left_sum_hessian)); buffer += sizeof(left_sum_hessian); + std::memcpy(&left_sum_gradient_and_hessian, buffer, sizeof(left_sum_gradient_and_hessian)); + buffer += sizeof(left_sum_gradient_and_hessian); std::memcpy(&right_sum_gradient, buffer, sizeof(right_sum_gradient)); buffer += sizeof(right_sum_gradient); std::memcpy(&right_sum_hessian, buffer, sizeof(right_sum_hessian)); buffer += sizeof(right_sum_hessian); + std::memcpy(&right_sum_gradient_and_hessian, buffer, sizeof(right_sum_gradient_and_hessian)); + buffer += sizeof(right_sum_gradient_and_hessian); std::memcpy(&default_left, buffer, sizeof(default_left)); buffer += sizeof(default_left); std::memcpy(&monotone_type, buffer, sizeof(monotone_type)); diff --git a/swig/pointer_manipulation.i b/swig/pointer_manipulation.i index 28635b34ac62..de0bddd42f8e 100644 --- a/swig/pointer_manipulation.i +++ b/swig/pointer_manipulation.i @@ -15,6 +15,7 @@ * to arrays of size max(int64_t) instead of max(int32_t). */ +%pointer_functions(uint8_t, bytep) %pointer_functions(int, intp) %pointer_functions(long, longp) %pointer_functions(double, doublep) @@ -33,6 +34,7 @@ %pointer_cast(double *, void *, double_to_voidp_ptr) %pointer_cast(float *, void *, float_to_voidp_ptr) %pointer_cast(int *, void *, int_to_voidp_ptr) +%pointer_cast(uint8_t *, void *, byte_to_voidp_ptr) %pointer_cast(int32_t *, void *, int32_t_to_voidp_ptr) %pointer_cast(int64_t *, void *, int64_t_to_voidp_ptr) diff --git a/tests/distributed/_test_distributed.py b/tests/distributed/_test_distributed.py index 9e1dd8e4f5a4..9ede4e0800fb 100644 --- a/tests/distributed/_test_distributed.py +++ b/tests/distributed/_test_distributed.py @@ -106,7 +106,7 @@ def _write_data(self, partitions: List[np.ndarray]) -> None: for i, partition in enumerate(partitions): np.savetxt(str(TESTS_DIR / f'train{i}.txt'), partition, delimiter=',') - def fit(self, partitions: List[np.ndarray], train_config: Dict = {}) -> None: + def fit(self, partitions: List[np.ndarray], train_config: Dict) -> None: """Run the distributed training process on a single machine. For each worker i: @@ -134,7 +134,7 @@ def fit(self, partitions: List[np.ndarray], train_config: Dict = {}) -> None: if result.returncode != 0: raise RuntimeError('Error in training') - def predict(self, predict_config: Dict[str, Any] = {}) -> np.ndarray: + def predict(self, predict_config: Dict[str, Any]) -> np.ndarray: """Compute the predictions using the model created in the fit step. predict_config is used to predict the training set train.txt @@ -178,7 +178,7 @@ def test_classifier(executable): } clf = DistributedMockup(executable) clf.fit(partitions, train_params) - y_probas = clf.predict() + y_probas = clf.predict(predict_config={}) y_pred = y_probas > 0.5 assert accuracy_score(clf.label_, y_pred) == 1. @@ -194,5 +194,5 @@ def test_regressor(executable): } reg = DistributedMockup(executable) reg.fit(partitions, train_params) - y_pred = reg.predict() + y_pred = reg.predict(predict_config={}) np.testing.assert_allclose(y_pred, reg.label_, rtol=0.2, atol=50.) diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index 7bb2d99a4037..5e237724ae85 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -2,6 +2,7 @@ import filecmp import numbers import re +from copy import deepcopy from os import getenv from pathlib import Path @@ -324,7 +325,7 @@ def test_add_features_same_booster_behaviour(tmp_path): d.set_label(y) b1 = lgb.Booster(train_set=d1) b = lgb.Booster(train_set=d) - for k in range(10): + for _ in range(10): b.update() b1.update() dname = tmp_path / "d.txt" @@ -365,7 +366,7 @@ def test_add_features_from_different_sources(): # test that method works for different data types d1 = lgb.Dataset(x_1, feature_name=names, free_raw_data=False).construct() - res_feature_names = [name for name in names] + res_feature_names = deepcopy(names) for idx, x_2 in enumerate(xxs, 2): original_type = type(d1.get_data()) d2 = lgb.Dataset(x_2, feature_name=names, free_raw_data=False).construct() @@ -407,7 +408,7 @@ def test_cegb_affects_behavior(tmp_path): ds = lgb.Dataset(X, feature_name=names).construct() ds.set_label(y) base = lgb.Booster(train_set=ds) - for k in range(10): + for _ in range(10): base.update() basename = tmp_path / "basename.txt" base.save_model(basename) @@ -419,7 +420,7 @@ def test_cegb_affects_behavior(tmp_path): {'cegb_penalty_split': 1}] for case in cases: booster = lgb.Booster(train_set=ds, params=case) - for k in range(10): + for _ in range(10): booster.update() casename = tmp_path / "casename.txt" booster.save_model(casename) @@ -445,7 +446,7 @@ def test_cegb_scaling_equalities(tmp_path): for (p1, p2) in pairs: booster1 = lgb.Booster(train_set=ds, params=p1) booster2 = lgb.Booster(train_set=ds, params=p2) - for k in range(10): + for _ in range(10): booster1.update() booster2.update() p1name = tmp_path / "p1.txt" @@ -632,17 +633,17 @@ def test_list_to_1d_numpy(collection, dtype): y = pd_Series(y) if isinstance(y, np.ndarray) and len(y.shape) == 2: with pytest.warns(UserWarning, match='column-vector'): - lgb.basic._list_to_1d_numpy(y) + lgb.basic._list_to_1d_numpy(y, dtype=np.float32, name="list") return elif isinstance(y, list) and isinstance(y[0], list): with pytest.raises(TypeError): - lgb.basic._list_to_1d_numpy(y) + lgb.basic._list_to_1d_numpy(y, dtype=np.float32, name="list") return elif isinstance(y, pd_Series) and y.dtype == object: with pytest.raises(ValueError): - lgb.basic._list_to_1d_numpy(y) + lgb.basic._list_to_1d_numpy(y, dtype=np.float32, name="list") return - result = lgb.basic._list_to_1d_numpy(y, dtype=dtype) + result = lgb.basic._list_to_1d_numpy(y, dtype=dtype, name="list") assert result.size == 10 assert result.dtype == dtype @@ -752,10 +753,10 @@ def test_feature_num_bin(min_data_in_bin): ]).T n_continuous = X.shape[1] - 1 feature_name = [f'x{i}' for i in range(n_continuous)] + ['cat1'] - ds_kwargs = dict( - params={'min_data_in_bin': min_data_in_bin}, - categorical_feature=[n_continuous], # last feature - ) + ds_kwargs = { + "params": {'min_data_in_bin': min_data_in_bin}, + "categorical_feature": [n_continuous], # last feature + } ds = lgb.Dataset(X, feature_name=feature_name, **ds_kwargs).construct() expected_num_bins = [ 100 // min_data_in_bin + 1, # extra bin for zero diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py index 594f88f527ac..662020428270 100644 --- a/tests/python_package_test/test_dask.py +++ b/tests/python_package_test/test_dask.py @@ -1062,9 +1062,9 @@ def test_eval_set_no_early_stopping(task, output, eval_sizes, eval_names_prefix, eval_class_weight.append({0: n_neg / n_pos, 1: n_pos / n_neg}) init_score_value = np.log(np.mean(y_e) / (1 - np.mean(y_e))) if 'dataframe' in output: - d_init_score = dy_e.map_partitions(lambda x: pd.Series([init_score_value] * x.size)) + d_init_score = dy_e.map_partitions(lambda x, val=init_score_value: pd.Series([val] * x.size)) else: - d_init_score = dy_e.map_blocks(lambda x: np.repeat(init_score_value, x.size)) + d_init_score = dy_e.map_blocks(lambda x, val=init_score_value: np.repeat(val, x.size)) eval_init_score.append(d_init_score) @@ -1854,3 +1854,44 @@ def test_predict_with_raw_score(task, output, cluster): if task.endswith('classification'): pred_proba_raw = model.predict_proba(dX, raw_score=True).compute() assert_eq(raw_predictions, pred_proba_raw) + + +def test_distributed_quantized_training(cluster): + with Client(cluster) as client: + X, y, w, _, dX, dy, dw, _ = _create_data( + objective='regression', + output='array' + ) + + np.savetxt("data_dask.csv", np.hstack([np.array([y]).T, X]), fmt="%f,%f,%f,%f,%f") + + params = { + "boosting_type": 'gbdt', + "n_estimators": 50, + "num_leaves": 31, + 'use_quantized_grad': True, + 'num_grad_quant_bins': 30, + 'quant_train_renew_leaf': True, + 'verbose': -1, + 'force_row_wise': True, + } + + quant_dask_classifier = lgb.DaskLGBMRegressor( + client=client, + time_out=5, + **params + ) + quant_dask_classifier = quant_dask_classifier.fit(dX, dy, sample_weight=dw) + quant_p1 = quant_dask_classifier.predict(dX) + quant_rmse = np.sqrt(np.mean((quant_p1.compute() - y) ** 2)) + + params["use_quantized_grad"] = False + dask_classifier = lgb.DaskLGBMRegressor( + client=client, + time_out=5, + **params + ) + dask_classifier = dask_classifier.fit(dX, dy, sample_weight=dw) + p1 = dask_classifier.predict(dX) + rmse = np.sqrt(np.mean((p1.compute() - y) ** 2)) + assert quant_rmse < rmse + 7.0 diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index b9709e6bcea6..e87cea3bfcbb 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -886,13 +886,13 @@ def test_early_stopping_min_delta(first_only, single_metric, greater_is_better): min_delta = metric2min_delta[metric[0]] else: min_delta = [metric2min_delta[m] for m in metric] - train_kwargs = dict( - params=params, - train_set=train_ds, - num_boost_round=50, - valid_sets=[train_ds, valid_ds], - valid_names=['training', 'valid'], - ) + train_kwargs = { + "params": params, + "train_set": train_ds, + "num_boost_round": 50, + "valid_sets": [train_ds, valid_ds], + "valid_names": ['training', 'valid'], + } # regular early stopping evals_result = {} @@ -1075,6 +1075,67 @@ def test_cv(): np.testing.assert_allclose(cv_res_lambda['valid ndcg@3-mean'], cv_res_lambda_obj['valid ndcg@3-mean']) +def test_cv_works_with_init_model(tmp_path): + X, y = make_synthetic_regression() + params = {'objective': 'regression', 'verbose': -1} + num_train_rounds = 2 + lgb_train = lgb.Dataset(X, y, free_raw_data=False) + bst = lgb.train( + params=params, + train_set=lgb_train, + num_boost_round=num_train_rounds + ) + preds_raw = bst.predict(X, raw_score=True) + model_path_txt = str(tmp_path / 'lgb.model') + bst.save_model(model_path_txt) + + num_cv_rounds = 5 + cv_kwargs = { + "num_boost_round": num_cv_rounds, + "nfold": 3, + "stratified": False, + "shuffle": False, + "seed": 708, + "return_cvbooster": True, + "params": params + } + + # init_model from an in-memory Booster + cv_res = lgb.cv( + train_set=lgb_train, + init_model=bst, + **cv_kwargs + ) + cv_bst_w_in_mem_init_model = cv_res["cvbooster"] + assert cv_bst_w_in_mem_init_model.current_iteration() == [num_train_rounds + num_cv_rounds] * 3 + for booster in cv_bst_w_in_mem_init_model.boosters: + np.testing.assert_allclose( + preds_raw, + booster.predict(X, raw_score=True, num_iteration=num_train_rounds) + ) + + # init_model from a text file + cv_res = lgb.cv( + train_set=lgb_train, + init_model=model_path_txt, + **cv_kwargs + ) + cv_bst_w_file_init_model = cv_res["cvbooster"] + assert cv_bst_w_file_init_model.current_iteration() == [num_train_rounds + num_cv_rounds] * 3 + for booster in cv_bst_w_file_init_model.boosters: + np.testing.assert_allclose( + preds_raw, + booster.predict(X, raw_score=True, num_iteration=num_train_rounds) + ) + + # predictions should be identical + for i in range(3): + np.testing.assert_allclose( + cv_bst_w_in_mem_init_model.boosters[i].predict(X), + cv_bst_w_file_init_model.boosters[i].predict(X) + ) + + def test_cvbooster(): X, y = load_breast_cancer(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) @@ -1710,7 +1771,7 @@ def parse_tree_features(gbm): for tree in tree_str: # split_features are in 4th line. features = tree.splitlines()[3].split("=")[1].split(" ") - features = set(f"Column_{f}" for f in features) + features = {f"Column_{f}" for f in features} feature_sets.append(features) return np.array(feature_sets) @@ -2799,14 +2860,14 @@ def metrics_combination_cv_regression(metric_list, assumed_iteration, iter_valid1_l2 = 3 iter_valid2_l1 = 3 iter_valid2_l2 = 15 - assert len(set([iter_valid1_l1, iter_valid1_l2, iter_valid2_l1, iter_valid2_l2])) == 2 + assert len({iter_valid1_l1, iter_valid1_l2, iter_valid2_l1, iter_valid2_l2}) == 2 iter_min_l1 = min([iter_valid1_l1, iter_valid2_l1]) iter_min_l2 = min([iter_valid1_l2, iter_valid2_l2]) iter_min_valid1 = min([iter_valid1_l1, iter_valid1_l2]) iter_cv_l1 = 15 iter_cv_l2 = 13 - assert len(set([iter_cv_l1, iter_cv_l2])) == 2 + assert len({iter_cv_l1, iter_cv_l2}) == 2 iter_cv_min = min([iter_cv_l1, iter_cv_l2]) # test for lgb.train @@ -4015,3 +4076,59 @@ def test_validate_features(): # check that disabling the check doesn't raise the error bst.refit(df2, y, validate_features=False) + + +def test_train_and_cv_raise_informative_error_for_train_set_of_wrong_type(): + with pytest.raises(TypeError, match=r"train\(\) only accepts Dataset object, train_set has type 'list'\."): + lgb.train({}, train_set=[]) + with pytest.raises(TypeError, match=r"cv\(\) only accepts Dataset object, train_set has type 'list'\."): + lgb.cv({}, train_set=[]) + + +@pytest.mark.parametrize('num_boost_round', [-7, -1, 0]) +def test_train_and_cv_raise_informative_error_for_impossible_num_boost_round(num_boost_round): + X, y = make_synthetic_regression(n_samples=100) + error_msg = rf"num_boost_round must be greater than 0\. Got {num_boost_round}\." + with pytest.raises(ValueError, match=error_msg): + lgb.train({}, train_set=lgb.Dataset(X, y), num_boost_round=num_boost_round) + with pytest.raises(ValueError, match=error_msg): + lgb.cv({}, train_set=lgb.Dataset(X, y), num_boost_round=num_boost_round) + + +def test_train_raises_informative_error_if_any_valid_sets_are_not_dataset_objects(): + X, y = make_synthetic_regression(n_samples=100) + X_valid = X * 2.0 + with pytest.raises(TypeError, match=r"Every item in valid_sets must be a Dataset object\. Item 1 has type 'tuple'\."): + lgb.train( + params={}, + train_set=lgb.Dataset(X, y), + valid_sets=[ + lgb.Dataset(X_valid, y), + ([1.0], [2.0]), + [5.6, 5.7, 5.8] + ] + ) + + +def test_train_raises_informative_error_for_params_of_wrong_type(): + X, y = make_synthetic_regression() + params = {"early_stopping_round": "too-many"} + dtrain = lgb.Dataset(X, label=y) + with pytest.raises(lgb.basic.LightGBMError, match="Parameter early_stopping_round should be of type int, got \"too-many\""): + lgb.train(params, dtrain) + + +def test_quantized_training(): + X, y = make_synthetic_regression() + ds = lgb.Dataset(X, label=y) + bst_params = {'num_leaves': 15, 'verbose': -1, 'seed': 0} + bst = lgb.train(bst_params, ds, num_boost_round=10) + rmse = np.sqrt(np.mean((bst.predict(X) - y) ** 2)) + bst_params.update({ + 'use_quantized_grad': True, + 'num_grad_quant_bins': 30, + 'quant_train_renew_leaf': True, + }) + quant_bst = lgb.train(bst_params, ds, num_boost_round=10) + quant_rmse = np.sqrt(np.mean((quant_bst.predict(X) - y) ** 2)) + assert quant_rmse < rmse + 6.0 diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index 746c958a7304..2f1372545067 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -9,17 +9,19 @@ import joblib import numpy as np import pytest +import scipy.sparse +from scipy.stats import spearmanr from sklearn.base import clone from sklearn.datasets import load_svmlight_file, make_blobs, make_multilabel_classification from sklearn.ensemble import StackingClassifier, StackingRegressor -from sklearn.metrics import log_loss, mean_squared_error +from sklearn.metrics import accuracy_score, log_loss, mean_squared_error, r2_score from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split from sklearn.multioutput import ClassifierChain, MultiOutputClassifier, MultiOutputRegressor, RegressorChain from sklearn.utils.estimator_checks import parametrize_with_checks from sklearn.utils.validation import check_is_fitted import lightgbm as lgb -from lightgbm.compat import PANDAS_INSTALLED, pd_DataFrame +from lightgbm.compat import DATATABLE_INSTALLED, PANDAS_INSTALLED, dt_DataTable, pd_DataFrame, pd_Series from .utils import (load_breast_cancer, load_digits, load_iris, load_linnerud, make_ranking, make_synthetic_regression, sklearn_multiclass_custom_objective, softmax) @@ -27,20 +29,27 @@ decreasing_generator = itertools.count(0, -1) task_to_model_factory = { 'ranking': lgb.LGBMRanker, - 'classification': lgb.LGBMClassifier, + 'binary-classification': lgb.LGBMClassifier, + 'multiclass-classification': lgb.LGBMClassifier, 'regression': lgb.LGBMRegressor, } -def _create_data(task): +def _create_data(task, n_samples=100, n_features=4): if task == 'ranking': - X, y, g = make_ranking(n_features=4) + X, y, g = make_ranking(n_features=4, n_samples=n_samples) g = np.bincount(g) - elif task == 'classification': - X, y = load_iris(return_X_y=True) + elif task.endswith('classification'): + if task == 'binary-classification': + centers = 2 + elif task == 'multiclass-classification': + centers = 3 + else: + ValueError(f"Unknown classification task '{task}'") + X, y = make_blobs(n_samples=n_samples, n_features=n_features, centers=centers, random_state=42) g = None elif task == 'regression': - X, y = make_synthetic_regression() + X, y = make_synthetic_regression(n_samples=n_samples, n_features=n_features) g = None return X, y, g @@ -248,6 +257,212 @@ def test_binary_classification_with_custom_objective(): assert ret < 0.05 +def test_early_stopping_validation_set_split_strategy_param_check(): + + X, y = load_breast_cancer(return_X_y=True) + gbm = lgb.LGBMClassifier( + n_estimators=50, + random_state=42, + verbose=-1, + early_stopping=True, + validation_set_split_strategy="invalid_strategy" + ) + with pytest.raises( + ValueError, + match=r"validation_set_split_strategy must be a callable or one of the following*" + ): + gbm.fit(X, y) + + +@pytest.mark.parametrize('use_weight', [True, False]) +def test_binary_classification_with_auto_early_stopping(use_weight): + + X, y = load_breast_cancer(return_X_y=True) + + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) + n_estimators = 1000 + gbm = lgb.LGBMClassifier( + n_estimators=n_estimators, random_state=42, verbose=-1, early_stopping=True + ) + weight = np.full_like(y_train, 2) if use_weight else None + gbm.fit(X_train, y_train, sample_weight=weight) + ret = log_loss(y_test, gbm.predict_proba(X_test)) + assert gbm._Booster.num_trees() < n_estimators + assert ret < 0.21 + + +@pytest.mark.parametrize('use_weight', [True, False]) +def test_binary_classification_with_auto_early_stopping_use_train_as_val_set(use_weight, recwarn): + + X, y = load_breast_cancer(return_X_y=True) + + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) + n_estimators = 1000 + gbm = lgb.LGBMClassifier( + n_estimators=n_estimators, + random_state=42, + verbose=-1, + early_stopping=True, + validation_fraction=None # Use train as validation set + ) + weight = np.full_like(y_train, 2) if use_weight else None + gbm.fit(X_train, y_train, sample_weight=weight) + # Check that the warning UserWarning("Only training set found, disabling early stopping.") + # is not raised + assert len(recwarn) == 0 + ret = log_loss(y_test, gbm.predict_proba(X_test)) + assert gbm._Booster.num_trees() < n_estimators + assert ret < 0.24 + + +@pytest.mark.parametrize('use_weight', [True, False]) +def test_binary_classification_with_auto_early_stopping_random(use_weight): + + X, y = load_breast_cancer(return_X_y=True) + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) + n_estimators = 1000 + gbm = lgb.LGBMClassifier( + n_estimators=n_estimators, + random_state=42, + verbose=-1, + early_stopping=True, + validation_set_split_strategy="random" + ) + weight = np.full_like(y_train, 2) if use_weight else None + gbm.fit(X_train, y_train, sample_weight=weight) + ret = log_loss(y_test, gbm.predict_proba(X_test)) + assert gbm._Booster.num_trees() < n_estimators + assert ret < 0.18 + + +def test_binary_classification_with_custom_eval_set_splitter(): + def custom_val_splitter(X, y): + return train_test_split(X, y, test_size=0.1, random_state=42, stratify=y) + X, y = load_breast_cancer(return_X_y=True) + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) + n_estimators = 1000 + gbm = lgb.LGBMClassifier( + n_estimators=n_estimators, + random_state=42, + early_stopping=True, + verbose=-1, + validation_set_split_strategy=custom_val_splitter + ) + gbm.fit(X_train, y_train, callbacks=[lgb.early_stopping(5)]) + ret = log_loss(y_test, gbm.predict_proba(X_test)) + assert gbm._Booster.num_trees() < n_estimators + assert ret < 0.18 + + +@pytest.mark.parametrize('use_weight', [True, False]) +def test_regression_with_auto_early_stopping(use_weight): + X, y = make_synthetic_regression() + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) + n_estimators = 1000 + gbm = lgb.LGBMRegressor( + n_estimators=n_estimators, + random_state=42, + early_stopping=True, + verbose=-1, + ) + weight = np.full_like(y_train, 2) if use_weight else None + gbm.fit(X_train, y_train, sample_weight=weight) + ret = mean_squared_error(y_test, gbm.predict(X_test)) + assert gbm._Booster.num_trees() < n_estimators + assert ret < 400 + + +def test_regression_with_custom_eval_set_splitter(): + def custom_val_splitter(X, y): + return train_test_split(X, y, test_size=0.1, random_state=42) + X, y = make_synthetic_regression() + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) + n_estimators = 1000 + gbm = lgb.LGBMRegressor( + n_estimators=n_estimators, + random_state=42, + early_stopping=True, + verbose=-1, + validation_set_split_strategy=custom_val_splitter + ) + gbm.fit(X_train, y_train, callbacks=[lgb.early_stopping(5)]) + ret = mean_squared_error(y_test, gbm.predict(X_test)) + assert gbm._Booster.num_trees() < n_estimators + assert ret < 400 + + +@pytest.mark.skipif(getenv('TASK', '') == 'cuda', reason='Skip due to differences in implementation details of CUDA version') +@pytest.mark.parametrize('use_weight', [True, False]) +def test_lambdarank_with_auto_early_stopping(use_weight): + rank_example_dir = Path(__file__).absolute().parents[2] / 'examples' / 'lambdarank' + X_train, y_train = load_svmlight_file(str(rank_example_dir / 'rank.train')) + q_train = np.loadtxt(str(rank_example_dir / 'rank.train.query')) + gbm = lgb.LGBMRanker( + n_estimators=50, random_state=42, early_stopping=True + ) + weight = np.full_like(y_train, 2) if use_weight else None + gbm.fit( + X_train, + y_train, + sample_weight=weight, + group=q_train, + eval_at=[1, 3], + callbacks=[ + lgb.reset_parameter(learning_rate=lambda x: max(0.01, 0.1 - 0.01 * x)) + ] + ) + assert gbm.best_iteration_ <= 24 + assert gbm.best_score_['valid_0']['ndcg@1'] > 0.5674 + assert gbm.best_score_['valid_0']['ndcg@3'] > 0.578 + + +@pytest.mark.skipif(getenv('TASK', '') == 'cuda', reason='Skip due to differences in implementation details of CUDA version') +@pytest.mark.parametrize('validation_set_split_strategy', ["random", "stratify"]) +def test_lambdarank_with_auto_early_stopping_raise_exception(validation_set_split_strategy): + rank_example_dir = Path(__file__).absolute().parents[2] / 'examples' / 'lambdarank' + X_train, y_train = load_svmlight_file(str(rank_example_dir / 'rank.train')) + q_train = np.loadtxt(str(rank_example_dir / 'rank.train.query')) + gbm = lgb.LGBMRanker( + n_estimators=50, + random_state=42, + early_stopping=True, + validation_set_split_strategy=validation_set_split_strategy + ) + with pytest.raises(ValueError, match=r"Parameter group has been specified but the selected*"): + gbm.fit( + X_train, + y_train, + group=q_train, + ) + + +@pytest.mark.skipif(getenv('TASK', '') == 'cuda', reason='Skip due to differences in implementation details of CUDA version') +def test_lambdarank_with_custom_eval_set_splitter(): + def custom_val_splitter(X, y, weight, group): + X_test, y_test = load_svmlight_file(str(rank_example_dir / 'rank.test')) + q_test = np.loadtxt(str(rank_example_dir / 'rank.test.query')) + return X, X_test, y, y_test, weight, None, group, q_test + rank_example_dir = Path(__file__).absolute().parents[2] / 'examples' / 'lambdarank' + X_train, y_train = load_svmlight_file(str(rank_example_dir / 'rank.train')) + q_train = np.loadtxt(str(rank_example_dir / 'rank.train.query')) + gbm = lgb.LGBMRanker( + n_estimators=50, random_state=42, early_stopping=True, validation_set_split_strategy=custom_val_splitter + ) + gbm.fit( + X_train, + y_train, + group=q_train, + eval_at=[1, 3], + callbacks=[ + lgb.early_stopping(10), + lgb.reset_parameter(learning_rate=lambda x: max(0.01, 0.1 - 0.01 * x)) + ] + ) + assert gbm.best_iteration_ <= 24 + assert gbm.best_score_['valid_0']['ndcg@1'] > 0.5674 + assert gbm.best_score_['valid_0']['ndcg@3'] > 0.578 + + def test_dart(): X, y = make_synthetic_regression() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) @@ -304,20 +519,24 @@ def test_grid_search(): y = y.astype(str) # utilize label encoder at it's max power X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42) - params = dict(subsample=0.8, - subsample_freq=1) - grid_params = dict(boosting_type=['rf', 'gbdt'], - n_estimators=[4, 6], - reg_alpha=[0.01, 0.005]) + params = { + "subsample": 0.8, + "subsample_freq": 1 + } + grid_params = { + "boosting_type": ['rf', 'gbdt'], + "n_estimators": [4, 6], + "reg_alpha": [0.01, 0.005] + } evals_result = {} - fit_params = dict( - eval_set=[(X_val, y_val)], - eval_metric=constant_metric, - callbacks=[ + fit_params = { + "eval_set": [(X_val, y_val)], + "eval_metric": constant_metric, + "callbacks": [ lgb.early_stopping(2), lgb.record_evaluation(evals_result) ] - ) + } grid = GridSearchCV(estimator=lgb.LGBMClassifier(**params), param_grid=grid_params, cv=2) grid.fit(X_train, y_train, **fit_params) score = grid.score(X_test, y_test) # utilizes GridSearchCV default refit=True @@ -341,14 +560,20 @@ def test_random_search(): X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42) n_iter = 3 # Number of samples - params = dict(subsample=0.8, - subsample_freq=1) - param_dist = dict(boosting_type=['rf', 'gbdt'], - n_estimators=[np.random.randint(low=3, high=10) for i in range(n_iter)], - reg_alpha=[np.random.uniform(low=0.01, high=0.06) for i in range(n_iter)]) - fit_params = dict(eval_set=[(X_val, y_val)], - eval_metric=constant_metric, - callbacks=[lgb.early_stopping(2)]) + params = { + "subsample": 0.8, + "subsample_freq": 1 + } + param_dist = { + "boosting_type": ['rf', 'gbdt'], + "n_estimators": [np.random.randint(low=3, high=10) for i in range(n_iter)], + "reg_alpha": [np.random.uniform(low=0.01, high=0.06) for i in range(n_iter)] + } + fit_params = { + "eval_set": [(X_val, y_val)], + "eval_metric": constant_metric, + "callbacks": [lgb.early_stopping(2)] + } rand = RandomizedSearchCV(estimator=lgb.LGBMClassifier(**params), param_distributions=param_dist, cv=2, n_iter=n_iter, random_state=42) @@ -1130,7 +1355,7 @@ def fit_and_check(eval_set_names, metric_names, assumed_iteration, first_metric_ iter_valid1_l2 = 4 iter_valid2_l1 = 2 iter_valid2_l2 = 2 - assert len(set([iter_valid1_l1, iter_valid1_l2, iter_valid2_l1, iter_valid2_l2])) == 2 + assert len({iter_valid1_l1, iter_valid1_l2, iter_valid2_l1, iter_valid2_l2}) == 2 iter_min_l1 = min([iter_valid1_l1, iter_valid2_l1]) iter_min_l2 = min([iter_valid1_l2, iter_valid2_l2]) iter_min = min([iter_min_l1, iter_min_l2]) @@ -1268,7 +1493,7 @@ def test_sklearn_integration(estimator, check): check(estimator) -@pytest.mark.parametrize('task', ['classification', 'ranking', 'regression']) +@pytest.mark.parametrize('task', ['binary-classification', 'multiclass-classification', 'ranking', 'regression']) def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array(task): pd = pytest.importorskip("pandas") X, y, g = _create_data(task) @@ -1378,9 +1603,9 @@ def test_default_n_jobs(tmp_path): @pytest.mark.skipif(not PANDAS_INSTALLED, reason='pandas is not installed') -@pytest.mark.parametrize('task', ['classification', 'ranking', 'regression']) +@pytest.mark.parametrize('task', ['binary-classification', 'multiclass-classification', 'ranking', 'regression']) def test_validate_features(task): - X, y, g = _create_data(task) + X, y, g = _create_data(task, n_features=4) features = ['x1', 'x2', 'x3', 'x4'] df = pd_DataFrame(X, columns=features) model = task_to_model_factory[task](n_estimators=10, num_leaves=15, verbose=-1) @@ -1397,3 +1622,148 @@ def test_validate_features(task): # check that disabling the check doesn't raise the error model.predict(df2, validate_features=False) + + +@pytest.mark.parametrize('X_type', ['dt_DataTable', 'list2d', 'numpy', 'scipy_csc', 'scipy_csr', 'pd_DataFrame']) +@pytest.mark.parametrize('y_type', ['list1d', 'numpy', 'pd_Series', 'pd_DataFrame']) +@pytest.mark.parametrize('task', ['binary-classification', 'multiclass-classification', 'regression']) +def test_classification_and_regression_minimally_work_with_all_all_accepted_data_types(X_type, y_type, task): + if any(t.startswith("pd_") for t in [X_type, y_type]) and not PANDAS_INSTALLED: + pytest.skip('pandas is not installed') + if any(t.startswith("dt_") for t in [X_type, y_type]) and not DATATABLE_INSTALLED: + pytest.skip('datatable is not installed') + X, y, g = _create_data(task, n_samples=2_000) + weights = np.abs(np.random.randn(y.shape[0])) + + if task == 'binary-classification' or task == 'regression': + init_score = np.full_like(y, np.mean(y)) + elif task == 'multiclass-classification': + init_score = np.outer(y, np.array([0.1, 0.2, 0.7])) + else: + raise ValueError(f"Unrecognized task '{task}'") + + X_valid = X * 2 + if X_type == 'dt_DataTable': + X = dt_DataTable(X) + elif X_type == 'list2d': + X = X.tolist() + elif X_type == 'scipy_csc': + X = scipy.sparse.csc_matrix(X) + elif X_type == 'scipy_csr': + X = scipy.sparse.csr_matrix(X) + elif X_type == 'pd_DataFrame': + X = pd_DataFrame(X) + elif X_type != 'numpy': + raise ValueError(f"Unrecognized X_type: '{X_type}'") + + # make weights and init_score same types as y, just to avoid + # a huge number of combinations and therefore test cases + if y_type == 'list1d': + y = y.tolist() + weights = weights.tolist() + init_score = init_score.tolist() + elif y_type == 'pd_DataFrame': + y = pd_DataFrame(y) + weights = pd_Series(weights) + if task == 'multiclass-classification': + init_score = pd_DataFrame(init_score) + else: + init_score = pd_Series(init_score) + elif y_type == 'pd_Series': + y = pd_Series(y) + weights = pd_Series(weights) + if task == 'multiclass-classification': + init_score = pd_DataFrame(init_score) + else: + init_score = pd_Series(init_score) + elif y_type != 'numpy': + raise ValueError(f"Unrecognized y_type: '{y_type}'") + + model = task_to_model_factory[task](n_estimators=10, verbose=-1) + model.fit( + X=X, + y=y, + sample_weight=weights, + init_score=init_score, + eval_set=[(X_valid, y)], + eval_sample_weight=[weights], + eval_init_score=[init_score] + ) + + preds = model.predict(X) + if task == 'binary-classification': + assert accuracy_score(y, preds) >= 0.99 + elif task == 'multiclass-classification': + assert accuracy_score(y, preds) >= 0.99 + elif task == 'regression': + assert r2_score(y, preds) > 0.86 + else: + raise ValueError(f"Unrecognized task: '{task}'") + + +@pytest.mark.parametrize('X_type', ['dt_DataTable', 'list2d', 'numpy', 'scipy_csc', 'scipy_csr', 'pd_DataFrame']) +@pytest.mark.parametrize('y_type', ['list1d', 'numpy', 'pd_DataFrame', 'pd_Series']) +@pytest.mark.parametrize('g_type', ['list1d_float', 'list1d_int', 'numpy', 'pd_Series']) +def test_ranking_minimally_works_with_all_all_accepted_data_types(X_type, y_type, g_type): + if any(t.startswith("pd_") for t in [X_type, y_type, g_type]) and not PANDAS_INSTALLED: + pytest.skip('pandas is not installed') + if any(t.startswith("dt_") for t in [X_type, y_type, g_type]) and not DATATABLE_INSTALLED: + pytest.skip('datatable is not installed') + X, y, g = _create_data(task='ranking', n_samples=1_000) + weights = np.abs(np.random.randn(y.shape[0])) + init_score = np.full_like(y, np.mean(y)) + X_valid = X * 2 + + if X_type == 'dt_DataTable': + X = dt_DataTable(X) + elif X_type == 'list2d': + X = X.tolist() + elif X_type == 'scipy_csc': + X = scipy.sparse.csc_matrix(X) + elif X_type == 'scipy_csr': + X = scipy.sparse.csr_matrix(X) + elif X_type == 'pd_DataFrame': + X = pd_DataFrame(X) + elif X_type != 'numpy': + raise ValueError(f"Unrecognized X_type: '{X_type}'") + + # make weights and init_score same types as y, just to avoid + # a huge number of combinations and therefore test cases + if y_type == 'list1d': + y = y.tolist() + weights = weights.tolist() + init_score = init_score.tolist() + elif y_type == 'pd_DataFrame': + y = pd_DataFrame(y) + weights = pd_Series(weights) + init_score = pd_Series(init_score) + elif y_type == 'pd_Series': + y = pd_Series(y) + weights = pd_Series(weights) + init_score = pd_Series(init_score) + elif y_type != 'numpy': + raise ValueError(f"Unrecognized y_type: '{y_type}'") + + if g_type == 'list1d_float': + g = g.astype("float").tolist() + elif g_type == 'list1d_int': + g = g.astype("int").tolist() + elif g_type == 'pd_Series': + g = pd_Series(g) + elif g_type != 'numpy': + raise ValueError(f"Unrecognized g_type: '{g_type}'") + + model = task_to_model_factory['ranking'](n_estimators=10, verbose=-1) + model.fit( + X=X, + y=y, + sample_weight=weights, + init_score=init_score, + group=g, + eval_set=[(X_valid, y)], + eval_sample_weight=[weights], + eval_init_score=[init_score], + eval_group=[g] + ) + preds = model.predict(X) + assert spearmanr(preds, y).correlation >= 0.99 diff --git a/windows/LightGBM.vcxproj b/windows/LightGBM.vcxproj index 342616d27daa..96fe017e96b8 100644 --- a/windows/LightGBM.vcxproj +++ b/windows/LightGBM.vcxproj @@ -34,7 +34,7 @@ SAK SAK LightGBM - 8.1 + 10.0 @@ -101,7 +101,7 @@ - EIGEN_MPL2_ONLY;EIGEN_DONT_PARALLELIZE + EIGEN_MPL2_ONLY;EIGEN_DONT_PARALLELIZE;WIN_HAS_INET_PTON; @@ -306,6 +306,7 @@ + @@ -341,6 +342,7 @@ + diff --git a/windows/LightGBM.vcxproj.filters b/windows/LightGBM.vcxproj.filters index ed591fc4d87a..27b445893c0f 100644 --- a/windows/LightGBM.vcxproj.filters +++ b/windows/LightGBM.vcxproj.filters @@ -51,6 +51,9 @@ src\treelearner + + src\treelearner + src\application @@ -338,5 +341,8 @@ src\treelearner + + src\treelearner + \ No newline at end of file