diff --git a/.appveyor.yml b/.appveyor.yml
index 274064fc56cd..a5cd02d69e23 100644
--- a/.appveyor.yml
+++ b/.appveyor.yml
@@ -23,7 +23,6 @@ clone_depth: 5
 
 install:
   - git submodule update --init --recursive  # get `external_libs` folder
-  - set PATH=%PATH:C:\Program Files\Git\usr\bin;=%  # delete sh.exe from PATH (mingw32-make fix)
   - set PATH=C:\mingw-w64\x86_64-8.1.0-posix-seh-rt_v6-rev0\mingw64\bin;%PATH%
   - set PYTHON_VERSION=%CONFIGURATION%
   - set CONDA_ENV="test-env"
diff --git a/.ci/check_python_dists.sh b/.ci/check_python_dists.sh
index e7e4a86b47e4..cb0bbae79fa9 100644
--- a/.ci/check_python_dists.sh
+++ b/.ci/check_python_dists.sh
@@ -17,4 +17,35 @@ if { test "${TASK}" = "bdist" || test "${METHOD}" = "wheel"; }; then
     check-wheel-contents ${DIST_DIR}/*.whl || exit -1
 fi
 
+PY_MINOR_VER=$(python -c "import sys; print(sys.version_info.minor)")
+if [ $PY_MINOR_VER -gt 7 ]; then
+    echo "pydistcheck..."
+    pip install pydistcheck
+    if { test "${TASK}" = "cuda" || test "${METHOD}" = "wheel"; }; then
+        pydistcheck \
+            --inspect \
+            --ignore 'compiled-objects-have-debug-symbols,distro-too-large-compressed' \
+            --max-allowed-size-uncompressed '60M' \
+            --max-allowed-files 800 \
+            ${DIST_DIR}/* || exit -1
+    elif { test $(uname -m) = "aarch64"; }; then
+        pydistcheck \
+            --inspect \
+            --ignore 'compiled-objects-have-debug-symbols' \
+            --max-allowed-size-compressed '5M' \
+            --max-allowed-size-uncompressed '15M' \
+            --max-allowed-files 800 \
+            ${DIST_DIR}/* || exit -1
+    else
+        pydistcheck \
+            --inspect \
+            --max-allowed-size-compressed '5M' \
+            --max-allowed-size-uncompressed '15M' \
+            --max-allowed-files 800 \
+            ${DIST_DIR}/* || exit -1
+    fi
+else
+    echo "skipping pydistcheck (does not support Python 3.${PY_MINOR_VER})"
+fi
+
 echo "done checking Python package distributions"
diff --git a/.ci/lint-cpp.sh b/.ci/lint-cpp.sh
new file mode 100755
index 000000000000..ef9fff683731
--- /dev/null
+++ b/.ci/lint-cpp.sh
@@ -0,0 +1,20 @@
+#!/bin/sh
+
+echo "running cpplint"
+cpplint \
+    --filter=-build/c++11,-build/include_subdir,-build/header_guard,-whitespace/line_length \
+    --recursive ./src ./include ./R-package ./swig ./tests \
+|| exit -1
+echo "done running cpplint"
+
+echo "running cmakelint"
+cmake_files=$(
+    find . -name CMakeLists.txt -o -path "./cmake/*.cmake" \
+    | grep -v external_libs
+)
+cmakelint \
+    --linelength=120 \
+    --filter=-convention/filename,-package/stdargs,-readability/wonkycase \
+    ${cmake_files} \
+|| exit -1
+echo "done running cmakelint"
diff --git a/.ci/lint-python.sh b/.ci/lint-python.sh
new file mode 100755
index 000000000000..887bc9fdebf1
--- /dev/null
+++ b/.ci/lint-python.sh
@@ -0,0 +1,23 @@
+#!/bin/sh
+
+echo "running ruff"
+ruff check \
+    --config=./python-package/pyproject.toml \
+    . \
+|| exit -1
+echo "done running ruff"
+
+echo "running isort"
+isort \
+    --check-only \
+    --settings-path=./python-package/pyproject.toml \
+    . \
+|| exit -1
+echo "done running isort"
+
+echo "running mypy"
+mypy \
+    --config-file=./python-package/pyproject.toml \
+    ./python-package \
+|| true
+echo "done running mypy"
diff --git a/.ci/test.sh b/.ci/test.sh
index 4b01e7c241af..80ed7d2d0ce3 100755
--- a/.ci/test.sh
+++ b/.ci/test.sh
@@ -66,30 +66,22 @@ if [[ $TASK == "swig" ]]; then
 fi
 
 if [[ $TASK == "lint" ]]; then
+    cd ${BUILD_DIRECTORY}
     conda create -q -y -n $CONDA_ENV \
         ${CONDA_PYTHON_REQUIREMENT} \
         cmakelint \
         cpplint \
-        flake8 \
         isort \
         mypy \
-        pydocstyle \
-        "r-lintr>=3.0"
+        'r-lintr>=3.0' \
+        ruff
     source activate $CONDA_ENV
     echo "Linting Python code"
-    flake8 \
-        --ignore=E501,W503 \
-        --exclude=./.nuget,./external_libs,./python-package/build \
-        . || exit -1
-    pydocstyle --convention=numpy --add-ignore=D105 --match-dir="^(?!^external_libs|test|example).*" --match="(?!^test_|setup).*\.py" . || exit -1
-    isort . --check-only || exit -1
-    mypy --ignore-missing-imports python-package/ || true
+    sh ${BUILD_DIRECTORY}/.ci/lint-python.sh || exit -1
     echo "Linting R code"
     Rscript ${BUILD_DIRECTORY}/.ci/lint_r_code.R ${BUILD_DIRECTORY} || exit -1
     echo "Linting C++ code"
-    cpplint --filter=-build/c++11,-build/include_subdir,-build/header_guard,-whitespace/line_length --recursive ./src ./include ./R-package ./swig ./tests || exit -1
-    cmake_files=$(find . -name CMakeLists.txt -o -path "*/cmake/*.cmake")
-    cmakelint --linelength=120 --filter=-convention/filename,-package/stdargs,-readability/wonkycase ${cmake_files} || exit -1
+    sh ${BUILD_DIRECTORY}/.ci/lint-cpp.sh || exit -1
     exit 0
 fi
 
@@ -153,21 +145,23 @@ if [[ $OS_NAME == "macos" ]] && [[ $COMPILER == "clang" ]]; then
 fi
 
 if [[ $TASK == "sdist" ]]; then
-    cd $BUILD_DIRECTORY/python-package && python setup.py sdist || exit -1
-    sh $BUILD_DIRECTORY/.ci/check_python_dists.sh $BUILD_DIRECTORY/python-package/dist || exit -1
-    pip install --user $BUILD_DIRECTORY/python-package/dist/lightgbm-$LGB_VER.tar.gz -v || exit -1
+    cd $BUILD_DIRECTORY && sh ./build-python.sh sdist || exit -1
+    sh $BUILD_DIRECTORY/.ci/check_python_dists.sh $BUILD_DIRECTORY/dist || exit -1
+    pip install --user $BUILD_DIRECTORY/dist/lightgbm-$LGB_VER.tar.gz -v || exit -1
     if [[ $PRODUCES_ARTIFACTS == "true" ]]; then
-        cp $BUILD_DIRECTORY/python-package/dist/lightgbm-$LGB_VER.tar.gz $BUILD_ARTIFACTSTAGINGDIRECTORY
+        cp $BUILD_DIRECTORY/dist/lightgbm-$LGB_VER.tar.gz $BUILD_ARTIFACTSTAGINGDIRECTORY || exit -1
     fi
     pytest $BUILD_DIRECTORY/tests/python_package_test || exit -1
     exit 0
 elif [[ $TASK == "bdist" ]]; then
     if [[ $OS_NAME == "macos" ]]; then
-        cd $BUILD_DIRECTORY/python-package && python setup.py bdist_wheel --plat-name=macosx --python-tag py3 || exit -1
-        sh $BUILD_DIRECTORY/.ci/check_python_dists.sh $BUILD_DIRECTORY/python-package/dist || exit -1
-        mv dist/lightgbm-$LGB_VER-py3-none-macosx.whl dist/lightgbm-$LGB_VER-py3-none-macosx_10_15_x86_64.macosx_11_6_x86_64.macosx_12_5_x86_64.whl
+        cd $BUILD_DIRECTORY && sh ./build-python.sh bdist_wheel || exit -1
+        sh $BUILD_DIRECTORY/.ci/check_python_dists.sh $BUILD_DIRECTORY/dist || exit -1
+        mv \
+            ./dist/*.whl \
+            dist/lightgbm-$LGB_VER-py3-none-macosx_10_15_x86_64.macosx_11_6_x86_64.macosx_12_5_x86_64.whl || exit -1
         if [[ $PRODUCES_ARTIFACTS == "true" ]]; then
-            cp dist/lightgbm-$LGB_VER-py3-none-macosx*.whl $BUILD_ARTIFACTSTAGINGDIRECTORY
+            cp dist/lightgbm-$LGB_VER-py3-none-macosx*.whl $BUILD_ARTIFACTSTAGINGDIRECTORY || exit -1
         fi
     else
         ARCH=$(uname -m)
@@ -176,37 +170,51 @@ elif [[ $TASK == "bdist" ]]; then
         else
             PLATFORM="manylinux2014_$ARCH"
         fi
-        cd $BUILD_DIRECTORY/python-package && python setup.py bdist_wheel --integrated-opencl --plat-name=$PLATFORM --python-tag py3 || exit -1
-        sh $BUILD_DIRECTORY/.ci/check_python_dists.sh $BUILD_DIRECTORY/python-package/dist || exit -1
+        cd $BUILD_DIRECTORY && sh ./build-python.sh bdist_wheel --integrated-opencl || exit -1
+        mv \
+            ./dist/*.whl \
+            ./dist/lightgbm-$LGB_VER-py3-none-$PLATFORM.whl || exit -1
+        sh $BUILD_DIRECTORY/.ci/check_python_dists.sh $BUILD_DIRECTORY/dist || exit -1
         if [[ $PRODUCES_ARTIFACTS == "true" ]]; then
-            cp dist/lightgbm-$LGB_VER-py3-none-$PLATFORM.whl $BUILD_ARTIFACTSTAGINGDIRECTORY
+            cp dist/lightgbm-$LGB_VER-py3-none-$PLATFORM.whl $BUILD_ARTIFACTSTAGINGDIRECTORY || exit -1
         fi
         # Make sure we can do both CPU and GPU; see tests/python_package_test/test_dual.py
         export LIGHTGBM_TEST_DUAL_CPU_GPU=1
     fi
-    pip install --user $BUILD_DIRECTORY/python-package/dist/*.whl || exit -1
+    pip install --user $BUILD_DIRECTORY/dist/*.whl || exit -1
     pytest $BUILD_DIRECTORY/tests || exit -1
     exit 0
 fi
 
-mkdir $BUILD_DIRECTORY/build && cd $BUILD_DIRECTORY/build
+# temporarily pin pip to versions that support 'pip install --install-option'
+# ref: https://github.com/microsoft/LightGBM/issues/5061#issuecomment-1510642287
+if [[ $METHOD == "pip" ]]; then
+    pip install 'pip<23.1'
+fi
 
 if [[ $TASK == "gpu" ]]; then
     sed -i'.bak' 's/std::string device_type = "cpu";/std::string device_type = "gpu";/' $BUILD_DIRECTORY/include/LightGBM/config.h
     grep -q 'std::string device_type = "gpu"' $BUILD_DIRECTORY/include/LightGBM/config.h || exit -1  # make sure that changes were really done
     if [[ $METHOD == "pip" ]]; then
-        cd $BUILD_DIRECTORY/python-package && python setup.py sdist || exit -1
-        sh $BUILD_DIRECTORY/.ci/check_python_dists.sh $BUILD_DIRECTORY/python-package/dist || exit -1
-        pip install --user $BUILD_DIRECTORY/python-package/dist/lightgbm-$LGB_VER.tar.gz -v --install-option=--gpu || exit -1
+        cd $BUILD_DIRECTORY && sh ./build-python.sh sdist || exit -1
+        sh $BUILD_DIRECTORY/.ci/check_python_dists.sh $BUILD_DIRECTORY/dist || exit -1
+        pip install \
+            --user \
+            -v \
+            --install-option=--gpu \
+            $BUILD_DIRECTORY/dist/lightgbm-$LGB_VER.tar.gz \
+        || exit -1
         pytest $BUILD_DIRECTORY/tests/python_package_test || exit -1
         exit 0
     elif [[ $METHOD == "wheel" ]]; then
-        cd $BUILD_DIRECTORY/python-package && python setup.py bdist_wheel --gpu || exit -1
-        sh $BUILD_DIRECTORY/.ci/check_python_dists.sh $BUILD_DIRECTORY/python-package/dist || exit -1
-        pip install --user $BUILD_DIRECTORY/python-package/dist/lightgbm-$LGB_VER*.whl -v || exit -1
+        cd $BUILD_DIRECTORY && sh ./build-python.sh bdist_wheel --gpu || exit -1
+        sh $BUILD_DIRECTORY/.ci/check_python_dists.sh $BUILD_DIRECTORY/dist || exit -1
+        pip install --user $BUILD_DIRECTORY/dist/lightgbm-$LGB_VER*.whl -v || exit -1
         pytest $BUILD_DIRECTORY/tests || exit -1
         exit 0
     elif [[ $METHOD == "source" ]]; then
+        mkdir $BUILD_DIRECTORY/build
+        cd $BUILD_DIRECTORY/build
         cmake -DUSE_GPU=ON ..
     fi
 elif [[ $TASK == "cuda" ]]; then
@@ -216,43 +224,59 @@ elif [[ $TASK == "cuda" ]]; then
     sed -i'.bak' 's/gpu_use_dp = false;/gpu_use_dp = true;/' $BUILD_DIRECTORY/include/LightGBM/config.h
     grep -q 'gpu_use_dp = true' $BUILD_DIRECTORY/include/LightGBM/config.h || exit -1  # make sure that changes were really done
     if [[ $METHOD == "pip" ]]; then
-        cd $BUILD_DIRECTORY/python-package && python setup.py sdist || exit -1
-        sh $BUILD_DIRECTORY/.ci/check_python_dists.sh $BUILD_DIRECTORY/python-package/dist || exit -1
-        pip install --user $BUILD_DIRECTORY/python-package/dist/lightgbm-$LGB_VER.tar.gz -v --install-option=--cuda || exit -1
+        cd $BUILD_DIRECTORY && sh ./build-python.sh sdist || exit -1
+        sh $BUILD_DIRECTORY/.ci/check_python_dists.sh $BUILD_DIRECTORY/dist || exit -1
+        pip install \
+            --user \
+            -v \
+            --install-option=--cuda \
+            $BUILD_DIRECTORY/dist/lightgbm-$LGB_VER.tar.gz \
+        || exit -1
         pytest $BUILD_DIRECTORY/tests/python_package_test || exit -1
         exit 0
     elif [[ $METHOD == "wheel" ]]; then
-        cd $BUILD_DIRECTORY/python-package && python setup.py bdist_wheel --cuda || exit -1
-        sh $BUILD_DIRECTORY/.ci/check_python_dists.sh $BUILD_DIRECTORY/python-package/dist || exit -1
-        pip install --user $BUILD_DIRECTORY/python-package/dist/lightgbm-$LGB_VER*.whl -v || exit -1
+        cd $BUILD_DIRECTORY && sh ./build-python.sh bdist_wheel --cuda || exit -1
+        sh $BUILD_DIRECTORY/.ci/check_python_dists.sh $BUILD_DIRECTORY/dist || exit -1
+        pip install --user $BUILD_DIRECTORY/dist/lightgbm-$LGB_VER*.whl -v || exit -1
         pytest $BUILD_DIRECTORY/tests || exit -1
         exit 0
     elif [[ $METHOD == "source" ]]; then
+        mkdir $BUILD_DIRECTORY/build
+        cd $BUILD_DIRECTORY/build
         cmake -DUSE_CUDA=ON ..
     fi
 elif [[ $TASK == "mpi" ]]; then
     if [[ $METHOD == "pip" ]]; then
-        cd $BUILD_DIRECTORY/python-package && python setup.py sdist || exit -1
-        sh $BUILD_DIRECTORY/.ci/check_python_dists.sh $BUILD_DIRECTORY/python-package/dist || exit -1
-        pip install --user $BUILD_DIRECTORY/python-package/dist/lightgbm-$LGB_VER.tar.gz -v --install-option=--mpi || exit -1
+        cd $BUILD_DIRECTORY && sh ./build-python.sh sdist || exit -1
+        sh $BUILD_DIRECTORY/.ci/check_python_dists.sh $BUILD_DIRECTORY/dist || exit -1
+        pip install \
+            --user \
+            -v \
+            --install-option=--mpi \
+            $BUILD_DIRECTORY/dist/lightgbm-$LGB_VER.tar.gz \
+        || exit -1
         pytest $BUILD_DIRECTORY/tests/python_package_test || exit -1
         exit 0
     elif [[ $METHOD == "wheel" ]]; then
-        cd $BUILD_DIRECTORY/python-package && python setup.py bdist_wheel --mpi || exit -1
-        sh $BUILD_DIRECTORY/.ci/check_python_dists.sh $BUILD_DIRECTORY/python-package/dist || exit -1
-        pip install --user $BUILD_DIRECTORY/python-package/dist/lightgbm-$LGB_VER*.whl -v || exit -1
+        cd $BUILD_DIRECTORY && sh ./build-python.sh bdist_wheel --mpi || exit -1
+        sh $BUILD_DIRECTORY/.ci/check_python_dists.sh $BUILD_DIRECTORY/dist || exit -1
+        pip install --user $BUILD_DIRECTORY/dist/lightgbm-$LGB_VER*.whl -v || exit -1
         pytest $BUILD_DIRECTORY/tests || exit -1
         exit 0
     elif [[ $METHOD == "source" ]]; then
+        mkdir $BUILD_DIRECTORY/build
+        cd $BUILD_DIRECTORY/build
         cmake -DUSE_MPI=ON -DUSE_DEBUG=ON ..
     fi
 else
+    mkdir $BUILD_DIRECTORY/build
+    cd $BUILD_DIRECTORY/build
     cmake ..
 fi
 
 make _lightgbm -j4 || exit -1
 
-cd $BUILD_DIRECTORY/python-package && python setup.py install --precompile --user || exit -1
+cd $BUILD_DIRECTORY && sh ./build-python.sh install --precompile --user || exit -1
 pytest $BUILD_DIRECTORY/tests || exit -1
 
 if [[ $TASK == "regular" ]]; then
diff --git a/.ci/test_r_package.sh b/.ci/test_r_package.sh
index dbc76f061114..34322ecef7d1 100755
--- a/.ci/test_r_package.sh
+++ b/.ci/test_r_package.sh
@@ -17,7 +17,7 @@ fi
 R_MAJOR_VERSION=( ${R_VERSION//./ } )
 if [[ "${R_MAJOR_VERSION}" == "3" ]]; then
     export R_MAC_VERSION=3.6.3
-    export R_MAC_PKG_URL=${CRAN_MIRROR}/bin/macosx/R-${R_MAC_VERSION}.pkg
+    export R_MAC_PKG_URL=${CRAN_MIRROR}/bin/macosx/R-${R_MAC_VERSION}.nn.pkg
     export R_LINUX_VERSION="3.6.3-1bionic"
     export R_APT_REPO="bionic-cran35/"
 elif [[ "${R_MAJOR_VERSION}" == "4" ]]; then
@@ -77,13 +77,14 @@ fi
 
 # Installing R precompiled for Mac OS 10.11 or higher
 if [[ $OS_NAME == "macos" ]]; then
+    brew update-reset && brew update
     if [[ $R_BUILD_TYPE == "cran" ]]; then
         brew install automake || exit -1
     fi
     brew install \
         checkbashisms \
         qpdf || exit -1
-    brew install --cask basictex || exit -1
+    brew install basictex || exit -1
     export PATH="/Library/TeX/texbin:$PATH"
     sudo tlmgr --verify-repo=none update --self || exit -1
     sudo tlmgr --verify-repo=none install inconsolata helvetic rsfs || exit -1
@@ -117,6 +118,16 @@ if [[ $OS_NAME == "macos" ]]; then
     fi
 fi
 
+# fix for issue where CRAN was not returning {lattice} when using R 3.6
+# "Warning: dependency ‘lattice’ is not available"
+#
+# refs for that MRAN snapshot:
+# * https://cran.r-project.org/web/packages/checkpoint/readme/README.html
+# * https://help.codeocean.com/en/articles/3087704-using-mran-snapshots-to-install-archived-r-packages
+if [[ "${R_MAJOR_VERSION}" == "3" ]]; then
+    Rscript --vanilla -e "install.packages('lattice', repos = 'https://cran.microsoft.com/snapshot/2020-04-23/', lib = '${R_LIB_PATH}')"
+fi
+
 # Manually install Depends and Imports libraries + 'knitr', 'RhpcBLASctl', 'rmarkdown', 'testthat'
 # to avoid a CI-time dependency on devtools (for devtools::install_deps())
 # NOTE: testthat is not required when running rchk
diff --git a/.ci/test_windows.ps1 b/.ci/test_windows.ps1
index 4735de82902c..1e14cec3fc28 100644
--- a/.ci/test_windows.ps1
+++ b/.ci/test_windows.ps1
@@ -35,7 +35,7 @@ if ($env:TASK -eq "swig") {
   mkdir $env:BUILD_SOURCESDIRECTORY/build; cd $env:BUILD_SOURCESDIRECTORY/build
   cmake -A x64 -DUSE_SWIG=ON .. ; cmake --build . --target ALL_BUILD --config Release ; Check-Output $?
   if ($env:AZURE -eq "true") {
-    cp $env:BUILD_SOURCESDIRECTORY/build/lightgbmlib.jar $env:BUILD_ARTIFACTSTAGINGDIRECTORY/lightgbmlib_win.jar
+    cp $env:BUILD_SOURCESDIRECTORY/build/lightgbmlib.jar $env:BUILD_ARTIFACTSTAGINGDIRECTORY/lightgbmlib_win.jar ; Check-Output $?
   }
   Exit 0
 }
@@ -44,6 +44,12 @@ if ($env:TASK -eq "swig") {
 conda init powershell
 conda activate
 conda config --set always_yes yes --set changeps1 no
+
+# ref:
+# * https://stackoverflow.com/a/62897729/3986677
+# * https://github.com/microsoft/LightGBM/issues/5899
+conda install brotlipy
+
 conda update -q -y conda
 conda create -q -y -n $env:CONDA_ENV `
   cloudpickle `
@@ -65,15 +71,15 @@ if ($env:TASK -ne "bdist") {
 if ($env:TASK -eq "regular") {
   mkdir $env:BUILD_SOURCESDIRECTORY/build; cd $env:BUILD_SOURCESDIRECTORY/build
   cmake -A x64 .. ; cmake --build . --target ALL_BUILD --config Release ; Check-Output $?
-  cd $env:BUILD_SOURCESDIRECTORY/python-package
-  python setup.py install --precompile ; Check-Output $?
+  cd $env:BUILD_SOURCESDIRECTORY
+  sh $env:BUILD_SOURCESDIRECTORY/build-python.sh install --precompile ; Check-Output $?
   cp $env:BUILD_SOURCESDIRECTORY/Release/lib_lightgbm.dll $env:BUILD_ARTIFACTSTAGINGDIRECTORY
   cp $env:BUILD_SOURCESDIRECTORY/Release/lightgbm.exe $env:BUILD_ARTIFACTSTAGINGDIRECTORY
 }
 elseif ($env:TASK -eq "sdist") {
-  cd $env:BUILD_SOURCESDIRECTORY/python-package
-  python setup.py sdist --formats gztar ; Check-Output $?
-  sh $env:BUILD_SOURCESDIRECTORY/.ci/check_python_dists.sh $env:BUILD_SOURCESDIRECTORY/python-package/dist ; Check-Output $?
+  cd $env:BUILD_SOURCESDIRECTORY
+  sh $env:BUILD_SOURCESDIRECTORY/build-python.sh sdist ; Check-Output $?
+  sh $env:BUILD_SOURCESDIRECTORY/.ci/check_python_dists.sh $env:BUILD_SOURCESDIRECTORY/dist ; Check-Output $?
   cd dist; pip install @(Get-ChildItem *.gz) -v ; Check-Output $?
 }
 elseif ($env:TASK -eq "bdist") {
@@ -87,17 +93,17 @@ elseif ($env:TASK -eq "bdist") {
   Get-ItemProperty -Path Registry::HKEY_LOCAL_MACHINE\SOFTWARE\Khronos\OpenCL\Vendors
 
   conda activate $env:CONDA_ENV
-  cd $env:BUILD_SOURCESDIRECTORY/python-package
-  python setup.py bdist_wheel --integrated-opencl --plat-name=win-amd64 --python-tag py3 ; Check-Output $?
-  sh $env:BUILD_SOURCESDIRECTORY/.ci/check_python_dists.sh $env:BUILD_SOURCESDIRECTORY/python-package/dist ; Check-Output $?
+  cd $env:BUILD_SOURCESDIRECTORY
+  sh "build-python.sh" bdist_wheel --integrated-opencl ; Check-Output $?
+  sh $env:BUILD_SOURCESDIRECTORY/.ci/check_python_dists.sh $env:BUILD_SOURCESDIRECTORY/dist ; Check-Output $?
   cd dist; pip install --user @(Get-ChildItem *.whl) ; Check-Output $?
   cp @(Get-ChildItem *.whl) $env:BUILD_ARTIFACTSTAGINGDIRECTORY
 } elseif (($env:APPVEYOR -eq "true") -and ($env:TASK -eq "python")) {
-  cd $env:BUILD_SOURCESDIRECTORY\python-package
+  cd $env:BUILD_SOURCESDIRECTORY
   if ($env:COMPILER -eq "MINGW") {
-    python setup.py install --mingw ; Check-Output $?
+    sh $env:BUILD_SOURCESDIRECTORY/build-python.sh install --mingw ; Check-Output $?
   } else {
-    python setup.py install ; Check-Output $?
+    sh $env:BUILD_SOURCESDIRECTORY/build-python.sh install ; Check-Output $?
   }
 }
 
diff --git a/.github/workflows/r_package.yml b/.github/workflows/r_package.yml
index 62ebc86726a6..eb2cb90a424e 100644
--- a/.github/workflows/r_package.yml
+++ b/.github/workflows/r_package.yml
@@ -63,24 +63,12 @@ jobs:
             r_version: 4.2
             build_type: cmake
             container: 'ubuntu:22.04'
-          - os: macOS-latest
-            task: r-package
-            compiler: gcc
-            r_version: 3.6
-            build_type: cmake
-            container: null
           - os: macOS-latest
             task: r-package
             compiler: gcc
             r_version: 4.2
             build_type: cmake
             container: null
-          - os: macOS-latest
-            task: r-package
-            compiler: clang
-            r_version: 3.6
-            build_type: cmake
-            container: null
           - os: macOS-latest
             task: r-package
             compiler: clang
diff --git a/.github/workflows/static_analysis.yml b/.github/workflows/static_analysis.yml
index 415cbb66086a..bf369e79c0c5 100644
--- a/.github/workflows/static_analysis.yml
+++ b/.github/workflows/static_analysis.yml
@@ -21,7 +21,7 @@ env:
   CONDA_ENV: test-env
   GITHUB_ACTIONS: 'true'
   OS_NAME: 'linux'
-  PYTHON_VERSION: '3.10'
+  PYTHON_VERSION: '3.11'
 
 jobs:
   test:
diff --git a/.gitignore b/.gitignore
index bb65ca426bba..d4045d9a4798 100644
--- a/.gitignore
+++ b/.gitignore
@@ -399,6 +399,7 @@ lightgbm.model
 /cmake-build-debug/
 
 # Files from local Python install
+lightgbm-python/
 python-package/LICENSE
 python-package/build_cpp/
 python-package/compile/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 750b41ab8164..0792f0959ca6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -15,8 +15,11 @@ set(
   "Semicolon separated list of sanitizer names, e.g., 'address;leak'. \
 Supported sanitizers are address, leak, undefined and thread."
 )
+option(BUILD_CLI "Build the 'lightbgm' command-line interface in addition to lib_lightgbm" ON)
 option(BUILD_CPP_TEST "Build C++ tests with Google Test" OFF)
 option(BUILD_STATIC_LIB "Build static library" OFF)
+option(INSTALL_HEADERS "Install headers to CMAKE_INSTALL_PREFIX (e.g. '/usr/local/include')" ON)
+option(__BUILD_FOR_PYTHON "Set to ON if building lib_lightgbm for use with the Python package" OFF)
 option(__BUILD_FOR_R "Set to ON if building lib_lightgbm for use with the R package" OFF)
 option(__INTEGRATE_OPENCL "Set to ON if building LightGBM with the OpenCL ICD Loader and its dependencies included" OFF)
 
@@ -55,6 +58,14 @@ if(__INTEGRATE_OPENCL)
   message(STATUS "Building library with integrated OpenCL components")
 endif()
 
+if(__BUILD_FOR_PYTHON OR __BUILD_FOR_R)
+    # the Python and R package don't require the CLI
+    set(BUILD_CLI OFF)
+    # installing the R and Python package shouldn't place LightGBM's headers
+    # outside of where the package is installed
+    set(INSTALL_HEADERS OFF)
+endif()
+
 if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
   if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "4.8.2")
     message(FATAL_ERROR "Insufficient gcc version")
@@ -421,8 +432,10 @@ endif()
 
 add_library(lightgbm_objs OBJECT ${SOURCES})
 
-add_executable(lightgbm src/main.cpp src/application/application.cpp)
-target_link_libraries(lightgbm PRIVATE lightgbm_objs)
+if(BUILD_CLI)
+    add_executable(lightgbm src/main.cpp src/application/application.cpp)
+    target_link_libraries(lightgbm PRIVATE lightgbm_objs)
+endif()
 
 set(API_SOURCES "src/c_api.cpp")
 # Only build the R part of the library if building for
@@ -544,19 +557,25 @@ if(USE_CUDA)
   # each target that contains or depends on cuda source.
   set_target_properties(lightgbm_objs PROPERTIES CUDA_ARCHITECTURES OFF)
   set_target_properties(_lightgbm PROPERTIES CUDA_ARCHITECTURES OFF)
-  set_target_properties(lightgbm PROPERTIES CUDA_ARCHITECTURES OFF)
+  if(BUILD_CLI)
+    set_target_properties(lightgbm PROPERTIES CUDA_ARCHITECTURES OFF)
+  endif()
 
   set_target_properties(lightgbm_objs PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
 
   # Device linking is not supported for object libraries.
   # Thus we have to specify them on final targets.
-  set_target_properties(lightgbm PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+  if(BUILD_CLI)
+    set_target_properties(lightgbm PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+  endif()
   set_target_properties(_lightgbm PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
 
   # histograms are list of object libraries. Linking object library to other
   # object libraries only gets usage requirements, the linked objects won't be
   # used. Thus we have to call target_link_libraries on final targets here.
-  target_link_libraries(lightgbm PRIVATE ${histograms})
+  if(BUILD_CLI)
+    target_link_libraries(lightgbm PRIVATE ${histograms})
+  endif()
   target_link_libraries(_lightgbm PRIVATE ${histograms})
 endif()
 
@@ -566,7 +585,7 @@ endif()
 
 if(WIN32)
     if(MINGW OR CYGWIN)
-      target_link_libraries(lightgbm_objs PUBLIC Ws2_32 IPHLPAPI)
+      target_link_libraries(lightgbm_objs PUBLIC ws2_32 iphlpapi)
     endif()
 endif()
 
@@ -619,11 +638,20 @@ if(BUILD_CPP_TEST)
   target_link_libraries(testlightgbm PRIVATE lightgbm_objs lightgbm_capi_objs GTest::GTest)
 endif()
 
+if(BUILD_CLI)
+    install(
+      TARGETS lightgbm
+      RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/bin
+    )
+endif()
+
 install(
-  TARGETS lightgbm _lightgbm
+  TARGETS _lightgbm
   RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/bin
   LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib
   ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/lib
 )
 
-install(DIRECTORY ${LightGBM_HEADER_DIR}/LightGBM DESTINATION ${CMAKE_INSTALL_PREFIX}/include)
+if(INSTALL_HEADERS)
+    install(DIRECTORY ${LightGBM_HEADER_DIR}/LightGBM DESTINATION ${CMAKE_INSTALL_PREFIX}/include)
+endif()
diff --git a/R-package/src/Makevars.in b/R-package/src/Makevars.in
index 471bfc948cc3..ba9ef054bfab 100644
--- a/R-package/src/Makevars.in
+++ b/R-package/src/Makevars.in
@@ -48,6 +48,7 @@ OBJECTS = \
     treelearner/data_parallel_tree_learner.o \
     treelearner/feature_parallel_tree_learner.o \
     treelearner/gpu_tree_learner.o \
+    treelearner/gradient_discretizer.o \
     treelearner/linear_tree_learner.o \
     treelearner/serial_tree_learner.o \
     treelearner/tree_learner.o \
diff --git a/R-package/src/Makevars.win.in b/R-package/src/Makevars.win.in
index 8d39317b4a3a..14f5afde002f 100644
--- a/R-package/src/Makevars.win.in
+++ b/R-package/src/Makevars.win.in
@@ -19,7 +19,7 @@ PKG_LIBS = \
     ${SHLIB_OPENMP_CXXFLAGS} \
     ${SHLIB_PTHREAD_FLAGS} \
     -lws2_32 \
-    -lIphlpapi
+    -liphlpapi
 
 OBJECTS = \
     boosting/boosting.o \
@@ -49,6 +49,7 @@ OBJECTS = \
     treelearner/data_parallel_tree_learner.o \
     treelearner/feature_parallel_tree_learner.o \
     treelearner/gpu_tree_learner.o \
+    treelearner/gradient_discretizer.o \
     treelearner/linear_tree_learner.o \
     treelearner/serial_tree_learner.o \
     treelearner/tree_learner.o \
diff --git a/build-python.sh b/build-python.sh
new file mode 100755
index 000000000000..1dd8bc9fe966
--- /dev/null
+++ b/build-python.sh
@@ -0,0 +1,342 @@
+#!/bin/sh
+
+# [description]
+#
+#     Prepare a source distribution (sdist) or built distribution (wheel)
+#     of the Python package, and optionally install it.
+#
+# [usage]
+#
+#     # build sdist and put it in dist/
+#     sh ./build-python.sh sdist
+#
+#     # build wheel and put it in dist/
+#     sh ./build-python.sh bdist_wheel [OPTIONS]
+#
+#     # compile lib_lightgbm and install the Python package wrapping it
+#     sh ./build-python.sh install [OPTIONS]
+#
+#     # install the Python package using a pre-compiled lib_lightgbm
+#     # (assumes lib_lightgbm.{dll,so} is located at the root of the repo)
+#     sh ./build-python.sh install --precompile
+#
+# [options]
+#
+#     --boost-dir=FILEPATH
+#                                   Directory with Boost package configuration file.
+#     --boost-include-dir=FILEPATH
+#                                   Directory containing Boost headers.
+#     --boost-librarydir=FILEPATH
+#                                   Preferred Boost library directory.
+#     --boost-root=FILEPATH
+#                                   Boost preferred installation prefix.
+#     --opencl-include-dir=FILEPATH
+#                                   OpenCL include directory.
+#     --opencl-library=FILEPATH
+#                                   Path to OpenCL library.
+#     --bit32
+#                                   Compile 32-bit version.
+#     --cuda
+#                                   Compile CUDA version.
+#     --gpu
+#                                   Compile GPU version.
+#     --hdfs
+#                                   Compile HDFS version.
+#     --integrated-opencl
+#                                   Compile integrated OpenCL version.
+#     --mingw
+#                                   Compile with MinGW.
+#     --mpi
+#                                   Compile MPI version.
+#     --nomp
+#                                   Compile version without OpenMP support.
+#     --precompile
+#                                   Use precompiled library.
+#                                   Only used with 'install' command.
+#     --time-costs
+#                                   Output time costs for different internal routines.
+#     --user
+#                                   Install into user-specific instead of global site-packages directory.
+#                                   Only used with 'install' command.
+
+set -e -u
+
+echo "building lightgbm"
+
+# Default values of arguments
+INSTALL="false"
+BUILD_SDIST="false"
+BUILD_WHEEL="false"
+
+PIP_INSTALL_ARGS=""
+BUILD_ARGS=""
+PRECOMPILE="false"
+
+while [ $# -gt 0 ]; do
+  case "$1" in
+    ############################
+    # sub-commands of setup.py #
+    ############################
+    install)
+      INSTALL="true"
+      ;;
+    sdist)
+      BUILD_SDIST="true"
+      ;;
+    bdist_wheel)
+      BUILD_WHEEL="true"
+      ;;
+    ############################
+    # customized library paths #
+    ############################
+    --boost-dir|--boost-dir=*)
+        if [[ "$1" != *=* ]];
+            then shift;
+        fi
+        BOOST_DIR="${1#*=}"
+        BUILD_ARGS="${BUILD_ARGS} --boost-dir='${BOOST_DIR}'"
+        ;;
+    --boost-include-dir|--boost-include-dir=*)
+        if [[ "$1" != *=* ]];
+            then shift;
+        fi
+        BOOST_INCLUDE_DIR="${1#*=}"
+        BUILD_ARGS="${BUILD_ARGS} --boost-include-dir='${BOOST_INCLUDE_DIR}'"
+        ;;
+    --boost-librarydir|--boost-librarydir=*)
+        if [[ "$1" != *=* ]];
+            then shift;
+        fi
+        BOOST_LIBRARY_DIR="${1#*=}"
+        BUILD_ARGS="${BUILD_ARGS} --boost-librarydir='${BOOST_LIBRARY_DIR}'"
+        ;;
+    --boost-root|--boost-root=*)
+        if [[ "$1" != *=* ]];
+            then shift;
+        fi
+        BOOST_ROOT="${1#*=}"
+        BUILD_ARGS="${BUILD_ARGS} --boost-root='${BOOST_ROOT}'"
+        ;;
+    --opencl-include-dir|--opencl-include-dir=*)
+        if [[ "$1" != *=* ]];
+            then shift;
+        fi
+        OPENCL_INCLUDE_DIR="${1#*=}"
+        BUILD_ARGS="${BUILD_ARGS} --opencl-include-dir='${OPENCL_INCLUDE_DIR}'"
+        ;;
+    --opencl-library|--opencl-library=*)
+        if [[ "$1" != *=* ]];
+            then shift;
+        fi
+        OPENCL_LIBRARY="${1#*=}"
+        BUILD_ARGS="${BUILD_ARGS} --opencl-library='${OPENCL_LIBRARY}'"
+        ;;
+    #########
+    # flags #
+    #########
+    --bit32)
+        BUILD_ARGS="${BUILD_ARGS} --bit32"
+        ;;
+    --cuda)
+        BUILD_ARGS="${BUILD_ARGS} --cuda"
+        ;;
+    --gpu)
+        BUILD_ARGS="${BUILD_ARGS} --gpu"
+        ;;
+    --hdfs)
+        BUILD_ARGS="${BUILD_ARGS} --hdfs"
+        ;;
+    --integrated-opencl)
+        BUILD_ARGS="${BUILD_ARGS} --integrated-opencl"
+        ;;
+    --mingw)
+        BUILD_ARGS="${BUILD_ARGS} --mingw"
+        ;;
+    --mpi)
+        BUILD_ARGS="${BUILD_ARGS} --mpi"
+        ;;
+    --nomp)
+        BUILD_ARGS="${BUILD_ARGS} --nomp"
+        ;;
+    --precompile)
+        PRECOMPILE="true"
+        ;;
+    --time-costs)
+        BUILD_ARGS="${PIP_INSTALL_ARGS} --time-costs"
+        ;;
+    --user)
+        PIP_INSTALL_ARGS="${PIP_INSTALL_ARGS} --user"
+        ;;
+    *)
+        echo "invalid argument '${1}'"
+        exit -1
+        ;;
+  esac
+  shift
+done
+
+# create a new directory that just contains the files needed
+# to build the Python package
+create_isolated_source_dir() {
+    rm -rf \
+        ./lightgbm-python \
+        ./lightgbm \
+        ./python-package/build \
+        ./python-package/build_cpp \
+        ./python-package/compile \
+        ./python-package/dist \
+        ./python-package/lightgbm.egg-info
+
+    cp -R ./python-package ./lightgbm-python
+
+    # temporarily remove these files until
+    # https://github.com/microsoft/LightGBM/issues/5061 is done
+    rm ./lightgbm-python/pyproject.toml
+    rm ./lightgbm-python/setup.cfg
+
+    cp LICENSE ./lightgbm-python/
+    cp VERSION.txt ./lightgbm-python/lightgbm/VERSION.txt
+
+    mkdir -p ./lightgbm-python/compile
+    cp -R ./cmake ./lightgbm-python/compile
+    cp CMakeLists.txt ./lightgbm-python/compile
+    cp -R ./include ./lightgbm-python/compile
+    cp -R ./src ./lightgbm-python/compile
+    cp -R ./swig ./lightgbm-python/compile
+    cp -R ./windows ./lightgbm-python/compile
+
+    # include only specific files from external_libs, to keep the package
+    # small and avoid redistributing code with licenses incompatible with
+    # LightGBM's license
+
+    ######################
+    # fast_double_parser #
+    ######################
+    mkdir -p ./lightgbm-python/compile/external_libs/fast_double_parser
+    cp \
+        external_libs/fast_double_parser/CMakeLists.txt \
+        ./lightgbm-python/compile/external_libs/fast_double_parser/CMakeLists.txt
+    cp \
+        external_libs/fast_double_parser/LICENSE* \
+        ./lightgbm-python/compile/external_libs/fast_double_parser/
+
+    mkdir -p ./lightgbm-python/compile/external_libs/fast_double_parser/include/
+    cp \
+        external_libs/fast_double_parser/include/fast_double_parser.h \
+        ./lightgbm-python/compile/external_libs/fast_double_parser/include/
+
+    #######
+    # fmt #
+    #######
+    mkdir -p ./lightgbm-python/compile/external_libs/fmt
+    cp \
+        external_libs/fast_double_parser/CMakeLists.txt \
+        ./lightgbm-python/compile/external_libs/fmt/CMakeLists.txt
+    cp \
+        external_libs/fmt/LICENSE* \
+        ./lightgbm-python/compile/external_libs/fmt/
+
+    mkdir -p ./lightgbm-python/compile/external_libs/fmt/include/fmt
+    cp \
+        external_libs/fmt/include/fmt/*.h \
+        ./lightgbm-python/compile/external_libs/fmt/include/fmt/
+
+    #########
+    # Eigen #
+    #########
+    mkdir -p ./lightgbm-python/compile/external_libs/eigen/Eigen
+    cp \
+        external_libs/eigen/CMakeLists.txt \
+        ./lightgbm-python/compile/external_libs/eigen/CMakeLists.txt
+
+    modules="Cholesky Core Dense Eigenvalues Geometry Householder Jacobi LU QR SVD"
+    for eigen_module in ${modules}; do
+        cp \
+            external_libs/eigen/Eigen/${eigen_module} \
+            ./lightgbm-python/compile/external_libs/eigen/Eigen/${eigen_module}
+        if [ ${eigen_module} != "Dense" ]; then
+            mkdir -p ./lightgbm-python/compile/external_libs/eigen/Eigen/src/${eigen_module}/
+            cp \
+                -R \
+                external_libs/eigen/Eigen/src/${eigen_module}/* \
+                ./lightgbm-python/compile/external_libs/eigen/Eigen/src/${eigen_module}/
+        fi
+    done
+
+    mkdir -p ./lightgbm-python/compile/external_libs/eigen/Eigen/misc
+    cp \
+        -R \
+        external_libs/eigen/Eigen/src/misc \
+        ./lightgbm-python/compile/external_libs/eigen/Eigen/src/misc/
+
+    mkdir -p ./lightgbm-python/compile/external_libs/eigen/Eigen/plugins
+    cp \
+        -R \
+        external_libs/eigen/Eigen/src/plugins \
+        ./lightgbm-python/compile/external_libs/eigen/Eigen/src/plugins/
+
+    ###################
+    # compute (Boost) #
+    ###################
+    mkdir -p ./lightgbm-python/compile/external_libs/compute
+    cp \
+        external_libs/compute/CMakeLists.txt \
+        ./lightgbm-python/compile/external_libs/compute/
+    cp \
+        -R \
+        external_libs/compute/cmake \
+        ./lightgbm-python/compile/external_libs/compute/cmake/
+    cp \
+        -R \
+        external_libs/compute/include \
+        ./lightgbm-python/compile/external_libs/compute/include/
+    cp \
+        -R \
+        external_libs/compute/meta \
+        ./lightgbm-python/compile/external_libs/compute/meta/
+}
+
+create_isolated_source_dir
+
+cd ./lightgbm-python
+
+# installation involves building the wheel + `pip install`-ing it
+if test "${INSTALL}" = true; then
+    if test "${PRECOMPILE}" = true; then
+        echo "--- installing lightgbm (from precompiled lib_lightgbm) ---"
+        python setup.py install ${PIP_INSTALL_ARGS} --precompile
+        exit 0
+    else
+        BUILD_SDIST="false"
+        BUILD_WHEEL="true"
+    fi
+fi
+
+if test "${BUILD_SDIST}" = true; then
+    echo "--- building sdist ---"
+    rm -f ../dist/*.tar.gz
+    python ./setup.py sdist \
+        --dist-dir ../dist
+fi
+
+if test "${BUILD_WHEEL}" = true; then
+    echo "--- building wheel ---"
+    rm -f ../dist/*.whl || true
+    python setup.py bdist_wheel \
+        --dist-dir ../dist \
+        ${BUILD_ARGS}
+fi
+
+if test "${INSTALL}" = true; then
+    echo "--- installing lightgbm ---"
+    # ref for use of '--find-links': https://stackoverflow.com/a/52481267/3986677
+    cd ../dist
+    pip install \
+        ${PIP_INSTALL_ARGS} \
+        --find-links=. \
+        lightgbm
+    cd ../
+fi
+
+echo "cleaning up"
+rm -rf ./lightgbm-python
diff --git a/docker/dockerfile-python b/docker/dockerfile-python
index 6c5ca6501ac3..541884811a0b 100644
--- a/docker/dockerfile-python
+++ b/docker/dockerfile-python
@@ -26,7 +26,7 @@ RUN apt-get update && \
     # lightgbm
     conda install -q -y numpy scipy scikit-learn pandas && \
     git clone --recursive --branch stable --depth 1 https://github.com/Microsoft/LightGBM && \
-    cd LightGBM/python-package && python setup.py install && \
+    sh ./build-python.sh install && \
     # clean
     apt-get autoremove -y && apt-get clean && \
     conda clean -a -y && \
diff --git a/docker/gpu/dockerfile.gpu b/docker/gpu/dockerfile.gpu
index bac9d97b2c2b..74c301234020 100644
--- a/docker/gpu/dockerfile.gpu
+++ b/docker/gpu/dockerfile.gpu
@@ -88,7 +88,7 @@ RUN cd /usr/local/src && mkdir lightgbm && cd lightgbm && \
 
 ENV PATH /usr/local/src/lightgbm/LightGBM:${PATH}
 
-RUN /bin/bash -c "source activate py3 && cd /usr/local/src/lightgbm/LightGBM/python-package && python setup.py install --precompile && source deactivate"
+RUN /bin/bash -c "source activate py3 && cd /usr/local/src/lightgbm/LightGBM && sh ./build-python.sh install --precompile && source deactivate"
 
 #################################################################################################################
 #           System CleanUp
diff --git a/docs/FAQ.rst b/docs/FAQ.rst
index 9f86b882e0a1..2a09fd674e4c 100644
--- a/docs/FAQ.rst
+++ b/docs/FAQ.rst
@@ -277,6 +277,10 @@ Python-package
 1. ``Error: setup script specifies an absolute path`` when installing from GitHub using ``python setup.py install``.
 --------------------------------------------------------------------------------------------------------------------
 
+.. note::
+    As of v4.0.0, ``lightgbm`` does not support directly invoking ``setup.py``.
+    This answer refers only to versions of ``lightgbm`` prior to v4.0.0.
+
 .. code-block:: console
 
    error: Error: setup script specifies an absolute path:
@@ -329,7 +333,7 @@ So, if you want to:
 We are doing our best to provide universal wheels which have high running speed and are compatible with any hardware, OS, compiler, etc. at the same time.
 However, sometimes it's just impossible to guarantee the possibility of usage of LightGBM in any specific environment (see `Microsoft/LightGBM#1743 <https://github.com/microsoft/LightGBM/issues/1743>`__).
 
-Therefore, the first thing you should try in case of segfaults is **compiling from the source** using ``pip install --no-binary :all: lightgbm``.
+Therefore, the first thing you should try in case of segfaults is **compiling from the source** using ``pip install --no-binary lightgbm lightgbm``.
 For the OS-specific prerequisites see `this guide <https://github.com/microsoft/LightGBM/blob/master/python-package/README.rst#user-content-build-from-sources>`__.
 
 Also, feel free to post a new issue in our GitHub repository. We always look at each case individually and try to find a root cause.
diff --git a/docs/GPU-Tutorial.rst b/docs/GPU-Tutorial.rst
index 1ca98784e3f6..836ab1add378 100644
--- a/docs/GPU-Tutorial.rst
+++ b/docs/GPU-Tutorial.rst
@@ -80,9 +80,7 @@ If you want to use the Python interface of LightGBM, you can install it now (alo
 
     sudo apt-get -y install python-pip
     sudo -H pip install setuptools numpy scipy scikit-learn -U
-    cd python-package/
-    sudo python setup.py install --precompile
-    cd ..
+    sudo sh ./build-python.sh install --precompile
 
 You need to set an additional parameter ``"device" : "gpu"`` (along with your other options like ``learning_rate``, ``num_leaves``, etc) to use GPU in Python.
 
diff --git a/docs/Parameters.rst b/docs/Parameters.rst
index abbd8cb14e14..aee1cc4e7f84 100644
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -658,6 +658,38 @@ Learning Control Parameters
 
    -  **Note**: can be used only in CLI version
 
+-  ``use_quantized_grad`` :raw-html:`<a id="use_quantized_grad" title="Permalink to this parameter" href="#use_quantized_grad">&#x1F517;&#xFE0E;</a>`, default = ``false``, type = bool
+
+   -  whether to use gradient quantization when training
+
+   -  enabling this will discretize (quantize) the gradients and hessians into bins of ``num_grad_quant_bins``
+
+   -  with quantized training, most arithmetics in the training process will be integer operations
+
+   -  gradient quantization can accelerate training, with little accuracy drop in most cases
+
+   -  **Note**: can be used only with ``device_type = cpu``
+
+-  ``num_grad_quant_bins`` :raw-html:`<a id="num_grad_quant_bins" title="Permalink to this parameter" href="#num_grad_quant_bins">&#x1F517;&#xFE0E;</a>`, default = ``4``, type = int
+
+   -  number of bins to quantization gradients and hessians
+
+   -  with more bins, the quantized training will be closer to full precision training
+
+   -  **Note**: can be used only with ``device_type = cpu``
+
+-  ``quant_train_renew_leaf`` :raw-html:`<a id="quant_train_renew_leaf" title="Permalink to this parameter" href="#quant_train_renew_leaf">&#x1F517;&#xFE0E;</a>`, default = ``false``, type = bool
+
+   -  whether to renew the leaf values with original gradients when quantized training
+
+   -  renewing is very helpful for good quantized training accuracy for ranking objectives
+
+   -  **Note**: can be used only with ``device_type = cpu``
+
+-  ``stochastic_rounding`` :raw-html:`<a id="stochastic_rounding" title="Permalink to this parameter" href="#stochastic_rounding">&#x1F517;&#xFE0E;</a>`, default = ``true``, type = bool
+
+   -  whether to use stochastic rounding in gradient quantization
+
 IO Parameters
 -------------
 
diff --git a/docs/Quick-Start.rst b/docs/Quick-Start.rst
index 7c1883652d96..04e64beb1281 100644
--- a/docs/Quick-Start.rst
+++ b/docs/Quick-Start.rst
@@ -59,14 +59,14 @@ Run LightGBM
 
 ::
 
-    "./lightgbm" config=your_config_file other_args ...
+    lightgbm config=your_config_file other_args ...
 
 Parameters can be set both in the config file and command line, and the parameters in command line have higher priority than in the config file.
 For example, the following command line will keep ``num_trees=10`` and ignore the same parameter in the config file.
 
 ::
 
-    "./lightgbm" config=train.conf num_trees=10
+    lightgbm config=train.conf num_trees=10
 
 Examples
 --------
diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h
index a6199bbbcbd2..ffb8f2844843 100644
--- a/include/LightGBM/bin.h
+++ b/include/LightGBM/bin.h
@@ -30,11 +30,14 @@ enum MissingType {
 };
 
 typedef double hist_t;
+typedef int32_t int_hist_t;
 typedef uint64_t hist_cnt_t;
 // check at compile time
 static_assert(sizeof(hist_t) == sizeof(hist_cnt_t), "Histogram entry size is not correct");
 
 const size_t kHistEntrySize = 2 * sizeof(hist_t);
+const size_t kInt32HistEntrySize = 2 * sizeof(int_hist_t);
+const size_t kInt16HistEntrySize = 2 * sizeof(int16_t);
 const int kHistOffset = 2;
 const double kSparseThreshold = 0.7;
 
@@ -56,6 +59,28 @@ inline static void HistogramSumReducer(const char* src, char* dst, int type_size
   }
 }
 
+inline static void Int32HistogramSumReducer(const char* src, char* dst, int type_size, comm_size_t len) {
+  const int64_t* src_ptr = reinterpret_cast<const int64_t*>(src);
+  int64_t* dst_ptr = reinterpret_cast<int64_t*>(dst);
+  const comm_size_t steps = (len + (type_size * 2) - 1) / (type_size * 2);
+  const int num_threads = OMP_NUM_THREADS();
+  #pragma omp parallel for schedule(static) num_threads(num_threads)
+  for (comm_size_t i = 0; i < steps; ++i) {
+    dst_ptr[i] += src_ptr[i];
+  }
+}
+
+inline static void Int16HistogramSumReducer(const char* src, char* dst, int type_size, comm_size_t len) {
+  const int32_t* src_ptr = reinterpret_cast<const int32_t*>(src);
+  int32_t* dst_ptr = reinterpret_cast<int32_t*>(dst);
+  const comm_size_t steps = (len + (type_size * 2) - 1) / (type_size * 2);
+  const int num_threads = OMP_NUM_THREADS();
+  #pragma omp parallel for schedule(static) num_threads(num_threads)
+  for (comm_size_t i = 0; i < steps; ++i) {
+    dst_ptr[i] += src_ptr[i];
+  }
+}
+
 /*! \brief This class used to convert feature values into bin,
 *          and store some meta information for bin*/
 class BinMapper {
@@ -332,6 +357,33 @@ class Bin {
     const score_t* ordered_gradients, const score_t* ordered_hessians,
     hist_t* out) const = 0;
 
+  virtual void ConstructHistogramInt8(
+    const data_size_t* data_indices, data_size_t start, data_size_t end,
+    const score_t* ordered_gradients, const score_t* ordered_hessians,
+    hist_t* out) const = 0;
+
+  virtual void ConstructHistogramInt8(data_size_t start, data_size_t end,
+    const score_t* ordered_gradients, const score_t* ordered_hessians,
+    hist_t* out) const = 0;
+
+  virtual void ConstructHistogramInt16(
+    const data_size_t* data_indices, data_size_t start, data_size_t end,
+    const score_t* ordered_gradients, const score_t* ordered_hessians,
+    hist_t* out) const = 0;
+
+  virtual void ConstructHistogramInt16(data_size_t start, data_size_t end,
+    const score_t* ordered_gradients, const score_t* ordered_hessians,
+    hist_t* out) const = 0;
+
+  virtual void ConstructHistogramInt32(
+    const data_size_t* data_indices, data_size_t start, data_size_t end,
+    const score_t* ordered_gradients, const score_t* ordered_hessians,
+    hist_t* out) const = 0;
+
+  virtual void ConstructHistogramInt32(data_size_t start, data_size_t end,
+    const score_t* ordered_gradients, const score_t* ordered_hessians,
+    hist_t* out) const = 0;
+
   /*!
   * \brief Construct histogram of this feature,
   *        Note: We use ordered_gradients and ordered_hessians to improve cache hit chance
@@ -351,6 +403,24 @@ class Bin {
   virtual void ConstructHistogram(data_size_t start, data_size_t end,
                                   const score_t* ordered_gradients, hist_t* out) const = 0;
 
+  virtual void ConstructHistogramInt8(const data_size_t* data_indices, data_size_t start, data_size_t end,
+                                       const score_t* ordered_gradients, hist_t* out) const = 0;
+
+  virtual void ConstructHistogramInt8(data_size_t start, data_size_t end,
+                                       const score_t* ordered_gradients, hist_t* out) const = 0;
+
+  virtual void ConstructHistogramInt16(const data_size_t* data_indices, data_size_t start, data_size_t end,
+                                       const score_t* ordered_gradients, hist_t* out) const = 0;
+
+  virtual void ConstructHistogramInt16(data_size_t start, data_size_t end,
+                                       const score_t* ordered_gradients, hist_t* out) const = 0;
+
+  virtual void ConstructHistogramInt32(const data_size_t* data_indices, data_size_t start, data_size_t end,
+                                       const score_t* ordered_gradients, hist_t* out) const = 0;
+
+  virtual void ConstructHistogramInt32(data_size_t start, data_size_t end,
+                                       const score_t* ordered_gradients, hist_t* out) const = 0;
+
   virtual data_size_t Split(uint32_t min_bin, uint32_t max_bin,
                             uint32_t default_bin, uint32_t most_freq_bin,
                             MissingType missing_type, bool default_left,
@@ -464,6 +534,57 @@ class MultiValBin {
                                          const score_t* ordered_hessians,
                                          hist_t* out) const = 0;
 
+  virtual void ConstructHistogramInt32(const data_size_t* data_indices,
+                                       data_size_t start, data_size_t end,
+                                       const score_t* gradients,
+                                       const score_t* hessians,
+                                       hist_t* out) const = 0;
+
+  virtual void ConstructHistogramInt32(data_size_t start, data_size_t end,
+                                       const score_t* gradients,
+                                       const score_t* hessians,
+                                       hist_t* out) const = 0;
+
+  virtual void ConstructHistogramOrderedInt32(const data_size_t* data_indices,
+                                              data_size_t start, data_size_t end,
+                                              const score_t* ordered_gradients,
+                                              const score_t* ordered_hessians,
+                                              hist_t* out) const = 0;
+
+  virtual void ConstructHistogramInt16(const data_size_t* data_indices,
+                                       data_size_t start, data_size_t end,
+                                       const score_t* gradients,
+                                       const score_t* hessians,
+                                       hist_t* out) const = 0;
+
+  virtual void ConstructHistogramInt16(data_size_t start, data_size_t end,
+                                       const score_t* gradients,
+                                       const score_t* hessians,
+                                       hist_t* out) const = 0;
+
+  virtual void ConstructHistogramOrderedInt16(const data_size_t* data_indices,
+                                              data_size_t start, data_size_t end,
+                                              const score_t* ordered_gradients,
+                                              const score_t* ordered_hessians,
+                                              hist_t* out) const = 0;
+
+  virtual void ConstructHistogramInt8(const data_size_t* data_indices,
+                                      data_size_t start, data_size_t end,
+                                      const score_t* gradients,
+                                      const score_t* hessians,
+                                      hist_t* out) const = 0;
+
+  virtual void ConstructHistogramInt8(data_size_t start, data_size_t end,
+                                      const score_t* gradients,
+                                      const score_t* hessians,
+                                      hist_t* out) const = 0;
+
+  virtual void ConstructHistogramOrderedInt8(const data_size_t* data_indices,
+                                             data_size_t start, data_size_t end,
+                                             const score_t* ordered_gradients,
+                                             const score_t* ordered_hessians,
+                                             hist_t* out) const = 0;
+
   virtual void FinishLoad() = 0;
 
   virtual bool IsSparse() = 0;
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index cbb2735baeb2..89318a7af246 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -592,6 +592,30 @@ struct Config {
   // desc = **Note**: can be used only in CLI version
   int snapshot_freq = -1;
 
+  // [no-save]
+  // desc = whether to use gradient quantization when training
+  // desc = enabling this will discretize (quantize) the gradients and hessians into bins of ``num_grad_quant_bins``
+  // desc = with quantized training, most arithmetics in the training process will be integer operations
+  // desc = gradient quantization can accelerate training, with little accuracy drop in most cases
+  // desc = **Note**: can be used only with ``device_type = cpu``
+  bool use_quantized_grad = false;
+
+  // [no-save]
+  // desc = number of bins to quantization gradients and hessians
+  // desc = with more bins, the quantized training will be closer to full precision training
+  // desc = **Note**: can be used only with ``device_type = cpu``
+  int num_grad_quant_bins = 4;
+
+  // [no-save]
+  // desc = whether to renew the leaf values with original gradients when quantized training
+  // desc = renewing is very helpful for good quantized training accuracy for ranking objectives
+  // desc = **Note**: can be used only with ``device_type = cpu``
+  bool quant_train_renew_leaf = false;
+
+  // [no-save]
+  // desc = whether to use stochastic rounding in gradient quantization
+  bool stochastic_rounding = true;
+
   #ifndef __NVCC__
   #pragma endregion
 
diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h
index 79c4ed196b09..825c5c6ebcf8 100644
--- a/include/LightGBM/dataset.h
+++ b/include/LightGBM/dataset.h
@@ -598,10 +598,11 @@ class Dataset {
 
   MultiValBin* GetMultiBinFromAllFeatures(const std::vector<uint32_t>& offsets) const;
 
+  template <bool USE_QUANT_GRAD, int HIST_BITS>
   TrainingShareStates* GetShareStates(
       score_t* gradients, score_t* hessians,
       const std::vector<int8_t>& is_feature_used, bool is_constant_hessian,
-      bool force_col_wise, bool force_row_wise) const;
+      bool force_col_wise, bool force_row_wise, const int num_grad_quant_bins) const;
 
   LIGHTGBM_EXPORT void FinishLoad();
 
@@ -636,7 +637,7 @@ class Dataset {
   void InitTrain(const std::vector<int8_t>& is_feature_used,
                  TrainingShareStates* share_state) const;
 
-  template <bool USE_INDICES, bool USE_HESSIAN>
+  template <bool USE_INDICES, bool USE_HESSIAN, bool USE_QUANT_GRAD, int HIST_BITS>
   void ConstructHistogramsInner(const std::vector<int8_t>& is_feature_used,
                                 const data_size_t* data_indices,
                                 data_size_t num_data, const score_t* gradients,
@@ -646,7 +647,7 @@ class Dataset {
                                 TrainingShareStates* share_state,
                                 hist_t* hist_data) const;
 
-  template <bool USE_INDICES, bool ORDERED>
+  template <bool USE_INDICES, bool ORDERED, bool USE_QUANT_GRAD, int HIST_BITS>
   void ConstructHistogramsMultiVal(const data_size_t* data_indices,
                                    data_size_t num_data,
                                    const score_t* gradients,
@@ -654,6 +655,7 @@ class Dataset {
                                    TrainingShareStates* share_state,
                                    hist_t* hist_data) const;
 
+  template <bool USE_QUANT_GRAD, int HIST_BITS>
   inline void ConstructHistograms(
       const std::vector<int8_t>& is_feature_used,
       const data_size_t* data_indices, data_size_t num_data,
@@ -666,21 +668,21 @@ class Dataset {
     bool use_indices = data_indices != nullptr && (num_data < num_data_);
     if (share_state->is_constant_hessian) {
       if (use_indices) {
-        ConstructHistogramsInner<true, false>(
+        ConstructHistogramsInner<true, false, USE_QUANT_GRAD, HIST_BITS>(
             is_feature_used, data_indices, num_data, gradients, hessians,
             ordered_gradients, ordered_hessians, share_state, hist_data);
       } else {
-        ConstructHistogramsInner<false, false>(
+        ConstructHistogramsInner<false, false, USE_QUANT_GRAD, HIST_BITS>(
             is_feature_used, data_indices, num_data, gradients, hessians,
             ordered_gradients, ordered_hessians, share_state, hist_data);
       }
     } else {
       if (use_indices) {
-        ConstructHistogramsInner<true, true>(
+        ConstructHistogramsInner<true, true, USE_QUANT_GRAD, HIST_BITS>(
             is_feature_used, data_indices, num_data, gradients, hessians,
             ordered_gradients, ordered_hessians, share_state, hist_data);
       } else {
-        ConstructHistogramsInner<false, true>(
+        ConstructHistogramsInner<false, true, USE_QUANT_GRAD, HIST_BITS>(
             is_feature_used, data_indices, num_data, gradients, hessians,
             ordered_gradients, ordered_hessians, share_state, hist_data);
       }
@@ -689,6 +691,9 @@ class Dataset {
 
   void FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, hist_t* data) const;
 
+  template <typename PACKED_HIST_BIN_T, typename PACKED_HIST_ACC_T, int HIST_BITS_BIN, int HIST_BITS_ACC>
+  void FixHistogramInt(int feature_idx, int64_t sum_gradient_and_hessian, hist_t* data) const;
+
   inline data_size_t Split(int feature, const uint32_t* threshold,
                            int num_threshold, bool default_left,
                            const data_size_t* data_indices,
diff --git a/include/LightGBM/train_share_states.h b/include/LightGBM/train_share_states.h
index 8c50734695b2..f102668edf70 100644
--- a/include/LightGBM/train_share_states.h
+++ b/include/LightGBM/train_share_states.h
@@ -19,7 +19,7 @@ namespace LightGBM {
 class MultiValBinWrapper {
  public:
   MultiValBinWrapper(MultiValBin* bin, data_size_t num_data,
-    const std::vector<int>& feature_groups_contained);
+    const std::vector<int>& feature_groups_contained, const int num_grad_quant_bins);
 
   bool IsSparse() {
     if (multi_val_bin_ != nullptr) {
@@ -34,15 +34,17 @@ class MultiValBinWrapper {
     const data_size_t* bagging_use_indices,
     data_size_t bagging_indices_cnt);
 
+  template <bool USE_QUANT_GRAD, int HIST_BITS, int INNER_HIST_BITS>
   void HistMove(const std::vector<hist_t, Common::AlignmentAllocator<hist_t, kAlignedSize>>& hist_buf);
 
+  template <bool USE_QUANT_GRAD, int HIST_BITS, int INNER_HIST_BITS>
   void HistMerge(std::vector<hist_t, Common::AlignmentAllocator<hist_t, kAlignedSize>>* hist_buf);
 
   void ResizeHistBuf(std::vector<hist_t, Common::AlignmentAllocator<hist_t, kAlignedSize>>* hist_buf,
     MultiValBin* sub_multi_val_bin,
     hist_t* origin_hist_data);
 
-  template <bool USE_INDICES, bool ORDERED>
+  template <bool USE_INDICES, bool ORDERED, bool USE_QUANT_GRAD, int HIST_BITS>
   void ConstructHistograms(const data_size_t* data_indices,
       data_size_t num_data,
       const score_t* gradients,
@@ -59,55 +61,145 @@ class MultiValBinWrapper {
       Threading::BlockInfo<data_size_t>(num_threads_, num_data, min_block_size_,
                                         &n_data_block_, &data_block_size_);
       ResizeHistBuf(hist_buf, cur_multi_val_bin, origin_hist_data);
+      const int inner_hist_bits = (data_block_size_ * num_grad_quant_bins_ < 256 && HIST_BITS == 16) ? 8 : HIST_BITS;
       OMP_INIT_EX();
       #pragma omp parallel for schedule(static) num_threads(num_threads_)
       for (int block_id = 0; block_id < n_data_block_; ++block_id) {
         OMP_LOOP_EX_BEGIN();
         data_size_t start = block_id * data_block_size_;
         data_size_t end = std::min<data_size_t>(start + data_block_size_, num_data);
-        ConstructHistogramsForBlock<USE_INDICES, ORDERED>(
-          cur_multi_val_bin, start, end, data_indices, gradients, hessians,
-          block_id, hist_buf);
+        if (inner_hist_bits == 8) {
+          ConstructHistogramsForBlock<USE_INDICES, ORDERED, USE_QUANT_GRAD, 8>(
+            cur_multi_val_bin, start, end, data_indices, gradients, hessians,
+            block_id, hist_buf);
+        } else {
+          ConstructHistogramsForBlock<USE_INDICES, ORDERED, USE_QUANT_GRAD, HIST_BITS>(
+            cur_multi_val_bin, start, end, data_indices, gradients, hessians,
+            block_id, hist_buf);
+        }
         OMP_LOOP_EX_END();
       }
       OMP_THROW_EX();
       global_timer.Stop("Dataset::sparse_bin_histogram");
 
       global_timer.Start("Dataset::sparse_bin_histogram_merge");
-      HistMerge(hist_buf);
+      if (inner_hist_bits == 8) {
+        HistMerge<USE_QUANT_GRAD, HIST_BITS, 8>(hist_buf);
+      } else {
+        HistMerge<USE_QUANT_GRAD, HIST_BITS, HIST_BITS>(hist_buf);
+      }
       global_timer.Stop("Dataset::sparse_bin_histogram_merge");
       global_timer.Start("Dataset::sparse_bin_histogram_move");
-      HistMove(*hist_buf);
+      if (inner_hist_bits == 8) {
+        HistMove<USE_QUANT_GRAD, HIST_BITS, 8>(*hist_buf);
+      } else {
+        HistMove<USE_QUANT_GRAD, HIST_BITS, HIST_BITS>(*hist_buf);
+      }
       global_timer.Stop("Dataset::sparse_bin_histogram_move");
     }
   }
 
-  template <bool USE_INDICES, bool ORDERED>
+  template <bool USE_INDICES, bool ORDERED, bool USE_QUANT_GRAD, int HIST_BITS>
   void ConstructHistogramsForBlock(const MultiValBin* sub_multi_val_bin,
     data_size_t start, data_size_t end, const data_size_t* data_indices,
     const score_t* gradients, const score_t* hessians, int block_id,
     std::vector<hist_t, Common::AlignmentAllocator<hist_t, kAlignedSize>>* hist_buf) {
-    hist_t* data_ptr = origin_hist_data_;
-    if (block_id == 0) {
-      if (is_use_subcol_) {
-        data_ptr = hist_buf->data() + hist_buf->size() - 2 * static_cast<size_t>(num_bin_aligned_);
+    if (USE_QUANT_GRAD) {
+      if (HIST_BITS == 8) {
+        int8_t* hist_buf_ptr = reinterpret_cast<int8_t*>(hist_buf->data());
+        int8_t* data_ptr = hist_buf_ptr +
+          static_cast<size_t>(num_bin_aligned_) * block_id * 2;
+        std::memset(reinterpret_cast<void*>(data_ptr), 0, num_bin_ * kInt8HistBufferEntrySize);
+        if (USE_INDICES) {
+          if (ORDERED) {
+            sub_multi_val_bin->ConstructHistogramOrderedInt8(data_indices, start, end,
+                                                              gradients, hessians,
+                                                              reinterpret_cast<hist_t*>(data_ptr));
+          } else {
+            sub_multi_val_bin->ConstructHistogramInt8(data_indices, start, end, gradients,
+                                                       hessians,
+                                                       reinterpret_cast<hist_t*>(data_ptr));
+          }
+        } else {
+          sub_multi_val_bin->ConstructHistogramInt8(start, end, gradients, hessians,
+                                                     reinterpret_cast<hist_t*>(data_ptr));
+        }
+      } else if (HIST_BITS == 16) {
+        int16_t* data_ptr = reinterpret_cast<int16_t*>(origin_hist_data_);
+        int16_t* hist_buf_ptr = reinterpret_cast<int16_t*>(hist_buf->data());
+        if (block_id == 0) {
+          if (is_use_subcol_) {
+            data_ptr = hist_buf_ptr + hist_buf->size() - 2 * static_cast<size_t>(num_bin_aligned_);
+          }
+        } else {
+          data_ptr = hist_buf_ptr +
+            static_cast<size_t>(num_bin_aligned_) * (block_id - 1) * 2;
+        }
+        std::memset(reinterpret_cast<void*>(data_ptr), 0, num_bin_ * kInt16HistBufferEntrySize);
+        if (USE_INDICES) {
+          if (ORDERED) {
+            sub_multi_val_bin->ConstructHistogramOrderedInt16(data_indices, start, end,
+                                                              gradients, hessians,
+                                                              reinterpret_cast<hist_t*>(data_ptr));
+          } else {
+            sub_multi_val_bin->ConstructHistogramInt16(data_indices, start, end, gradients,
+                                                       hessians,
+                                                       reinterpret_cast<hist_t*>(data_ptr));
+          }
+        } else {
+          sub_multi_val_bin->ConstructHistogramInt16(start, end, gradients, hessians,
+                                                     reinterpret_cast<hist_t*>(data_ptr));
+        }
+      } else {
+        int32_t* data_ptr = reinterpret_cast<int32_t*>(origin_hist_data_);
+        int32_t* hist_buf_ptr = reinterpret_cast<int32_t*>(hist_buf->data());
+        if (block_id == 0) {
+          if (is_use_subcol_) {
+            data_ptr = hist_buf_ptr + hist_buf->size() - 2 * static_cast<size_t>(num_bin_aligned_);
+          }
+        } else {
+          data_ptr = hist_buf_ptr +
+            static_cast<size_t>(num_bin_aligned_) * (block_id - 1) * 2;
+        }
+        std::memset(reinterpret_cast<void*>(data_ptr), 0, num_bin_ * kInt32HistBufferEntrySize);
+        if (USE_INDICES) {
+          if (ORDERED) {
+            sub_multi_val_bin->ConstructHistogramOrderedInt32(data_indices, start, end,
+                                                              gradients, hessians,
+                                                              reinterpret_cast<hist_t*>(data_ptr));
+          } else {
+            sub_multi_val_bin->ConstructHistogramInt32(data_indices, start, end, gradients,
+                                                       hessians,
+                                                       reinterpret_cast<hist_t*>(data_ptr));
+          }
+        } else {
+          sub_multi_val_bin->ConstructHistogramInt32(start, end, gradients, hessians,
+                                                     reinterpret_cast<hist_t*>(data_ptr));
+        }
       }
     } else {
-      data_ptr = hist_buf->data() +
-        static_cast<size_t>(num_bin_aligned_) * (block_id - 1) * 2;
-    }
-    std::memset(reinterpret_cast<void*>(data_ptr), 0, num_bin_ * kHistBufferEntrySize);
-    if (USE_INDICES) {
-      if (ORDERED) {
-        sub_multi_val_bin->ConstructHistogramOrdered(data_indices, start, end,
-                                                gradients, hessians, data_ptr);
+      hist_t* data_ptr = origin_hist_data_;
+      if (block_id == 0) {
+        if (is_use_subcol_) {
+          data_ptr = hist_buf->data() + hist_buf->size() - 2 * static_cast<size_t>(num_bin_aligned_);
+        }
       } else {
-        sub_multi_val_bin->ConstructHistogram(data_indices, start, end, gradients,
-                                          hessians, data_ptr);
+        data_ptr = hist_buf->data() +
+          static_cast<size_t>(num_bin_aligned_) * (block_id - 1) * 2;
+      }
+      std::memset(reinterpret_cast<void*>(data_ptr), 0, num_bin_ * kHistBufferEntrySize);
+      if (USE_INDICES) {
+        if (ORDERED) {
+          sub_multi_val_bin->ConstructHistogramOrdered(data_indices, start, end,
+                                                  gradients, hessians, data_ptr);
+        } else {
+          sub_multi_val_bin->ConstructHistogram(data_indices, start, end, gradients,
+                                            hessians, data_ptr);
+        }
+      } else {
+        sub_multi_val_bin->ConstructHistogram(start, end, gradients, hessians,
+                                          data_ptr);
       }
-    } else {
-      sub_multi_val_bin->ConstructHistogram(start, end, gradients, hessians,
-                                        data_ptr);
     }
   }
 
@@ -162,10 +254,14 @@ class MultiValBinWrapper {
   int data_block_size_;
   int min_block_size_;
   int num_data_;
+  int num_grad_quant_bins_;
 
   hist_t* origin_hist_data_;
 
   const size_t kHistBufferEntrySize = 2 * sizeof(hist_t);
+  const size_t kInt32HistBufferEntrySize = 2 * sizeof(int32_t);
+  const size_t kInt16HistBufferEntrySize = 2 * sizeof(int16_t);
+  const size_t kInt8HistBufferEntrySize = 2 * sizeof(int8_t);
 };
 
 struct TrainingShareStates {
@@ -193,7 +289,7 @@ struct TrainingShareStates {
 
   void SetMultiValBin(MultiValBin* bin, data_size_t num_data,
     const std::vector<std::unique_ptr<FeatureGroup>>& feature_groups,
-    bool dense_only, bool sparse_only);
+    bool dense_only, bool sparse_only, const int num_grad_quant_bins);
 
   void CalcBinOffsets(const std::vector<std::unique_ptr<FeatureGroup>>& feature_groups,
     std::vector<uint32_t>* offsets, bool is_col_wise);
@@ -210,14 +306,14 @@ struct TrainingShareStates {
     }
   }
 
-  template <bool USE_INDICES, bool ORDERED>
+  template <bool USE_INDICES, bool ORDERED, bool USE_QUANT_GRAD, int HIST_BITS>
   void ConstructHistograms(const data_size_t* data_indices,
                           data_size_t num_data,
                           const score_t* gradients,
                           const score_t* hessians,
                           hist_t* hist_data) {
     if (multi_val_bin_wrapper_ != nullptr) {
-      multi_val_bin_wrapper_->ConstructHistograms<USE_INDICES, ORDERED>(
+      multi_val_bin_wrapper_->ConstructHistograms<USE_INDICES, ORDERED, USE_QUANT_GRAD, HIST_BITS>(
         data_indices, num_data, gradients, hessians, &hist_buf_, hist_data);
     }
   }
diff --git a/python-package/README.rst b/python-package/README.rst
index 6cabf33ab29c..2f927f4f2010 100644
--- a/python-package/README.rst
+++ b/python-package/README.rst
@@ -41,7 +41,7 @@ Build from Sources
 
 .. code:: sh
 
-    pip install --no-binary :all: lightgbm
+    pip install --no-binary lightgbm lightgbm
 
 For **Linux** and **macOS** users, installation from sources requires installed `CMake`_.
 
@@ -193,34 +193,33 @@ For **Windows** users, if you get any errors during installation and there is th
 .. code:: sh
 
     git clone --recursive https://github.com/microsoft/LightGBM.git
-    cd LightGBM/python-package
     # export CXX=g++-7 CC=gcc-7  # macOS users, if you decided to compile with gcc, don't forget to specify compilers (replace "7" with version of gcc installed on your machine)
-    python setup.py install
+    sh ./build-python.sh install
 
 Note: ``sudo`` (or administrator rights in **Windows**) may be needed to perform the command.
 
-Run ``python setup.py install --nomp`` to disable **OpenMP** support. All requirements from `Build Threadless Version section <#build-threadless-version>`__ apply for this installation option as well.
+Run ``sh ./build-python.sh install --nomp`` to disable **OpenMP** support. All requirements from `Build Threadless Version section <#build-threadless-version>`__ apply for this installation option as well.
 
-Run ``python setup.py install --mpi`` to enable **MPI** support. All requirements from `Build MPI Version section <#build-mpi-version>`__ apply for this installation option as well.
+Run ``sh ./build-python.sh install --mpi`` to enable **MPI** support. All requirements from `Build MPI Version section <#build-mpi-version>`__ apply for this installation option as well.
 
-Run ``python setup.py install --mingw``, if you want to use **MinGW-w64** on **Windows** instead of **Visual Studio**. All requirements from `Build with MinGW-w64 on Windows section <#build-with-mingw-w64-on-windows>`__ apply for this installation option as well.
+Run ``sh ./build-python.sh install --mingw``, if you want to use **MinGW-w64** on **Windows** instead of **Visual Studio**. All requirements from `Build with MinGW-w64 on Windows section <#build-with-mingw-w64-on-windows>`__ apply for this installation option as well.
 
-Run ``python setup.py install --gpu`` to enable GPU support. All requirements from `Build GPU Version section <#build-gpu-version>`__ apply for this installation option as well. To pass additional options to **CMake** use the following syntax: ``python setup.py install --gpu --opencl-include-dir=/usr/local/cuda/include/``, see `Build GPU Version section <#build-gpu-version>`__ for the complete list of them.
+Run ``sh ./build-python.sh install --gpu`` to enable GPU support. All requirements from `Build GPU Version section <#build-gpu-version>`__ apply for this installation option as well. To pass additional options to **CMake** use the following syntax: ``sh ./build-python.sh install --gpu --opencl-include-dir="/usr/local/cuda/include/"``, see `Build GPU Version section <#build-gpu-version>`__ for the complete list of them.
 
-Run ``python setup.py install --cuda`` to enable CUDA support. All requirements from `Build CUDA Version section <#build-cuda-version>`__ apply for this installation option as well.
+Run ``sh ./build-python.sh install --cuda`` to enable CUDA support. All requirements from `Build CUDA Version section <#build-cuda-version>`__ apply for this installation option as well.
 
-Run ``python setup.py install --hdfs`` to enable HDFS support. All requirements from `Build HDFS Version section <#build-hdfs-version>`__ apply for this installation option as well.
+Run ``sh ./build-python.sh install --hdfs`` to enable HDFS support. All requirements from `Build HDFS Version section <#build-hdfs-version>`__ apply for this installation option as well.
 
-Run ``python setup.py install --bit32``, if you want to use 32-bit version. All requirements from `Build 32-bit Version with 32-bit Python section <#build-32-bit-version-with-32-bit-python>`__ apply for this installation option as well.
+Run ``sh ./build-python.sh install --bit32``, if you want to use 32-bit version. All requirements from `Build 32-bit Version with 32-bit Python section <#build-32-bit-version-with-32-bit-python>`__ apply for this installation option as well.
 
-Run ``python setup.py install --time-costs``, if you want to output time costs for different internal routines. All requirements from `Build with Time Costs Output section <#build-with-time-costs-output>`__ apply for this installation option as well.
+Run ``sh ./build-python.sh install --time-costs``, if you want to output time costs for different internal routines. All requirements from `Build with Time Costs Output section <#build-with-time-costs-output>`__ apply for this installation option as well.
 
-If you get any errors during installation or due to any other reasons, you may want to build dynamic library from sources by any method you prefer (see `Installation Guide <https://github.com/microsoft/LightGBM/blob/master/docs/Installation-Guide.rst>`__) and then just run ``python setup.py install --precompile``.
+If you get any errors during installation or due to any other reasons, you may want to build dynamic library from sources by any method you prefer (see `Installation Guide <https://github.com/microsoft/LightGBM/blob/master/docs/Installation-Guide.rst>`__) and then just run ``sh ./build-python.sh install --precompile``.
 
 Build Wheel File
 ****************
 
-You can use ``python setup.py bdist_wheel`` instead of ``python setup.py install`` to build wheel file and use it for installation later. This might be useful for systems with restricted or completely without network access.
+You can use ``sh ./build-python.sh install bdist_wheel`` instead of ``sh ./build-python.sh install`` to build wheel file and use it for installation later. This might be useful for systems with restricted or completely without network access.
 
 Install Dask-package
 ''''''''''''''''''''
@@ -235,7 +234,7 @@ To install all additional dependencies required for Dask-package, you can append
 
     pip install lightgbm[dask]
 
-Or replace ``python setup.py install`` with ``pip install -e .[dask]`` if you are installing the package from source files.
+Or replace ``sh ./build-python.sh install`` with ``pip install -e .[dask]`` if you are installing the package from source files.
 
 Troubleshooting
 ---------------
@@ -252,9 +251,15 @@ Refer to the walk through examples in `Python guide folder <https://github.com/m
 Development Guide
 -----------------
 
-The code style of Python-package follows `PEP 8 <https://www.python.org/dev/peps/pep-0008/>`_. If you would like to make a contribution and not familiar with PEP 8, please check the PEP 8 style guide first. Otherwise, the check won't pass. Only E501 (line too long) and W503 (line break occurred before a binary operator) can be ignored.
+The code style of Python-package follows `PEP 8 <https://www.python.org/dev/peps/pep-0008/>`_.
 
-Documentation strings (docstrings) are written in the NumPy style.
+The package's documentation strings (docstrings) are written in the `numpydoc style <https://numpydoc.readthedocs.io/en/latest/format.html>`_.
+
+To check that a contribution to the package matches its style expectations, run the following from the root of the repo.
+
+.. code:: sh
+
+    sh .ci/lint-python.sh
 
 .. |License| image:: https://img.shields.io/github/license/microsoft/lightgbm.svg
    :target: https://github.com/microsoft/LightGBM/blob/master/LICENSE
diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py
index 445a1047d959..fd07283aa236 100644
--- a/python-package/lightgbm/basic.py
+++ b/python-package/lightgbm/basic.py
@@ -2,6 +2,7 @@
 """Wrapper for C API of LightGBM."""
 import abc
 import ctypes
+import inspect
 import json
 import warnings
 from collections import OrderedDict
@@ -12,7 +13,7 @@
 from os.path import getsize
 from pathlib import Path
 from tempfile import NamedTemporaryFile
-from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union
 
 import numpy as np
 import scipy.sparse
@@ -20,6 +21,9 @@
 from .compat import PANDAS_INSTALLED, concat, dt_DataTable, pd_CategoricalDtype, pd_DataFrame, pd_Series
 from .libpath import find_lib_path
 
+if TYPE_CHECKING:
+    from typing import Literal
+
 __all__ = [
     'Booster',
     'Dataset',
@@ -34,6 +38,10 @@
     "ctypes._Pointer[ctypes.c_int32]",
     "ctypes._Pointer[ctypes.c_int64]"
 ]
+_ctypes_int_array = Union[
+    "ctypes.Array[ctypes._Pointer[ctypes.c_int32]]",
+    "ctypes.Array[ctypes._Pointer[ctypes.c_int64]]"
+]
 _ctypes_float_ptr = Union[
     "ctypes._Pointer[ctypes.c_float]",
     "ctypes._Pointer[ctypes.c_double]"
@@ -45,8 +53,8 @@
 _LGBM_EvalFunctionResultType = Tuple[str, float, bool]
 _LGBM_BoosterBestScoreType = Dict[str, Dict[str, float]]
 _LGBM_BoosterEvalMethodResultType = Tuple[str, str, float, bool]
-_LGBM_CategoricalFeatureConfiguration = Union[List[str], List[int], str]
-_LGBM_FeatureNameConfiguration = Union[List[str], str]
+_LGBM_CategoricalFeatureConfiguration = Union[List[str], List[int], "Literal['auto']"]
+_LGBM_FeatureNameConfiguration = Union[List[str], "Literal['auto']"]
 _LGBM_GroupType = Union[
     List[float],
     List[int],
@@ -72,7 +80,8 @@
     List[np.ndarray]
 ]
 _LGBM_LabelType = Union[
-    list,
+    List[float],
+    List[int],
     np.ndarray,
     pd_Series,
     pd_DataFrame
@@ -276,8 +285,8 @@ def _is_1d_collection(data: Any) -> bool:
 
 def _list_to_1d_numpy(
     data: Any,
-    dtype: "np.typing.DTypeLike" = np.float32,
-    name: str = 'list'
+    dtype: "np.typing.DTypeLike",
+    name: str
 ) -> np.ndarray:
     """Convert data to numpy 1-D array."""
     if _is_numpy_1d_array(data):
@@ -317,8 +326,8 @@ def _is_2d_collection(data: Any) -> bool:
 
 def _data_to_2d_numpy(
     data: Any,
-    dtype: "np.typing.DTypeLike" = np.float32,
-    name: str = 'list'
+    dtype: "np.typing.DTypeLike",
+    name: str
 ) -> np.ndarray:
     """Convert data to numpy 2-D array."""
     if _is_numpy_2d_array(data):
@@ -588,13 +597,16 @@ def _convert_from_sliced_object(data: np.ndarray) -> np.ndarray:
     return data
 
 
-def _c_float_array(data):
+def _c_float_array(
+    data: np.ndarray
+) -> Tuple[_ctypes_float_ptr, int, np.ndarray]:
     """Get pointer of float numpy array / list."""
     if _is_1d_list(data):
         data = np.array(data, copy=False)
     if _is_numpy_1d_array(data):
         data = _convert_from_sliced_object(data)
         assert data.flags.c_contiguous
+        ptr_data: _ctypes_float_ptr
         if data.dtype == np.float32:
             ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
             type_data = _C_API_DTYPE_FLOAT32
@@ -608,13 +620,16 @@ def _c_float_array(data):
     return (ptr_data, type_data, data)  # return `data` to avoid the temporary copy is freed
 
 
-def _c_int_array(data):
+def _c_int_array(
+    data: np.ndarray
+) -> Tuple[_ctypes_int_ptr, int, np.ndarray]:
     """Get pointer of int numpy array / list."""
     if _is_1d_list(data):
         data = np.array(data, copy=False)
     if _is_numpy_1d_array(data):
         data = _convert_from_sliced_object(data)
         assert data.flags.c_contiguous
+        ptr_data: _ctypes_int_ptr
         if data.dtype == np.int32:
             ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_int32))
             type_data = _C_API_DTYPE_INT32
@@ -677,7 +692,7 @@ def _data_from_pandas(
             if categorical_feature == 'auto':  # use cat cols from DataFrame
                 categorical_feature = cat_cols_not_ordered
             else:  # use cat cols specified by user
-                categorical_feature = list(categorical_feature)
+                categorical_feature = list(categorical_feature)  # type: ignore[assignment]
         if feature_name == 'auto':
             feature_name = list(data.columns)
         _check_for_bad_pandas_dtypes(data.dtypes)
@@ -982,8 +997,8 @@ def predict(
         elif isinstance(data, list):
             try:
                 data = np.array(data)
-            except BaseException:
-                raise ValueError('Cannot convert data list to numpy array.')
+            except BaseException as err:
+                raise ValueError('Cannot convert data list to numpy array.') from err
             preds, nrow = self.__pred_for_np2d(
                 mat=data,
                 start_iteration=start_iteration,
@@ -1001,8 +1016,8 @@ def predict(
             try:
                 _log_warning('Converting data to scipy sparse matrix.')
                 csr = scipy.sparse.csr_matrix(data)
-            except BaseException:
-                raise TypeError(f'Cannot predict data for type {type(data).__name__}')
+            except BaseException as err:
+                raise TypeError(f'Cannot predict data for type {type(data).__name__}') from err
             preds, nrow = self.__pred_for_csr(
                 csr=csr,
                 start_iteration=start_iteration,
@@ -1623,10 +1638,10 @@ def _init_from_sample(
 
         # c type: double**
         # each double* element points to start of each column of sample data.
-        sample_col_ptr = (ctypes.POINTER(ctypes.c_double) * ncol)()
+        sample_col_ptr: _ctypes_float_array = (ctypes.POINTER(ctypes.c_double) * ncol)()
         # c type int**
         # each int* points to start of indices for each column
-        indices_col_ptr = (ctypes.POINTER(ctypes.c_int32) * ncol)()
+        indices_col_ptr: _ctypes_int_array = (ctypes.POINTER(ctypes.c_int32) * ncol)()
         for i in range(ncol):
             sample_col_ptr[i] = _c_float_array(sample_data[i])[0]
             indices_col_ptr[i] = _c_int_array(sample_indices[i])[0]
@@ -1724,18 +1739,20 @@ def _free_handle(self) -> "Dataset":
     def _set_init_score_by_predictor(
         self,
         predictor: Optional[_InnerPredictor],
-        data,
-        used_indices: Optional[List[int]]
-    ):
+        data: _LGBM_TrainDataType,
+        used_indices: Optional[Union[List[int], np.ndarray]]
+    ) -> "Dataset":
         data_has_header = False
         if isinstance(data, (str, Path)) and self.params is not None:
             # check data has header or not
             data_has_header = any(self.params.get(alias, False) for alias in _ConfigAliases.get("header"))
         num_data = self.num_data()
         if predictor is not None:
-            init_score = predictor.predict(data,
-                                           raw_score=True,
-                                           data_has_header=data_has_header)
+            init_score: Union[np.ndarray, scipy.sparse.spmatrix] = predictor.predict(
+                data=data,
+                raw_score=True,
+                data_has_header=data_has_header
+            )
             init_score = init_score.ravel()
             if used_indices is not None:
                 assert not self._need_slice
@@ -1754,23 +1771,24 @@ def _set_init_score_by_predictor(
                         new_init_score[j * num_data + i] = init_score[i * predictor.num_class + j]
                 init_score = new_init_score
         elif self.init_score is not None:
-            init_score = np.zeros(self.init_score.shape, dtype=np.float64)
+            init_score = np.full_like(self.init_score, fill_value=0.0, dtype=np.float64)
         else:
             return self
         self.set_init_score(init_score)
+        return self
 
     def _lazy_init(
         self,
         data: Optional[_LGBM_TrainDataType],
-        label: Optional[_LGBM_LabelType] = None,
-        reference: Optional["Dataset"] = None,
-        weight: Optional[_LGBM_WeightType] = None,
-        group: Optional[_LGBM_GroupType] = None,
-        init_score: Optional[_LGBM_InitScoreType] = None,
-        predictor=None,
-        feature_name='auto',
-        categorical_feature='auto',
-        params: Optional[Dict[str, Any]] = None
+        label: Optional[_LGBM_LabelType],
+        reference: Optional["Dataset"],
+        weight: Optional[_LGBM_WeightType],
+        group: Optional[_LGBM_GroupType],
+        init_score: Optional[_LGBM_InitScoreType],
+        predictor: Optional[_InnerPredictor],
+        feature_name: _LGBM_FeatureNameConfiguration,
+        categorical_feature: _LGBM_CategoricalFeatureConfiguration,
+        params: Optional[Dict[str, Any]]
     ) -> "Dataset":
         if data is None:
             self.handle = None
@@ -1778,16 +1796,14 @@ def _lazy_init(
         if reference is not None:
             self.pandas_categorical = reference.pandas_categorical
             categorical_feature = reference.categorical_feature
-        data, feature_name, categorical_feature, self.pandas_categorical = _data_from_pandas(data,
-                                                                                             feature_name,
-                                                                                             categorical_feature,
-                                                                                             self.pandas_categorical)
+        data, feature_name, categorical_feature, self.pandas_categorical = _data_from_pandas(data=data,
+                                                                                             feature_name=feature_name,
+                                                                                             categorical_feature=categorical_feature,
+                                                                                             pandas_categorical=self.pandas_categorical)
 
         # process for args
         params = {} if params is None else params
-        args_names = (getattr(self.__class__, '_lazy_init')
-                      .__code__
-                      .co_varnames[:getattr(self.__class__, '_lazy_init').__code__.co_argcount])
+        args_names = inspect.signature(self.__class__._lazy_init).parameters.keys()
         for key in params.keys():
             if key in args_names:
                 _log_warning(f'{key} keyword has been found in `params` and will be ignored.\n'
@@ -1851,8 +1867,8 @@ def _lazy_init(
             try:
                 csr = scipy.sparse.csr_matrix(data)
                 self.__init_from_csr(csr, params_str, ref_dataset)
-            except BaseException:
-                raise TypeError(f'Cannot initialize Dataset from {type(data).__name__}')
+            except BaseException as err:
+                raise TypeError(f'Cannot initialize Dataset from {type(data).__name__}') from err
         if label is not None:
             self.set_label(label)
         if self.get_label() is None:
@@ -1903,7 +1919,7 @@ def __sample(self, seqs: List[Sequence], total_nrow: int) -> Tuple[List[np.ndarr
         indices = self._create_sample_indices(total_nrow)
 
         # Select sampled rows, transpose to column order.
-        sampled = np.array([row for row in self._yield_row_from_seqlist(seqs, indices)])
+        sampled = np.array(list(self._yield_row_from_seqlist(seqs, indices)))
         sampled = sampled.T
 
         filtered = []
@@ -1996,7 +2012,7 @@ def __init_from_list_np2d(
             ptr_data = (ctypes.POINTER(ctypes.c_float) * len(mats))()
 
         holders = []
-        type_ptr_data = None
+        type_ptr_data = -1
 
         for i, mat in enumerate(mats):
             if len(mat.shape) != 2:
@@ -2013,7 +2029,7 @@ def __init_from_list_np2d(
                 mats[i] = np.array(mat.reshape(mat.size), dtype=np.float32)
 
             chunk_ptr_data, chunk_type_ptr_data, holder = _c_float_array(mats[i])
-            if type_ptr_data is not None and chunk_type_ptr_data != type_ptr_data:
+            if type_ptr_data != -1 and chunk_type_ptr_data != type_ptr_data:
                 raise ValueError('Input chunks must have same type')
             ptr_data[i] = chunk_ptr_data
             type_ptr_data = chunk_type_ptr_data
@@ -2154,13 +2170,13 @@ def construct(self) -> "Dataset":
                     self._update_params(reference_params)
                 if self.used_indices is None:
                     # create valid
-                    self._lazy_init(self.data, label=self.label, reference=self.reference,
+                    self._lazy_init(data=self.data, label=self.label, reference=self.reference,
                                     weight=self.weight, group=self.group,
                                     init_score=self.init_score, predictor=self._predictor,
-                                    feature_name=self.feature_name, params=self.params)
+                                    feature_name=self.feature_name, categorical_feature='auto', params=self.params)
                 else:
                     # construct subset
-                    used_indices = _list_to_1d_numpy(self.used_indices, np.int32, name='used_indices')
+                    used_indices = _list_to_1d_numpy(self.used_indices, dtype=np.int32, name='used_indices')
                     assert used_indices.flags.c_contiguous
                     if self.reference.group is not None:
                         group_info = np.array(self.reference.group).astype(np.int32, copy=False)
@@ -2189,7 +2205,7 @@ def construct(self) -> "Dataset":
                         )
             else:
                 # create train
-                self._lazy_init(self.data, label=self.label,
+                self._lazy_init(data=self.data, label=self.label, reference=None,
                                 weight=self.weight, group=self.group,
                                 init_score=self.init_score, predictor=self._predictor,
                                 feature_name=self.feature_name, categorical_feature=self.categorical_feature, params=self.params)
@@ -2329,7 +2345,7 @@ def _reverse_update_params(self) -> "Dataset":
     def set_field(
         self,
         field_name: str,
-        data
+        data: Optional[Union[List[List[float]], List[List[int]], List[float], List[int], np.ndarray, pd_Series, pd_DataFrame]]
     ) -> "Dataset":
         """Set property into the Dataset.
 
@@ -2360,9 +2376,9 @@ def set_field(
         if field_name == 'init_score':
             dtype = np.float64
             if _is_1d_collection(data):
-                data = _list_to_1d_numpy(data, dtype, name=field_name)
+                data = _list_to_1d_numpy(data, dtype=dtype, name=field_name)
             elif _is_2d_collection(data):
-                data = _data_to_2d_numpy(data, dtype, name=field_name)
+                data = _data_to_2d_numpy(data, dtype=dtype, name=field_name)
                 data = data.ravel(order='F')
             else:
                 raise TypeError(
@@ -2371,8 +2387,9 @@ def set_field(
                 )
         else:
             dtype = np.int32 if field_name == 'group' else np.float32
-            data = _list_to_1d_numpy(data, dtype, name=field_name)
+            data = _list_to_1d_numpy(data, dtype=dtype, name=field_name)
 
+        ptr_data: Union[_ctypes_float_ptr, _ctypes_int_ptr]
         if data.dtype == np.float32 or data.dtype == np.float64:
             ptr_data, type_data, _ = _c_float_array(data)
         elif data.dtype == np.int32:
@@ -2460,7 +2477,7 @@ def set_categorical_feature(
             else:
                 if self.categorical_feature != 'auto':
                     _log_warning('categorical_feature in Dataset is overridden.\n'
-                                 f'New categorical_feature is {sorted(list(categorical_feature))}')
+                                 f'New categorical_feature is {list(categorical_feature)}')
                 self.categorical_feature = categorical_feature
                 return self._free_handle()
         else:
@@ -2528,7 +2545,7 @@ def set_reference(self, reference: "Dataset") -> "Dataset":
             raise LightGBMError("Cannot set reference after freed raw data, "
                                 "set free_raw_data=False when construct Dataset to avoid this.")
 
-    def set_feature_name(self, feature_name: Union[List[str], str]) -> "Dataset":
+    def set_feature_name(self, feature_name: _LGBM_FeatureNameConfiguration) -> "Dataset":
         """Set feature name.
 
         Parameters
@@ -2584,7 +2601,7 @@ def set_label(self, label: Optional[_LGBM_LabelType]) -> "Dataset":
                     label = label.to_numpy(dtype=np.float32, na_value=np.nan)
                 label_array = np.ravel(label)
             else:
-                label_array = _list_to_1d_numpy(label, name='label')
+                label_array = _list_to_1d_numpy(label, dtype=np.float32, name='label')
             self.set_field('label', label_array)
             self.label = self.get_field('label')  # original values can be modified at cpp side
         return self
@@ -2609,7 +2626,7 @@ def set_weight(
             weight = None
         self.weight = weight
         if self.handle is not None and weight is not None:
-            weight = _list_to_1d_numpy(weight, name='weight')
+            weight = _list_to_1d_numpy(weight, dtype=np.float32, name='weight')
             self.set_field('weight', weight)
             self.weight = self.get_field('weight')  # original values can be modified at cpp side
         return self
@@ -2658,7 +2675,7 @@ def set_group(
         """
         self.group = group
         if self.handle is not None and group is not None:
-            group = _list_to_1d_numpy(group, np.int32, name='group')
+            group = _list_to_1d_numpy(group, dtype=np.int32, name='group')
             self.set_field('group', group)
         return self
 
@@ -2759,7 +2776,7 @@ def get_data(self) -> Optional[_LGBM_TrainDataType]:
                 elif isinstance(self.data, Sequence):
                     self.data = self.data[self.used_indices]
                 elif isinstance(self.data, list) and len(self.data) > 0 and all(isinstance(x, Sequence) for x in self.data):
-                    self.data = np.array([row for row in self._yield_row_from_seqlist(self.data, self.used_indices)])
+                    self.data = np.array(list(self._yield_row_from_seqlist(self.data, self.used_indices)))
                 else:
                     _log_warning(f"Cannot subset {type(self.data).__name__} type of raw data.\n"
                                  "Returning original raw data")
@@ -3097,7 +3114,7 @@ def __init__(
                 ctypes.byref(out_num_class)))
             self.__num_class = out_num_class.value
             # buffer for inner predict
-            self.__inner_predict_buffer = [None]
+            self.__inner_predict_buffer: List[Optional[np.ndarray]] = [None]
             self.__is_predicted_cur_iter = [False]
             self.__get_eval_info()
             self.pandas_categorical = train_set.pandas_categorical
@@ -3288,13 +3305,21 @@ def trees_to_dataframe(self) -> pd_DataFrame:
         if self.num_trees() == 0:
             raise LightGBMError('There are no trees in this Booster and thus nothing to parse')
 
-        def _is_split_node(tree):
+        def _is_split_node(tree: Dict[str, Any]) -> bool:
             return 'split_index' in tree.keys()
 
-        def create_node_record(tree, node_depth=1, tree_index=None,
-                               feature_names=None, parent_node=None):
-
-            def _get_node_index(tree, tree_index):
+        def create_node_record(
+            tree: Dict[str, Any],
+            node_depth: int = 1,
+            tree_index: Optional[int] = None,
+            feature_names: Optional[List[str]] = None,
+            parent_node: Optional[str] = None
+        ) -> Dict[str, Any]:
+
+            def _get_node_index(
+                tree: Dict[str, Any],
+                tree_index: Optional[int]
+            ) -> str:
                 tree_num = f'{tree_index}-' if tree_index is not None else ''
                 is_split = _is_split_node(tree)
                 node_type = 'S' if is_split else 'L'
@@ -3302,7 +3327,10 @@ def _get_node_index(tree, tree_index):
                 node_num = tree.get('split_index' if is_split else 'leaf_index', 0)
                 return f"{tree_num}{node_type}{node_num}"
 
-            def _get_split_feature(tree, feature_names):
+            def _get_split_feature(
+                tree: Dict[str, Any],
+                feature_names: Optional[List[str]]
+            ) -> Optional[str]:
                 if _is_split_node(tree):
                     if feature_names is not None:
                         feature_name = feature_names[tree['split_feature']]
@@ -3312,11 +3340,11 @@ def _get_split_feature(tree, feature_names):
                     feature_name = None
                 return feature_name
 
-            def _is_single_node_tree(tree):
+            def _is_single_node_tree(tree: Dict[str, Any]) -> bool:
                 return set(tree.keys()) == {'leaf_value'}
 
             # Create the node record, and populate universal data members
-            node = OrderedDict()
+            node: Dict[str, Union[int, str, None]] = OrderedDict()
             node['tree_index'] = tree_index
             node['node_depth'] = node_depth
             node['node_index'] = _get_node_index(tree, tree_index)
@@ -3353,10 +3381,15 @@ def _is_single_node_tree(tree):
 
             return node
 
-        def tree_dict_to_node_list(tree, node_depth=1, tree_index=None,
-                                   feature_names=None, parent_node=None):
+        def tree_dict_to_node_list(
+            tree: Dict[str, Any],
+            node_depth: int = 1,
+            tree_index: Optional[int] = None,
+            feature_names: Optional[List[str]] = None,
+            parent_node: Optional[str] = None
+        ) -> List[Dict[str, Any]]:
 
-            node = create_node_record(tree,
+            node = create_node_record(tree=tree,
                                       node_depth=node_depth,
                                       tree_index=tree_index,
                                       feature_names=feature_names,
@@ -3369,11 +3402,12 @@ def tree_dict_to_node_list(tree, node_depth=1, tree_index=None,
                 children = ['left_child', 'right_child']
                 for child in children:
                     subtree_list = tree_dict_to_node_list(
-                        tree[child],
+                        tree=tree[child],
                         node_depth=node_depth + 1,
                         tree_index=tree_index,
                         feature_names=feature_names,
-                        parent_node=node['node_index'])
+                        parent_node=node['node_index']
+                    )
                     # In tree format, "subtree_list" is a list of node records (dicts),
                     # and we add node to the list.
                     res.extend(subtree_list)
@@ -3383,7 +3417,7 @@ def tree_dict_to_node_list(tree, node_depth=1, tree_index=None,
         feature_names = model_dict['feature_names']
         model_list = []
         for tree in model_dict['tree_info']:
-            model_list.extend(tree_dict_to_node_list(tree['tree_structure'],
+            model_list.extend(tree_dict_to_node_list(tree=tree['tree_structure'],
                                                      tree_index=tree['tree_index'],
                                                      feature_names=feature_names))
 
@@ -3558,8 +3592,8 @@ def __boost(
         if self.__num_class > 1:
             grad = grad.ravel(order='F')
             hess = hess.ravel(order='F')
-        grad = _list_to_1d_numpy(grad, name='gradient')
-        hess = _list_to_1d_numpy(hess, name='hessian')
+        grad = _list_to_1d_numpy(grad, dtype=np.float32, name='gradient')
+        hess = _list_to_1d_numpy(hess, dtype=np.float32, name='hessian')
         assert grad.flags.c_contiguous
         assert hess.flags.c_contiguous
         if len(grad) != len(hess):
@@ -4068,7 +4102,7 @@ def predict(
             Prediction result.
             Can be sparse or a list of sparse objects (each element represents predictions for one class) for feature contributions (when ``pred_contrib=True``).
         """
-        predictor = self._to_predictor(deepcopy(kwargs))
+        predictor = self._to_predictor(pred_parameter=deepcopy(kwargs))
         if num_iteration is None:
             if start_iteration <= 0:
                 num_iteration = self.best_iteration
@@ -4158,8 +4192,8 @@ def refit(
             raise LightGBMError('Cannot refit due to null objective function.')
         if dataset_params is None:
             dataset_params = {}
-        predictor = self._to_predictor(deepcopy(kwargs))
-        leaf_preds = predictor.predict(
+        predictor = self._to_predictor(pred_parameter=deepcopy(kwargs))
+        leaf_preds: np.ndarray = predictor.predict(  # type: ignore[assignment]
             data=data,
             start_iteration=-1,
             pred_leaf=True,
@@ -4262,7 +4296,7 @@ def set_leaf_output(
 
     def _to_predictor(
         self,
-        pred_parameter: Optional[Dict[str, Any]] = None
+        pred_parameter: Dict[str, Any]
     ) -> _InnerPredictor:
         """Convert to predictor."""
         predictor = _InnerPredictor(booster_handle=self.handle, pred_parameter=pred_parameter)
@@ -4414,7 +4448,7 @@ def add(root: Dict[str, Any]) -> None:
         model = self.dump_model()
         feature_names = model.get('feature_names')
         tree_infos = model['tree_info']
-        values = []
+        values: List[float] = []
         for tree_info in tree_infos:
             add(tree_info['tree_structure'])
 
@@ -4488,16 +4522,16 @@ def __inner_predict(self, data_idx: int) -> np.ndarray:
         # avoid to predict many time in one iteration
         if not self.__is_predicted_cur_iter[data_idx]:
             tmp_out_len = ctypes.c_int64(0)
-            data_ptr = self.__inner_predict_buffer[data_idx].ctypes.data_as(ctypes.POINTER(ctypes.c_double))
+            data_ptr = self.__inner_predict_buffer[data_idx].ctypes.data_as(ctypes.POINTER(ctypes.c_double))  # type: ignore[union-attr]
             _safe_call(_LIB.LGBM_BoosterGetPredict(
                 self.handle,
                 ctypes.c_int(data_idx),
                 ctypes.byref(tmp_out_len),
                 data_ptr))
-            if tmp_out_len.value != len(self.__inner_predict_buffer[data_idx]):
+            if tmp_out_len.value != len(self.__inner_predict_buffer[data_idx]):  # type: ignore[arg-type]
                 raise ValueError(f"Wrong length of predict results for data {data_idx}")
             self.__is_predicted_cur_iter[data_idx] = True
-        result = self.__inner_predict_buffer[data_idx]
+        result: np.ndarray = self.__inner_predict_buffer[data_idx]  # type: ignore[assignment]
         if self.__num_class > 1:
             num_data = result.size // self.__num_class
             result = result.reshape(num_data, self.__num_class, order='F')
diff --git a/python-package/lightgbm/callback.py b/python-package/lightgbm/callback.py
index 45f67edf5949..0c5d3e7956fa 100644
--- a/python-package/lightgbm/callback.py
+++ b/python-package/lightgbm/callback.py
@@ -15,6 +15,10 @@
 
 _EvalResultDict = Dict[str, Dict[str, List[Any]]]
 _EvalResultTuple = Union[
+    _LGBM_BoosterEvalMethodResultType,
+    Tuple[str, str, float, bool, float]
+]
+_ListOfEvalResultTuples = Union[
     List[_LGBM_BoosterEvalMethodResultType],
     List[Tuple[str, str, float, bool, float]]
 ]
@@ -23,7 +27,7 @@
 class EarlyStopException(Exception):
     """Exception of early stopping."""
 
-    def __init__(self, best_iteration: int, best_score: _EvalResultTuple) -> None:
+    def __init__(self, best_iteration: int, best_score: _ListOfEvalResultTuples) -> None:
         """Create early stopping exception.
 
         Parameters
@@ -55,7 +59,7 @@ def _format_eval_result(value: _EvalResultTuple, show_stdv: bool) -> str:
         return f"{value[0]}'s {value[1]}: {value[2]:g}"
     elif len(value) == 5:
         if show_stdv:
-            return f"{value[0]}'s {value[1]}: {value[2]:g} + {value[4]:g}"
+            return f"{value[0]}'s {value[1]}: {value[2]:g} + {value[4]:g}"  # type: ignore[misc]
         else:
             return f"{value[0]}'s {value[1]}: {value[2]:g}"
     else:
@@ -256,7 +260,7 @@ def __init__(
     def _reset_storages(self) -> None:
         self.best_score: List[float] = []
         self.best_iter: List[int] = []
-        self.best_score_list: List[Union[_EvalResultTuple, None]] = []
+        self.best_score_list: List[_ListOfEvalResultTuples] = []
         self.cmp_op: List[Callable[[float, float], bool]] = []
         self.first_metric = ''
 
@@ -297,7 +301,7 @@ def _init(self, env: CallbackEnv) -> None:
 
         self._reset_storages()
 
-        n_metrics = len(set(m[1] for m in env.evaluation_result_list))
+        n_metrics = len({m[1] for m in env.evaluation_result_list})
         n_datasets = len(env.evaluation_result_list) // n_metrics
         if isinstance(self.min_delta, list):
             if not all(t >= 0 for t in self.min_delta):
@@ -327,7 +331,6 @@ def _init(self, env: CallbackEnv) -> None:
         self.first_metric = env.evaluation_result_list[0][1].split(" ")[-1]
         for eval_ret, delta in zip(env.evaluation_result_list, deltas):
             self.best_iter.append(0)
-            self.best_score_list.append(None)
             if eval_ret[3]:  # greater is better
                 self.best_score.append(float('-inf'))
                 self.cmp_op.append(partial(self._gt_delta, delta=delta))
@@ -350,12 +353,17 @@ def __call__(self, env: CallbackEnv) -> None:
             self._init(env)
         if not self.enabled:
             return
+        # self.best_score_list is initialized to an empty list
+        first_time_updating_best_score_list = (self.best_score_list == [])
         for i in range(len(env.evaluation_result_list)):
             score = env.evaluation_result_list[i][2]
-            if self.best_score_list[i] is None or self.cmp_op[i](score, self.best_score[i]):
+            if first_time_updating_best_score_list or self.cmp_op[i](score, self.best_score[i]):
                 self.best_score[i] = score
                 self.best_iter[i] = env.iteration
-                self.best_score_list[i] = env.evaluation_result_list
+                if first_time_updating_best_score_list:
+                    self.best_score_list.append(env.evaluation_result_list)
+                else:
+                    self.best_score_list[i] = env.evaluation_result_list
             # split is needed for "<dataset type> <metric>" case (e.g. "train l1")
             eval_name_splitted = env.evaluation_result_list[i][1].split(" ")
             if self.first_metric_only and self.first_metric != eval_name_splitted[-1]:
diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py
index 02bf7af2d253..c856fa1a9b11 100644
--- a/python-package/lightgbm/compat.py
+++ b/python-package/lightgbm/compat.py
@@ -77,9 +77,9 @@ def __init__(self, *args, **kwargs):
     from sklearn.utils.validation import assert_all_finite, check_array, check_X_y
     try:
         from sklearn.exceptions import NotFittedError
-        from sklearn.model_selection import BaseCrossValidator, GroupKFold, StratifiedKFold
+        from sklearn.model_selection import BaseCrossValidator, GroupKFold, StratifiedKFold, train_test_split
     except ImportError:
-        from sklearn.cross_validation import BaseCrossValidator, GroupKFold, StratifiedKFold
+        from sklearn.cross_validation import BaseCrossValidator, GroupKFold, StratifiedKFold, train_test_split
         from sklearn.utils.validation import NotFittedError
     try:
         from sklearn.utils.validation import _check_sample_weight
@@ -100,6 +100,7 @@ def _check_sample_weight(sample_weight, X, dtype=None):
     LGBMNotFittedError = NotFittedError
     _LGBMStratifiedKFold = StratifiedKFold
     _LGBMGroupKFold = GroupKFold
+    _LGBMTrainTestSplit = train_test_split
     _LGBMCheckXY = check_X_y
     _LGBMCheckArray = check_array
     _LGBMCheckSampleWeight = _check_sample_weight
diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py
index 8ea1d6907081..88487d515f81 100644
--- a/python-package/lightgbm/dask.py
+++ b/python-package/lightgbm/dask.py
@@ -21,9 +21,9 @@
 from .compat import (DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED, Client, LGBMNotFittedError, concat,
                      dask_Array, dask_array_from_delayed, dask_bag_from_delayed, dask_DataFrame, dask_Series,
                      default_client, delayed, pd_DataFrame, pd_Series, wait)
-from .sklearn import (LGBMClassifier, LGBMModel, LGBMRanker, LGBMRegressor, _LGBM_ScikitCustomObjectiveFunction,
-                      _LGBM_ScikitEvalMetricType, _lgbmmodel_doc_custom_eval_note, _lgbmmodel_doc_fit,
-                      _lgbmmodel_doc_predict)
+from .sklearn import (LGBMClassifier, LGBMModel, LGBMRanker, LGBMRegressor, _LGBM_ScikitCustomEvalSetSplitter,
+                      _LGBM_ScikitCustomObjectiveFunction, _LGBM_ScikitEvalMetricType, _lgbmmodel_doc_custom_eval_note,
+                      _lgbmmodel_doc_fit, _lgbmmodel_doc_predict)
 
 __all__ = [
     'DaskLGBMClassifier',
@@ -189,7 +189,7 @@ def _train_part(
     local_listen_port: int,
     num_machines: int,
     return_model: bool,
-    time_out: int = 120,
+    time_out: int,
     **kwargs: Any
 ) -> Optional[LGBMModel]:
     network_params = {
@@ -576,13 +576,48 @@ def _train(
         # pad eval sets when they come in different sizes.
         n_largest_eval_parts = max(x[0].npartitions for x in eval_set)
 
-        eval_sets = defaultdict(list)
+        eval_sets: Dict[
+            int,
+            List[
+                Union[
+                    _DatasetNames,
+                    Tuple[
+                        List[Optional[_DaskMatrixLike]],
+                        List[Optional[_DaskVectorLike]]
+                    ]
+                ]
+            ]
+        ] = defaultdict(list)
         if eval_sample_weight:
-            eval_sample_weights = defaultdict(list)
+            eval_sample_weights: Dict[
+                int,
+                List[
+                    Union[
+                        _DatasetNames,
+                        List[Optional[_DaskVectorLike]]
+                    ]
+                ]
+            ] = defaultdict(list)
         if eval_group:
-            eval_groups = defaultdict(list)
+            eval_groups: Dict[
+                int,
+                List[
+                    Union[
+                        _DatasetNames,
+                        List[Optional[_DaskVectorLike]]
+                    ]
+                ]
+            ] = defaultdict(list)
         if eval_init_score:
-            eval_init_scores = defaultdict(list)
+            eval_init_scores: Dict[
+                int,
+                List[
+                    Union[
+                        _DatasetNames,
+                        List[Optional[_DaskMatrixLike]]
+                    ]
+                ]
+            ] = defaultdict(list)
 
         for i, (X_eval, y_eval) in enumerate(eval_set):
             n_this_eval_parts = X_eval.npartitions
@@ -610,8 +645,8 @@ def _train(
                         eval_sets[parts_idx].append(([x_e], [y_e]))
                     else:
                         # append additional chunks of this eval set to this part.
-                        eval_sets[parts_idx][-1][0].append(x_e)
-                        eval_sets[parts_idx][-1][1].append(y_e)
+                        eval_sets[parts_idx][-1][0].append(x_e)  # type: ignore[index, union-attr]
+                        eval_sets[parts_idx][-1][1].append(y_e)  # type: ignore[index, union-attr]
 
             if eval_sample_weight:
                 if eval_sample_weight[i] is sample_weight:
@@ -631,7 +666,7 @@ def _train(
                         if j < n_parts:
                             eval_sample_weights[parts_idx].append([w_e])
                         else:
-                            eval_sample_weights[parts_idx][-1].append(w_e)
+                            eval_sample_weights[parts_idx][-1].append(w_e)  # type: ignore[union-attr]
 
             if eval_init_score:
                 if eval_init_score[i] is init_score:
@@ -649,7 +684,7 @@ def _train(
                         if j < n_parts:
                             eval_init_scores[parts_idx].append([init_score_e])
                         else:
-                            eval_init_scores[parts_idx][-1].append(init_score_e)
+                            eval_init_scores[parts_idx][-1].append(init_score_e)  # type: ignore[union-attr]
 
             if eval_group:
                 if eval_group[i] is group:
@@ -667,7 +702,7 @@ def _train(
                         if j < n_parts:
                             eval_groups[parts_idx].append([g_e])
                         else:
-                            eval_groups[parts_idx][-1].append(g_e)
+                            eval_groups[parts_idx][-1].append(g_e)  # type: ignore[union-attr]
 
         # assign sub-eval_set components to worker parts.
         for parts_idx, e_set in eval_sets.items():
@@ -686,7 +721,8 @@ def _train(
 
     for part in parts:
         if part.status == 'error':  # type: ignore
-            return part  # trigger error locally
+            # trigger error locally
+            return part  # type: ignore[return-value]
 
     # Find locations of all parts and map them to particular Dask workers
     key_to_part_dict = {part.key: part for part in parts}  # type: ignore
@@ -701,7 +737,7 @@ def _train(
         for worker in worker_map:
             has_eval_set = False
             for part in worker_map[worker]:
-                if 'eval_set' in part.result():
+                if 'eval_set' in part.result():  # type: ignore[attr-defined]
                     has_eval_set = True
                     break
 
@@ -751,7 +787,7 @@ def _train(
     else:
         if listen_port_in_params:
             _log_info("Using passed-in 'local_listen_port' for all workers")
-            unique_hosts = set(urlparse(a).hostname for a in worker_addresses)
+            unique_hosts = {urlparse(a).hostname for a in worker_addresses}
             if len(unique_hosts) < len(worker_addresses):
                 msg = (
                     "'local_listen_port' was provided in Dask training parameters, but at least one "
@@ -836,6 +872,7 @@ def _predict_part(
     **kwargs: Any
 ) -> _DaskPart:
 
+    result: _DaskPart
     if part.shape[0] == 0:
         result = np.array([])
     elif pred_proba:
@@ -1001,7 +1038,7 @@ def _extract(items: List[Any], i: int) -> Any:
             **kwargs,
         )
         pred_row = predict_fn(data_row)
-        chunks = (data.chunks[0],)
+        chunks: Tuple[int, ...] = (data.chunks[0],)
         map_blocks_kwargs = {}
         if len(pred_row.shape) > 1:
             chunks += (pred_row.shape[1],)
@@ -1133,10 +1170,19 @@ def __init__(
         random_state: Optional[Union[int, np.random.RandomState]] = None,
         n_jobs: Optional[int] = None,
         importance_type: str = 'split',
+        early_stopping: bool = False,
+        validation_fraction: Optional[float] = 0.1,
+        n_iter_no_change: int = 10,
+        validation_set_split_strategy: Optional[Union[str, _LGBM_ScikitCustomEvalSetSplitter]] = None,
         client: Optional[Client] = None,
         **kwargs: Any
     ):
         """Docstring is inherited from the lightgbm.LGBMClassifier.__init__."""
+        if early_stopping:
+            raise NotImplementedError(
+                "Early Stopping is not available for the Dask interface of lightgbm "
+                f"(found early_stopping={early_stopping})"
+            )
         self.client = client
         super().__init__(
             boosting_type=boosting_type,
@@ -1235,7 +1281,7 @@ def fit(  # type: ignore[override]
 
     def predict(
         self,
-        X: _DaskMatrixLike,
+        X: _DaskMatrixLike,  # type: ignore[override]
         raw_score: bool = False,
         start_iteration: int = 0,
         num_iteration: Optional[int] = None,
@@ -1270,7 +1316,7 @@ def predict(
 
     def predict_proba(
         self,
-        X: _DaskMatrixLike,
+        X: _DaskMatrixLike,  # type: ignore[override]
         raw_score: bool = False,
         start_iteration: int = 0,
         num_iteration: Optional[int] = None,
@@ -1338,10 +1384,19 @@ def __init__(
         random_state: Optional[Union[int, np.random.RandomState]] = None,
         n_jobs: Optional[int] = None,
         importance_type: str = 'split',
+        early_stopping: bool = False,
+        validation_fraction: Optional[float] = 0.1,
+        n_iter_no_change: int = 10,
+        validation_set_split_strategy: Optional[Union[str, _LGBM_ScikitCustomEvalSetSplitter]] = None,
         client: Optional[Client] = None,
         **kwargs: Any
     ):
         """Docstring is inherited from the lightgbm.LGBMRegressor.__init__."""
+        if early_stopping:
+            raise NotImplementedError(
+                "Early Stopping is not available for the Dask interface of lightgbm "
+                f"(found early_stopping={early_stopping})"
+            )
         self.client = client
         super().__init__(
             boosting_type=boosting_type,
@@ -1441,7 +1496,7 @@ def fit(  # type: ignore[override]
 
     def predict(
         self,
-        X: _DaskMatrixLike,
+        X: _DaskMatrixLike,  # type: ignore[override]
         raw_score: bool = False,
         start_iteration: int = 0,
         num_iteration: Optional[int] = None,
@@ -1508,10 +1563,19 @@ def __init__(
         random_state: Optional[Union[int, np.random.RandomState]] = None,
         n_jobs: Optional[int] = None,
         importance_type: str = 'split',
+        early_stopping: bool = False,
+        validation_fraction: Optional[float] = 0.1,
+        n_iter_no_change: int = 10,
+        validation_set_split_strategy: Optional[Union[str, _LGBM_ScikitCustomEvalSetSplitter]] = None,
         client: Optional[Client] = None,
         **kwargs: Any
     ):
         """Docstring is inherited from the lightgbm.LGBMRanker.__init__."""
+        if early_stopping:
+            raise NotImplementedError(
+                "Early Stopping is not available for the Dask interface of lightgbm "
+                f"(found early_stopping={early_stopping})"
+            )
         self.client = client
         super().__init__(
             boosting_type=boosting_type,
@@ -1616,7 +1680,7 @@ def fit(  # type: ignore[override]
 
     def predict(
         self,
-        X: _DaskMatrixLike,
+        X: _DaskMatrixLike,  # type: ignore[override]
         raw_score: bool = False,
         start_iteration: int = 0,
         num_iteration: Optional[int] = None,
diff --git a/python-package/lightgbm/engine.py b/python-package/lightgbm/engine.py
index 3a0c93fba332..1f8624b7055d 100644
--- a/python-package/lightgbm/engine.py
+++ b/python-package/lightgbm/engine.py
@@ -11,8 +11,9 @@
 
 from . import callback
 from .basic import (Booster, Dataset, LightGBMError, _choose_param_value, _ConfigAliases, _InnerPredictor,
-                    _LGBM_CategoricalFeatureConfiguration, _LGBM_CustomObjectiveFunction,
-                    _LGBM_FeatureNameConfiguration, _log_warning)
+                    _LGBM_BoosterEvalMethodResultType, _LGBM_CategoricalFeatureConfiguration,
+                    _LGBM_CustomObjectiveFunction, _LGBM_EvalFunctionResultType, _LGBM_FeatureNameConfiguration,
+                    _log_warning)
 from .compat import SKLEARN_INSTALLED, _LGBMBaseCrossValidator, _LGBMGroupKFold, _LGBMStratifiedKFold
 
 __all__ = [
@@ -22,9 +23,15 @@
 ]
 
 
-_LGBM_CustomMetricFunction = Callable[
-    [np.ndarray, Dataset],
-    Union[Tuple[str, float, bool], List[Tuple[str, float, bool]]]
+_LGBM_CustomMetricFunction = Union[
+    Callable[
+        [np.ndarray, Dataset],
+        _LGBM_EvalFunctionResultType,
+    ],
+    Callable[
+        [np.ndarray, Dataset],
+        List[_LGBM_EvalFunctionResultType]
+    ],
 ]
 
 _LGBM_PreprocFunction = Callable[
@@ -134,6 +141,20 @@ def train(
     booster : Booster
         The trained Booster model.
     """
+    if not isinstance(train_set, Dataset):
+        raise TypeError(f"train() only accepts Dataset object, train_set has type '{type(train_set).__name__}'.")
+
+    if num_boost_round <= 0:
+        raise ValueError(f"num_boost_round must be greater than 0. Got {num_boost_round}.")
+
+    if isinstance(valid_sets, list):
+        for i, valid_item in enumerate(valid_sets):
+            if not isinstance(valid_item, Dataset):
+                raise TypeError(
+                    "Every item in valid_sets must be a Dataset object. "
+                    f"Item {i} has type '{type(valid_item).__name__}'."
+                )
+
     # create predictor first
     params = copy.deepcopy(params)
     params = _choose_param_value(
@@ -160,17 +181,12 @@ def train(
         params.pop("early_stopping_round")
     first_metric_only = params.get('first_metric_only', False)
 
-    if num_boost_round <= 0:
-        raise ValueError("num_boost_round should be greater than zero.")
     predictor: Optional[_InnerPredictor] = None
     if isinstance(init_model, (str, Path)):
         predictor = _InnerPredictor(model_file=init_model, pred_parameter=params)
     elif isinstance(init_model, Booster):
-        predictor = init_model._to_predictor(dict(init_model.params, **params))
+        predictor = init_model._to_predictor(pred_parameter=dict(init_model.params, **params))
     init_iteration = predictor.num_total_iteration if predictor is not None else 0
-    # check dataset
-    if not isinstance(train_set, Dataset):
-        raise TypeError("Training only accepts Dataset object")
 
     train_set._update_params(params) \
              ._set_predictor(predictor) \
@@ -193,8 +209,6 @@ def train(
                 if valid_names is not None:
                     train_data_name = valid_names[i]
                 continue
-            if not isinstance(valid_data, Dataset):
-                raise TypeError("Training only accepts Dataset object")
             reduced_valid_sets.append(valid_data._update_params(params).set_reference(train_set))
             if valid_names is not None and len(valid_names) > i:
                 name_valid_sets.append(valid_names[i])
@@ -211,7 +225,7 @@ def train(
     if "early_stopping_round" in params:
         callbacks_set.add(
             callback.early_stopping(
-                stopping_rounds=params["early_stopping_round"],
+                stopping_rounds=params["early_stopping_round"],  # type: ignore[arg-type]
                 first_metric_only=first_metric_only,
                 verbose=_choose_param_value(
                     main_param_name="verbosity",
@@ -251,7 +265,7 @@ def train(
 
         booster.update(fobj=fobj)
 
-        evaluation_result_list = []
+        evaluation_result_list: List[_LGBM_BoosterEvalMethodResultType] = []
         # check evaluation result.
         if valid_sets is not None:
             if is_valid_contain_train:
@@ -531,7 +545,7 @@ def cv(
     callbacks: Optional[List[Callable]] = None,
     eval_train_metric: bool = False,
     return_cvbooster: bool = False
-) -> Dict[str, Any]:
+) -> Dict[str, Union[List[float], CVBooster]]:
     """Perform the cross-validation with given parameters.
 
     Parameters
@@ -637,10 +651,14 @@ def cv(
         {'metric1-mean': [values], 'metric1-stdv': [values],
         'metric2-mean': [values], 'metric2-stdv': [values],
         ...}.
-        If ``return_cvbooster=True``, also returns trained boosters via ``cvbooster`` key.
+        If ``return_cvbooster=True``, also returns trained boosters wrapped in a ``CVBooster`` object via ``cvbooster`` key.
     """
     if not isinstance(train_set, Dataset):
-        raise TypeError("Training only accepts Dataset object")
+        raise TypeError(f"cv() only accepts Dataset object, train_set has type '{type(train_set).__name__}'.")
+
+    if num_boost_round <= 0:
+        raise ValueError(f"num_boost_round must be greater than 0. Got {num_boost_round}.")
+
     params = copy.deepcopy(params)
     params = _choose_param_value(
         main_param_name='objective',
@@ -666,12 +684,10 @@ def cv(
         params.pop("early_stopping_round")
     first_metric_only = params.get('first_metric_only', False)
 
-    if num_boost_round <= 0:
-        raise ValueError("num_boost_round should be greater than zero.")
     if isinstance(init_model, (str, Path)):
         predictor = _InnerPredictor(model_file=init_model, pred_parameter=params)
     elif isinstance(init_model, Booster):
-        predictor = init_model._to_predictor(dict(init_model.params, **params))
+        predictor = init_model._to_predictor(pred_parameter=dict(init_model.params, **params))
     else:
         predictor = None
 
@@ -702,7 +718,7 @@ def cv(
     if "early_stopping_round" in params:
         callbacks_set.add(
             callback.early_stopping(
-                stopping_rounds=params["early_stopping_round"],
+                stopping_rounds=params["early_stopping_round"],  # type: ignore[arg-type]
                 first_metric_only=first_metric_only,
                 verbose=_choose_param_value(
                     main_param_name="verbosity",
@@ -725,8 +741,8 @@ def cv(
                                     begin_iteration=0,
                                     end_iteration=num_boost_round,
                                     evaluation_result_list=None))
-        cvfolds.update(fobj=fobj)
-        res = _agg_cv_result(cvfolds.eval_valid(feval))
+        cvfolds.update(fobj=fobj)  # type: ignore[call-arg]
+        res = _agg_cv_result(cvfolds.eval_valid(feval))  # type: ignore[call-arg]
         for _, key, mean, _, std in res:
             results[f'{key}-mean'].append(mean)
             results[f'{key}-stdv'].append(std)
@@ -747,6 +763,6 @@ def cv(
             break
 
     if return_cvbooster:
-        results['cvbooster'] = cvfolds
+        results['cvbooster'] = cvfolds  # type: ignore[assignment]
 
     return dict(results)
diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py
index bf3190320dc3..1bd991c0f618 100644
--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -6,15 +6,16 @@
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import numpy as np
+import scipy.sparse
 
 from .basic import (Booster, Dataset, LightGBMError, _choose_param_value, _ConfigAliases, _LGBM_BoosterBestScoreType,
                     _LGBM_CategoricalFeatureConfiguration, _LGBM_EvalFunctionResultType, _LGBM_FeatureNameConfiguration,
-                    _log_warning)
-from .callback import _EvalResultDict, record_evaluation
+                    _LGBM_GroupType, _LGBM_InitScoreType, _LGBM_LabelType, _LGBM_WeightType, _log_warning)
+from .callback import _EarlyStoppingCallback, _EvalResultDict, early_stopping, record_evaluation
 from .compat import (SKLEARN_INSTALLED, LGBMNotFittedError, _LGBMAssertAllFinite, _LGBMCheckArray,
                      _LGBMCheckClassificationTargets, _LGBMCheckSampleWeight, _LGBMCheckXY, _LGBMClassifierBase,
-                     _LGBMComputeSampleWeight, _LGBMCpuCount, _LGBMLabelEncoder, _LGBMModelBase, _LGBMRegressorBase,
-                     dt_DataTable, pd_DataFrame)
+                     _LGBMComputeSampleWeight, _LGBMCpuCount, _LGBMGroupKFold, _LGBMLabelEncoder, _LGBMModelBase,
+                     _LGBMRegressorBase, _LGBMTrainTestSplit, dt_DataTable, pd_DataFrame)
 from .engine import train
 
 __all__ = [
@@ -24,39 +25,79 @@
     'LGBMRegressor',
 ]
 
+_LGBM_ScikitMatrixLike = Union[
+    dt_DataTable,
+    List[Union[List[float], List[int]]],
+    np.ndarray,
+    pd_DataFrame,
+    scipy.sparse.spmatrix
+]
 _LGBM_ScikitCustomObjectiveFunction = Union[
+    # f(labels, preds)
     Callable[
-        [np.ndarray, np.ndarray],
+        [Optional[np.ndarray], np.ndarray],
         Tuple[np.ndarray, np.ndarray]
     ],
+    # f(labels, preds, weights)
     Callable[
-        [np.ndarray, np.ndarray, np.ndarray],
+        [Optional[np.ndarray], np.ndarray, Optional[np.ndarray]],
         Tuple[np.ndarray, np.ndarray]
     ],
+    # f(labels, preds, weights, group)
     Callable[
-        [np.ndarray, np.ndarray, np.ndarray, np.ndarray],
+        [Optional[np.ndarray], np.ndarray, Optional[np.ndarray], Optional[np.ndarray]],
         Tuple[np.ndarray, np.ndarray]
     ],
 ]
 _LGBM_ScikitCustomEvalFunction = Union[
+    # f(labels, preds)
+    Callable[
+        [Optional[np.ndarray], np.ndarray],
+        _LGBM_EvalFunctionResultType
+    ],
     Callable[
-        [np.ndarray, np.ndarray],
-        Union[_LGBM_EvalFunctionResultType, List[_LGBM_EvalFunctionResultType]]
+        [Optional[np.ndarray], np.ndarray],
+        List[_LGBM_EvalFunctionResultType]
     ],
+    # f(labels, preds, weights)
     Callable[
-        [np.ndarray, np.ndarray, np.ndarray],
-        Union[_LGBM_EvalFunctionResultType, List[_LGBM_EvalFunctionResultType]]
+        [Optional[np.ndarray], np.ndarray, Optional[np.ndarray]],
+        _LGBM_EvalFunctionResultType
     ],
     Callable[
-        [np.ndarray, np.ndarray, np.ndarray, np.ndarray],
-        Union[_LGBM_EvalFunctionResultType, List[_LGBM_EvalFunctionResultType]]
+        [Optional[np.ndarray], np.ndarray, Optional[np.ndarray]],
+        List[_LGBM_EvalFunctionResultType]
     ],
+    # f(labels, preds, weights, group)
+    Callable[
+        [Optional[np.ndarray], np.ndarray, Optional[np.ndarray], Optional[np.ndarray]],
+        _LGBM_EvalFunctionResultType
+    ],
+    Callable[
+        [Optional[np.ndarray], np.ndarray, Optional[np.ndarray], Optional[np.ndarray]],
+        List[_LGBM_EvalFunctionResultType]
+    ]
 ]
 _LGBM_ScikitEvalMetricType = Union[
     str,
     _LGBM_ScikitCustomEvalFunction,
     List[Union[str, _LGBM_ScikitCustomEvalFunction]]
 ]
+_LGBM_ScikitCustomEvalSetSplitter = Union[
+    Callable[
+        [_LGBM_ScikitMatrixLike, _LGBM_LabelType],
+        Tuple[_LGBM_ScikitMatrixLike, _LGBM_ScikitMatrixLike, _LGBM_LabelType, _LGBM_LabelType]
+    ],
+    Callable[
+        [_LGBM_ScikitMatrixLike, _LGBM_LabelType, Optional[np.ndarray]],
+        Tuple[_LGBM_ScikitMatrixLike, _LGBM_ScikitMatrixLike, _LGBM_LabelType, _LGBM_LabelType, Optional[np.ndarray], Optional[np.ndarray]]
+    ],
+    Callable[
+        [_LGBM_ScikitMatrixLike, _LGBM_LabelType, Optional[np.ndarray], _LGBM_GroupType],
+        Tuple[_LGBM_ScikitMatrixLike, _LGBM_ScikitMatrixLike, _LGBM_LabelType, _LGBM_LabelType, Optional[np.ndarray], Optional[np.ndarray], _LGBM_GroupType, _LGBM_GroupType]
+    ],
+]
+_LGBM_ScikitValidSet = Tuple[_LGBM_ScikitMatrixLike, _LGBM_LabelType]
 
 
 class _ObjectiveFunctionWrapper:
@@ -127,11 +168,11 @@ def __call__(self, preds: np.ndarray, dataset: Dataset) -> Tuple[np.ndarray, np.
         labels = dataset.get_label()
         argc = len(signature(self.func).parameters)
         if argc == 2:
-            grad, hess = self.func(labels, preds)
+            grad, hess = self.func(labels, preds)  # type: ignore[call-arg]
         elif argc == 3:
-            grad, hess = self.func(labels, preds, dataset.get_weight())
+            grad, hess = self.func(labels, preds, dataset.get_weight())  # type: ignore[call-arg]
         elif argc == 4:
-            grad, hess = self.func(labels, preds, dataset.get_weight(), dataset.get_group())
+            grad, hess = self.func(labels, preds, dataset.get_weight(), dataset.get_group())  # type: ignore [call-arg]
         else:
             raise TypeError(f"Self-defined objective function should have 2, 3 or 4 arguments, got {argc}")
         return grad, hess
@@ -205,15 +246,280 @@ def __call__(
         labels = dataset.get_label()
         argc = len(signature(self.func).parameters)
         if argc == 2:
-            return self.func(labels, preds)
+            return self.func(labels, preds)  # type: ignore[call-arg]
         elif argc == 3:
-            return self.func(labels, preds, dataset.get_weight())
+            return self.func(labels, preds, dataset.get_weight())  # type: ignore[call-arg]
         elif argc == 4:
-            return self.func(labels, preds, dataset.get_weight(), dataset.get_group())
+            return self.func(labels, preds, dataset.get_weight(), dataset.get_group())  # type: ignore[call-arg]
         else:
             raise TypeError(f"Self-defined eval function should have 2, 3 or 4 arguments, got {argc}")
 
 
+def _train_test_split(
+    X: _LGBM_ScikitMatrixLike,
+    y: _LGBM_LabelType,
+    weight,
+    test_size: float,
+    random_state: Optional[Union[int, np.random.RandomState]],
+    stratified: bool,
+) -> Tuple[
+    _LGBM_ScikitMatrixLike,
+    _LGBM_ScikitMatrixLike,
+    _LGBM_LabelType,
+    _LGBM_LabelType,
+    Optional[np.ndarray],
+    Optional[np.ndarray],
+]:
+    """Split X, y and weights into random train and test subsets.
+
+    Parameters
+    ----------
+    X : numpy 2-D array of shape = [n_samples, n_features]
+        The features matrix.
+    y : numpy 1-D array of shape = [n_samples]
+        The target values.
+    weight : numpy 1-D array of shape = [n_samples]
+        The weight of samples. Weights should be non-negative.
+    test_size : float
+        Should be between 0.0 and 1.0 and represent the proportion of the dataset
+        that to include in the test split.
+    random_state : int, RandomState instance or None
+        Controls the shuffling applied to the data before applying the split.
+        Pass an int for reproducible output across multiple function calls.
+    stratified : bool
+        If true, split data in a stratified fashion.
+
+    Returns
+    -------
+    Tuple[ np.ndarray, np.ndarray, np.ndarray, np.ndarray, Optional[np.ndarray], Optional[np.ndarray] ]
+        X_train: numpy 2-D array of shape = [n_train_samples, n_features]
+            The features matrix to be used for training.
+        X_val: numpy 2-D array of shape = [n_val_samples, n_features]
+            The features matrix to be used as a evaluation set for early stopping.
+        y_train: numpy 1-D array of shape = [n_train_samples]
+            The target values to be used for training.
+        y_val: numpy 1-D array of shape = [n_val_samples]
+            The target values to be used as a evaluation set for early stopping.
+        weight_train: numpy 1-D array of shape = [n_train_samples], optional
+            The weight of samples to be used for training. Returned if input weights is not None.
+            Weights should be non-negative.
+        weight_val: numpy 1-D array of shape = [n_val_samples], optional
+            The weight of samples to be used as a evaluation set for early stopping. Returned if input weights is not None.
+            Weights should be non-negative.
+    """
+    stratify = y if stratified else None
+    if weight is not None:
+        return _LGBMTrainTestSplit(
+            X, y, weight, test_size=test_size, random_state=random_state, stratify=stratify
+        )
+    else:
+        X_train, X_val, y_train, y_val = _LGBMTrainTestSplit(
+            X, y, test_size=test_size, random_state=random_state, stratify=stratify
+        )
+        return X_train, X_val, y_train, y_val, None, None
+
+
+def _train_test_group_split(
+    X: _LGBM_ScikitMatrixLike,
+    y: _LGBM_LabelType,
+    weight,
+    group: _LGBM_GroupType,
+    n_splits: int
+) -> Tuple[
+    _LGBM_ScikitMatrixLike,
+    _LGBM_ScikitMatrixLike,
+    _LGBM_LabelType,
+    _LGBM_LabelType,
+    Optional[np.ndarray],
+    Optional[np.ndarray],
+    _LGBM_GroupType,
+    _LGBM_GroupType,
+]:
+    """Split X, y, weights and group into train and test subsets.
+
+    Parameters
+    ----------
+    X : numpy 2-D array of shape = [n_samples, n_features]
+        The features matrix.
+    y : numpy 1-D array of shape = [n_samples]
+        The target values.
+    weight : numpy 1-D array of shape = [n_samples]
+        The weight of samples. Weights should be non-negative.
+    group : numpy 1-D array
+        Group/query data.
+        Only used in the learning-to-rank task.
+        sum(group) = n_samples.
+        For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups,
+        where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc.
+    n_splits : int
+        controls the size of the test set. The test set will have
+        size = n_samples / n_splits
+
+    Returns
+    -------
+    Tuple[ np.ndarray, np.ndarray, np.ndarray, np.ndarray, Optional[np.ndarray], Optional[np.ndarray], np.ndarray, np.ndarray ]
+        X_train: numpy 2-D array of shape = [n_train_samples, n_features]
+            The features matrix to be used for training.
+        X_val: numpy 2-D array of shape = [n_val_samples, n_features]
+            The features matrix to be used as a evaluation set for early stopping.
+        y_train: numpy 1-D array of shape = [n_train_samples]
+            The target values to be used for training.
+        y_val: numpy 1-D array of shape = [n_val_samples]
+            The target values to be used as a evaluation set for early stopping.
+        weight_train: numpy 1-D array of shape = [n_train_samples], optional
+            The weight of samples to be used for training. Returned if input weights is not None.
+            Weights should be non-negative.
+        weight_val: numpy 1-D array of shape = [n_val_samples], optional
+            The weight of samples to be used as a evaluation set for early stopping. Returned if input weights is not None.
+            Weights should be non-negative.
+        group_train: numpy 1-D array
+            Group/query data to be used for training.
+            Only used in the learning-to-rank task.
+            sum(group) = n_train_samples.
+            For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups,
+            where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc.
+        group_val: numpy 1-D array
+            Group/query data to be used as a evaluation set for early stopping.
+            Only used in the learning-to-rank task.
+            sum(group) = n_val_samples.
+            For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups,
+            where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc.
+    """
+    group_k_fold = _LGBMGroupKFold(n_splits)
+    group = np.array(group, dtype=np.int32, copy=False)
+    group_flattened = np.repeat(list(range(len(group))), group)
+    train_idx, val_idx = next(group_k_fold.split(X, y, groups=group_flattened))
+    full_ds = Dataset(data=X, label=y, weight=weight, group=group, free_raw_data=False)
+    train_ds = full_ds.subset(sorted(train_idx)).construct()
+    val_ds = full_ds.subset(sorted(val_idx)).construct()
+    return (
+        train_ds.get_data(),
+        val_ds.get_data(),
+        train_ds.get_label(),
+        val_ds.get_label(),
+        train_ds.get_weight(),
+        val_ds.get_weight(),
+        train_ds.get_group(),
+        val_ds.get_group(),
+    )
+
+
+def _train_test_split_custom_splitter(
+    custom_splitter: _LGBM_ScikitCustomEvalSetSplitter,
+    X: _LGBM_ScikitMatrixLike,
+    y: _LGBM_LabelType,
+    weight,
+    group: Optional[_LGBM_GroupType]
+) -> Tuple[
+    _LGBM_ScikitMatrixLike,
+    _LGBM_ScikitMatrixLike,
+    _LGBM_LabelType,
+    _LGBM_LabelType,
+    Optional[np.ndarray],
+    Optional[np.ndarray],
+    Optional[_LGBM_GroupType],
+    Optional[_LGBM_GroupType],
+]:
+    """Call passed custom_splitter with appropriate arguments.
+
+    Parameters
+    ----------
+    func : callable
+        Expects a callable with following signatures:
+        ``func(X, y) -> X_train, X_val, y_train, y_val``,
+        ``func(X, y, weight) -> X_train, X_val, y_train, y_val, weight_train, weight_val`` or
+        ``func(X, y, weight, group) ->
+            X_train, X_val, y_train, y_val, weight_train, weight_val, group_train, group_val``
+        where
+            X : numpy 2-D array of shape = [n_samples, n_features]
+                The features matrix.
+            y : numpy 1-D array of shape = [n_samples]
+                The target values.
+            weight : numpy 1-D array of shape = [n_samples]
+                The weight of samples. Weights should be non-negative.
+            group : numpy 1-D array
+                Group/query data.
+                Only used in the learning-to-rank task.
+                sum(group) = n_samples.
+                For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups,
+                where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc.
+            X_train: numpy 2-D array of shape = [n_train_samples, n_features]
+                The features matrix to be used for training.
+            X_val: numpy 2-D array of shape = [n_val_samples, n_features]
+                The features matrix to be used as a evaluation set for early stopping.
+            y_train: numpy 1-D array of shape = [n_train_samples]
+                The target values to be used for training.
+            y_val: numpy 1-D array of shape = [n_val_samples]
+                The target values to be used as a evaluation set for early stopping.
+            weight_train: numpy 1-D array of shape = [n_train_samples]
+                The weight of samples to be used for training. Weights should be non-negative.
+            weight_val: numpy 1-D array of shape = [n_val_samples]
+                The weight of samples to be used as a evaluation set for early stopping. Weights should be non-negative.
+            group_train: numpy 1-D array
+                Group/query data to be used for training.
+                Only used in the learning-to-rank task.
+                sum(group) = n_train_samples.
+                For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups,
+                where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc.
+            group_val: numpy 1-D array
+                Group/query data to be used as a evaluation set for early stopping.
+                Only used in the learning-to-rank task.
+                sum(group) = n_val_samples.
+                For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups,
+                where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc.
+    X : numpy 2-D array of shape = [n_samples, n_features]
+        The features matrix.
+    y : numpy 1-D array of shape = [n_samples]
+        The target values.
+    weight : numpy 1-D array of shape = [n_samples]
+        The weight of samples. Weights should be non-negative.
+    group : numpy 1-D array
+        Group/query data.
+        Only used in the learning-to-rank task.
+        sum(group) = n_samples.
+        For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups,
+        where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc.
+
+    Returns
+    -------
+    X_train: numpy 2-D array of shape = [n_train_samples, n_features]
+        The features matrix to be used for training.
+    X_val: numpy 2-D array of shape = [n_val_samples, n_features]
+        The features matrix to be used as a evaluation set for early stopping.
+    y_train: numpy 1-D array of shape = [n_train_samples]
+        The target values to be used for training.
+    y_val: numpy 1-D array of shape = [n_val_samples]
+        The target values to be used as a evaluation set for early stopping.
+    weight_train: numpy 1-D array of shape = [n_train_samples]
+        The weight of samples to be used for training. Weights should be non-negative.
+    weight_val: numpy 1-D array of shape = [n_val_samples]
+        The weight of samples to be used as a evaluation set for early stopping. Weights should be non-negative.
+    group_train: numpy 1-D array
+        Group/query data to be used for training.
+        Only used in the learning-to-rank task.
+        sum(group) = n_train_samples.
+        For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups,
+        where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc.
+    group_val: numpy 1-D array
+        Group/query data to be used as a evaluation set for early stopping.
+        Only used in the learning-to-rank task.
+        sum(group) = n_val_samples.
+        For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups,
+        where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc.
+    """
+    argc = len(signature(custom_splitter).parameters)
+    if argc == 2:
+        X_train, X_val, y_train, y_val = custom_splitter(X, y)
+        return X_train, X_val, y_train, y_val, None, None, None, None
+    elif argc == 3:
+        X_train, X_val, y_train, y_val, weight_train, weight_val = custom_splitter(X, y, weight)
+        return X_train, X_val, y_train, y_val, weight_train, weight_val, None, None
+    elif argc == 4:
+        return custom_splitter(X, y, weight, group)
+    else:
+        raise TypeError(f"Self-defined eval function should have 2, 3 or 4 arguments, got {argc}")
+
+
 # documentation templates for LGBMModel methods are shared between the classes in
 # this module and those in the ``dask`` module
 
@@ -385,6 +691,10 @@ def __init__(
         random_state: Optional[Union[int, np.random.RandomState]] = None,
         n_jobs: Optional[int] = None,
         importance_type: str = 'split',
+        early_stopping: bool = False,
+        validation_fraction: Optional[float] = 0.1,
+        n_iter_no_change: int = 10,
+        validation_set_split_strategy: Optional[Union[str, _LGBM_ScikitCustomEvalSetSplitter]] = None,
         **kwargs
     ):
         r"""Construct a gradient boosting model.
@@ -461,6 +771,25 @@ def __init__(
             The type of feature importance to be filled into ``feature_importances_``.
             If 'split', result contains numbers of times the feature is used in a model.
             If 'gain', result contains total gains of splits which use the feature.
+        early_stopping : bool, optional (default=False)
+            If ``True``, enables early stopping. If ``False`` and no ``early_stopping`` callbacks are passed
+            to the ``fit`` method, then early stopping is disabled.
+        validation_fraction : float or None, optional (default=0.1)
+            Proportion of training data to set aside as
+            validation data for early stopping. If None, early stopping is done on
+            the training data. Only used if early stopping is performed.
+        n_iter_no_change : int, optional (default=10)
+            Used to determine when to "early stop". The fitting process is
+            stopped when none of the last ``n_iter_no_change`` scores are better
+            than the ``n_iter_no_change - 1`` -th-to-last one, up to some
+            tolerance. Only used if early stopping is performed.
+        validation_set_split_strategy : Union[str, _LGBM_ScikitCustomEvalSetSplitter] (default=None)
+            Strategy to use to split validation data for early stopping.
+            If 'random' a random set of train data is used for validation.
+            If 'stratify', the random set of data is taking using stratifed sampling.
+            If 'group', the split is done using a random sample of groups. Only used in the learning-to-rank task.
+            Default: 'random' for LGBMRegressor, 'stratify' for LGBMClassifier, 'group' for LGBMRanker.
+            Alternatively, a custom splitting function can be provided, for more details, see note below.
         **kwargs
             Other parameters for the model.
             Check http://lightgbm.readthedocs.io/en/latest/Parameters.html for more parameters.
@@ -500,6 +829,51 @@ def __init__(
 
         For multi-class task, y_pred is a numpy 2-D array of shape = [n_samples, n_classes],
         and grad and hess should be returned in the same format.
+
+        A custom validation data splitting function can be provided for the ``validation_set_split_strategy`` parameter.
+        In this case, it should have the signature
+        ``func(X, y) -> X_train, X_val, y_train, y_val``,
+        ``func(X, y, weight) -> X_train, X_val, y_train, y_val, weight_train, weight_val`` or
+        ``func(X, y, weight, group) -> X_train, X_val, y_train, y_val, weight_train, weight_val, group_train, group_val``
+        where:
+
+            X : numpy 2-D array of shape = [n_samples, n_features]
+                The features matrix.
+            y : numpy 1-D array of shape = [n_samples]
+                The target values.
+            weight : numpy 1-D array of shape = [n_samples]
+                The weight of samples. Weights should be non-negative.
+            group : numpy 1-D array
+                Group/query data.
+                Only used in the learning-to-rank task.
+                sum(group) = n_samples.
+                For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups,
+                where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc.
+            X_train: numpy 2-D array of shape = [n_train_samples, n_features]
+                The features matrix to be used for training.
+            X_val: numpy 2-D array of shape = [n_val_samples, n_features]
+                The features matrix to be used as a evaluation set for early stopping.
+            y_train: numpy 1-D array of shape = [n_train_samples]
+                The target values to be used for training.
+            y_val: numpy 1-D array of shape = [n_val_samples]
+                The target values to be used as a evaluation set for early stopping.
+            weight_train: numpy 1-D array of shape = [n_train_samples]
+                The weight of samples to be used for training. Weights should be non-negative.
+            weight_val: numpy 1-D array of shape = [n_val_samples]
+                The weight of samples to be used as a evaluation set for early stopping. Weights should be non-negative.
+            group_train: numpy 1-D array
+                Group/query data to be used for training.
+                Only used in the learning-to-rank task.
+                sum(group) = n_train_samples.
+                For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups,
+                where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc.
+            group_val: numpy 1-D array
+                Group/query data to be used as a evaluation set for early stopping.
+                Only used in the learning-to-rank task.
+                sum(group) = n_val_samples.
+                For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups,
+                where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc.
+
         """
         if not SKLEARN_INSTALLED:
             raise LightGBMError('scikit-learn is required for lightgbm.sklearn. '
@@ -536,6 +910,10 @@ def __init__(
         self._n_features_in: int = -1
         self._classes: Optional[np.ndarray] = None
         self._n_classes: int = -1
+        self.early_stopping = early_stopping
+        self.validation_fraction = validation_fraction
+        self.n_iter_no_change = n_iter_no_change
+        self.validation_set_split_strategy = validation_set_split_strategy
         self.set_params(**kwargs)
 
     def _more_tags(self) -> Dict[str, Any]:
@@ -638,6 +1016,10 @@ def _process_params(self, stage: str) -> Dict[str, Any]:
         params.pop('importance_type', None)
         params.pop('n_estimators', None)
         params.pop('class_weight', None)
+        params.pop("early_stopping", None)
+        params.pop("validation_fraction", None)
+        params.pop("n_iter_no_change", None)
+        params.pop("validation_set_split_strategy", None)
 
         if isinstance(params['random_state'], np.random.RandomState):
             params['random_state'] = params['random_state'].randint(np.iinfo(np.int32).max)
@@ -697,17 +1079,17 @@ def _process_n_jobs(self, n_jobs: Optional[int]) -> int:
 
     def fit(
         self,
-        X,
-        y,
-        sample_weight=None,
-        init_score=None,
-        group=None,
-        eval_set=None,
+        X: _LGBM_ScikitMatrixLike,
+        y: _LGBM_LabelType,
+        sample_weight: Optional[_LGBM_WeightType] = None,
+        init_score: Optional[_LGBM_InitScoreType] = None,
+        group: Optional[_LGBM_GroupType] = None,
+        eval_set: Optional[List[_LGBM_ScikitValidSet]] = None,
         eval_names: Optional[List[str]] = None,
-        eval_sample_weight=None,
-        eval_class_weight=None,
-        eval_init_score=None,
-        eval_group=None,
+        eval_sample_weight: Optional[List[_LGBM_WeightType]] = None,
+        eval_class_weight: Optional[List[float]] = None,
+        eval_init_score: Optional[List[_LGBM_InitScoreType]] = None,
+        eval_group: Optional[List[_LGBM_GroupType]] = None,
         eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None,
         feature_name: _LGBM_FeatureNameConfiguration = 'auto',
         categorical_feature: _LGBM_CategoricalFeatureConfiguration = 'auto',
@@ -719,9 +1101,13 @@ def fit(
 
         # Do not modify original args in fit function
         # Refer to https://github.com/microsoft/LightGBM/pull/2619
-        eval_metric_list = copy.deepcopy(eval_metric)
-        if not isinstance(eval_metric_list, list):
-            eval_metric_list = [eval_metric_list]
+        eval_metric_list: List[Union[str, _LGBM_ScikitCustomEvalFunction]]
+        if eval_metric is None:
+            eval_metric_list = []
+        elif isinstance(eval_metric, list):
+            eval_metric_list = copy.deepcopy(eval_metric)
+        else:
+            eval_metric_list = [copy.deepcopy(eval_metric)]
 
         # Separate built-in from callable evaluation metrics
         eval_metrics_callable = [_EvalFunctionWrapper(f) for f in eval_metric_list if callable(f)]
@@ -732,10 +1118,120 @@ def fit(
         params['metric'] = [e for e in eval_metrics_builtin if e not in params['metric']] + params['metric']
         params['metric'] = [metric for metric in params['metric'] if metric is not None]
 
+        if self.early_stopping and eval_set is None:
+
+            if self.validation_set_split_strategy is None:
+                if isinstance(self, LGBMRegressor):
+                    _validation_set_split_strategy = "random"
+                elif isinstance(self, LGBMClassifier):
+                    _validation_set_split_strategy = "stratify"
+                elif isinstance(self, LGBMRanker):
+                    _validation_set_split_strategy = "group"
+                else:
+                    raise ValueError("Unknown LGBMModel type.")
+            else:
+                _validation_set_split_strategy = self.validation_set_split_strategy
+
+            if callable(_validation_set_split_strategy):
+                (
+                    _X_train, _X_val,
+                    _y_train, _y_val,
+                    sample_weight_train, sample_weight_val,
+                    group_train, group_val
+                ) = _train_test_split_custom_splitter(
+                    custom_splitter=_validation_set_split_strategy,
+                    X=X,
+                    y=y,
+                    weight=sample_weight,
+                    group=group
+                )
+            elif self.validation_fraction is None:
+                # If validation_fraction is None early stopping is done on the training data
+                _X_train = X
+                _X_val = copy.copy(X)
+                _y_train = y
+                _y_val = copy.copy(y)
+                sample_weight_train = sample_weight
+                sample_weight_val = copy.copy(sample_weight)
+                group_train = group
+                group_val = copy.copy(group)
+            elif (
+                _validation_set_split_strategy != "group"
+                and isinstance(self, LGBMRanker)
+            ):
+                raise ValueError(
+                    "Parameter group has been specified but the selected"
+                    f"validation_set_split_strategy ({_validation_set_split_strategy})"
+                    "does not support groups please set validation_set_split_strategy to \"group\" or "
+                    "provide a callable with the signature func(X, y, weights, group) -> "
+                    "(X_train, X_val, y_train, y_val, weights_train, weights_val, group_train, group_val)"
+                )
+            elif _validation_set_split_strategy == "random":
+                (
+                    _X_train, _X_val,
+                    _y_train, _y_val,
+                    sample_weight_train, sample_weight_val,
+                ) = _train_test_split(
+                    X,
+                    y,
+                    sample_weight,
+                    test_size=self.validation_fraction,
+                    random_state=self.random_state,
+                    stratified=False,
+                )
+                group_train, group_val = None, None
+            elif _validation_set_split_strategy == "stratify":
+                (
+                    _X_train, _X_val,
+                    _y_train, _y_val,
+                    sample_weight_train, sample_weight_val,
+                ) = _train_test_split(
+                    X,
+                    y,
+                    sample_weight,
+                    test_size=self.validation_fraction,
+                    random_state=self.random_state,
+                    stratified=True,
+                )
+                group_train, group_val = None, None
+            elif _validation_set_split_strategy == "group":
+                n_splits = max(int(np.ceil(1 / self.validation_fraction)), 2)
+                (
+                    _X_train, _X_val,
+                    _y_train, _y_val,
+                    sample_weight_train, sample_weight_val,
+                    group_train, group_val
+                ) = _train_test_group_split(
+                    X,
+                    y,
+                    sample_weight,
+                    group,
+                    n_splits=n_splits
+                )
+            else:
+                raise ValueError(
+                    "validation_set_split_strategy must be a callable or one of the following"
+                    "values { \"random\", \"stratify\", \"group\"}, got"
+                    f"{_validation_set_split_strategy}")
+
+            eval_set = [(_X_val, _y_val)]
+            if sample_weight_val is not None:
+                eval_sample_weight = [sample_weight_val]
+            if group_val is not None:
+                eval_group = [group_val]
+
+            _X, _y = _X_train, _y_train
+            _sample_weight = sample_weight_train
+            _group = group_train
+        else:
+            _X, _y = X, y
+            _sample_weight = sample_weight
+            _group = group
+
         if not isinstance(X, (pd_DataFrame, dt_DataTable)):
-            _X, _y = _LGBMCheckXY(X, y, accept_sparse=True, force_all_finite=False, ensure_min_samples=2)
-            if sample_weight is not None:
-                sample_weight = _LGBMCheckSampleWeight(sample_weight, _X)
+            _X, _y = _LGBMCheckXY(_X, _y, accept_sparse=True, force_all_finite=False, ensure_min_samples=2)
+            if _sample_weight is not None:
+                _sample_weight = _LGBMCheckSampleWeight(_sample_weight, _X)
         else:
             _X, _y = X, y
 
@@ -743,16 +1239,16 @@ def fit(
             self._class_weight = self.class_weight
         if self._class_weight is not None:
             class_sample_weight = _LGBMComputeSampleWeight(self._class_weight, y)
-            if sample_weight is None or len(sample_weight) == 0:
-                sample_weight = class_sample_weight
+            if _sample_weight is None or len(_sample_weight) == 0:
+                _sample_weight = class_sample_weight
             else:
-                sample_weight = np.multiply(sample_weight, class_sample_weight)
+                _sample_weight = np.multiply(_sample_weight, class_sample_weight)
 
         self._n_features = _X.shape[1]
         # copy for consistency
         self._n_features_in = self._n_features
 
-        train_set = Dataset(data=_X, label=_y, weight=sample_weight, group=group,
+        train_set = Dataset(data=_X, label=_y, weight=_sample_weight, group=_group,
                             init_score=init_score, categorical_feature=categorical_feature,
                             params=params)
 
@@ -805,13 +1301,16 @@ def _get_meta_data(collection, name, i):
         evals_result: _EvalResultDict = {}
         callbacks.append(record_evaluation(evals_result))
 
+        if (self.early_stopping and all(type(callback) is not _EarlyStoppingCallback for callback in callbacks)):
+            callbacks.append(early_stopping(self.n_iter_no_change))
+
         self._Booster = train(
             params=params,
             train_set=train_set,
             num_boost_round=self.n_estimators,
             valid_sets=valid_sets,
             valid_names=eval_names,
-            feval=eval_metrics_callable,
+            feval=eval_metrics_callable,  # type: ignore[arg-type]
             init_model=init_model,
             feature_name=feature_name,
             callbacks=callbacks
@@ -829,19 +1328,19 @@ def _get_meta_data(collection, name, i):
         return self
 
     fit.__doc__ = _lgbmmodel_doc_fit.format(
-        X_shape="array-like or sparse matrix of shape = [n_samples, n_features]",
-        y_shape="array-like of shape = [n_samples]",
-        sample_weight_shape="array-like of shape = [n_samples] or None, optional (default=None)",
-        init_score_shape="array-like of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task) or shape = [n_samples, n_classes] (for multi-class task) or None, optional (default=None)",
-        group_shape="array-like or None, optional (default=None)",
-        eval_sample_weight_shape="list of array, or None, optional (default=None)",
-        eval_init_score_shape="list of array, or None, optional (default=None)",
-        eval_group_shape="list of array, or None, optional (default=None)"
+        X_shape="numpy array, pandas DataFrame, H2O DataTable's Frame , scipy.sparse, list of lists of int or float of shape = [n_samples, n_features]",
+        y_shape="numpy array, pandas DataFrame, pandas Series, list of int or float of shape = [n_samples]",
+        sample_weight_shape="numpy array, pandas Series, list of int or float of shape = [n_samples] or None, optional (default=None)",
+        init_score_shape="numpy array, pandas DataFrame, pandas Series, list of int or float of shape = [n_samples] or shape = [n_samples * n_classes] (for multi-class task) or shape = [n_samples, n_classes] (for multi-class task) or None, optional (default=None)",
+        group_shape="numpy array, pandas Series, list of int or float, or None, optional (default=None)",
+        eval_sample_weight_shape="list of array (same types as ``sample_weight`` supports), or None, optional (default=None)",
+        eval_init_score_shape="list of array (same types as ``init_score`` supports), or None, optional (default=None)",
+        eval_group_shape="list of array (same types as ``group`` supports), or None, optional (default=None)"
     ) + "\n\n" + _lgbmmodel_doc_custom_eval_note
 
     def predict(
         self,
-        X,
+        X: _LGBM_ScikitMatrixLike,
         raw_score: bool = False,
         start_iteration: int = 0,
         num_iteration: Optional[int] = None,
@@ -889,7 +1388,7 @@ def predict(
 
     predict.__doc__ = _lgbmmodel_doc_predict.format(
         description="Return the predicted value for each sample.",
-        X_shape="array-like or sparse matrix of shape = [n_samples, n_features]",
+        X_shape="numpy array, pandas DataFrame, H2O DataTable's Frame , scipy.sparse, list of lists of int or float of shape = [n_samples, n_features]",
         output_name="predicted_result",
         predicted_result_shape="array-like of shape = [n_samples] or shape = [n_samples, n_classes]",
         X_leaves_shape="array-like of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]",
@@ -929,7 +1428,7 @@ def objective_(self) -> Union[str, _LGBM_ScikitCustomObjectiveFunction]:
         """:obj:`str` or :obj:`callable`: The concrete objective used while fitting this model."""
         if not self.__sklearn_is_fitted__():
             raise LGBMNotFittedError('No objective found. Need to call fit beforehand.')
-        return self._objective
+        return self._objective  # type: ignore[return-value]
 
     @property
     def n_estimators_(self) -> int:
@@ -993,14 +1492,14 @@ class LGBMRegressor(_LGBMRegressorBase, LGBMModel):
 
     def fit(  # type: ignore[override]
         self,
-        X,
-        y,
-        sample_weight=None,
-        init_score=None,
-        eval_set=None,
+        X: _LGBM_ScikitMatrixLike,
+        y: _LGBM_LabelType,
+        sample_weight: Optional[_LGBM_WeightType] = None,
+        init_score: Optional[_LGBM_InitScoreType] = None,
+        eval_set: Optional[List[_LGBM_ScikitValidSet]] = None,
         eval_names: Optional[List[str]] = None,
-        eval_sample_weight=None,
-        eval_init_score=None,
+        eval_sample_weight: Optional[List[_LGBM_WeightType]] = None,
+        eval_init_score: Optional[List[_LGBM_InitScoreType]] = None,
         eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None,
         feature_name: _LGBM_FeatureNameConfiguration = 'auto',
         categorical_feature: _LGBM_CategoricalFeatureConfiguration = 'auto',
@@ -1039,15 +1538,15 @@ class LGBMClassifier(_LGBMClassifierBase, LGBMModel):
 
     def fit(  # type: ignore[override]
         self,
-        X,
-        y,
-        sample_weight=None,
-        init_score=None,
-        eval_set=None,
+        X: _LGBM_ScikitMatrixLike,
+        y: _LGBM_LabelType,
+        sample_weight: Optional[_LGBM_WeightType] = None,
+        init_score: Optional[_LGBM_InitScoreType] = None,
+        eval_set: Optional[List[_LGBM_ScikitValidSet]] = None,
         eval_names: Optional[List[str]] = None,
-        eval_sample_weight=None,
-        eval_class_weight=None,
-        eval_init_score=None,
+        eval_sample_weight: Optional[List[_LGBM_WeightType]] = None,
+        eval_class_weight: Optional[List[float]] = None,
+        eval_init_score: Optional[List[_LGBM_InitScoreType]] = None,
         eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None,
         feature_name: _LGBM_FeatureNameConfiguration = 'auto',
         categorical_feature: _LGBM_CategoricalFeatureConfiguration = 'auto',
@@ -1090,7 +1589,7 @@ def fit(  # type: ignore[override]
             eval_metric = eval_metric_list
 
         # do not modify args, as it causes errors in model selection tools
-        valid_sets: Optional[List[Tuple]] = None
+        valid_sets: Optional[List[_LGBM_ScikitValidSet]] = None
         if eval_set is not None:
             if isinstance(eval_set, tuple):
                 eval_set = [eval_set]
@@ -1127,7 +1626,7 @@ def fit(  # type: ignore[override]
 
     def predict(
         self,
-        X,
+        X: _LGBM_ScikitMatrixLike,
         raw_score: bool = False,
         start_iteration: int = 0,
         num_iteration: Optional[int] = None,
@@ -1157,7 +1656,7 @@ def predict(
 
     def predict_proba(
         self,
-        X,
+        X: _LGBM_ScikitMatrixLike,
         raw_score: bool = False,
         start_iteration: int = 0,
         num_iteration: Optional[int] = None,
@@ -1189,7 +1688,7 @@ def predict_proba(
 
     predict_proba.__doc__ = _lgbmmodel_doc_predict.format(
         description="Return the predicted probability for each class for each sample.",
-        X_shape="array-like or sparse matrix of shape = [n_samples, n_features]",
+        X_shape="numpy array, pandas DataFrame, H2O DataTable's Frame , scipy.sparse, list of lists of int or float of shape = [n_samples, n_features]",
         output_name="predicted_probability",
         predicted_result_shape="array-like of shape = [n_samples] or shape = [n_samples, n_classes]",
         X_leaves_shape="array-like of shape = [n_samples, n_trees] or shape = [n_samples, n_trees * n_classes]",
@@ -1223,16 +1722,16 @@ class LGBMRanker(LGBMModel):
 
     def fit(  # type: ignore[override]
         self,
-        X,
-        y,
-        sample_weight=None,
-        init_score=None,
-        group=None,
-        eval_set=None,
+        X: _LGBM_ScikitMatrixLike,
+        y: _LGBM_LabelType,
+        sample_weight: Optional[_LGBM_WeightType] = None,
+        init_score: Optional[_LGBM_InitScoreType] = None,
+        group: Optional[_LGBM_GroupType] = None,
+        eval_set: Optional[List[_LGBM_ScikitValidSet]] = None,
         eval_names: Optional[List[str]] = None,
-        eval_sample_weight=None,
-        eval_init_score=None,
-        eval_group=None,
+        eval_sample_weight: Optional[List[_LGBM_WeightType]] = None,
+        eval_init_score: Optional[List[_LGBM_InitScoreType]] = None,
+        eval_group: Optional[List[_LGBM_GroupType]] = None,
         eval_metric: Optional[_LGBM_ScikitEvalMetricType] = None,
         eval_at: Union[List[int], Tuple[int, ...]] = (1, 2, 3, 4, 5),
         feature_name: _LGBM_FeatureNameConfiguration = 'auto',
diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml
new file mode 100644
index 000000000000..c3d550047389
--- /dev/null
+++ b/python-package/pyproject.toml
@@ -0,0 +1,57 @@
+[tool.isort]
+line_length = 120
+skip_glob = [
+    "*/external_libs/*",
+    "*/lightgbm-python/*"
+]
+
+[tool.mypy]
+exclude = 'build/*|compile/*|docs/*|examples/*|external_libs/*|lightgbm-python/*|tests/*'
+ignore_missing_imports = true
+
+[tool.ruff]
+exclude = [
+    "build",
+    "compile",
+    "docs",
+    "external_libs",
+    "lightgbm-python",
+    "setup.py"
+]
+ignore = [
+    # (pydocstyle) Missing docstring in magic method
+    "D105",
+    # (pycodestyle) Line too long
+    "E501"
+]
+select = [
+    # flake8-bugbear
+    "B",
+    # flake8-comprehensions
+    "C4",
+    # pydocstyle
+    "D",
+    # pycodestyle
+    "E",
+    # pyflakes
+    "F"
+]
+
+# this should be set to the oldest version of python LightGBM supports
+target-version = "py37"
+
+[tool.ruff.per-file-ignores]
+"examples/*" = [
+    # pydocstyle
+    "D"
+]
+"tests/*" = [
+    # (flake8-bugbear) Found useless expression
+    "B018",
+    # pydocstyle
+    "D"
+]
+
+[tool.ruff.pydocstyle]
+
+convention = "numpy"
diff --git a/python-package/setup.cfg b/python-package/setup.cfg
new file mode 100644
index 000000000000..0f2746df16c4
--- /dev/null
+++ b/python-package/setup.cfg
@@ -0,0 +1,12 @@
+[flake8]
+ignore =
+    # line too long
+    E501,
+    # line break occurred before a binary operator
+    W503
+exclude =
+    ./.nuget,
+    ./external_libs,
+    ./lightgbm-python,
+    ./python-package/build,
+    ./python-package/compile
diff --git a/python-package/setup.py b/python-package/setup.py
index b1620929f816..565cddd75ee4 100644
--- a/python-package/setup.py
+++ b/python-package/setup.py
@@ -7,8 +7,8 @@
 from os import chdir
 from pathlib import Path
 from platform import system
-from shutil import copyfile, copytree, rmtree
-from typing import List, Optional, Union
+from shutil import rmtree
+from typing import List, Optional
 
 from setuptools import find_packages, setup
 from setuptools.command.install import install
@@ -46,41 +46,6 @@ def find_lib() -> List[str]:
     return LIB_PATH
 
 
-def copy_files(integrated_opencl: bool = False, use_gpu: bool = False) -> None:
-
-    def copy_files_helper(folder_name: Union[str, Path]) -> None:
-        src = CURRENT_DIR.parent / folder_name
-        if src.is_dir():
-            dst = CURRENT_DIR / 'compile' / folder_name
-            if dst.is_dir():
-                rmtree(dst)
-            copytree(src, dst)
-        else:
-            raise Exception(f'Cannot copy {src} folder')
-
-    if not IS_SOURCE_FLAG_PATH.is_file():
-        copy_files_helper('include')
-        copy_files_helper('src')
-        for submodule in (CURRENT_DIR.parent / 'external_libs').iterdir():
-            submodule_stem = submodule.stem
-            if submodule_stem == 'compute' and not use_gpu:
-                continue
-            copy_files_helper(Path('external_libs') / submodule_stem)
-        (CURRENT_DIR / "compile" / "windows").mkdir(parents=True, exist_ok=True)
-        copyfile(CURRENT_DIR.parent / "windows" / "LightGBM.sln",
-                 CURRENT_DIR / "compile" / "windows" / "LightGBM.sln")
-        copyfile(CURRENT_DIR.parent / "windows" / "LightGBM.vcxproj",
-                 CURRENT_DIR / "compile" / "windows" / "LightGBM.vcxproj")
-        copyfile(CURRENT_DIR.parent / "LICENSE",
-                 CURRENT_DIR / "LICENSE")
-        copyfile(CURRENT_DIR.parent / "CMakeLists.txt",
-                 CURRENT_DIR / "compile" / "CMakeLists.txt")
-        if integrated_opencl:
-            (CURRENT_DIR / "compile" / "cmake").mkdir(parents=True, exist_ok=True)
-            copyfile(CURRENT_DIR.parent / "cmake" / "IntegratedOpenCL.cmake",
-                     CURRENT_DIR / "compile" / "cmake" / "IntegratedOpenCL.cmake")
-
-
 def clear_path(path: Path) -> None:
     if path.is_dir():
         for file_name in path.iterdir():
@@ -126,7 +91,7 @@ def compile_cpp(
 
     logger.info("Starting to compile the library.")
 
-    cmake_cmd = ["cmake", str(CURRENT_DIR / "compile")]
+    cmake_cmd = ["cmake", str(CURRENT_DIR / "compile"), "-D__BUILD_FOR_PYTHON=ON"]
     if integrated_opencl:
         use_gpu = False
         cmake_cmd.append("-D__INTEGRATE_OPENCL=ON")
@@ -160,7 +125,8 @@ def compile_cpp(
             if use_mpi:
                 raise Exception('MPI version cannot be compiled by MinGW due to the miss of MPI library in it')
             logger.info("Starting to compile with CMake and MinGW.")
-            silent_call(cmake_cmd + ["-G", "MinGW Makefiles"], raise_error=True,
+            # ref: https://stackoverflow.com/a/45104058/3986677
+            silent_call(cmake_cmd + ["-G", "MinGW Makefiles", "-DCMAKE_SH=CMAKE_SH-NOTFOUND"], raise_error=True,
                         error_msg='Please install CMake and all required dependencies first')
             silent_call(["mingw32-make.exe", "_lightgbm", f"-I{build_dir}", "-j4"], raise_error=True,
                         error_msg='Please install MinGW first')
@@ -254,7 +220,6 @@ def run(self) -> None:
                                 "please use 64-bit Python instead.")
         LOG_PATH.touch()
         if not self.precompile:
-            copy_files(integrated_opencl=self.integrated_opencl, use_gpu=self.gpu)
             compile_cpp(use_mingw=self.mingw, use_gpu=self.gpu, use_cuda=self.cuda, use_mpi=self.mpi,
                         use_hdfs=self.hdfs, boost_root=self.boost_root, boost_dir=self.boost_dir,
                         boost_include_dir=self.boost_include_dir, boost_librarydir=self.boost_librarydir,
@@ -315,7 +280,6 @@ def finalize_options(self) -> None:
 class CustomSdist(sdist):
 
     def run(self) -> None:
-        copy_files(integrated_opencl=True, use_gpu=True)
         IS_SOURCE_FLAG_PATH.touch()
         rmtree(CURRENT_DIR / 'lightgbm' / 'Release', ignore_errors=True)
         rmtree(CURRENT_DIR / 'lightgbm' / 'windows' / 'x64', ignore_errors=True)
@@ -332,11 +296,8 @@ def run(self) -> None:
     LOG_PATH = Path.home() / 'LightGBM_compilation.log'
     LOG_NOTICE = f"The full version of error log was saved into {LOG_PATH}"
     IS_SOURCE_FLAG_PATH = CURRENT_DIR / '_IS_SOURCE_PACKAGE.txt'
-    _version_src = CURRENT_DIR.parent / 'VERSION.txt'
-    _version_dst = CURRENT_DIR / 'lightgbm' / 'VERSION.txt'
-    if _version_src.is_file():
-        copyfile(_version_src, _version_dst)
-    version = _version_dst.read_text(encoding='utf-8').strip()
+    _version_file = CURRENT_DIR / 'lightgbm' / 'VERSION.txt'
+    version = _version_file.read_text(encoding='utf-8').strip()
     readme = (CURRENT_DIR / 'README.rst').read_text(encoding='utf-8')
 
     sys.path.insert(0, str(CURRENT_DIR))
diff --git a/src/io/bin.cpp b/src/io/bin.cpp
index 30da15d81053..3d84599e6589 100644
--- a/src/io/bin.cpp
+++ b/src/io/bin.cpp
@@ -160,20 +160,6 @@ namespace LightGBM {
                                                const std::vector<double>& forced_upper_bounds) {
     std::vector<double> bin_upper_bound;
 
-    // get list of distinct values
-    int left_cnt_data = 0;
-    int cnt_zero = 0;
-    int right_cnt_data = 0;
-    for (int i = 0; i < num_distinct_values; ++i) {
-      if (distinct_values[i] <= -kZeroThreshold) {
-        left_cnt_data += counts[i];
-      } else if (distinct_values[i] > kZeroThreshold) {
-        right_cnt_data += counts[i];
-      } else {
-        cnt_zero += counts[i];
-      }
-    }
-
     // get number of positive and negative distinct values
     int left_cnt = -1;
     for (int i = 0; i < num_distinct_values; ++i) {
diff --git a/src/io/config.cpp b/src/io/config.cpp
index 86b64a52d105..e8578046960a 100644
--- a/src/io/config.cpp
+++ b/src/io/config.cpp
@@ -378,6 +378,10 @@ void Config::CheckParamConflict() {
     if (deterministic) {
       Log::Warning("Although \"deterministic\" is set, the results ran by GPU may be non-deterministic.");
     }
+    if (use_quantized_grad) {
+      Log::Warning("Quantized training is not supported by GPU tree learner. Switch to full precision training.");
+      use_quantized_grad = false;
+    }
   } else if (device_type == std::string("cuda")) {
     // force row-wise for cuda version
     force_col_wise = false;
@@ -385,6 +389,10 @@ void Config::CheckParamConflict() {
     if (deterministic) {
       Log::Warning("Although \"deterministic\" is set, the results ran by GPU may be non-deterministic.");
     }
+    if (use_quantized_grad) {
+      Log::Warning("Quantized training is not supported by CUDA tree learner. Switch to full precision training.");
+      use_quantized_grad = false;
+    }
   }
   // linear tree learner must be serial type and run on CPU device
   if (linear_tree) {
diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp
index b1dbcc378a27..0906ba4b6439 100644
--- a/src/io/config_auto.cpp
+++ b/src/io/config_auto.cpp
@@ -251,6 +251,10 @@ const std::unordered_set<std::string>& Config::parameter_set() {
   "output_model",
   "saved_feature_importance_type",
   "snapshot_freq",
+  "use_quantized_grad",
+  "num_grad_quant_bins",
+  "quant_train_renew_leaf",
+  "stochastic_rounding",
   "linear_tree",
   "max_bin",
   "max_bin_by_feature",
@@ -493,6 +497,14 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str
 
   GetInt(params, "snapshot_freq", &snapshot_freq);
 
+  GetBool(params, "use_quantized_grad", &use_quantized_grad);
+
+  GetInt(params, "num_grad_quant_bins", &num_grad_quant_bins);
+
+  GetBool(params, "quant_train_renew_leaf", &quant_train_renew_leaf);
+
+  GetBool(params, "stochastic_rounding", &stochastic_rounding);
+
   GetBool(params, "linear_tree", &linear_tree);
 
   GetInt(params, "max_bin", &max_bin);
@@ -828,6 +840,10 @@ const std::unordered_map<std::string, std::vector<std::string>>& Config::paramet
     {"output_model", {"model_output", "model_out"}},
     {"saved_feature_importance_type", {}},
     {"snapshot_freq", {"save_period"}},
+    {"use_quantized_grad", {}},
+    {"num_grad_quant_bins", {}},
+    {"quant_train_renew_leaf", {}},
+    {"stochastic_rounding", {}},
     {"linear_tree", {"linear_trees"}},
     {"max_bin", {"max_bins"}},
     {"max_bin_by_feature", {}},
@@ -966,6 +982,10 @@ const std::unordered_map<std::string, std::string>& Config::ParameterTypes() {
     {"output_model", "string"},
     {"saved_feature_importance_type", "int"},
     {"snapshot_freq", "int"},
+    {"use_quantized_grad", "bool"},
+    {"num_grad_quant_bins", "int"},
+    {"quant_train_renew_leaf", "bool"},
+    {"stochastic_rounding", "bool"},
     {"linear_tree", "bool"},
     {"max_bin", "int"},
     {"max_bin_by_feature", "vector<int>"},
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index a8f449d3f55b..5b23f01ec3a0 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -608,10 +608,12 @@ MultiValBin* Dataset::GetMultiBinFromAllFeatures(const std::vector<uint32_t>& of
   return ret.release();
 }
 
+template <bool USE_QUANT_GRAD, int HIST_BITS>
 TrainingShareStates* Dataset::GetShareStates(
     score_t* gradients, score_t* hessians,
     const std::vector<int8_t>& is_feature_used, bool is_constant_hessian,
-    bool force_col_wise, bool force_row_wise) const {
+    bool force_col_wise, bool force_row_wise,
+    const int num_grad_quant_bins) const {
   Common::FunctionTimer fun_timer("Dataset::TestMultiThreadingMethod",
                                   global_timer);
   if (force_col_wise && force_row_wise) {
@@ -631,7 +633,7 @@ TrainingShareStates* Dataset::GetShareStates(
     share_state->CalcBinOffsets(
       feature_groups_, &offsets, true);
     share_state->SetMultiValBin(GetMultiBinFromSparseFeatures(offsets),
-      num_data_, feature_groups_, false, true);
+      num_data_, feature_groups_, false, true, num_grad_quant_bins);
     share_state->is_col_wise = true;
     share_state->is_constant_hessian = is_constant_hessian;
     return share_state;
@@ -641,7 +643,7 @@ TrainingShareStates* Dataset::GetShareStates(
     share_state->CalcBinOffsets(
       feature_groups_, &offsets, false);
     share_state->SetMultiValBin(GetMultiBinFromAllFeatures(offsets), num_data_,
-      feature_groups_, false, false);
+      feature_groups_, false, false, num_grad_quant_bins);
     share_state->is_col_wise = false;
     share_state->is_constant_hessian = is_constant_hessian;
     return share_state;
@@ -658,14 +660,14 @@ TrainingShareStates* Dataset::GetShareStates(
     std::vector<uint32_t> col_wise_offsets;
     col_wise_state->CalcBinOffsets(feature_groups_, &col_wise_offsets, true);
     col_wise_state->SetMultiValBin(GetMultiBinFromSparseFeatures(col_wise_offsets), num_data_,
-      feature_groups_, false, true);
+      feature_groups_, false, true, num_grad_quant_bins);
     col_wise_init_time = std::chrono::steady_clock::now() - start_time;
 
     start_time = std::chrono::steady_clock::now();
     std::vector<uint32_t> row_wise_offsets;
     row_wise_state->CalcBinOffsets(feature_groups_, &row_wise_offsets, false);
     row_wise_state->SetMultiValBin(GetMultiBinFromAllFeatures(row_wise_offsets), num_data_,
-      feature_groups_, false, false);
+      feature_groups_, false, false, num_grad_quant_bins);
     row_wise_init_time = std::chrono::steady_clock::now() - start_time;
 
     uint64_t max_total_bin = std::max<uint64_t>(row_wise_state->num_hist_total_bin(),
@@ -685,12 +687,12 @@ TrainingShareStates* Dataset::GetShareStates(
     InitTrain(is_feature_used, row_wise_state.get());
     std::chrono::duration<double, std::milli> col_wise_time, row_wise_time;
     start_time = std::chrono::steady_clock::now();
-    ConstructHistograms(is_feature_used, nullptr, num_data_, gradients,
+    ConstructHistograms<USE_QUANT_GRAD, HIST_BITS>(is_feature_used, nullptr, num_data_, gradients,
                         hessians, gradients, hessians, col_wise_state.get(),
                         hist_data.data());
     col_wise_time = std::chrono::steady_clock::now() - start_time;
     start_time = std::chrono::steady_clock::now();
-    ConstructHistograms(is_feature_used, nullptr, num_data_, gradients,
+    ConstructHistograms<USE_QUANT_GRAD, HIST_BITS>(is_feature_used, nullptr, num_data_, gradients,
                         hessians, gradients, hessians, row_wise_state.get(),
                         hist_data.data());
     row_wise_time = std::chrono::steady_clock::now() - start_time;
@@ -721,6 +723,24 @@ TrainingShareStates* Dataset::GetShareStates(
   }
 }
 
+template TrainingShareStates* Dataset::GetShareStates<false, 0>(
+    score_t* gradients, score_t* hessians,
+    const std::vector<int8_t>& is_feature_used, bool is_constant_hessian,
+    bool force_col_wise, bool force_row_wise,
+    const int num_grad_quant_bins) const;
+
+template TrainingShareStates* Dataset::GetShareStates<true, 16>(
+    score_t* gradients, score_t* hessians,
+    const std::vector<int8_t>& is_feature_used, bool is_constant_hessian,
+    bool force_col_wise, bool force_row_wise,
+    const int num_grad_quant_bins) const;
+
+template TrainingShareStates* Dataset::GetShareStates<true, 32>(
+    score_t* gradients, score_t* hessians,
+    const std::vector<int8_t>& is_feature_used, bool is_constant_hessian,
+    bool force_col_wise, bool force_row_wise,
+    const int num_grad_quant_bins) const;
+
 void Dataset::CopyFeatureMapperFrom(const Dataset* dataset) {
   feature_groups_.clear();
   num_features_ = dataset->num_features_;
@@ -1203,7 +1223,7 @@ void Dataset::InitTrain(const std::vector<int8_t>& is_feature_used,
         is_feature_used);
 }
 
-template <bool USE_INDICES, bool ORDERED>
+template <bool USE_INDICES, bool ORDERED, bool USE_QUANT_GRAD, int HIST_BITS>
 void Dataset::ConstructHistogramsMultiVal(const data_size_t* data_indices,
                                           data_size_t num_data,
                                           const score_t* gradients,
@@ -1212,18 +1232,18 @@ void Dataset::ConstructHistogramsMultiVal(const data_size_t* data_indices,
                                           hist_t* hist_data) const {
   Common::FunctionTimer fun_time("Dataset::ConstructHistogramsMultiVal",
                                  global_timer);
-  share_state->ConstructHistograms<USE_INDICES, ORDERED>(
+  share_state->ConstructHistograms<USE_INDICES, ORDERED, USE_QUANT_GRAD, HIST_BITS>(
       data_indices, num_data, gradients, hessians, hist_data);
 }
 
-template <bool USE_INDICES, bool USE_HESSIAN>
+template <bool USE_INDICES, bool USE_HESSIAN, bool USE_QUANT_GRAD, int HIST_BITS>
 void Dataset::ConstructHistogramsInner(
     const std::vector<int8_t>& is_feature_used, const data_size_t* data_indices,
     data_size_t num_data, const score_t* gradients, const score_t* hessians,
     score_t* ordered_gradients, score_t* ordered_hessians,
     TrainingShareStates* share_state, hist_t* hist_data) const {
   if (!share_state->is_col_wise) {
-    return ConstructHistogramsMultiVal<USE_INDICES, false>(
+    return ConstructHistogramsMultiVal<USE_INDICES, false, USE_QUANT_GRAD, HIST_BITS>(
         data_indices, num_data, gradients, hessians, share_state, hist_data);
   }
   std::vector<int> used_dense_group;
@@ -1275,30 +1295,80 @@ void Dataset::ConstructHistogramsInner(
     for (int gi = 0; gi < num_used_dense_group; ++gi) {
       OMP_LOOP_EX_BEGIN();
       int group = used_dense_group[gi];
-      auto data_ptr = hist_data + group_bin_boundaries_[group] * 2;
       const int num_bin = feature_groups_[group]->num_total_bin_;
-      std::memset(reinterpret_cast<void*>(data_ptr), 0,
-                  num_bin * kHistEntrySize);
-      if (USE_HESSIAN) {
-        if (USE_INDICES) {
-          feature_groups_[group]->bin_data_->ConstructHistogram(
-              data_indices, 0, num_data, ptr_ordered_grad, ptr_ordered_hess,
-              data_ptr);
+      if (USE_QUANT_GRAD) {
+        if (HIST_BITS == 16) {
+          auto data_ptr = reinterpret_cast<hist_t*>(reinterpret_cast<int32_t*>(hist_data) + group_bin_boundaries_[group]);
+          std::memset(reinterpret_cast<void*>(data_ptr), 0,
+                      num_bin * kInt16HistEntrySize);
+          if (USE_HESSIAN) {
+            if (USE_INDICES) {
+              feature_groups_[group]->bin_data_->ConstructHistogramInt16(
+                  data_indices, 0, num_data, ptr_ordered_grad, ptr_ordered_hess,
+                  data_ptr);
+            } else {
+              feature_groups_[group]->bin_data_->ConstructHistogramInt16(
+                  0, num_data, ptr_ordered_grad, ptr_ordered_hess, data_ptr);
+            }
+          } else {
+            if (USE_INDICES) {
+              feature_groups_[group]->bin_data_->ConstructHistogramInt16(
+                  data_indices, 0, num_data, ptr_ordered_grad,
+                  data_ptr);
+            } else {
+              feature_groups_[group]->bin_data_->ConstructHistogramInt16(
+                  0, num_data, ptr_ordered_grad, data_ptr);
+            }
+          }
         } else {
-          feature_groups_[group]->bin_data_->ConstructHistogram(
-              0, num_data, ptr_ordered_grad, ptr_ordered_hess, data_ptr);
+          auto data_ptr = hist_data + group_bin_boundaries_[group];
+          std::memset(reinterpret_cast<void*>(data_ptr), 0,
+                      num_bin * kInt32HistEntrySize);
+          if (USE_HESSIAN) {
+            if (USE_INDICES) {
+              feature_groups_[group]->bin_data_->ConstructHistogramInt32(
+                  data_indices, 0, num_data, ptr_ordered_grad, ptr_ordered_hess,
+                  data_ptr);
+            } else {
+              feature_groups_[group]->bin_data_->ConstructHistogramInt32(
+                  0, num_data, ptr_ordered_grad, ptr_ordered_hess, data_ptr);
+            }
+          } else {
+            if (USE_INDICES) {
+              feature_groups_[group]->bin_data_->ConstructHistogramInt32(
+                  data_indices, 0, num_data, ptr_ordered_grad,
+                  data_ptr);
+            } else {
+              feature_groups_[group]->bin_data_->ConstructHistogramInt32(
+                  0, num_data, ptr_ordered_grad, data_ptr);
+            }
+          }
         }
       } else {
-        if (USE_INDICES) {
-          feature_groups_[group]->bin_data_->ConstructHistogram(
-              data_indices, 0, num_data, ptr_ordered_grad, data_ptr);
+        auto data_ptr = hist_data + group_bin_boundaries_[group] * 2;
+        std::memset(reinterpret_cast<void*>(data_ptr), 0,
+                    num_bin * kHistEntrySize);
+        if (USE_HESSIAN) {
+          if (USE_INDICES) {
+            feature_groups_[group]->bin_data_->ConstructHistogram(
+                data_indices, 0, num_data, ptr_ordered_grad, ptr_ordered_hess,
+                data_ptr);
+          } else {
+            feature_groups_[group]->bin_data_->ConstructHistogram(
+                0, num_data, ptr_ordered_grad, ptr_ordered_hess, data_ptr);
+          }
         } else {
-          feature_groups_[group]->bin_data_->ConstructHistogram(
-              0, num_data, ptr_ordered_grad, data_ptr);
-        }
-        auto cnt_dst = reinterpret_cast<hist_cnt_t*>(data_ptr + 1);
-        for (int i = 0; i < num_bin * 2; i += 2) {
-          data_ptr[i + 1] = static_cast<double>(cnt_dst[i]) * hessians[0];
+          if (USE_INDICES) {
+            feature_groups_[group]->bin_data_->ConstructHistogram(
+                data_indices, 0, num_data, ptr_ordered_grad, data_ptr);
+          } else {
+            feature_groups_[group]->bin_data_->ConstructHistogram(
+                0, num_data, ptr_ordered_grad, data_ptr);
+          }
+          auto cnt_dst = reinterpret_cast<hist_cnt_t*>(data_ptr + 1);
+          for (int i = 0; i < num_bin * 2; i += 2) {
+            data_ptr[i + 1] = static_cast<double>(cnt_dst[i]) * hessians[0];
+          }
         }
       }
       OMP_LOOP_EX_END();
@@ -1307,43 +1377,78 @@ void Dataset::ConstructHistogramsInner(
   }
   global_timer.Stop("Dataset::dense_bin_histogram");
   if (multi_val_groud_id >= 0) {
-    if (num_used_dense_group > 0) {
-      ConstructHistogramsMultiVal<USE_INDICES, true>(
-          data_indices, num_data, ptr_ordered_grad, ptr_ordered_hess,
-          share_state,
-          hist_data + group_bin_boundaries_[multi_val_groud_id] * 2);
+    if (USE_QUANT_GRAD) {
+      if (HIST_BITS == 32) {
+        int32_t* hist_data_ptr = reinterpret_cast<int32_t*>(hist_data);
+        if (num_used_dense_group > 0) {
+          ConstructHistogramsMultiVal<USE_INDICES, true, USE_QUANT_GRAD, HIST_BITS>(
+              data_indices, num_data, ptr_ordered_grad, ptr_ordered_hess,
+              share_state,
+              reinterpret_cast<hist_t*>(hist_data_ptr + group_bin_boundaries_[multi_val_groud_id] * 2));
+        } else {
+          ConstructHistogramsMultiVal<USE_INDICES, false, USE_QUANT_GRAD, HIST_BITS>(
+              data_indices, num_data, gradients, hessians, share_state,
+              reinterpret_cast<hist_t*>(hist_data_ptr + group_bin_boundaries_[multi_val_groud_id] * 2));
+        }
+      } else if (HIST_BITS == 16) {
+        int16_t* hist_data_ptr = reinterpret_cast<int16_t*>(hist_data);
+        if (num_used_dense_group > 0) {
+          ConstructHistogramsMultiVal<USE_INDICES, true, USE_QUANT_GRAD, HIST_BITS>(
+              data_indices, num_data, ptr_ordered_grad, ptr_ordered_hess,
+              share_state,
+              reinterpret_cast<hist_t*>(hist_data_ptr + group_bin_boundaries_[multi_val_groud_id] * 2));
+        } else {
+          ConstructHistogramsMultiVal<USE_INDICES, false, USE_QUANT_GRAD, HIST_BITS>(
+              data_indices, num_data, gradients, hessians, share_state,
+              reinterpret_cast<hist_t*>(hist_data_ptr + group_bin_boundaries_[multi_val_groud_id] * 2));
+        }
+      }
     } else {
-      ConstructHistogramsMultiVal<USE_INDICES, false>(
-          data_indices, num_data, gradients, hessians, share_state,
-          hist_data + group_bin_boundaries_[multi_val_groud_id] * 2);
+      if (num_used_dense_group > 0) {
+        ConstructHistogramsMultiVal<USE_INDICES, true, USE_QUANT_GRAD, HIST_BITS>(
+            data_indices, num_data, ptr_ordered_grad, ptr_ordered_hess,
+            share_state,
+            hist_data + group_bin_boundaries_[multi_val_groud_id] * 2);
+      } else {
+        ConstructHistogramsMultiVal<USE_INDICES, false, USE_QUANT_GRAD, HIST_BITS>(
+            data_indices, num_data, gradients, hessians, share_state,
+            hist_data + group_bin_boundaries_[multi_val_groud_id] * 2);
+      }
     }
   }
 }
 
 // explicitly initialize template methods, for cross module call
-template void Dataset::ConstructHistogramsInner<true, true>(
-    const std::vector<int8_t>& is_feature_used, const data_size_t* data_indices,
-    data_size_t num_data, const score_t* gradients, const score_t* hessians,
-    score_t* ordered_gradients, score_t* ordered_hessians,
-    TrainingShareStates* share_state, hist_t* hist_data) const;
+#define CONSTRUCT_HISTOGRAMS_INNER_PARMA \
+  const std::vector<int8_t>& is_feature_used, const data_size_t* data_indices, \
+  data_size_t num_data, const score_t* gradients, const score_t* hessians, \
+  score_t* ordered_gradients, score_t* ordered_hessians, \
+  TrainingShareStates* share_state, hist_t* hist_data
 
-template void Dataset::ConstructHistogramsInner<true, false>(
-    const std::vector<int8_t>& is_feature_used, const data_size_t* data_indices,
-    data_size_t num_data, const score_t* gradients, const score_t* hessians,
-    score_t* ordered_gradients, score_t* ordered_hessians,
-    TrainingShareStates* share_state, hist_t* hist_data) const;
+// explicitly initialize template methods, for cross module call
+template void Dataset::ConstructHistogramsInner<true, true, false, 0>(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const;
 
-template void Dataset::ConstructHistogramsInner<false, true>(
-    const std::vector<int8_t>& is_feature_used, const data_size_t* data_indices,
-    data_size_t num_data, const score_t* gradients, const score_t* hessians,
-    score_t* ordered_gradients, score_t* ordered_hessians,
-    TrainingShareStates* share_state, hist_t* hist_data) const;
+template void Dataset::ConstructHistogramsInner<true, false, false, 0>(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const;
 
-template void Dataset::ConstructHistogramsInner<false, false>(
-    const std::vector<int8_t>& is_feature_used, const data_size_t* data_indices,
-    data_size_t num_data, const score_t* gradients, const score_t* hessians,
-    score_t* ordered_gradients, score_t* ordered_hessians,
-    TrainingShareStates* share_state, hist_t* hist_data) const;
+template void Dataset::ConstructHistogramsInner<false, true, false, 0>(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const;
+
+template void Dataset::ConstructHistogramsInner<false, false, false, 0>(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const;
+
+template void Dataset::ConstructHistogramsInner<true, true, true, 16>(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const;
+
+template void Dataset::ConstructHistogramsInner<true, false, true, 16>(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const;
+
+template void Dataset::ConstructHistogramsInner<false, true, true, 16>(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const;
+
+template void Dataset::ConstructHistogramsInner<false, false, true, 16>(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const;
+
+template void Dataset::ConstructHistogramsInner<true, true, true, 32>(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const;
+
+template void Dataset::ConstructHistogramsInner<true, false, true, 32>(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const;
+
+template void Dataset::ConstructHistogramsInner<false, true, true, 32>(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const;
+
+template void Dataset::ConstructHistogramsInner<false, false, true, 32>(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const;
 
 void Dataset::FixHistogram(int feature_idx, double sum_gradient,
                            double sum_hessian, hist_t* data) const {
@@ -1365,6 +1470,49 @@ void Dataset::FixHistogram(int feature_idx, double sum_gradient,
   }
 }
 
+template <typename PACKED_HIST_BIN_T, typename PACKED_HIST_ACC_T, int HIST_BITS_BIN, int HIST_BITS_ACC>
+void Dataset::FixHistogramInt(int feature_idx, int64_t int_sum_gradient_and_hessian, hist_t* data) const {
+  const int group = feature2group_[feature_idx];
+  const int sub_feature = feature2subfeature_[feature_idx];
+  const BinMapper* bin_mapper =
+      feature_groups_[group]->bin_mappers_[sub_feature].get();
+  const int most_freq_bin = bin_mapper->GetMostFreqBin();
+  PACKED_HIST_BIN_T* data_ptr = reinterpret_cast<PACKED_HIST_BIN_T*>(data);
+  PACKED_HIST_ACC_T int_sum_gradient_and_hessian_local = HIST_BITS_ACC == 16 ?
+    ((static_cast<int32_t>(int_sum_gradient_and_hessian >> 32) << 16) |
+    static_cast<int32_t>(int_sum_gradient_and_hessian & 0x0000ffff)) :
+    int_sum_gradient_and_hessian;
+  if (most_freq_bin > 0) {
+    const int num_bin = bin_mapper->num_bin();
+    if (HIST_BITS_BIN == HIST_BITS_ACC) {
+      for (int i = 0; i < num_bin; ++i) {
+        if (i != most_freq_bin) {
+          int_sum_gradient_and_hessian_local -= data_ptr[i];
+        }
+      }
+      data_ptr[most_freq_bin] = int_sum_gradient_and_hessian_local;
+    } else {
+      CHECK_EQ(HIST_BITS_ACC, 32);
+      CHECK_EQ(HIST_BITS_BIN, 16);
+      for (int i = 0; i < num_bin; ++i) {
+        if (i != most_freq_bin) {
+          const PACKED_HIST_BIN_T packed_hist = data_ptr[i];
+          const PACKED_HIST_ACC_T packed_hist_acc = (static_cast<int64_t>(static_cast<int16_t>(packed_hist >> 16)) << 32) |
+            static_cast<int64_t>(packed_hist & 0x0000ffff);
+          int_sum_gradient_and_hessian_local -= packed_hist_acc;
+        }
+      }
+      PACKED_HIST_BIN_T int_sum_gradient_and_hessian_local_bin =
+        (static_cast<int32_t>(int_sum_gradient_and_hessian_local >> 32) << 16) | static_cast<int32_t>(int_sum_gradient_and_hessian_local & 0x0000ffff);
+      data_ptr[most_freq_bin] = int_sum_gradient_and_hessian_local_bin;
+    }
+  }
+}
+
+template void Dataset::FixHistogramInt<int64_t, int64_t, 32, 32>(int feature_idx, int64_t int_sum_gradient_and_hessian, hist_t* data) const;
+
+template void Dataset::FixHistogramInt<int32_t, int32_t, 16, 16>(int feature_idx, int64_t int_sum_gradient_and_hessian, hist_t* data) const;
+
 template <typename T>
 void PushVector(std::vector<T>* dest, const std::vector<T>& src) {
   dest->reserve(dest->size() + src.size());
diff --git a/src/io/dense_bin.hpp b/src/io/dense_bin.hpp
index 3d0f8db8e549..e612052e47d2 100644
--- a/src/io/dense_bin.hpp
+++ b/src/io/dense_bin.hpp
@@ -171,6 +171,146 @@ class DenseBin : public Bin {
   }
 
 
+  template <bool USE_INDICES, bool USE_PREFETCH, bool USE_HESSIAN, typename PACKED_HIST_T, int HIST_BITS>
+  void ConstructHistogramIntInner(const data_size_t* data_indices,
+                               data_size_t start, data_size_t end,
+                               const score_t* ordered_gradients,
+                               hist_t* out) const {
+    data_size_t i = start;
+    PACKED_HIST_T* out_ptr = reinterpret_cast<PACKED_HIST_T*>(out);
+    const int16_t* gradients_ptr = reinterpret_cast<const int16_t*>(ordered_gradients);
+    const VAL_T* data_ptr_base = data_.data();
+    if (USE_PREFETCH) {
+      const data_size_t pf_offset = 64 / sizeof(VAL_T);
+      const data_size_t pf_end = end - pf_offset;
+      for (; i < pf_end; ++i) {
+        const auto idx = USE_INDICES ? data_indices[i] : i;
+        const auto pf_idx =
+            USE_INDICES ? data_indices[i + pf_offset] : i + pf_offset;
+        if (IS_4BIT) {
+          PREFETCH_T0(data_ptr_base + (pf_idx >> 1));
+        } else {
+          PREFETCH_T0(data_ptr_base + pf_idx);
+        }
+        const auto ti = static_cast<uint32_t>(data(idx));
+        const int16_t gradient_16 = gradients_ptr[i];
+        if (USE_HESSIAN) {
+          const PACKED_HIST_T gradient_packed = HIST_BITS == 8 ? gradient_16 :
+            (static_cast<PACKED_HIST_T>(static_cast<int8_t>(gradient_16 >> 8)) << HIST_BITS) | (gradient_16 & 0xff);
+          out_ptr[ti] += gradient_packed;
+        } else {
+          const PACKED_HIST_T gradient_packed = HIST_BITS == 8 ? gradient_16 :
+            (static_cast<PACKED_HIST_T>(static_cast<int8_t>(gradient_16 >> 8)) << HIST_BITS) | (1);
+          out_ptr[ti] += gradient_packed;
+        }
+      }
+    }
+    for (; i < end; ++i) {
+      const auto idx = USE_INDICES ? data_indices[i] : i;
+      const auto ti = static_cast<uint32_t>(data(idx));
+      const int16_t gradient_16 = gradients_ptr[i];
+      if (USE_HESSIAN) {
+        const PACKED_HIST_T gradient_packed = HIST_BITS == 8 ? gradient_16 :
+            (static_cast<PACKED_HIST_T>(static_cast<int8_t>(gradient_16 >> 8)) << HIST_BITS) | (gradient_16 & 0xff);
+        out_ptr[ti] += gradient_packed;
+      } else {
+        const PACKED_HIST_T gradient_packed = HIST_BITS == 8 ? gradient_16 :
+            (static_cast<PACKED_HIST_T>(static_cast<int8_t>(gradient_16 >> 8)) << HIST_BITS) | (1);
+        out_ptr[ti] += gradient_packed;
+      }
+    }
+  }
+
+  void ConstructHistogramInt8(const data_size_t* data_indices, data_size_t start,
+                          data_size_t end, const score_t* ordered_gradients,
+                          const score_t* /*ordered_hessians*/,
+                          hist_t* out) const override {
+    ConstructHistogramIntInner<true, true, true, int16_t, 8>(
+        data_indices, start, end, ordered_gradients, out);
+  }
+
+  void ConstructHistogramInt8(data_size_t start, data_size_t end,
+                          const score_t* ordered_gradients,
+                          const score_t* /*ordered_hessians*/,
+                          hist_t* out) const override {
+    ConstructHistogramIntInner<false, false, true, int16_t, 8>(
+        nullptr, start, end, ordered_gradients, out);
+  }
+
+  void ConstructHistogramInt8(const data_size_t* data_indices, data_size_t start,
+                          data_size_t end, const score_t* ordered_gradients,
+                          hist_t* out) const override {
+    ConstructHistogramIntInner<true, true, false, int16_t, 8>(
+      data_indices, start, end, ordered_gradients, out);
+  }
+
+  void ConstructHistogramInt8(data_size_t start, data_size_t end,
+                          const score_t* ordered_gradients,
+                          hist_t* out) const override {
+    ConstructHistogramIntInner<false, false, false, int16_t, 8>(
+        nullptr, start, end, ordered_gradients, out);
+  }
+
+  void ConstructHistogramInt16(const data_size_t* data_indices, data_size_t start,
+                          data_size_t end, const score_t* ordered_gradients,
+                          const score_t* /*ordered_hessians*/,
+                          hist_t* out) const override {
+    ConstructHistogramIntInner<true, true, true, int32_t, 16>(
+        data_indices, start, end, ordered_gradients, out);
+  }
+
+  void ConstructHistogramInt16(data_size_t start, data_size_t end,
+                          const score_t* ordered_gradients,
+                          const score_t* /*ordered_hessians*/,
+                          hist_t* out) const override {
+    ConstructHistogramIntInner<false, false, true, int32_t, 16>(
+        nullptr, start, end, ordered_gradients, out);
+  }
+
+  void ConstructHistogramInt16(const data_size_t* data_indices, data_size_t start,
+                          data_size_t end, const score_t* ordered_gradients,
+                          hist_t* out) const override {
+    ConstructHistogramIntInner<true, true, false, int32_t, 16>(
+      data_indices, start, end, ordered_gradients, out);
+  }
+
+  void ConstructHistogramInt16(data_size_t start, data_size_t end,
+                          const score_t* ordered_gradients,
+                          hist_t* out) const override {
+    ConstructHistogramIntInner<false, false, false, int32_t, 16>(
+        nullptr, start, end, ordered_gradients, out);
+  }
+
+  void ConstructHistogramInt32(const data_size_t* data_indices, data_size_t start,
+                          data_size_t end, const score_t* ordered_gradients,
+                          const score_t* /*ordered_hessians*/,
+                          hist_t* out) const override {
+    ConstructHistogramIntInner<true, true, true, int64_t, 32>(
+        data_indices, start, end, ordered_gradients, out);
+  }
+
+  void ConstructHistogramInt32(data_size_t start, data_size_t end,
+                          const score_t* ordered_gradients,
+                          const score_t* /*ordered_hessians*/,
+                          hist_t* out) const override {
+    ConstructHistogramIntInner<false, false, true, int64_t, 32>(
+        nullptr, start, end, ordered_gradients, out);
+  }
+
+  void ConstructHistogramInt32(const data_size_t* data_indices, data_size_t start,
+                          data_size_t end, const score_t* ordered_gradients,
+                          hist_t* out) const override {
+    ConstructHistogramIntInner<true, true, false, int64_t, 32>(
+      data_indices, start, end, ordered_gradients, out);
+  }
+
+  void ConstructHistogramInt32(data_size_t start, data_size_t end,
+                          const score_t* ordered_gradients,
+                          hist_t* out) const override {
+    ConstructHistogramIntInner<false, false, false, int64_t, 32>(
+        nullptr, start, end, ordered_gradients, out);
+  }
+
   template <bool MISS_IS_ZERO, bool MISS_IS_NA, bool MFB_IS_ZERO,
             bool MFB_IS_NA, bool USE_MIN_BIN>
   data_size_t SplitInner(uint32_t min_bin, uint32_t max_bin,
diff --git a/src/io/multi_val_dense_bin.hpp b/src/io/multi_val_dense_bin.hpp
index b4fbfbe673aa..780272bdc4e1 100644
--- a/src/io/multi_val_dense_bin.hpp
+++ b/src/io/multi_val_dense_bin.hpp
@@ -124,6 +124,123 @@ class MultiValDenseBin : public MultiValBin {
                                               gradients, hessians, out);
   }
 
+  template<bool USE_INDICES, bool USE_PREFETCH, bool ORDERED, typename PACKED_HIST_T, int HIST_BITS>
+  void ConstructHistogramIntInner(const data_size_t* data_indices, data_size_t start, data_size_t end,
+    const score_t* gradients_and_hessians, hist_t* out) const {
+    data_size_t i = start;
+    const VAL_T* data_ptr_base = data_.data();
+    const int16_t* gradients_and_hessians_ptr = reinterpret_cast<const int16_t*>(gradients_and_hessians);
+    PACKED_HIST_T* out_ptr = reinterpret_cast<PACKED_HIST_T*>(out);
+
+    if (USE_PREFETCH) {
+      const data_size_t pf_offset = 32 / sizeof(VAL_T);
+      const data_size_t pf_end = end - pf_offset;
+
+      for (; i < pf_end; ++i) {
+        const auto idx = USE_INDICES ? data_indices[i] : i;
+        const auto pf_idx = USE_INDICES ? data_indices[i + pf_offset] : i + pf_offset;
+        if (!ORDERED) {
+          PREFETCH_T0(gradients_and_hessians_ptr + pf_idx);
+        }
+        PREFETCH_T0(data_ptr_base + RowPtr(pf_idx));
+        const auto j_start = RowPtr(idx);
+        const VAL_T* data_ptr = data_ptr_base + j_start;
+        const int16_t gradient_16 = gradients_and_hessians_ptr[idx];
+        const PACKED_HIST_T gradient_packed = (HIST_BITS == 8) ? gradient_16 :
+          ((static_cast<PACKED_HIST_T>(static_cast<int8_t>(gradient_16 >> 8)) << HIST_BITS) |
+          static_cast<PACKED_HIST_T>(gradient_16 & 0xff));
+        for (int j = 0; j < num_feature_; ++j) {
+          const uint32_t bin = static_cast<uint32_t>(data_ptr[j]);
+          const auto ti = (bin + offsets_[j]);
+          out_ptr[ti] += gradient_packed;
+        }
+      }
+    }
+    for (; i < end; ++i) {
+      const auto idx = USE_INDICES ? data_indices[i] : i;
+      const auto j_start = RowPtr(idx);
+      const VAL_T* data_ptr = data_ptr_base + j_start;
+      const int16_t gradient_16 = gradients_and_hessians_ptr[idx];
+      const PACKED_HIST_T gradient_packed = (HIST_BITS == 8) ? gradient_16 :
+          ((static_cast<PACKED_HIST_T>(static_cast<int8_t>(gradient_16 >> 8)) << HIST_BITS) |
+          static_cast<PACKED_HIST_T>(gradient_16 & 0xff));
+      for (int j = 0; j < num_feature_; ++j) {
+        const uint32_t bin = static_cast<uint32_t>(data_ptr[j]);
+        const auto ti = (bin + offsets_[j]);
+        out_ptr[ti] += gradient_packed;
+      }
+    }
+  }
+
+  void ConstructHistogramInt32(const data_size_t* data_indices, data_size_t start,
+                          data_size_t end, const score_t* gradients,
+                          const score_t* /*hessians*/, hist_t* out) const override {
+    ConstructHistogramIntInner<true, true, false, int64_t, 32>(data_indices, start, end,
+                                                               gradients, out);
+  }
+
+  void ConstructHistogramInt32(data_size_t start, data_size_t end,
+                          const score_t* gradients, const score_t* /*hessians*/,
+                          hist_t* out) const override {
+    ConstructHistogramIntInner<false, false, false, int64_t, 32>(
+        nullptr, start, end, gradients, out);
+  }
+
+  void ConstructHistogramOrderedInt32(const data_size_t* data_indices,
+                                 data_size_t start, data_size_t end,
+                                 const score_t* gradients,
+                                 const score_t* /*hessians*/,
+                                 hist_t* out) const override {
+    ConstructHistogramIntInner<true, true, true, int64_t, 32>(data_indices, start, end,
+                                                              gradients, out);
+  }
+
+  void ConstructHistogramInt16(const data_size_t* data_indices, data_size_t start,
+                          data_size_t end, const score_t* gradients,
+                          const score_t* /*hessians*/, hist_t* out) const override {
+    ConstructHistogramIntInner<true, true, false, int32_t, 16>(data_indices, start, end,
+                                                               gradients, out);
+  }
+
+  void ConstructHistogramInt16(data_size_t start, data_size_t end,
+                          const score_t* gradients, const score_t* /*hessians*/,
+                          hist_t* out) const override {
+    ConstructHistogramIntInner<false, false, false, int32_t, 16>(
+        nullptr, start, end, gradients, out);
+  }
+
+  void ConstructHistogramOrderedInt16(const data_size_t* data_indices,
+                                 data_size_t start, data_size_t end,
+                                 const score_t* gradients,
+                                 const score_t* /*hessians*/,
+                                 hist_t* out) const override {
+    ConstructHistogramIntInner<true, true, true, int32_t, 16>(data_indices, start, end,
+                                                              gradients, out);
+  }
+
+  void ConstructHistogramInt8(const data_size_t* data_indices, data_size_t start,
+                          data_size_t end, const score_t* gradients,
+                          const score_t* /*hessians*/, hist_t* out) const override {
+    ConstructHistogramIntInner<true, true, false, int16_t, 8>(data_indices, start, end,
+                                                               gradients, out);
+  }
+
+  void ConstructHistogramInt8(data_size_t start, data_size_t end,
+                          const score_t* gradients, const score_t* /*hessians*/,
+                          hist_t* out) const override {
+    ConstructHistogramIntInner<false, false, false, int16_t, 8>(
+        nullptr, start, end, gradients, out);
+  }
+
+  void ConstructHistogramOrderedInt8(const data_size_t* data_indices,
+                                 data_size_t start, data_size_t end,
+                                 const score_t* gradients,
+                                 const score_t* /*hessians*/,
+                                 hist_t* out) const override {
+    ConstructHistogramIntInner<true, true, true, int16_t, 8>(data_indices, start, end,
+                                                              gradients, out);
+  }
+
   MultiValBin* CreateLike(data_size_t num_data, int num_bin, int num_feature, double,
     const std::vector<uint32_t>& offsets) const override {
     return new MultiValDenseBin<VAL_T>(num_data, num_bin, num_feature, offsets);
diff --git a/src/io/multi_val_sparse_bin.hpp b/src/io/multi_val_sparse_bin.hpp
index eaa30ef0a0cc..32a5a51b4f89 100644
--- a/src/io/multi_val_sparse_bin.hpp
+++ b/src/io/multi_val_sparse_bin.hpp
@@ -180,6 +180,124 @@ class MultiValSparseBin : public MultiValBin {
                                               gradients, hessians, out);
   }
 
+  template <bool USE_INDICES, bool USE_PREFETCH, bool ORDERED, typename PACKED_HIST_T, int HIST_BITS>
+  void ConstructHistogramIntInner(const data_size_t* data_indices,
+                               data_size_t start, data_size_t end,
+                               const score_t* gradients_and_hessians, hist_t* out) const {
+    data_size_t i = start;
+    PACKED_HIST_T* out_ptr = reinterpret_cast<PACKED_HIST_T*>(out);
+    const int16_t* gradients_and_hessians_ptr = reinterpret_cast<const int16_t*>(gradients_and_hessians);
+    const VAL_T* data_ptr = data_.data();
+    const INDEX_T* row_ptr_base = row_ptr_.data();
+    if (USE_PREFETCH) {
+      const data_size_t pf_offset = 32 / sizeof(VAL_T);
+      const data_size_t pf_end = end - pf_offset;
+
+      for (; i < pf_end; ++i) {
+        const auto idx = USE_INDICES ? data_indices[i] : i;
+        const auto pf_idx =
+            USE_INDICES ? data_indices[i + pf_offset] : i + pf_offset;
+        if (!ORDERED) {
+          PREFETCH_T0(gradients_and_hessians_ptr + pf_idx);
+        }
+        PREFETCH_T0(row_ptr_base + pf_idx);
+        PREFETCH_T0(data_ptr + row_ptr_[pf_idx]);
+        const auto j_start = RowPtr(idx);
+        const auto j_end = RowPtr(idx + 1);
+        const int16_t gradient_16 = ORDERED ? gradients_and_hessians_ptr[i] : gradients_and_hessians_ptr[idx];
+        const PACKED_HIST_T gradient_packed = (HIST_BITS == 8) ? gradient_16 :
+          ((static_cast<PACKED_HIST_T>(static_cast<int8_t>(gradient_16 >> 8)) << HIST_BITS) |
+          static_cast<PACKED_HIST_T>(gradient_16 & 0xff));
+        for (auto j = j_start; j < j_end; ++j) {
+          const auto ti = static_cast<uint32_t>(data_ptr[j]);
+          out_ptr[ti] += gradient_packed;
+        }
+      }
+    }
+    for (; i < end; ++i) {
+      const auto idx = USE_INDICES ? data_indices[i] : i;
+      const auto j_start = RowPtr(idx);
+      const auto j_end = RowPtr(idx + 1);
+      const int16_t gradient_16 = ORDERED ? gradients_and_hessians_ptr[i] : gradients_and_hessians_ptr[idx];
+      const PACKED_HIST_T gradient_packed = (HIST_BITS == 8) ? gradient_16 :
+          ((static_cast<PACKED_HIST_T>(static_cast<int8_t>(gradient_16 >> 8)) << HIST_BITS) |
+          static_cast<PACKED_HIST_T>(gradient_16 & 0xff));
+      for (auto j = j_start; j < j_end; ++j) {
+        const auto ti = static_cast<uint32_t>(data_ptr[j]);
+        out_ptr[ti] += gradient_packed;
+      }
+    }
+  }
+
+  void ConstructHistogramInt32(const data_size_t* data_indices, data_size_t start,
+                          data_size_t end, const score_t* gradients,
+                          const score_t* /*hessians*/, hist_t* out) const override {
+    ConstructHistogramIntInner<true, true, false, int64_t, 32>(data_indices, start, end,
+                                                               gradients, out);
+  }
+
+  void ConstructHistogramInt32(data_size_t start, data_size_t end,
+                          const score_t* gradients, const score_t* /*hessians*/,
+                          hist_t* out) const override {
+    ConstructHistogramIntInner<false, false, false, int64_t, 32>(
+        nullptr, start, end, gradients, out);
+  }
+
+  void ConstructHistogramOrderedInt32(const data_size_t* data_indices,
+                                 data_size_t start, data_size_t end,
+                                 const score_t* gradients,
+                                 const score_t* /*hessians*/,
+                                 hist_t* out) const override {
+    ConstructHistogramIntInner<true, true, true, int64_t, 32>(data_indices, start, end,
+                                                              gradients, out);
+  }
+
+  void ConstructHistogramInt16(const data_size_t* data_indices, data_size_t start,
+                          data_size_t end, const score_t* gradients,
+                          const score_t* /*hessians*/, hist_t* out) const override {
+    ConstructHistogramIntInner<true, true, false, int32_t, 16>(data_indices, start, end,
+                                                               gradients, out);
+  }
+
+  void ConstructHistogramInt16(data_size_t start, data_size_t end,
+                          const score_t* gradients, const score_t* /*hessians*/,
+                          hist_t* out) const override {
+    ConstructHistogramIntInner<false, false, false, int32_t, 16>(
+        nullptr, start, end, gradients, out);
+  }
+
+  void ConstructHistogramOrderedInt16(const data_size_t* data_indices,
+                                 data_size_t start, data_size_t end,
+                                 const score_t* gradients,
+                                 const score_t* /*hessians*/,
+                                 hist_t* out) const override {
+    ConstructHistogramIntInner<true, true, true, int32_t, 16>(data_indices, start, end,
+                                                              gradients, out);
+  }
+
+  void ConstructHistogramInt8(const data_size_t* data_indices, data_size_t start,
+                          data_size_t end, const score_t* gradients,
+                          const score_t* /*hessians*/, hist_t* out) const override {
+    ConstructHistogramIntInner<true, true, false, int16_t, 8>(data_indices, start, end,
+                                                               gradients, out);
+  }
+
+  void ConstructHistogramInt8(data_size_t start, data_size_t end,
+                          const score_t* gradients, const score_t* /*hessians*/,
+                          hist_t* out) const override {
+    ConstructHistogramIntInner<false, false, false, int16_t, 8>(
+        nullptr, start, end, gradients, out);
+  }
+
+  void ConstructHistogramOrderedInt8(const data_size_t* data_indices,
+                                 data_size_t start, data_size_t end,
+                                 const score_t* gradients,
+                                 const score_t* /*hessians*/,
+                                 hist_t* out) const override {
+    ConstructHistogramIntInner<true, true, true, int16_t, 8>(data_indices, start, end,
+                                                              gradients, out);
+  }
+
   MultiValBin* CreateLike(data_size_t num_data, int num_bin, int,
                           double estimate_element_per_row,
                           const std::vector<uint32_t>& /*offsets*/) const override {
diff --git a/src/io/sparse_bin.hpp b/src/io/sparse_bin.hpp
index e01c0afcf5bc..f7137d29ffd9 100644
--- a/src/io/sparse_bin.hpp
+++ b/src/io/sparse_bin.hpp
@@ -203,6 +203,184 @@ class SparseBin : public Bin {
   }
 #undef ACC_GH
 
+  template <bool USE_HESSIAN, typename PACKED_HIST_T, typename GRAD_HIST_T, typename HESS_HIST_T, int HIST_BITS>
+  void ConstructIntHistogramInner(data_size_t start, data_size_t end,
+                          const score_t* ordered_gradients_and_hessians,
+                          hist_t* out) const {
+    data_size_t i_delta, cur_pos;
+    InitIndex(start, &i_delta, &cur_pos);
+    if (USE_HESSIAN) {
+      PACKED_HIST_T* out_ptr = reinterpret_cast<PACKED_HIST_T*>(out);
+      const int16_t* gradients_and_hessians_ptr = reinterpret_cast<const int16_t*>(ordered_gradients_and_hessians);
+      while (cur_pos < start && i_delta < num_vals_) {
+        cur_pos += deltas_[++i_delta];
+      }
+      while (cur_pos < end && i_delta < num_vals_) {
+        const VAL_T bin = vals_[i_delta];
+        const int16_t gradient_16 = gradients_and_hessians_ptr[cur_pos];
+        const PACKED_HIST_T gradient_64 = (static_cast<PACKED_HIST_T>(static_cast<int8_t>(gradient_16 >> 8)) << HIST_BITS) | (gradient_16 & 0xff);
+        out_ptr[bin] += gradient_64;
+        cur_pos += deltas_[++i_delta];
+      }
+    } else {
+      GRAD_HIST_T* grad = reinterpret_cast<GRAD_HIST_T*>(out);
+      HESS_HIST_T* cnt = reinterpret_cast<HESS_HIST_T*>(out) + 1;
+      const int8_t* gradients_and_hessians_ptr = reinterpret_cast<const int8_t*>(ordered_gradients_and_hessians);
+      while (cur_pos < start && i_delta < num_vals_) {
+        cur_pos += deltas_[++i_delta];
+      }
+      while (cur_pos < end && i_delta < num_vals_) {
+        const uint32_t ti = static_cast<uint32_t>(vals_[i_delta]) << 1;
+        grad[ti] += gradients_and_hessians_ptr[cur_pos];
+        ++cnt[ti];
+        cur_pos += deltas_[++i_delta];
+      }
+    }
+  }
+
+  template <bool USE_HESSIAN, typename PACKED_HIST_T, typename GRAD_HIST_T, typename HESS_HIST_T, int HIST_BITS>
+  void ConstructIntHistogramInner(const data_size_t* data_indices, data_size_t start,
+                          data_size_t end, const score_t* ordered_gradients_and_hessians,
+                          hist_t* out) const {
+    data_size_t i_delta, cur_pos;
+    InitIndex(data_indices[start], &i_delta, &cur_pos);
+    data_size_t i = start;
+    if (USE_HESSIAN) {
+      PACKED_HIST_T* out_ptr = reinterpret_cast<PACKED_HIST_T*>(out);
+      const int16_t* gradients_and_hessians_ptr = reinterpret_cast<const int16_t*>(ordered_gradients_and_hessians);
+      for (;;) {
+        if (cur_pos < data_indices[i]) {
+          cur_pos += deltas_[++i_delta];
+          if (i_delta >= num_vals_) {
+            break;
+          }
+        } else if (cur_pos > data_indices[i]) {
+          if (++i >= end) {
+            break;
+          }
+        } else {
+          const VAL_T bin = vals_[i_delta];
+          const int16_t gradient_16 = gradients_and_hessians_ptr[i];
+          const PACKED_HIST_T gradient_packed = (HIST_BITS == 8) ? gradient_16 :
+            (static_cast<PACKED_HIST_T>(static_cast<int8_t>(gradient_16 >> 8)) << HIST_BITS) | (gradient_16 & 0xff);
+          out_ptr[bin] += gradient_packed;
+          if (++i >= end) {
+            break;
+          }
+          cur_pos += deltas_[++i_delta];
+          if (i_delta >= num_vals_) {
+            break;
+          }
+        }
+      }
+    } else {
+      GRAD_HIST_T* grad = reinterpret_cast<GRAD_HIST_T*>(out);
+      HESS_HIST_T* cnt = reinterpret_cast<HESS_HIST_T*>(out) + 1;
+      const int8_t* gradients_and_hessians_ptr = reinterpret_cast<const int8_t*>(ordered_gradients_and_hessians);
+      for (;;) {
+        if (cur_pos < data_indices[i]) {
+          cur_pos += deltas_[++i_delta];
+          if (i_delta >= num_vals_) {
+            break;
+          }
+        } else if (cur_pos > data_indices[i]) {
+          if (++i >= end) {
+            break;
+          }
+        } else {
+          const uint32_t ti = static_cast<uint32_t>(vals_[i_delta]) << 1;
+          grad[ti] += gradients_and_hessians_ptr[i << 1];
+          ++cnt[ti];
+          if (++i >= end) {
+            break;
+          }
+          cur_pos += deltas_[++i_delta];
+          if (i_delta >= num_vals_) {
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  void ConstructHistogramInt32(const data_size_t* data_indices, data_size_t start,
+                          data_size_t end, const score_t* ordered_gradients,
+                          const score_t* /*ordered_hessians*/,
+                          hist_t* out) const override {
+    ConstructIntHistogramInner<true, int64_t, int32_t, uint32_t, 32>(data_indices, start, end, ordered_gradients, out);
+  }
+
+  void ConstructHistogramInt32(data_size_t start, data_size_t end,
+                          const score_t* ordered_gradients,
+                          const score_t* /*ordered_hessians*/,
+                          hist_t* out) const override {
+    ConstructIntHistogramInner<true, int64_t, int32_t, uint32_t, 32>(start, end, ordered_gradients, out);
+  }
+
+  void ConstructHistogramInt32(const data_size_t* data_indices, data_size_t start,
+                          data_size_t end, const score_t* ordered_gradients,
+                          hist_t* out) const override {
+    ConstructIntHistogramInner<false, int64_t, int32_t, uint32_t, 32>(data_indices, start, end, ordered_gradients, out);
+  }
+
+  void ConstructHistogramInt32(data_size_t start, data_size_t end,
+                          const score_t* ordered_gradients,
+                          hist_t* out) const override {
+    ConstructIntHistogramInner<false, int64_t, int32_t, uint32_t, 32>(start, end, ordered_gradients, out);
+  }
+
+  void ConstructHistogramInt16(const data_size_t* data_indices, data_size_t start,
+                          data_size_t end, const score_t* ordered_gradients,
+                          const score_t* /*ordered_hessians*/,
+                          hist_t* out) const override {
+    ConstructIntHistogramInner<true, int32_t, int16_t, uint16_t, 16>(data_indices, start, end, ordered_gradients, out);
+  }
+
+  void ConstructHistogramInt16(data_size_t start, data_size_t end,
+                          const score_t* ordered_gradients,
+                          const score_t* /*ordered_hessians*/,
+                          hist_t* out) const override {
+    ConstructIntHistogramInner<true, int32_t, int16_t, uint16_t, 16>(start, end, ordered_gradients, out);
+  }
+
+  void ConstructHistogramInt16(const data_size_t* data_indices, data_size_t start,
+                          data_size_t end, const score_t* ordered_gradients,
+                          hist_t* out) const override {
+    ConstructIntHistogramInner<false, int32_t, int16_t, uint16_t, 16>(data_indices, start, end, ordered_gradients, out);
+  }
+
+  void ConstructHistogramInt16(data_size_t start, data_size_t end,
+                          const score_t* ordered_gradients,
+                          hist_t* out) const override {
+    ConstructIntHistogramInner<false, int32_t, int16_t, uint16_t, 16>(start, end, ordered_gradients, out);
+  }
+
+  void ConstructHistogramInt8(const data_size_t* data_indices, data_size_t start,
+                          data_size_t end, const score_t* ordered_gradients,
+                          const score_t* /*ordered_hessians*/,
+                          hist_t* out) const override {
+    ConstructIntHistogramInner<true, int16_t, uint8_t, uint8_t, 8>(data_indices, start, end, ordered_gradients, out);
+  }
+
+  void ConstructHistogramInt8(data_size_t start, data_size_t end,
+                          const score_t* ordered_gradients,
+                          const score_t* /*ordered_hessians*/,
+                          hist_t* out) const override {
+    ConstructIntHistogramInner<true, int16_t, uint8_t, uint8_t, 8>(start, end, ordered_gradients, out);
+  }
+
+  void ConstructHistogramInt8(const data_size_t* data_indices, data_size_t start,
+                          data_size_t end, const score_t* ordered_gradients,
+                          hist_t* out) const override {
+    ConstructIntHistogramInner<false, int16_t, uint8_t, uint8_t, 8>(data_indices, start, end, ordered_gradients, out);
+  }
+
+  void ConstructHistogramInt8(data_size_t start, data_size_t end,
+                          const score_t* ordered_gradients,
+                          hist_t* out) const override {
+    ConstructIntHistogramInner<false, int16_t, uint8_t, uint8_t, 8>(start, end, ordered_gradients, out);
+  }
+
   inline void NextNonzeroFast(data_size_t* i_delta,
                               data_size_t* cur_pos) const {
     *cur_pos += deltas_[++(*i_delta)];
diff --git a/src/io/train_share_states.cpp b/src/io/train_share_states.cpp
index f6462697a93d..71b2e097ef1b 100644
--- a/src/io/train_share_states.cpp
+++ b/src/io/train_share_states.cpp
@@ -9,7 +9,7 @@
 namespace LightGBM {
 
 MultiValBinWrapper::MultiValBinWrapper(MultiValBin* bin, data_size_t num_data,
-  const std::vector<int>& feature_groups_contained):
+  const std::vector<int>& feature_groups_contained, const int num_grad_quant_bins):
     feature_groups_contained_(feature_groups_contained) {
   num_threads_ = OMP_NUM_THREADS();
   num_data_ = num_data;
@@ -19,6 +19,7 @@ MultiValBinWrapper::MultiValBinWrapper(MultiValBin* bin, data_size_t num_data,
   }
   num_bin_ = bin->num_bin();
   num_bin_aligned_ = (num_bin_ + kAlignedSize - 1) / kAlignedSize * kAlignedSize;
+  num_grad_quant_bins_ = num_grad_quant_bins;
 }
 
 void MultiValBinWrapper::InitTrain(const std::vector<int>& group_feature_start,
@@ -45,43 +46,161 @@ void MultiValBinWrapper::InitTrain(const std::vector<int>& group_feature_start,
   }
 }
 
+template <bool USE_QUANT_GRAD, int HIST_BITS, int INNER_HIST_BITS>
 void MultiValBinWrapper::HistMove(const std::vector<hist_t,
   Common::AlignmentAllocator<hist_t, kAlignedSize>>& hist_buf) {
-  if (!is_use_subcol_) {
+  if (!is_use_subcol_ && INNER_HIST_BITS != 8) {
     return;
   }
-  const hist_t* src = hist_buf.data() + hist_buf.size() -
-    2 * static_cast<size_t>(num_bin_aligned_);
-  #pragma omp parallel for schedule(static)
-  for (int i = 0; i < static_cast<int>(hist_move_src_.size()); ++i) {
-    std::copy_n(src + hist_move_src_[i], hist_move_size_[i],
-                origin_hist_data_ + hist_move_dest_[i]);
+  if (USE_QUANT_GRAD) {
+    if (HIST_BITS == 32) {
+      const int64_t* src = reinterpret_cast<const int64_t*>(hist_buf.data()) + hist_buf.size() / 2 -
+        static_cast<size_t>(num_bin_aligned_);
+      #pragma omp parallel for schedule(static)
+      for (int i = 0; i < static_cast<int>(hist_move_src_.size()); ++i) {
+        std::copy_n(src + hist_move_src_[i] / 2, hist_move_size_[i] / 2,
+                    reinterpret_cast<int64_t*>(origin_hist_data_) + hist_move_dest_[i] / 2);
+      }
+    } else if (HIST_BITS == 16) {
+      const int32_t* src = reinterpret_cast<const int32_t*>(hist_buf.data()) + hist_buf.size() / 2 -
+        static_cast<size_t>(num_bin_aligned_);
+      if (is_use_subcol_) {
+        #pragma omp parallel for schedule(static)
+        for (int i = 0; i < static_cast<int>(hist_move_src_.size()); ++i) {
+          std::copy_n(src + hist_move_src_[i] / 2, hist_move_size_[i] / 2,
+                      reinterpret_cast<int32_t*>(origin_hist_data_) + hist_move_dest_[i] / 2);
+        }
+      } else {
+        int32_t* orig_ptr = reinterpret_cast<int32_t*>(origin_hist_data_);
+        #pragma omp parallel for schedule(static)
+        for (int i = 0; i < num_bin_; ++i) {
+          orig_ptr[i] = src[i];
+        }
+      }
+    }
+  } else {
+    const hist_t* src = hist_buf.data() + hist_buf.size() -
+      2 * static_cast<size_t>(num_bin_aligned_);
+    #pragma omp parallel for schedule(static)
+    for (int i = 0; i < static_cast<int>(hist_move_src_.size()); ++i) {
+      std::copy_n(src + hist_move_src_[i], hist_move_size_[i],
+                  origin_hist_data_ + hist_move_dest_[i]);
+    }
   }
 }
 
+template void MultiValBinWrapper::HistMove<false, 0, 0>(const std::vector<hist_t,
+  Common::AlignmentAllocator<hist_t, kAlignedSize>>& hist_buf);
+
+template void MultiValBinWrapper::HistMove<false, 0, 8>(const std::vector<hist_t,
+  Common::AlignmentAllocator<hist_t, kAlignedSize>>& hist_buf);
+
+template void MultiValBinWrapper::HistMove<true, 16, 8>(const std::vector<hist_t,
+  Common::AlignmentAllocator<hist_t, kAlignedSize>>& hist_buf);
+
+template void MultiValBinWrapper::HistMove<true, 16, 16>(const std::vector<hist_t,
+  Common::AlignmentAllocator<hist_t, kAlignedSize>>& hist_buf);
+
+template void MultiValBinWrapper::HistMove<true, 32, 8>(const std::vector<hist_t,
+  Common::AlignmentAllocator<hist_t, kAlignedSize>>& hist_buf);
+
+template void MultiValBinWrapper::HistMove<true, 32, 32>(const std::vector<hist_t,
+  Common::AlignmentAllocator<hist_t, kAlignedSize>>& hist_buf);
+
+template <bool USE_QUANT_GRAD, int HIST_BITS, int INNER_HIST_BITS>
 void MultiValBinWrapper::HistMerge(std::vector<hist_t,
   Common::AlignmentAllocator<hist_t, kAlignedSize>>* hist_buf) {
   int n_bin_block = 1;
   int bin_block_size = num_bin_;
   Threading::BlockInfo<data_size_t>(num_threads_, num_bin_, 512, &n_bin_block,
                                   &bin_block_size);
-  hist_t* dst = origin_hist_data_;
-  if (is_use_subcol_) {
-    dst = hist_buf->data() + hist_buf->size() - 2 * static_cast<size_t>(num_bin_aligned_);
-  }
-  #pragma omp parallel for schedule(static, 1) num_threads(num_threads_)
-  for (int t = 0; t < n_bin_block; ++t) {
-    const int start = t * bin_block_size;
-    const int end = std::min(start + bin_block_size, num_bin_);
-    for (int tid = 1; tid < n_data_block_; ++tid) {
-      auto src_ptr = hist_buf->data() + static_cast<size_t>(num_bin_aligned_) * 2 * (tid - 1);
-      for (int i = start * 2; i < end * 2; ++i) {
-        dst[i] += src_ptr[i];
+  if (USE_QUANT_GRAD) {
+    if (HIST_BITS == 32) {
+      int64_t* dst = reinterpret_cast<int64_t*>(origin_hist_data_);
+      if (is_use_subcol_) {
+        dst = reinterpret_cast<int64_t*>(hist_buf->data()) + hist_buf->size() / 2 - static_cast<size_t>(num_bin_aligned_);
+      }
+      #pragma omp parallel for schedule(static, 1) num_threads(num_threads_)
+      for (int t = 0; t < n_bin_block; ++t) {
+        const int start = t * bin_block_size;
+        const int end = std::min(start + bin_block_size, num_bin_);
+        for (int tid = 1; tid < n_data_block_; ++tid) {
+          auto src_ptr = reinterpret_cast<const int64_t*>(hist_buf->data()) + static_cast<size_t>(num_bin_aligned_) * (tid - 1);
+          for (int i = start; i < end; ++i) {
+            dst[i] += src_ptr[i];
+          }
+        }
+      }
+    } else if (HIST_BITS == 16 && INNER_HIST_BITS == 16) {
+      int32_t* dst = reinterpret_cast<int32_t*>(origin_hist_data_);
+      if (is_use_subcol_) {
+        dst = reinterpret_cast<int32_t*>(hist_buf->data()) + hist_buf->size() / 2 - static_cast<size_t>(num_bin_aligned_);
+      }
+      #pragma omp parallel for schedule(static, 1) num_threads(num_threads_)
+      for (int t = 0; t < n_bin_block; ++t) {
+        const int start = t * bin_block_size;
+        const int end = std::min(start + bin_block_size, num_bin_);
+        for (int tid = 1; tid < n_data_block_; ++tid) {
+          auto src_ptr = reinterpret_cast<const int32_t*>(hist_buf->data()) + static_cast<size_t>(num_bin_aligned_) * (tid - 1);
+          for (int i = start; i < end; ++i) {
+            dst[i] += src_ptr[i];
+          }
+        }
+      }
+    } else if (HIST_BITS == 16 && INNER_HIST_BITS == 8) {
+      int32_t* dst = reinterpret_cast<int32_t*>(hist_buf->data()) + hist_buf->size() / 2 - static_cast<size_t>(num_bin_aligned_);
+      std::memset(reinterpret_cast<void*>(dst), 0, num_bin_ * kInt16HistBufferEntrySize);
+      #pragma omp parallel for schedule(static, 1) num_threads(num_threads_)
+      for (int t = 0; t < n_bin_block; ++t) {
+        const int start = t * bin_block_size;
+        const int end = std::min(start + bin_block_size, num_bin_);
+        for (int tid = 0; tid < n_data_block_; ++tid) {
+          auto src_ptr = reinterpret_cast<const int16_t*>(hist_buf->data()) + static_cast<size_t>(num_bin_aligned_) * tid;
+          for (int i = start; i < end; ++i) {
+            const int16_t packed_hist = src_ptr[i];
+            const int32_t packed_hist_int32 = (static_cast<int32_t>(static_cast<int8_t>(packed_hist >> 8)) << 16) | static_cast<int32_t>(packed_hist & 0x00ff);
+            dst[i] += packed_hist_int32;
+          }
+        }
+      }
+    }
+  } else {
+    hist_t* dst = origin_hist_data_;
+    if (is_use_subcol_) {
+      dst = hist_buf->data() + hist_buf->size() - 2 * static_cast<size_t>(num_bin_aligned_);
+    }
+    #pragma omp parallel for schedule(static, 1) num_threads(num_threads_)
+    for (int t = 0; t < n_bin_block; ++t) {
+      const int start = t * bin_block_size;
+      const int end = std::min(start + bin_block_size, num_bin_);
+      for (int tid = 1; tid < n_data_block_; ++tid) {
+        auto src_ptr = hist_buf->data() + static_cast<size_t>(num_bin_aligned_) * 2 * (tid - 1);
+        for (int i = start * 2; i < end * 2; ++i) {
+          dst[i] += src_ptr[i];
+        }
       }
     }
   }
 }
 
+template void MultiValBinWrapper::HistMerge<false, 0, 0>(std::vector<hist_t,
+  Common::AlignmentAllocator<hist_t, kAlignedSize>>* hist_buf);
+
+template void MultiValBinWrapper::HistMerge<false, 0, 8>(std::vector<hist_t,
+  Common::AlignmentAllocator<hist_t, kAlignedSize>>* hist_buf);
+
+template void MultiValBinWrapper::HistMerge<true, 16, 8>(std::vector<hist_t,
+  Common::AlignmentAllocator<hist_t, kAlignedSize>>* hist_buf);
+
+template void MultiValBinWrapper::HistMerge<true, 16, 16>(std::vector<hist_t,
+  Common::AlignmentAllocator<hist_t, kAlignedSize>>* hist_buf);
+
+template void MultiValBinWrapper::HistMerge<true, 32, 8>(std::vector<hist_t,
+  Common::AlignmentAllocator<hist_t, kAlignedSize>>* hist_buf);
+
+template void MultiValBinWrapper::HistMerge<true, 32, 32>(std::vector<hist_t,
+  Common::AlignmentAllocator<hist_t, kAlignedSize>>* hist_buf);
+
 void MultiValBinWrapper::ResizeHistBuf(std::vector<hist_t,
   Common::AlignmentAllocator<hist_t, kAlignedSize>>* hist_buf,
   MultiValBin* sub_multi_val_bin,
@@ -389,7 +508,7 @@ void TrainingShareStates::CalcBinOffsets(const std::vector<std::unique_ptr<Featu
 
 void TrainingShareStates::SetMultiValBin(MultiValBin* bin, data_size_t num_data,
   const std::vector<std::unique_ptr<FeatureGroup>>& feature_groups,
-  bool dense_only, bool sparse_only) {
+  bool dense_only, bool sparse_only, const int num_grad_quant_bins) {
   num_threads = OMP_NUM_THREADS();
   if (bin == nullptr) {
     return;
@@ -408,7 +527,7 @@ void TrainingShareStates::SetMultiValBin(MultiValBin* bin, data_size_t num_data,
   num_total_bin_ += bin->num_bin();
   num_elements_per_row_ += bin->num_element_per_row();
   multi_val_bin_wrapper_.reset(new MultiValBinWrapper(
-    bin, num_data, feature_groups_contained));
+    bin, num_data, feature_groups_contained, num_grad_quant_bins));
 }
 
 }  // namespace LightGBM
diff --git a/src/treelearner/data_parallel_tree_learner.cpp b/src/treelearner/data_parallel_tree_learner.cpp
index 677b7dc6eb82..2509db5e722a 100644
--- a/src/treelearner/data_parallel_tree_learner.cpp
+++ b/src/treelearner/data_parallel_tree_learner.cpp
@@ -30,7 +30,9 @@ void DataParallelTreeLearner<TREELEARNER_T>::Init(const Dataset* train_data, boo
   auto max_cat_threshold = this->config_->max_cat_threshold;
   // need to be able to hold smaller and larger best splits in SyncUpGlobalBestSplit
   size_t split_info_size = static_cast<size_t>(SplitInfo::Size(max_cat_threshold) * 2);
-  size_t histogram_size = static_cast<size_t>(this->share_state_->num_hist_total_bin() * kHistEntrySize);
+  size_t histogram_size = this->config_->use_quantized_grad ?
+    static_cast<size_t>(this->share_state_->num_hist_total_bin() * kInt32HistEntrySize) :
+    static_cast<size_t>(this->share_state_->num_hist_total_bin() * kHistEntrySize);
 
   // allocate buffer for communication
   size_t buffer_size = std::max(histogram_size, split_info_size);
@@ -43,8 +45,19 @@ void DataParallelTreeLearner<TREELEARNER_T>::Init(const Dataset* train_data, boo
   block_start_.resize(num_machines_);
   block_len_.resize(num_machines_);
 
+  if (this->config_->use_quantized_grad) {
+    block_start_int16_.resize(num_machines_);
+    block_len_int16_.resize(num_machines_);
+  }
+
   buffer_write_start_pos_.resize(this->num_features_);
   buffer_read_start_pos_.resize(this->num_features_);
+
+  if (this->config_->use_quantized_grad) {
+    buffer_write_start_pos_int16_.resize(this->num_features_);
+    buffer_read_start_pos_int16_.resize(this->num_features_);
+  }
+
   global_data_count_in_leaf_.resize(this->config_->num_leaves);
 }
 
@@ -55,100 +68,155 @@ void DataParallelTreeLearner<TREELEARNER_T>::ResetConfig(const Config* config) {
 }
 
 template <typename TREELEARNER_T>
-void DataParallelTreeLearner<TREELEARNER_T>::BeforeTrain() {
-  TREELEARNER_T::BeforeTrain();
-  // generate feature partition for current tree
-  std::vector<std::vector<int>> feature_distribution(num_machines_, std::vector<int>());
-  std::vector<int> num_bins_distributed(num_machines_, 0);
-  for (int i = 0; i < this->train_data_->num_total_features(); ++i) {
-    int inner_feature_index = this->train_data_->InnerFeatureIndex(i);
-    if (inner_feature_index == -1) { continue; }
-    if (this->col_sampler_.is_feature_used_bytree()[inner_feature_index]) {
-      int cur_min_machine = static_cast<int>(ArrayArgs<int>::ArgMin(num_bins_distributed));
-      feature_distribution[cur_min_machine].push_back(inner_feature_index);
-      auto num_bin = this->train_data_->FeatureNumBin(inner_feature_index);
-      if (this->train_data_->FeatureBinMapper(inner_feature_index)->GetMostFreqBin() == 0) {
-        num_bin -= 1;
-      }
-      num_bins_distributed[cur_min_machine] += num_bin;
-    }
-    is_feature_aggregated_[inner_feature_index] = false;
-  }
-  // get local used feature
-  for (auto fid : feature_distribution[rank_]) {
-    is_feature_aggregated_[fid] = true;
-  }
-
+void DataParallelTreeLearner<TREELEARNER_T>::PrepareBufferPos(
+  const std::vector<std::vector<int>>& feature_distribution,
+  std::vector<comm_size_t>* block_start,
+  std::vector<comm_size_t>* block_len,
+  std::vector<comm_size_t>* buffer_write_start_pos,
+  std::vector<comm_size_t>* buffer_read_start_pos,
+  comm_size_t* reduce_scatter_size,
+  size_t hist_entry_size) {
   // get block start and block len for reduce scatter
-  reduce_scatter_size_ = 0;
+  *reduce_scatter_size = 0;
   for (int i = 0; i < num_machines_; ++i) {
-    block_len_[i] = 0;
+    (*block_len)[i] = 0;
     for (auto fid : feature_distribution[i]) {
       auto num_bin = this->train_data_->FeatureNumBin(fid);
       if (this->train_data_->FeatureBinMapper(fid)->GetMostFreqBin() == 0) {
         num_bin -= 1;
       }
-      block_len_[i] += num_bin * kHistEntrySize;
+      (*block_len)[i] += num_bin * hist_entry_size;
     }
-    reduce_scatter_size_ += block_len_[i];
+    *reduce_scatter_size += (*block_len)[i];
   }
 
-  block_start_[0] = 0;
+  (*block_start)[0] = 0;
   for (int i = 1; i < num_machines_; ++i) {
-    block_start_[i] = block_start_[i - 1] + block_len_[i - 1];
+    (*block_start)[i] = (*block_start)[i - 1] + (*block_len)[i - 1];
   }
 
-  // get buffer_write_start_pos_
+  // get buffer_write_start_pos
   int bin_size = 0;
   for (int i = 0; i < num_machines_; ++i) {
     for (auto fid : feature_distribution[i]) {
-      buffer_write_start_pos_[fid] = bin_size;
+      (*buffer_write_start_pos)[fid] = bin_size;
       auto num_bin = this->train_data_->FeatureNumBin(fid);
       if (this->train_data_->FeatureBinMapper(fid)->GetMostFreqBin() == 0) {
         num_bin -= 1;
       }
-      bin_size += num_bin * kHistEntrySize;
+      bin_size += num_bin * hist_entry_size;
     }
   }
 
-  // get buffer_read_start_pos_
+  // get buffer_read_start_pos
   bin_size = 0;
   for (auto fid : feature_distribution[rank_]) {
-    buffer_read_start_pos_[fid] = bin_size;
+    (*buffer_read_start_pos)[fid] = bin_size;
     auto num_bin = this->train_data_->FeatureNumBin(fid);
     if (this->train_data_->FeatureBinMapper(fid)->GetMostFreqBin() == 0) {
       num_bin -= 1;
     }
-    bin_size += num_bin * kHistEntrySize;
+    bin_size += num_bin * hist_entry_size;
   }
+}
 
-  // sync global data sumup info
-  std::tuple<data_size_t, double, double> data(this->smaller_leaf_splits_->num_data_in_leaf(),
-                                               this->smaller_leaf_splits_->sum_gradients(), this->smaller_leaf_splits_->sum_hessians());
-  int size = sizeof(data);
-  std::memcpy(input_buffer_.data(), &data, size);
-  // global sumup reduce
-  Network::Allreduce(input_buffer_.data(), size, sizeof(std::tuple<data_size_t, double, double>), output_buffer_.data(), [](const char *src, char *dst, int type_size, comm_size_t len) {
-    comm_size_t used_size = 0;
-    const std::tuple<data_size_t, double, double> *p1;
-    std::tuple<data_size_t, double, double> *p2;
-    while (used_size < len) {
-      p1 = reinterpret_cast<const std::tuple<data_size_t, double, double> *>(src);
-      p2 = reinterpret_cast<std::tuple<data_size_t, double, double> *>(dst);
-      std::get<0>(*p2) = std::get<0>(*p2) + std::get<0>(*p1);
-      std::get<1>(*p2) = std::get<1>(*p2) + std::get<1>(*p1);
-      std::get<2>(*p2) = std::get<2>(*p2) + std::get<2>(*p1);
-      src += type_size;
-      dst += type_size;
-      used_size += type_size;
+template <typename TREELEARNER_T>
+void DataParallelTreeLearner<TREELEARNER_T>::BeforeTrain() {
+  TREELEARNER_T::BeforeTrain();
+  // generate feature partition for current tree
+  std::vector<std::vector<int>> feature_distribution(num_machines_, std::vector<int>());
+  std::vector<int> num_bins_distributed(num_machines_, 0);
+  for (int i = 0; i < this->train_data_->num_total_features(); ++i) {
+    int inner_feature_index = this->train_data_->InnerFeatureIndex(i);
+    if (inner_feature_index == -1) { continue; }
+    if (this->col_sampler_.is_feature_used_bytree()[inner_feature_index]) {
+      int cur_min_machine = static_cast<int>(ArrayArgs<int>::ArgMin(num_bins_distributed));
+      feature_distribution[cur_min_machine].push_back(inner_feature_index);
+      auto num_bin = this->train_data_->FeatureNumBin(inner_feature_index);
+      if (this->train_data_->FeatureBinMapper(inner_feature_index)->GetMostFreqBin() == 0) {
+        num_bin -= 1;
+      }
+      num_bins_distributed[cur_min_machine] += num_bin;
     }
-  });
-  // copy back
-  std::memcpy(reinterpret_cast<void*>(&data), output_buffer_.data(), size);
-  // set global sumup info
-  this->smaller_leaf_splits_->Init(std::get<1>(data), std::get<2>(data));
-  // init global data count in leaf
-  global_data_count_in_leaf_[0] = std::get<0>(data);
+    is_feature_aggregated_[inner_feature_index] = false;
+  }
+  // get local used feature
+  for (auto fid : feature_distribution[rank_]) {
+    is_feature_aggregated_[fid] = true;
+  }
+
+  // get block start and block len for reduce scatter
+  if (this->config_->use_quantized_grad) {
+    PrepareBufferPos(feature_distribution, &block_start_, &block_len_, &buffer_write_start_pos_,
+      &buffer_read_start_pos_, &reduce_scatter_size_, kInt32HistEntrySize);
+    PrepareBufferPos(feature_distribution, &block_start_int16_, &block_len_int16_, &buffer_write_start_pos_int16_,
+      &buffer_read_start_pos_int16_, &reduce_scatter_size_int16_, kInt16HistEntrySize);
+  } else {
+    PrepareBufferPos(feature_distribution, &block_start_, &block_len_, &buffer_write_start_pos_,
+      &buffer_read_start_pos_, &reduce_scatter_size_, kHistEntrySize);
+  }
+
+  if (this->config_->use_quantized_grad) {
+    // sync global data sumup info
+    std::tuple<data_size_t, double, double, int64_t> data(this->smaller_leaf_splits_->num_data_in_leaf(),
+                                                          this->smaller_leaf_splits_->sum_gradients(), this->smaller_leaf_splits_->sum_hessians(),
+                                                          this->smaller_leaf_splits_->int_sum_gradients_and_hessians());
+    int size = sizeof(data);
+    std::memcpy(input_buffer_.data(), &data, size);
+    // global sumup reduce
+    Network::Allreduce(input_buffer_.data(), size, sizeof(std::tuple<data_size_t, double, double, int64_t>), output_buffer_.data(), [](const char *src, char *dst, int type_size, comm_size_t len) {
+      comm_size_t used_size = 0;
+      const std::tuple<data_size_t, double, double, int64_t> *p1;
+      std::tuple<data_size_t, double, double, int64_t> *p2;
+      while (used_size < len) {
+        p1 = reinterpret_cast<const std::tuple<data_size_t, double, double, int64_t> *>(src);
+        p2 = reinterpret_cast<std::tuple<data_size_t, double, double, int64_t> *>(dst);
+        std::get<0>(*p2) = std::get<0>(*p2) + std::get<0>(*p1);
+        std::get<1>(*p2) = std::get<1>(*p2) + std::get<1>(*p1);
+        std::get<2>(*p2) = std::get<2>(*p2) + std::get<2>(*p1);
+        std::get<3>(*p2) = std::get<3>(*p2) + std::get<3>(*p1);
+        src += type_size;
+        dst += type_size;
+        used_size += type_size;
+      }
+    });
+    // copy back
+    std::memcpy(reinterpret_cast<void*>(&data), output_buffer_.data(), size);
+    // set global sumup info
+    this->smaller_leaf_splits_->Init(std::get<1>(data), std::get<2>(data), std::get<3>(data));
+    // init global data count in leaf
+    global_data_count_in_leaf_[0] = std::get<0>(data);
+    // reset hist num bits according to global num data
+    this->gradient_discretizer_->template SetNumBitsInHistogramBin<true>(0, -1, GetGlobalDataCountInLeaf(0), 0);
+  } else {
+    // sync global data sumup info
+    std::tuple<data_size_t, double, double> data(this->smaller_leaf_splits_->num_data_in_leaf(),
+                                                this->smaller_leaf_splits_->sum_gradients(), this->smaller_leaf_splits_->sum_hessians());
+    int size = sizeof(data);
+    std::memcpy(input_buffer_.data(), &data, size);
+    // global sumup reduce
+    Network::Allreduce(input_buffer_.data(), size, sizeof(std::tuple<data_size_t, double, double>), output_buffer_.data(), [](const char *src, char *dst, int type_size, comm_size_t len) {
+      comm_size_t used_size = 0;
+      const std::tuple<data_size_t, double, double> *p1;
+      std::tuple<data_size_t, double, double> *p2;
+      while (used_size < len) {
+        p1 = reinterpret_cast<const std::tuple<data_size_t, double, double> *>(src);
+        p2 = reinterpret_cast<std::tuple<data_size_t, double, double> *>(dst);
+        std::get<0>(*p2) = std::get<0>(*p2) + std::get<0>(*p1);
+        std::get<1>(*p2) = std::get<1>(*p2) + std::get<1>(*p1);
+        std::get<2>(*p2) = std::get<2>(*p2) + std::get<2>(*p1);
+        src += type_size;
+        dst += type_size;
+        used_size += type_size;
+      }
+    });
+    // copy back
+    std::memcpy(reinterpret_cast<void*>(&data), output_buffer_.data(), size);
+    // set global sumup info
+    this->smaller_leaf_splits_->Init(std::get<1>(data), std::get<2>(data));
+    // init global data count in leaf
+    global_data_count_in_leaf_[0] = std::get<0>(data);
+  }
 }
 
 template <typename TREELEARNER_T>
@@ -167,23 +235,66 @@ void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplits(const Tree* tree) {
       const BinMapper* feature_bin_mapper = this->train_data_->FeatureBinMapper(feature_index);
       const int offset = static_cast<int>(feature_bin_mapper->GetMostFreqBin() == 0);
       const int num_bin = feature_bin_mapper->num_bin();
-      hist_t* hist_ptr = this->smaller_leaf_histogram_array_[feature_index].RawData();
-      std::memset(reinterpret_cast<void*>(hist_ptr), 0, (num_bin - offset) * kHistEntrySize);
+      if (this->config_->use_quantized_grad) {
+        int32_t* hist_ptr = this->smaller_leaf_histogram_array_[feature_index].RawDataInt32();
+        std::memset(reinterpret_cast<void*>(hist_ptr), 0, (num_bin - offset) * kInt32HistEntrySize);
+        int16_t* hist_ptr_int16 = this->smaller_leaf_histogram_array_[feature_index].RawDataInt16();
+        std::memset(reinterpret_cast<void*>(hist_ptr_int16), 0, (num_bin - offset) * kInt16HistEntrySize);
+      } else {
+        hist_t* hist_ptr = this->smaller_leaf_histogram_array_[feature_index].RawData();
+        std::memset(reinterpret_cast<void*>(hist_ptr), 0, (num_bin - offset) * kHistEntrySize);
+      }
     }
   }
   // construct local histograms
+  global_timer.Start("DataParallelTreeLearner::ReduceHistogram");
+  global_timer.Start("DataParallelTreeLearner::ReduceHistogram::Copy");
   #pragma omp parallel for schedule(static)
   for (int feature_index = 0; feature_index < this->num_features_; ++feature_index) {
     if (this->col_sampler_.is_feature_used_bytree()[feature_index] == false)
       continue;
     // copy to buffer
-    std::memcpy(input_buffer_.data() + buffer_write_start_pos_[feature_index],
+    if (this->config_->use_quantized_grad) {
+      const uint8_t local_smaller_leaf_num_bits = this->gradient_discretizer_->template GetHistBitsInLeaf<false>(this->smaller_leaf_splits_->leaf_index());
+      const uint8_t smaller_leaf_num_bits = this->gradient_discretizer_->template GetHistBitsInLeaf<true>(this->smaller_leaf_splits_->leaf_index());
+      if (smaller_leaf_num_bits <= 16) {
+        std::memcpy(input_buffer_.data() + buffer_write_start_pos_int16_[feature_index],
+                    this->smaller_leaf_histogram_array_[feature_index].RawDataInt16(),
+                    this->smaller_leaf_histogram_array_[feature_index].SizeOfInt16Histgram());
+      } else {
+        if (local_smaller_leaf_num_bits == 32) {
+          std::memcpy(input_buffer_.data() + buffer_write_start_pos_[feature_index],
+                      this->smaller_leaf_histogram_array_[feature_index].RawDataInt32(),
+                      this->smaller_leaf_histogram_array_[feature_index].SizeOfInt32Histgram());
+        } else {
+          this->smaller_leaf_histogram_array_[feature_index].CopyFromInt16ToInt32(
+            input_buffer_.data() + buffer_write_start_pos_[feature_index]);
+        }
+      }
+    } else {
+      std::memcpy(input_buffer_.data() + buffer_write_start_pos_[feature_index],
                 this->smaller_leaf_histogram_array_[feature_index].RawData(),
                 this->smaller_leaf_histogram_array_[feature_index].SizeOfHistgram());
+    }
   }
+  global_timer.Stop("DataParallelTreeLearner::ReduceHistogram::Copy");
   // Reduce scatter for histogram
-  Network::ReduceScatter(input_buffer_.data(), reduce_scatter_size_, sizeof(hist_t), block_start_.data(),
-                         block_len_.data(), output_buffer_.data(), static_cast<comm_size_t>(output_buffer_.size()), &HistogramSumReducer);
+  global_timer.Start("DataParallelTreeLearner::ReduceHistogram::ReduceScatter");
+  if (!this->config_->use_quantized_grad) {
+    Network::ReduceScatter(input_buffer_.data(), reduce_scatter_size_, sizeof(hist_t), block_start_.data(),
+                           block_len_.data(), output_buffer_.data(), static_cast<comm_size_t>(output_buffer_.size()), &HistogramSumReducer);
+  } else {
+    const uint8_t smaller_leaf_num_bits = this->gradient_discretizer_->template GetHistBitsInLeaf<true>(this->smaller_leaf_splits_->leaf_index());
+    if (smaller_leaf_num_bits <= 16) {
+      Network::ReduceScatter(input_buffer_.data(), reduce_scatter_size_int16_, sizeof(int16_t), block_start_int16_.data(),
+                            block_len_int16_.data(), output_buffer_.data(), static_cast<comm_size_t>(output_buffer_.size()), &Int16HistogramSumReducer);
+    } else {
+      Network::ReduceScatter(input_buffer_.data(), reduce_scatter_size_, sizeof(int_hist_t), block_start_.data(),
+                            block_len_.data(), output_buffer_.data(), static_cast<comm_size_t>(output_buffer_.size()), &Int32HistogramSumReducer);
+    }
+  }
+  global_timer.Stop("DataParallelTreeLearner::ReduceHistogram::ReduceScatter");
+  global_timer.Stop("DataParallelTreeLearner::ReduceHistogram");
   this->FindBestSplitsFromHistograms(
       this->col_sampler_.is_feature_used_bytree(), true, tree);
 }
@@ -198,6 +309,26 @@ void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(const
       this->col_sampler_.GetByNode(tree, this->larger_leaf_splits_->leaf_index());
   double smaller_leaf_parent_output = this->GetParentOutput(tree, this->smaller_leaf_splits_.get());
   double larger_leaf_parent_output = this->GetParentOutput(tree, this->larger_leaf_splits_.get());
+
+  if (this->config_->use_quantized_grad && this->larger_leaf_splits_ != nullptr && this->larger_leaf_splits_->leaf_index() >= 0) {
+    const int parent_index = std::min(this->smaller_leaf_splits_->leaf_index(), this->larger_leaf_splits_->leaf_index());
+    const uint8_t parent_num_bits = this->gradient_discretizer_->template GetHistBitsInNode<true>(parent_index);
+    const uint8_t larger_leaf_num_bits = this->gradient_discretizer_->template GetHistBitsInLeaf<true>(this->larger_leaf_splits_->leaf_index());
+    const uint8_t smaller_leaf_num_bits = this->gradient_discretizer_->template GetHistBitsInLeaf<true>(this->smaller_leaf_splits_->leaf_index());
+    if (parent_num_bits > 16 && larger_leaf_num_bits <= 16) {
+      CHECK_LE(smaller_leaf_num_bits, 16);
+      OMP_INIT_EX();
+      #pragma omp parallel for schedule(static)
+      for (int feature_index = 0; feature_index < this->num_features_; ++feature_index) {
+        OMP_LOOP_EX_BEGIN();
+        if (!is_feature_aggregated_[feature_index]) continue;
+        this->larger_leaf_histogram_array_[feature_index].CopyToBuffer(this->gradient_discretizer_->GetChangeHistBitsBuffer(feature_index));
+        OMP_LOOP_EX_END();
+      }
+      OMP_THROW_EX();
+    }
+  }
+
   OMP_INIT_EX();
   #pragma omp parallel for schedule(static)
   for (int feature_index = 0; feature_index < this->num_features_; ++feature_index) {
@@ -206,12 +337,39 @@ void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(const
     const int tid = omp_get_thread_num();
     const int real_feature_index = this->train_data_->RealFeatureIndex(feature_index);
     // restore global histograms from buffer
-    this->smaller_leaf_histogram_array_[feature_index].FromMemory(
-      output_buffer_.data() + buffer_read_start_pos_[feature_index]);
+    if (this->config_->use_quantized_grad) {
+      const uint8_t smaller_leaf_num_bits = this->gradient_discretizer_->template GetHistBitsInLeaf<true>(this->smaller_leaf_splits_->leaf_index());
+      if (smaller_leaf_num_bits <= 16) {
+        this->smaller_leaf_histogram_array_[feature_index].FromMemoryInt16(
+          output_buffer_.data() + buffer_read_start_pos_int16_[feature_index]);
+      } else {
+        this->smaller_leaf_histogram_array_[feature_index].FromMemoryInt32(
+          output_buffer_.data() + buffer_read_start_pos_[feature_index]);
+      }
+    } else {
+      this->smaller_leaf_histogram_array_[feature_index].FromMemory(
+        output_buffer_.data() + buffer_read_start_pos_[feature_index]);
+    }
 
-    this->train_data_->FixHistogram(feature_index,
-                                    this->smaller_leaf_splits_->sum_gradients(), this->smaller_leaf_splits_->sum_hessians(),
-                                    this->smaller_leaf_histogram_array_[feature_index].RawData());
+    if (this->config_->use_quantized_grad) {
+      const uint8_t smaller_leaf_num_bits = this->gradient_discretizer_->template GetHistBitsInLeaf<true>(this->smaller_leaf_splits_->leaf_index());
+      const int64_t int_sum_gradient_and_hessian = this->smaller_leaf_splits_->int_sum_gradients_and_hessians();
+      if (smaller_leaf_num_bits <= 16) {
+        this->train_data_->template FixHistogramInt<int32_t, int32_t, 16, 16>(
+          feature_index,
+          int_sum_gradient_and_hessian,
+          reinterpret_cast<hist_t*>(this->smaller_leaf_histogram_array_[feature_index].RawDataInt16()));
+      } else {
+        this->train_data_->template FixHistogramInt<int64_t, int64_t, 32, 32>(
+          feature_index,
+          int_sum_gradient_and_hessian,
+          reinterpret_cast<hist_t*>(this->smaller_leaf_histogram_array_[feature_index].RawDataInt32()));
+      }
+    } else {
+      this->train_data_->FixHistogram(feature_index,
+                                      this->smaller_leaf_splits_->sum_gradients(), this->smaller_leaf_splits_->sum_hessians(),
+                                      this->smaller_leaf_histogram_array_[feature_index].RawData());
+    }
 
     this->ComputeBestSplitForFeature(
         this->smaller_leaf_histogram_array_, feature_index, real_feature_index,
@@ -225,8 +383,31 @@ void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(const
     if (this->larger_leaf_splits_ == nullptr || this->larger_leaf_splits_->leaf_index() < 0) continue;
 
     // construct histgroms for large leaf, we init larger leaf as the parent, so we can just subtract the smaller leaf's histograms
-    this->larger_leaf_histogram_array_[feature_index].Subtract(
-      this->smaller_leaf_histogram_array_[feature_index]);
+    if (this->config_->use_quantized_grad) {
+      const int parent_index = std::min(this->smaller_leaf_splits_->leaf_index(), this->larger_leaf_splits_->leaf_index());
+      const uint8_t parent_num_bits = this->gradient_discretizer_->template GetHistBitsInNode<true>(parent_index);
+      const uint8_t larger_leaf_num_bits = this->gradient_discretizer_->template GetHistBitsInLeaf<true>(this->larger_leaf_splits_->leaf_index());
+      const uint8_t smaller_leaf_num_bits = this->gradient_discretizer_->template GetHistBitsInLeaf<true>(this->smaller_leaf_splits_->leaf_index());
+      if (parent_num_bits <= 16) {
+        CHECK_LE(smaller_leaf_num_bits, 16);
+        CHECK_LE(larger_leaf_num_bits, 16);
+        this->larger_leaf_histogram_array_[feature_index].template Subtract<true, int32_t, int32_t, int32_t, 16, 16, 16>(
+              this->smaller_leaf_histogram_array_[feature_index]);
+      } else if (larger_leaf_num_bits <= 16) {
+        CHECK_LE(smaller_leaf_num_bits, 16);
+        this->larger_leaf_histogram_array_[feature_index].template Subtract<true, int64_t, int32_t, int32_t, 32, 16, 16>(
+            this->smaller_leaf_histogram_array_[feature_index], this->gradient_discretizer_->GetChangeHistBitsBuffer(feature_index));
+      } else if (smaller_leaf_num_bits <= 16) {
+        this->larger_leaf_histogram_array_[feature_index].template Subtract<true, int64_t, int32_t, int64_t, 32, 16, 32>(
+              this->smaller_leaf_histogram_array_[feature_index]);
+      } else {
+        this->larger_leaf_histogram_array_[feature_index].template Subtract<true, int64_t, int64_t, int64_t, 32, 32, 32>(
+              this->smaller_leaf_histogram_array_[feature_index]);
+      }
+    } else {
+      this->larger_leaf_histogram_array_[feature_index].Subtract(
+        this->smaller_leaf_histogram_array_[feature_index]);
+    }
 
     this->ComputeBestSplitForFeature(
         this->larger_leaf_histogram_array_, feature_index, real_feature_index,
@@ -273,6 +454,10 @@ void DataParallelTreeLearner<TREELEARNER_T>::Split(Tree* tree, int best_Leaf, in
   // need update global number of data in leaf
   global_data_count_in_leaf_[*left_leaf] = best_split_info.left_count;
   global_data_count_in_leaf_[*right_leaf] = best_split_info.right_count;
+  // reset hist num bits according to global num data
+  if (this->config_->use_quantized_grad) {
+    this->gradient_discretizer_->template SetNumBitsInHistogramBin<true>(*left_leaf, *right_leaf, GetGlobalDataCountInLeaf(*left_leaf), GetGlobalDataCountInLeaf(*right_leaf));
+  }
 }
 
 // instantiate template classes, otherwise linker cannot find the code
diff --git a/src/treelearner/feature_histogram.hpp b/src/treelearner/feature_histogram.hpp
index 7804292d15d0..d917ed7917ec 100644
--- a/src/treelearner/feature_histogram.hpp
+++ b/src/treelearner/feature_histogram.hpp
@@ -51,6 +51,18 @@ class FeatureHistogram {
   /*! \brief Disable copy */
   FeatureHistogram(const FeatureHistogram&) = delete;
 
+  /*!
+   * \brief Init the feature histogram
+   * \param feature the feature data for this histogram
+   * \param min_num_data_one_leaf minimal number of data in one leaf
+   */
+  void Init(hist_t* data, int16_t* data_int16, const FeatureMetainfo* meta) {
+    meta_ = meta;
+    data_ = data;
+    data_int16_ = data_int16;
+    ResetFunc();
+  }
+
   /*!
    * \brief Init the feature histogram
    * \param feature the feature data for this histogram
@@ -59,6 +71,7 @@ class FeatureHistogram {
   void Init(hist_t* data, const FeatureMetainfo* meta) {
     meta_ = meta;
     data_ = data;
+    data_int16_ = nullptr;
     ResetFunc();
   }
 
@@ -72,13 +85,80 @@ class FeatureHistogram {
 
   hist_t* RawData() { return data_; }
 
+  int32_t* RawDataInt32() { return reinterpret_cast<int32_t*>(data_); }
+
+  int16_t* RawDataInt16() { return data_int16_; }
+
   /*!
    * \brief Subtract current histograms with other
    * \param other The histogram that want to subtract
    */
-  void Subtract(const FeatureHistogram& other) {
-    for (int i = 0; i < (meta_->num_bin - meta_->offset) * 2; ++i) {
-      data_[i] -= other.data_[i];
+  template <bool USE_DIST_GRAD = false,
+    typename THIS_HIST_T = hist_t, typename OTHER_HIST_T = hist_t, typename RESULT_HIST_T = hist_t,
+    int THIS_HIST_BITS = 0, int OTHER_HIST_BITS = 0, int RESULT_HIST_BITS = 0>
+  void Subtract(const FeatureHistogram& other, const int32_t* buffer = nullptr) {
+    if (USE_DIST_GRAD) {
+      const THIS_HIST_T* this_int_data = THIS_HIST_BITS == 16 ?
+        reinterpret_cast<const THIS_HIST_T*>(data_int16_) :
+        (RESULT_HIST_BITS == 16 ?
+          reinterpret_cast<const THIS_HIST_T*>(buffer) :
+          reinterpret_cast<const THIS_HIST_T*>(data_));
+      const OTHER_HIST_T* other_int_data = OTHER_HIST_BITS == 16 ?
+        reinterpret_cast<OTHER_HIST_T*>(other.data_int16_) :
+        reinterpret_cast<OTHER_HIST_T*>(other.data_);
+      RESULT_HIST_T* result_int_data = RESULT_HIST_BITS == 16 ?
+        reinterpret_cast<RESULT_HIST_T*>(data_int16_) :
+        reinterpret_cast<RESULT_HIST_T*>(data_);
+      if (THIS_HIST_BITS == 32 && OTHER_HIST_BITS == 16 && RESULT_HIST_BITS == 32) {
+        for (int i = 0; i < meta_->num_bin - meta_->offset; ++i) {
+          const int32_t other_grad_hess = static_cast<int32_t>(other_int_data[i]);
+          const int64_t this_grad_hess = this_int_data[i];
+          const int64_t other_grad_hess_int64 =
+            (static_cast<int64_t>(static_cast<int16_t>(other_grad_hess >> 16)) << 32) |
+            (static_cast<int64_t>(other_grad_hess & 0x0000ffff));
+          const int64_t result_grad_hess = this_grad_hess - other_grad_hess_int64;
+          result_int_data[i] = result_grad_hess;
+        }
+      } else if (THIS_HIST_BITS == 32 && OTHER_HIST_BITS == 16 && RESULT_HIST_BITS == 16) {
+        for (int i = 0; i < meta_->num_bin - meta_->offset; ++i) {
+          const int32_t other_grad_hess = static_cast<int32_t>(other_int_data[i]);
+          const int64_t this_grad_hess = this_int_data[i];
+          const int64_t other_grad_hess_int64 =
+            (static_cast<int64_t>(static_cast<int16_t>(other_grad_hess >> 16)) << 32) |
+            (static_cast<int64_t>(other_grad_hess & 0x0000ffff));
+          const int64_t result_grad_hess = this_grad_hess - other_grad_hess_int64;
+          const int32_t result_grad_hess_int32 =
+            (static_cast<int32_t>(result_grad_hess >> 32) << 16) |
+            static_cast<int32_t>(result_grad_hess & 0x00000000ffffffff);
+          result_int_data[i] = result_grad_hess_int32;
+        }
+      } else {
+        for (int i = 0; i < meta_->num_bin - meta_->offset; ++i) {
+          result_int_data[i] = this_int_data[i] - other_int_data[i];
+        }
+      }
+    } else {
+      for (int i = 0; i < (meta_->num_bin - meta_->offset) * 2; ++i) {
+        data_[i] -= other.data_[i];
+      }
+    }
+  }
+
+  void CopyToBuffer(int32_t* buffer) {
+    const int64_t* data_ptr = reinterpret_cast<const int64_t*>(data_);
+    int64_t* buffer_ptr = reinterpret_cast<int64_t*>(buffer);
+    for (int i = 0; i < meta_->num_bin - meta_->offset; ++i) {
+      buffer_ptr[i] = data_ptr[i];
+    }
+  }
+
+  void CopyFromInt16ToInt32(char* buffer) {
+    const int32_t* int16_data = reinterpret_cast<const int32_t*>(RawDataInt16());
+    int64_t* int32_data = reinterpret_cast<int64_t*>(buffer);
+    for (int i = 0; i < meta_->num_bin - meta_->offset; ++i) {
+      const int32_t int16_val = int16_data[i];
+      int32_data[i] = (static_cast<int64_t>(static_cast<int16_t>(int16_val >> 16)) << 32) |
+        static_cast<int64_t>(int16_val & 0x0000ffff);
     }
   }
 
@@ -94,8 +174,23 @@ class FeatureHistogram {
     output->gain *= meta_->penalty;
   }
 
+  void FindBestThresholdInt(int64_t sum_gradient_and_hessian,
+                            double grad_scale, double hess_scale,
+                            const uint8_t num_bits_bin,
+                            const uint8_t num_bits_acc,
+                            data_size_t num_data,
+                            const FeatureConstraint* constraints,
+                            double parent_output,
+                            SplitInfo* output) {
+    output->default_left = true;
+    output->gain = kMinScore;
+    int_find_best_threshold_fun_(sum_gradient_and_hessian, grad_scale, hess_scale, num_bits_bin, num_bits_acc, num_data,
+                             constraints, parent_output, output);
+    output->gain *= meta_->penalty;
+  }
+
   template <bool USE_RAND, bool USE_L1, bool USE_MAX_OUTPUT, bool USE_SMOOTHING>
-  double BeforeNumercal(double sum_gradient, double sum_hessian, double parent_output, data_size_t num_data,
+  double BeforeNumerical(double sum_gradient, double sum_hessian, double parent_output, data_size_t num_data,
                         SplitInfo* output, int* rand_threshold) {
     is_splittable_ = false;
     output->monotone_type = meta_->monotone_type;
@@ -112,6 +207,27 @@ class FeatureHistogram {
     return gain_shift + meta_->config->min_gain_to_split;
   }
 
+  template <bool USE_RAND, bool USE_L1, bool USE_MAX_OUTPUT, bool USE_SMOOTHING>
+  double BeforeNumericalInt(int64_t sum_gradient_and_hessian, double grad_scale, double hess_scale, double parent_output, data_size_t num_data,
+                        SplitInfo* output, int* rand_threshold) {
+    is_splittable_ = false;
+    output->monotone_type = meta_->monotone_type;
+    const int32_t int_sum_gradient = static_cast<int32_t>(sum_gradient_and_hessian >> 32);
+    const uint32_t int_sum_hessian = static_cast<uint32_t>(sum_gradient_and_hessian & 0x00000000ffffffff);
+    const double sum_gradient = static_cast<double>(int_sum_gradient) * grad_scale;
+    const double sum_hessian = static_cast<double>(int_sum_hessian) * hess_scale;
+    double gain_shift = GetLeafGain<USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
+        sum_gradient, sum_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2,
+        meta_->config->max_delta_step, meta_->config->path_smooth, num_data, parent_output);
+    *rand_threshold = 0;
+    if (USE_RAND) {
+      if (meta_->num_bin - 2 > 0) {
+        *rand_threshold = meta_->rand.NextInt(0, meta_->num_bin - 2);
+      }
+    }
+    return gain_shift + meta_->config->min_gain_to_split;
+  }
+
   void FuncForNumrical() {
     if (meta_->config->extra_trees) {
       if (meta_->config->monotone_constraints.empty()) {
@@ -155,6 +271,119 @@ class FeatureHistogram {
 
   template <bool USE_RAND, bool USE_MC, bool USE_L1, bool USE_MAX_OUTPUT, bool USE_SMOOTHING>
   void FuncForNumricalL3() {
+  if (meta_->config->use_quantized_grad) {
+#define TEMPLATE_PREFIX_INT USE_RAND, USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING
+#define LAMBDA_ARGUMENTS_INT                                         \
+  int64_t sum_gradient_and_hessian, double grad_scale, double hess_scale, const uint8_t hist_bits_bin, const uint8_t hist_bits_acc, data_size_t num_data, \
+      const FeatureConstraint* constraints, double parent_output, SplitInfo *output
+#define BEFORE_ARGUMENTS_INT sum_gradient_and_hessian, grad_scale, hess_scale, parent_output, num_data, output, &rand_threshold
+#define FUNC_ARGUMENTS_INT                                                      \
+  sum_gradient_and_hessian, grad_scale, hess_scale, num_data, constraints, min_gain_shift, \
+      output, rand_threshold, parent_output
+
+      if (meta_->num_bin > 2 && meta_->missing_type != MissingType::None) {
+        if (meta_->missing_type == MissingType::Zero) {
+          int_find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS_INT) {
+            int rand_threshold = 0;
+            double min_gain_shift =
+                BeforeNumericalInt<USE_RAND, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
+                    BEFORE_ARGUMENTS_INT);
+            if (hist_bits_acc <= 16) {
+              CHECK_LE(hist_bits_bin, 16);
+              FindBestThresholdSequentiallyInt<TEMPLATE_PREFIX_INT, true, true, false, int32_t, int32_t, int16_t, int16_t, 16, 16>(
+                  FUNC_ARGUMENTS_INT);
+              FindBestThresholdSequentiallyInt<TEMPLATE_PREFIX_INT, false, true, false, int32_t, int32_t, int16_t, int16_t, 16, 16>(
+                  FUNC_ARGUMENTS_INT);
+            } else {
+              if (hist_bits_bin == 32) {
+                FindBestThresholdSequentiallyInt<TEMPLATE_PREFIX_INT, true, true, false, int64_t, int64_t, int32_t, int32_t, 32, 32>(
+                    FUNC_ARGUMENTS_INT);
+                FindBestThresholdSequentiallyInt<TEMPLATE_PREFIX_INT, false, true, false, int64_t, int64_t, int32_t, int32_t, 32, 32>(
+                    FUNC_ARGUMENTS_INT);
+              } else {
+                FindBestThresholdSequentiallyInt<TEMPLATE_PREFIX_INT, true, true, false, int32_t, int64_t, int16_t, int32_t, 16, 32>(
+                    FUNC_ARGUMENTS_INT);
+                FindBestThresholdSequentiallyInt<TEMPLATE_PREFIX_INT, false, true, false, int32_t, int64_t, int16_t, int32_t, 16, 32>(
+                    FUNC_ARGUMENTS_INT);
+              }
+            }
+          };
+        } else {
+          int_find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS_INT) {
+            int rand_threshold = 0;
+            double min_gain_shift =
+                BeforeNumericalInt<USE_RAND, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
+                    BEFORE_ARGUMENTS_INT);
+            if (hist_bits_acc <= 16) {
+              CHECK_LE(hist_bits_bin, 16);
+              FindBestThresholdSequentiallyInt<TEMPLATE_PREFIX_INT, true, false, true, int32_t, int32_t, int16_t, int16_t, 16, 16>(
+                  FUNC_ARGUMENTS_INT);
+              FindBestThresholdSequentiallyInt<TEMPLATE_PREFIX_INT, false, false, true, int32_t, int32_t, int16_t, int16_t, 16, 16>(
+                  FUNC_ARGUMENTS_INT);
+            } else {
+              if (hist_bits_bin == 32) {
+                FindBestThresholdSequentiallyInt<TEMPLATE_PREFIX_INT, true, false, true, int64_t, int64_t, int32_t, int32_t, 32, 32>(
+                    FUNC_ARGUMENTS_INT);
+                FindBestThresholdSequentiallyInt<TEMPLATE_PREFIX_INT, false, false, true, int64_t, int64_t, int32_t, int32_t, 32, 32>(
+                    FUNC_ARGUMENTS_INT);
+              } else {
+                FindBestThresholdSequentiallyInt<TEMPLATE_PREFIX_INT, true, false, true, int32_t, int64_t, int16_t, int32_t, 16, 32>(
+                    FUNC_ARGUMENTS_INT);
+                FindBestThresholdSequentiallyInt<TEMPLATE_PREFIX_INT, false, false, true, int32_t, int64_t, int16_t, int32_t, 16, 32>(
+                    FUNC_ARGUMENTS_INT);
+              }
+            }
+          };
+        }
+      } else {
+        if (meta_->missing_type != MissingType::NaN) {
+          int_find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS_INT) {
+            int rand_threshold = 0;
+            double min_gain_shift =
+                BeforeNumericalInt<USE_RAND, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
+                    BEFORE_ARGUMENTS_INT);
+            if (hist_bits_acc <= 16) {
+              CHECK_LE(hist_bits_bin, 16);
+              FindBestThresholdSequentiallyInt<TEMPLATE_PREFIX_INT, true, false, false, int32_t, int32_t, int16_t, int16_t, 16, 16>(
+                  FUNC_ARGUMENTS_INT);
+            } else {
+              if (hist_bits_bin == 32) {
+                FindBestThresholdSequentiallyInt<TEMPLATE_PREFIX_INT, true, false, false, int64_t, int64_t, int32_t, int32_t, 32, 32>(
+                    FUNC_ARGUMENTS_INT);
+              } else {
+                FindBestThresholdSequentiallyInt<TEMPLATE_PREFIX_INT, true, false, false, int32_t, int64_t, int16_t, int32_t, 16, 32>(
+                    FUNC_ARGUMENTS_INT);
+              }
+            }
+          };
+        } else {
+          int_find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS_INT) {
+            int rand_threshold = 0;
+            double min_gain_shift =
+                BeforeNumericalInt<USE_RAND, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
+                    BEFORE_ARGUMENTS_INT);
+            if (hist_bits_acc <= 16) {
+              CHECK_LE(hist_bits_bin, 16);
+              FindBestThresholdSequentiallyInt<TEMPLATE_PREFIX_INT, true, false, false, int32_t, int32_t, int16_t, int16_t, 16, 16>(
+                  FUNC_ARGUMENTS_INT);
+            } else {
+              if (hist_bits_bin == 32) {
+                FindBestThresholdSequentiallyInt<TEMPLATE_PREFIX_INT, true, false, false, int64_t, int64_t, int32_t, int32_t, 32, 32>(
+                    FUNC_ARGUMENTS_INT);
+              } else {
+                FindBestThresholdSequentiallyInt<TEMPLATE_PREFIX_INT, true, false, false, int32_t, int64_t, int16_t, int32_t, 16, 32>(
+                    FUNC_ARGUMENTS_INT);
+              }
+            }
+            output->default_left = false;
+          };
+        }
+      }
+#undef TEMPLATE_PREFIX_INT
+#undef LAMBDA_ARGUMENTS_INT
+#undef BEFORE_ARGUMENTS_INT
+#undef FUNC_ARGURMENTS_INT
+  } else {
 #define TEMPLATE_PREFIX USE_RAND, USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING
 #define LAMBDA_ARGUMENTS                                         \
   double sum_gradient, double sum_hessian, data_size_t num_data, \
@@ -164,56 +393,57 @@ class FeatureHistogram {
   sum_gradient, sum_hessian, num_data, constraints, min_gain_shift, \
       output, rand_threshold, parent_output
 
-    if (meta_->num_bin > 2 && meta_->missing_type != MissingType::None) {
-      if (meta_->missing_type == MissingType::Zero) {
-        find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS) {
-          int rand_threshold = 0;
-          double min_gain_shift =
-              BeforeNumercal<USE_RAND, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
-                  BEFORE_ARGUMENTS);
-          FindBestThresholdSequentially<TEMPLATE_PREFIX, true, true, false>(
-              FUNC_ARGUMENTS);
-          FindBestThresholdSequentially<TEMPLATE_PREFIX, false, true, false>(
-              FUNC_ARGUMENTS);
-        };
-      } else {
-        find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS) {
-          int rand_threshold = 0;
-          double min_gain_shift =
-              BeforeNumercal<USE_RAND, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
-                  BEFORE_ARGUMENTS);
-          FindBestThresholdSequentially<TEMPLATE_PREFIX, true, false, true>(
-              FUNC_ARGUMENTS);
-          FindBestThresholdSequentially<TEMPLATE_PREFIX, false, false, true>(
-              FUNC_ARGUMENTS);
-        };
-      }
-    } else {
-      if (meta_->missing_type != MissingType::NaN) {
-        find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS) {
-          int rand_threshold = 0;
-          double min_gain_shift =
-              BeforeNumercal<USE_RAND, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
-                  BEFORE_ARGUMENTS);
-          FindBestThresholdSequentially<TEMPLATE_PREFIX, true, false, false>(
-              FUNC_ARGUMENTS);
-        };
+      if (meta_->num_bin > 2 && meta_->missing_type != MissingType::None) {
+        if (meta_->missing_type == MissingType::Zero) {
+          find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS) {
+            int rand_threshold = 0;
+            double min_gain_shift =
+                BeforeNumerical<USE_RAND, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
+                    BEFORE_ARGUMENTS);
+            FindBestThresholdSequentially<TEMPLATE_PREFIX, true, true, false>(
+                FUNC_ARGUMENTS);
+            FindBestThresholdSequentially<TEMPLATE_PREFIX, false, true, false>(
+                FUNC_ARGUMENTS);
+          };
+        } else {
+          find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS) {
+            int rand_threshold = 0;
+            double min_gain_shift =
+                BeforeNumerical<USE_RAND, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
+                    BEFORE_ARGUMENTS);
+            FindBestThresholdSequentially<TEMPLATE_PREFIX, true, false, true>(
+                FUNC_ARGUMENTS);
+            FindBestThresholdSequentially<TEMPLATE_PREFIX, false, false, true>(
+                FUNC_ARGUMENTS);
+          };
+        }
       } else {
-        find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS) {
-          int rand_threshold = 0;
-          double min_gain_shift =
-              BeforeNumercal<USE_RAND, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
-                  BEFORE_ARGUMENTS);
-          FindBestThresholdSequentially<TEMPLATE_PREFIX, true, false, false>(
-              FUNC_ARGUMENTS);
-          output->default_left = false;
-        };
+        if (meta_->missing_type != MissingType::NaN) {
+          find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS) {
+            int rand_threshold = 0;
+            double min_gain_shift =
+                BeforeNumerical<USE_RAND, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
+                    BEFORE_ARGUMENTS);
+            FindBestThresholdSequentially<TEMPLATE_PREFIX, true, false, false>(
+                FUNC_ARGUMENTS);
+          };
+        } else {
+          find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS) {
+            int rand_threshold = 0;
+            double min_gain_shift =
+                BeforeNumerical<USE_RAND, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
+                    BEFORE_ARGUMENTS);
+            FindBestThresholdSequentially<TEMPLATE_PREFIX, true, false, false>(
+                FUNC_ARGUMENTS);
+            output->default_left = false;
+          };
+        }
       }
-    }
 #undef TEMPLATE_PREFIX
 #undef LAMBDA_ARGUMENTS
 #undef BEFORE_ARGUMENTS
 #undef FUNC_ARGURMENTS
+    }
   }
 
   void FuncForCategorical() {
@@ -716,6 +946,14 @@ class FeatureHistogram {
     return (meta_->num_bin - meta_->offset) * kHistEntrySize;
   }
 
+  int SizeOfInt32Histgram() const {
+    return (meta_->num_bin - meta_->offset) * kInt32HistEntrySize;
+  }
+
+  int SizeOfInt16Histgram() const {
+    return (meta_->num_bin - meta_->offset) * kInt16HistEntrySize;
+  }
+
   /*!
    * \brief Restore histogram from memory
    */
@@ -724,6 +962,16 @@ class FeatureHistogram {
                 (meta_->num_bin - meta_->offset) * kHistEntrySize);
   }
 
+  void FromMemoryInt32(char* memory_data) {
+    std::memcpy(data_, memory_data,
+                (meta_->num_bin - meta_->offset) * kInt32HistEntrySize);
+  }
+
+  void FromMemoryInt16(char* memory_data) {
+    std::memcpy(data_int16_, memory_data,
+                (meta_->num_bin - meta_->offset) * kInt16HistEntrySize);
+  }
+
   /*!
    * \brief True if this histogram can be splitted
    */
@@ -1082,14 +1330,312 @@ class FeatureHistogram {
     }
   }
 
+  template <bool USE_RAND, bool USE_MC, bool USE_L1, bool USE_MAX_OUTPUT, bool USE_SMOOTHING,
+          bool REVERSE, bool SKIP_DEFAULT_BIN, bool NA_AS_MISSING, typename PACKED_HIST_BIN_T, typename PACKED_HIST_ACC_T,
+          typename HIST_BIN_T, typename HIST_ACC_T, int HIST_BITS_BIN, int HIST_BITS_ACC>
+  void FindBestThresholdSequentiallyInt(int64_t int_sum_gradient_and_hessian,
+                                        const double grad_scale, const double hess_scale,
+                                        data_size_t num_data,
+                                        const FeatureConstraint* constraints,
+                                        double min_gain_shift, SplitInfo* output,
+                                        int rand_threshold, double parent_output) {
+    const int8_t offset = meta_->offset;
+    PACKED_HIST_ACC_T best_sum_left_gradient_and_hessian = 0;
+    PACKED_HIST_ACC_T local_int_sum_gradient_and_hessian =
+      HIST_BITS_ACC == 16 ?
+      ((static_cast<int32_t>(int_sum_gradient_and_hessian >> 32) << 16) | static_cast<int32_t>(int_sum_gradient_and_hessian & 0x0000ffff)) :
+      int_sum_gradient_and_hessian;
+    double best_gain = kMinScore;
+    uint32_t best_threshold = static_cast<uint32_t>(meta_->num_bin);
+    const double cnt_factor = static_cast<double>(num_data) /
+      static_cast<double>(static_cast<uint32_t>(int_sum_gradient_and_hessian & 0x00000000ffffffff));
+
+    BasicConstraint best_right_constraints;
+    BasicConstraint best_left_constraints;
+    bool constraint_update_necessary =
+        USE_MC && constraints->ConstraintDifferentDependingOnThreshold();
+
+    if (USE_MC) {
+      constraints->InitCumulativeConstraints(REVERSE);
+    }
+
+    const PACKED_HIST_BIN_T* data_ptr = nullptr;
+    if (HIST_BITS_BIN == 16) {
+      data_ptr = reinterpret_cast<const PACKED_HIST_BIN_T*>(data_int16_);
+    } else {
+      data_ptr = reinterpret_cast<const PACKED_HIST_BIN_T*>(data_);
+    }
+    if (REVERSE) {
+      PACKED_HIST_ACC_T sum_right_gradient_and_hessian = 0;
+
+      int t = meta_->num_bin - 1 - offset - NA_AS_MISSING;
+      const int t_end = 1 - offset;
+
+      // from right to left, and we don't need data in bin0
+      for (; t >= t_end; --t) {
+        // need to skip default bin
+        if (SKIP_DEFAULT_BIN) {
+          if ((t + offset) == static_cast<int>(meta_->default_bin)) {
+            continue;
+          }
+        }
+        const PACKED_HIST_BIN_T grad_and_hess = data_ptr[t];
+        if (HIST_BITS_ACC != HIST_BITS_BIN) {
+          const PACKED_HIST_ACC_T grad_and_hess_acc = HIST_BITS_BIN == 16 ?
+            ((static_cast<PACKED_HIST_ACC_T>(static_cast<HIST_BIN_T>(grad_and_hess >> HIST_BITS_BIN)) << HIST_BITS_ACC) |
+            (static_cast<PACKED_HIST_ACC_T>(grad_and_hess & 0x0000ffff))) :
+            ((static_cast<PACKED_HIST_ACC_T>(static_cast<HIST_BIN_T>(grad_and_hess >> HIST_BITS_BIN)) << HIST_BITS_ACC) |
+            (static_cast<PACKED_HIST_ACC_T>(grad_and_hess & 0x00000000ffffffff)));
+          sum_right_gradient_and_hessian += grad_and_hess_acc;
+        } else {
+          sum_right_gradient_and_hessian += grad_and_hess;
+        }
+        const uint32_t int_sum_right_hessian = HIST_BITS_ACC == 16 ?
+          static_cast<uint32_t>(sum_right_gradient_and_hessian & 0x0000ffff) :
+          static_cast<uint32_t>(sum_right_gradient_and_hessian & 0x00000000ffffffff);
+        data_size_t right_count = Common::RoundInt(int_sum_right_hessian * cnt_factor);
+        double sum_right_hessian = int_sum_right_hessian * hess_scale;
+        // if data not enough, or sum hessian too small
+        if (right_count < meta_->config->min_data_in_leaf ||
+            sum_right_hessian < meta_->config->min_sum_hessian_in_leaf) {
+          continue;
+        }
+        data_size_t left_count = num_data - right_count;
+        // if data not enough
+        if (left_count < meta_->config->min_data_in_leaf) {
+          break;
+        }
+
+        const PACKED_HIST_ACC_T sum_left_gradient_and_hessian = local_int_sum_gradient_and_hessian - sum_right_gradient_and_hessian;
+        const uint32_t int_sum_left_hessian = HIST_BITS_ACC == 16 ?
+          static_cast<uint32_t>(sum_left_gradient_and_hessian & 0x0000ffff) :
+          static_cast<uint32_t>(sum_left_gradient_and_hessian & 0x00000000ffffffff);
+        double sum_left_hessian = int_sum_left_hessian * hess_scale;
+        // if sum hessian too small
+        if (sum_left_hessian < meta_->config->min_sum_hessian_in_leaf) {
+          break;
+        }
+
+        double sum_right_gradient = HIST_BITS_ACC == 16 ?
+          static_cast<double>(static_cast<int16_t>(sum_right_gradient_and_hessian >> 16)) * grad_scale :
+          static_cast<double>(static_cast<int32_t>(sum_right_gradient_and_hessian >> 32)) * grad_scale;
+        double sum_left_gradient = HIST_BITS_ACC == 16 ?
+          static_cast<double>(static_cast<int16_t>(sum_left_gradient_and_hessian >> 16)) * grad_scale :
+          static_cast<double>(static_cast<int32_t>(sum_left_gradient_and_hessian >> 32)) * grad_scale;
+        if (USE_RAND) {
+          if (t - 1 + offset != rand_threshold) {
+            continue;
+          }
+        }
+
+        if (USE_MC && constraint_update_necessary) {
+          constraints->Update(t + offset);
+        }
+
+        // current split gain
+        double current_gain = GetSplitGains<USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
+            sum_left_gradient, sum_left_hessian + kEpsilon, sum_right_gradient,
+            sum_right_hessian + kEpsilon, meta_->config->lambda_l1,
+            meta_->config->lambda_l2, meta_->config->max_delta_step,
+            constraints, meta_->monotone_type, meta_->config->path_smooth,
+            left_count, right_count, parent_output);
+        // gain with split is worse than without split
+        if (current_gain <= min_gain_shift) {
+          continue;
+        }
+
+        // mark as able to be split
+        is_splittable_ = true;
+        // better split point
+        if (current_gain > best_gain) {
+          if (USE_MC) {
+            best_right_constraints = constraints->RightToBasicConstraint();
+            best_left_constraints = constraints->LeftToBasicConstraint();
+            if (best_right_constraints.min > best_right_constraints.max ||
+                best_left_constraints.min > best_left_constraints.max) {
+              continue;
+            }
+          }
+          best_sum_left_gradient_and_hessian = sum_left_gradient_and_hessian;
+          // left is <= threshold, right is > threshold.  so this is t-1
+          best_threshold = static_cast<uint32_t>(t - 1 + offset);
+          best_gain = current_gain;
+        }
+      }
+    } else {
+      PACKED_HIST_ACC_T sum_left_gradient_and_hessian = 0;
+
+      int t = 0;
+      const int t_end = meta_->num_bin - 2 - offset;
+
+      if (NA_AS_MISSING) {
+        if (offset == 1) {
+          sum_left_gradient_and_hessian = local_int_sum_gradient_and_hessian;
+          for (int i = 0; i < meta_->num_bin - offset; ++i) {
+            const PACKED_HIST_BIN_T grad_and_hess = data_ptr[i];
+            if (HIST_BITS_ACC != HIST_BITS_BIN) {
+              const PACKED_HIST_ACC_T grad_and_hess_acc = HIST_BITS_BIN == 16 ?
+                ((static_cast<PACKED_HIST_ACC_T>(static_cast<HIST_BIN_T>(grad_and_hess >> HIST_BITS_BIN)) << HIST_BITS_ACC) |
+                (static_cast<PACKED_HIST_ACC_T>(grad_and_hess & 0x0000ffff))) :
+                ((static_cast<PACKED_HIST_ACC_T>(static_cast<HIST_BIN_T>(grad_and_hess >> HIST_BITS_BIN)) << HIST_BITS_ACC) |
+                (static_cast<PACKED_HIST_ACC_T>(grad_and_hess & 0x00000000ffffffff)));
+              sum_left_gradient_and_hessian -= grad_and_hess_acc;
+            } else {
+              sum_left_gradient_and_hessian -= grad_and_hess;
+            }
+          }
+          t = -1;
+        }
+      }
+
+      for (; t <= t_end; ++t) {
+        if (SKIP_DEFAULT_BIN) {
+          if ((t + offset) == static_cast<int>(meta_->default_bin)) {
+            continue;
+          }
+        }
+        if (t >= 0) {
+          const PACKED_HIST_BIN_T grad_and_hess = data_ptr[t];
+          if (HIST_BITS_ACC != HIST_BITS_BIN) {
+            const PACKED_HIST_ACC_T grad_and_hess_acc = HIST_BITS_BIN == 16 ?
+              ((static_cast<PACKED_HIST_ACC_T>(static_cast<HIST_BIN_T>(grad_and_hess >> HIST_BITS_BIN)) << HIST_BITS_ACC) |
+              (static_cast<PACKED_HIST_ACC_T>(grad_and_hess & 0x0000ffff))) :
+              ((static_cast<PACKED_HIST_ACC_T>(static_cast<HIST_BIN_T>(grad_and_hess >> HIST_BITS_BIN)) << HIST_BITS_ACC) |
+              (static_cast<PACKED_HIST_ACC_T>(grad_and_hess & 0x00000000ffffffff)));
+            sum_left_gradient_and_hessian += grad_and_hess_acc;
+          } else {
+            sum_left_gradient_and_hessian += grad_and_hess;
+          }
+        }
+        // if data not enough, or sum hessian too small
+        const uint32_t int_sum_left_hessian = HIST_BITS_ACC == 16 ?
+          static_cast<uint32_t>(sum_left_gradient_and_hessian & 0x0000ffff) :
+          static_cast<uint32_t>(sum_left_gradient_and_hessian & 0x00000000ffffffff);
+        const data_size_t left_count = Common::RoundInt(static_cast<double>(int_sum_left_hessian) * cnt_factor);
+        const double sum_left_hessian = static_cast<double>(int_sum_left_hessian) * hess_scale;
+        if (left_count < meta_->config->min_data_in_leaf ||
+            sum_left_hessian < meta_->config->min_sum_hessian_in_leaf) {
+          continue;
+        }
+        data_size_t right_count = num_data - left_count;
+        // if data not enough
+        if (right_count < meta_->config->min_data_in_leaf) {
+          break;
+        }
+
+        const PACKED_HIST_ACC_T sum_right_gradient_and_hessian = local_int_sum_gradient_and_hessian - sum_left_gradient_and_hessian;
+        const uint32_t int_sum_right_hessian = HIST_BITS_ACC == 16 ?
+          static_cast<uint32_t>(sum_right_gradient_and_hessian & 0x0000ffff) :
+          static_cast<uint32_t>(sum_right_gradient_and_hessian & 0x00000000ffffffff);
+        const double sum_right_hessian = static_cast<double>(int_sum_right_hessian) * hess_scale;
+        // if sum Hessian too small
+        if (sum_right_hessian < meta_->config->min_sum_hessian_in_leaf) {
+          break;
+        }
+
+        double sum_right_gradient = HIST_BITS_ACC == 16 ?
+          static_cast<double>(static_cast<int16_t>(sum_right_gradient_and_hessian >> 16)) * grad_scale :
+          static_cast<double>(static_cast<int32_t>(sum_right_gradient_and_hessian >> 32)) * grad_scale;
+        double sum_left_gradient = HIST_BITS_ACC == 16 ?
+          static_cast<double>(static_cast<int16_t>(sum_left_gradient_and_hessian >> 16)) * grad_scale :
+          static_cast<double>(static_cast<int32_t>(sum_left_gradient_and_hessian >> 32)) * grad_scale;
+        if (USE_RAND) {
+          if (t + offset != rand_threshold) {
+            continue;
+          }
+        }
+        // current split gain
+        double current_gain = GetSplitGains<USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
+            sum_left_gradient, sum_left_hessian + kEpsilon, sum_right_gradient,
+            sum_right_hessian + kEpsilon, meta_->config->lambda_l1,
+            meta_->config->lambda_l2, meta_->config->max_delta_step,
+            constraints, meta_->monotone_type, meta_->config->path_smooth, left_count,
+            right_count, parent_output);
+        // gain with split is worse than without split
+        if (current_gain <= min_gain_shift) {
+          continue;
+        }
+
+        // mark as able to be split
+        is_splittable_ = true;
+        // better split point
+        if (current_gain > best_gain) {
+          if (USE_MC) {
+            best_right_constraints = constraints->RightToBasicConstraint();
+            best_left_constraints = constraints->LeftToBasicConstraint();
+            if (best_right_constraints.min > best_right_constraints.max ||
+                best_left_constraints.min > best_left_constraints.max) {
+              continue;
+            }
+          }
+          best_sum_left_gradient_and_hessian = sum_left_gradient_and_hessian;
+          best_threshold = static_cast<uint32_t>(t + offset);
+          best_gain = current_gain;
+        }
+      }
+    }
+
+    if (is_splittable_ && best_gain > output->gain + min_gain_shift) {
+      const int32_t int_best_sum_left_gradient = HIST_BITS_ACC == 16 ?
+        static_cast<int32_t>(static_cast<int16_t>(best_sum_left_gradient_and_hessian >> 16)) :
+        static_cast<int32_t>(best_sum_left_gradient_and_hessian >> 32);
+      const uint32_t int_best_sum_left_hessian = HIST_BITS_ACC == 16 ?
+        static_cast<uint32_t>(best_sum_left_gradient_and_hessian & 0x0000ffff) :
+        static_cast<uint32_t>(best_sum_left_gradient_and_hessian & 0x00000000ffffffff);
+      const double best_sum_left_gradient = static_cast<double>(int_best_sum_left_gradient) * grad_scale;
+      const double best_sum_left_hessian = static_cast<double>(int_best_sum_left_hessian) * hess_scale;
+      const int64_t best_sum_left_gradient_and_hessian_int64 = HIST_BITS_ACC == 16 ?
+          ((static_cast<int64_t>(static_cast<int16_t>(best_sum_left_gradient_and_hessian >> 16)) << 32) |
+          static_cast<int64_t>(best_sum_left_gradient_and_hessian & 0x0000ffff)) :
+          best_sum_left_gradient_and_hessian;
+      const int64_t best_sum_right_gradient_and_hessian = int_sum_gradient_and_hessian - best_sum_left_gradient_and_hessian_int64;
+      const int32_t int_best_sum_right_gradient = static_cast<int32_t>(best_sum_right_gradient_and_hessian >> 32);
+      const uint32_t int_best_sum_right_hessian = static_cast<uint32_t>(best_sum_right_gradient_and_hessian & 0x00000000ffffffff);
+      const double best_sum_right_gradient = static_cast<double>(int_best_sum_right_gradient) * grad_scale;
+      const double best_sum_right_hessian = static_cast<double>(int_best_sum_right_hessian) * hess_scale;
+      const data_size_t best_left_count = Common::RoundInt(static_cast<double>(int_best_sum_left_hessian) * cnt_factor);
+      const data_size_t best_right_count = Common::RoundInt(static_cast<double>(int_best_sum_right_hessian) * cnt_factor);
+      // update split information
+      output->threshold = best_threshold;
+      output->left_output =
+          CalculateSplittedLeafOutput<USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
+              best_sum_left_gradient, best_sum_left_hessian,
+              meta_->config->lambda_l1, meta_->config->lambda_l2,
+              meta_->config->max_delta_step, best_left_constraints, meta_->config->path_smooth,
+              best_left_count, parent_output);
+      output->left_count = best_left_count;
+      output->left_sum_gradient = best_sum_left_gradient;
+      output->left_sum_hessian = best_sum_left_hessian;
+      output->left_sum_gradient_and_hessian = best_sum_left_gradient_and_hessian_int64;
+      output->right_output =
+          CalculateSplittedLeafOutput<USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
+              best_sum_right_gradient,
+              best_sum_right_hessian, meta_->config->lambda_l1,
+              meta_->config->lambda_l2, meta_->config->max_delta_step,
+              best_right_constraints, meta_->config->path_smooth, best_right_count,
+              parent_output);
+      output->right_count = best_right_count;
+      output->right_sum_gradient = best_sum_right_gradient;
+      output->right_sum_hessian = best_sum_right_hessian;
+      output->right_sum_gradient_and_hessian = best_sum_right_gradient_and_hessian;
+      output->gain = best_gain - min_gain_shift;
+      output->default_left = REVERSE;
+    }
+  }
+
   const FeatureMetainfo* meta_;
   /*! \brief sum of gradient of each bin */
   hist_t* data_;
+  int16_t* data_int16_;
   bool is_splittable_ = true;
 
   std::function<void(double, double, data_size_t, const FeatureConstraint*,
                      double, SplitInfo*)>
       find_best_threshold_fun_;
+
+  std::function<void(int64_t, double, double, const uint8_t, const uint8_t, data_size_t, const FeatureConstraint*,
+                     double, SplitInfo*)>
+      int_find_best_threshold_fun_;
 };
 
 class HistogramPool {
@@ -1200,18 +1746,35 @@ class HistogramPool {
       pool_.resize(cache_size);
       data_.resize(cache_size);
     }
-    OMP_INIT_EX();
-#pragma omp parallel for schedule(static)
-    for (int i = old_cache_size; i < cache_size; ++i) {
-      OMP_LOOP_EX_BEGIN();
-      pool_[i].reset(new FeatureHistogram[train_data->num_features()]);
-      data_[i].resize(num_total_bin * 2);
-      for (int j = 0; j < train_data->num_features(); ++j) {
-        pool_[i][j].Init(data_[i].data() + offsets[j] * 2, &feature_metas_[j]);
+
+    if (config->use_quantized_grad) {
+      OMP_INIT_EX();
+      #pragma omp parallel for schedule(static)
+      for (int i = old_cache_size; i < cache_size; ++i) {
+        OMP_LOOP_EX_BEGIN();
+        pool_[i].reset(new FeatureHistogram[train_data->num_features()]);
+        data_[i].resize(num_total_bin);
+        for (int j = 0; j < train_data->num_features(); ++j) {
+          int16_t* data_ptr = reinterpret_cast<int16_t*>(data_[i].data());
+          pool_[i][j].Init(data_[i].data() + offsets[j], data_ptr + 2 * offsets[j], &feature_metas_[j]);
+        }
+        OMP_LOOP_EX_END();
+      }
+      OMP_THROW_EX();
+    } else {
+      OMP_INIT_EX();
+      #pragma omp parallel for schedule(static)
+      for (int i = old_cache_size; i < cache_size; ++i) {
+        OMP_LOOP_EX_BEGIN();
+        pool_[i].reset(new FeatureHistogram[train_data->num_features()]);
+        data_[i].resize(num_total_bin * 2);
+        for (int j = 0; j < train_data->num_features(); ++j) {
+          pool_[i][j].Init(data_[i].data() + offsets[j] * 2, &feature_metas_[j]);
+        }
+        OMP_LOOP_EX_END();
       }
-      OMP_LOOP_EX_END();
+      OMP_THROW_EX();
     }
-    OMP_THROW_EX();
   }
 
   void ResetConfig(const Dataset* train_data, const Config* config) {
diff --git a/src/treelearner/gpu_tree_learner.cpp b/src/treelearner/gpu_tree_learner.cpp
index f92da0fe9f76..294be28b6f86 100644
--- a/src/treelearner/gpu_tree_learner.cpp
+++ b/src/treelearner/gpu_tree_learner.cpp
@@ -991,7 +991,7 @@ void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_u
     nullptr, nullptr,
     nullptr, nullptr);
   // then construct sparse features on CPU
-  train_data_->ConstructHistograms(is_sparse_feature_used,
+  train_data_->ConstructHistograms<false, 0>(is_sparse_feature_used,
     smaller_leaf_splits_->data_indices(), smaller_leaf_splits_->num_data_in_leaf(),
     gradients_, hessians_,
     ordered_gradients_.data(), ordered_hessians_.data(),
@@ -1056,7 +1056,7 @@ void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_u
       gradients_, hessians_,
       ordered_gradients_.data(), ordered_hessians_.data());
     // then construct sparse features on CPU
-    train_data_->ConstructHistograms(is_sparse_feature_used,
+    train_data_->ConstructHistograms<false, 0>(is_sparse_feature_used,
       larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(),
       gradients_, hessians_,
       ordered_gradients_.data(), ordered_hessians_.data(),
diff --git a/src/treelearner/gradient_discretizer.cpp b/src/treelearner/gradient_discretizer.cpp
new file mode 100644
index 000000000000..4c00f73ab12c
--- /dev/null
+++ b/src/treelearner/gradient_discretizer.cpp
@@ -0,0 +1,262 @@
+/*!
+ * Copyright (c) 2022 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for
+ * license information.
+ */
+
+#include "gradient_discretizer.hpp"
+#include <LightGBM/network.h>
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+namespace LightGBM {
+
+void GradientDiscretizer::Init(
+  const data_size_t num_data, const int num_leaves,
+  const int num_features, const Dataset* train_data) {
+  discretized_gradients_and_hessians_vector_.resize(num_data * 2);
+  gradient_random_values_.resize(num_data);
+  hessian_random_values_.resize(num_data);
+  random_values_use_start_eng_ = std::mt19937(random_seed_);
+  random_values_use_start_dist_ = std::uniform_int_distribution<data_size_t>(0, num_data);
+
+  const int num_threads = OMP_NUM_THREADS();
+  int num_blocks = 0;
+  data_size_t block_size = 0;
+  Threading::BlockInfo<data_size_t>(num_data, 512, &num_blocks, &block_size);
+  #pragma omp parallel for schedule(static, 1) num_threads(num_threads)
+  for (int thread_id = 0; thread_id < num_blocks; ++thread_id) {
+    const data_size_t start = thread_id * block_size;
+    const data_size_t end = std::min(start + block_size, num_data);
+    std::mt19937 gradient_random_values_eng(random_seed_ + thread_id);
+    std::uniform_real_distribution<double> gradient_random_values_dist(0.0f, 1.0f);
+    std::mt19937 hessian_random_values_eng(random_seed_ + thread_id + num_threads);
+    std::uniform_real_distribution<double> hessian_random_values_dist(0.0f, 1.0f);
+    for (data_size_t i = start; i < end; ++i) {
+      gradient_random_values_[i] = gradient_random_values_dist(gradient_random_values_eng);
+      hessian_random_values_[i] = hessian_random_values_dist(hessian_random_values_eng);
+    }
+  }
+
+  max_gradient_abs_ = 0.0f;
+  max_hessian_abs_ = 0.0f;
+
+  gradient_scale_ = 0.0f;
+  hessian_scale_ = 0.0f;
+  inverse_gradient_scale_ = 0.0f;
+  inverse_hessian_scale_ = 0.0f;
+
+  num_leaves_ = num_leaves;
+  leaf_num_bits_in_histogram_bin_.resize(num_leaves_, 0);
+  node_num_bits_in_histogram_bin_.resize(num_leaves_, 0);
+  global_leaf_num_bits_in_histogram_bin_.resize(num_leaves_, 0);
+  global_node_num_bits_in_histogram_bin_.resize(num_leaves_, 0);
+
+  leaf_grad_hess_stats_.resize(num_leaves_ * 2, 0.0);
+  change_hist_bits_buffer_.resize(num_features);
+  #pragma omp parallel for schedule(static) num_threads(num_threads)
+  for (int feature_index = 0; feature_index < num_features; ++feature_index) {
+    const BinMapper* bin_mapper = train_data->FeatureBinMapper(feature_index);
+    change_hist_bits_buffer_[feature_index].resize((bin_mapper->num_bin() - static_cast<int>(bin_mapper->GetMostFreqBin() == 0)) * 2);
+  }
+
+  ordered_int_gradients_and_hessians_.resize(2 * num_data);
+}
+
+void GradientDiscretizer::DiscretizeGradients(
+  const data_size_t num_data,
+  const score_t* input_gradients,
+  const score_t* input_hessians) {
+  double max_gradient = std::fabs(input_gradients[0]);
+  double max_hessian = std::fabs(input_hessians[0]);
+  const int num_threads = OMP_NUM_THREADS();
+  std::vector<double> thread_max_gradient(num_threads, max_gradient);
+  std::vector<double> thread_max_hessian(num_threads, max_hessian);
+  Threading::For<data_size_t>(0, num_data, 1024,
+    [input_gradients, input_hessians, &thread_max_gradient, &thread_max_hessian]
+    (int, data_size_t start, data_size_t end) {
+      int thread_id = omp_get_thread_num();
+      for (data_size_t i = start; i < end; ++i) {
+        double fabs_grad = std::fabs(input_gradients[i]);
+        double fabs_hess = std::fabs(input_hessians[i]);
+        if (fabs_grad > thread_max_gradient[thread_id]) {
+          thread_max_gradient[thread_id] = fabs_grad;
+        }
+        if (fabs_hess > thread_max_hessian[thread_id]) {
+          thread_max_hessian[thread_id] = fabs_hess;
+        }
+      }});
+  max_gradient = thread_max_gradient[0];
+  max_hessian = thread_max_hessian[0];
+  for (int thread_id = 1; thread_id < num_threads; ++thread_id) {
+    if (max_gradient < thread_max_gradient[thread_id]) {
+      max_gradient = thread_max_gradient[thread_id];
+    }
+    if (max_hessian < thread_max_hessian[thread_id]) {
+      max_hessian = thread_max_hessian[thread_id];
+    }
+  }
+  if (Network::num_machines() > 1) {
+    max_gradient = Network::GlobalSyncUpByMax(max_gradient);
+    max_hessian = Network::GlobalSyncUpByMax(max_hessian);
+  }
+  max_gradient_abs_ = max_gradient;
+  max_hessian_abs_ = max_hessian;
+  gradient_scale_ = max_gradient_abs_ / static_cast<double>(num_grad_quant_bins_ / 2);
+  if (is_constant_hessian_) {
+    hessian_scale_ = max_hessian_abs_;
+  } else {
+    hessian_scale_ = max_hessian_abs_ / static_cast<double>(num_grad_quant_bins_);
+  }
+  inverse_gradient_scale_ = 1.0f / gradient_scale_;
+  inverse_hessian_scale_ = 1.0f / hessian_scale_;
+
+  const int random_values_use_start = random_values_use_start_dist_(random_values_use_start_eng_);
+  int8_t* discretized_int8 = discretized_gradients_and_hessians_vector_.data();
+  if (stochastic_rounding_) {
+    if (is_constant_hessian_) {
+      #pragma omp parallel for schedule(static) num_threads(num_threads)
+      for (data_size_t i = 0; i < num_data; ++i) {
+        const double gradient = input_gradients[i];
+        const data_size_t random_value_pos = (i + random_values_use_start) % num_data;
+        discretized_int8[2 * i + 1] = gradient >= 0.0f ?
+          static_cast<int8_t>(gradient * inverse_gradient_scale_ + gradient_random_values_[random_value_pos]) :
+          static_cast<int8_t>(gradient * inverse_gradient_scale_ - gradient_random_values_[random_value_pos]);
+        discretized_int8[2 * i] = static_cast<int8_t>(1);
+      }
+    } else {
+      #pragma omp parallel for schedule(static) num_threads(num_threads)
+      for (data_size_t i = 0; i < num_data; ++i) {
+        const double gradient = input_gradients[i];
+        const data_size_t random_value_pos = (i + random_values_use_start) % num_data;
+        discretized_int8[2 * i + 1] = gradient >= 0.0f ?
+          static_cast<int8_t>(gradient * inverse_gradient_scale_ + gradient_random_values_[random_value_pos]) :
+          static_cast<int8_t>(gradient * inverse_gradient_scale_ - gradient_random_values_[random_value_pos]);
+        discretized_int8[2 * i] = static_cast<int8_t>(input_hessians[i] * inverse_hessian_scale_ + hessian_random_values_[random_value_pos]);
+      }
+    }
+  } else {
+    if (is_constant_hessian_) {
+      #pragma omp parallel for schedule(static) num_threads(num_threads)
+      for (data_size_t i = 0; i < num_data; ++i) {
+        const double gradient = input_gradients[i];
+        discretized_int8[2 * i + 1] = gradient >= 0.0f ?
+          static_cast<int8_t>(gradient * inverse_gradient_scale_ + 0.5) :
+          static_cast<int8_t>(gradient * inverse_gradient_scale_ - 0.5);
+        discretized_int8[2 * i] = static_cast<int8_t>(1);
+      }
+    } else {
+      #pragma omp parallel for schedule(static) num_threads(num_threads)
+      for (data_size_t i = 0; i < num_data; ++i) {
+        const double gradient = input_gradients[i];
+        discretized_int8[2 * i + 1] = gradient >= 0.0f ?
+          static_cast<int8_t>(gradient * inverse_gradient_scale_ + 0.5) :
+          static_cast<int8_t>(gradient * inverse_gradient_scale_ - 0.5);
+        discretized_int8[2 * i] = static_cast<int8_t>(input_hessians[i] * inverse_hessian_scale_ + 0.5);
+      }
+    }
+  }
+}
+
+template <bool IS_GLOBAL>
+void GradientDiscretizer::SetNumBitsInHistogramBin(
+  const int left_leaf_index, const int right_leaf_index,
+  const data_size_t num_data_in_left_leaf, const data_size_t num_data_in_right_leaf) {
+  std::vector<int8_t>& leaf_num_bits_in_histogram_bin = IS_GLOBAL ?
+    global_leaf_num_bits_in_histogram_bin_ : leaf_num_bits_in_histogram_bin_;
+  std::vector<int8_t>& node_num_bits_in_histogram_bin = IS_GLOBAL ?
+    global_node_num_bits_in_histogram_bin_ : node_num_bits_in_histogram_bin_;
+  if (right_leaf_index == -1) {
+    const uint64_t max_stat_per_bin = static_cast<uint64_t>(num_data_in_left_leaf) * static_cast<uint64_t>(num_grad_quant_bins_);
+    if (max_stat_per_bin < 256) {
+      leaf_num_bits_in_histogram_bin[left_leaf_index] = 8;
+    } else if (max_stat_per_bin < 65536) {
+      leaf_num_bits_in_histogram_bin[left_leaf_index] = 16;
+    } else {
+      leaf_num_bits_in_histogram_bin[left_leaf_index] = 32;
+    }
+  } else {
+    const uint64_t max_stat_left_per_bin = static_cast<uint64_t>(num_data_in_left_leaf) * static_cast<uint64_t>(num_grad_quant_bins_);
+    const uint64_t max_stat_right_per_bin = static_cast<uint64_t>(num_data_in_right_leaf) * static_cast<uint64_t>(num_grad_quant_bins_);
+    node_num_bits_in_histogram_bin[left_leaf_index] = leaf_num_bits_in_histogram_bin[left_leaf_index];
+    if (max_stat_left_per_bin < 256) {
+      leaf_num_bits_in_histogram_bin[left_leaf_index] = 8;
+    } else if (max_stat_left_per_bin < 65536) {
+      leaf_num_bits_in_histogram_bin[left_leaf_index] = 16;
+    } else {
+      leaf_num_bits_in_histogram_bin[left_leaf_index] = 32;
+    }
+    if (max_stat_right_per_bin < 256) {
+      leaf_num_bits_in_histogram_bin[right_leaf_index] = 8;
+    } else if (max_stat_right_per_bin < 65536) {
+      leaf_num_bits_in_histogram_bin[right_leaf_index] = 16;
+    } else {
+      leaf_num_bits_in_histogram_bin[right_leaf_index] = 32;
+    }
+  }
+}
+
+template void GradientDiscretizer::SetNumBitsInHistogramBin<false>(
+  const int left_leaf_index, const int right_leaf_index,
+  const data_size_t num_data_in_left_leaf, const data_size_t num_data_in_right_leaf);
+
+template void GradientDiscretizer::SetNumBitsInHistogramBin<true>(
+  const int left_leaf_index, const int right_leaf_index,
+  const data_size_t num_data_in_left_leaf, const data_size_t num_data_in_right_leaf);
+
+void GradientDiscretizer::RenewIntGradTreeOutput(
+  Tree* tree, const Config* config, const DataPartition* data_partition,
+  const score_t* gradients, const score_t* hessians,
+  const std::function<data_size_t(int)>& leaf_index_to_global_num_data) {
+  global_timer.Start("GradientDiscretizer::RenewIntGradTreeOutput");
+  if (config->tree_learner == std::string("data")) {
+    for (int leaf_id = 0; leaf_id < tree->num_leaves(); ++leaf_id) {
+      data_size_t leaf_cnt = 0;
+      const data_size_t* data_indices = data_partition->GetIndexOnLeaf(leaf_id, &leaf_cnt);
+      double sum_gradient = 0.0f, sum_hessian = 0.0f;
+      #pragma omp parallel for schedule(static) reduction(+:sum_gradient, sum_hessian)
+      for (data_size_t i = 0; i < leaf_cnt; ++i) {
+        const data_size_t index = data_indices[i];
+        const score_t grad = gradients[index];
+        const score_t hess = hessians[index];
+        sum_gradient += grad;
+        sum_hessian += hess;
+      }
+      leaf_grad_hess_stats_[2 * leaf_id] = sum_gradient;
+      leaf_grad_hess_stats_[2 * leaf_id + 1] = sum_hessian;
+    }
+    std::vector<double> global_leaf_grad_hess_stats = Network::GlobalSum<double>(&leaf_grad_hess_stats_);
+    for (int leaf_id = 0; leaf_id < tree->num_leaves(); ++leaf_id) {
+      const double sum_gradient = global_leaf_grad_hess_stats[2 * leaf_id];
+      const double sum_hessian = global_leaf_grad_hess_stats[2 * leaf_id + 1];
+      const double leaf_output = FeatureHistogram::CalculateSplittedLeafOutput<true, true, false>(
+        sum_gradient, sum_hessian,
+        config->lambda_l1, config->lambda_l2, config->max_delta_step, config->path_smooth,
+        leaf_index_to_global_num_data(leaf_id), 0.0f);
+      tree->SetLeafOutput(leaf_id, leaf_output);
+    }
+  } else {
+    for (int leaf_id = 0; leaf_id < tree->num_leaves(); ++leaf_id) {
+      data_size_t leaf_cnt = 0;
+      const data_size_t* data_indices = data_partition->GetIndexOnLeaf(leaf_id, &leaf_cnt);
+      double sum_gradient = 0.0f, sum_hessian = 0.0f;
+      #pragma omp parallel for schedule(static) reduction(+:sum_gradient, sum_hessian)
+      for (data_size_t i = 0; i < leaf_cnt; ++i) {
+        const data_size_t index = data_indices[i];
+        const score_t grad = gradients[index];
+        const score_t hess = hessians[index];
+        sum_gradient += grad;
+        sum_hessian += hess;
+      }
+      const double leaf_output = FeatureHistogram::CalculateSplittedLeafOutput<true, true, false>(sum_gradient, sum_hessian,
+        config->lambda_l1, config->lambda_l2, config->max_delta_step, config->path_smooth,
+        leaf_cnt, 0.0f);
+      tree->SetLeafOutput(leaf_id, leaf_output);
+    }
+  }
+  global_timer.Stop("GradientDiscretizer::RenewIntGradTreeOutput");
+}
+
+}  // namespace LightGBM
diff --git a/src/treelearner/gradient_discretizer.hpp b/src/treelearner/gradient_discretizer.hpp
new file mode 100644
index 000000000000..352788f7d093
--- /dev/null
+++ b/src/treelearner/gradient_discretizer.hpp
@@ -0,0 +1,128 @@
+/*!
+ * Copyright (c) 2022 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for
+ * license information.
+ */
+#ifndef LIGHTGBM_TREE_LEARNER_GRADIENT_DISCRETIZER_HPP_
+#define LIGHTGBM_TREE_LEARNER_GRADIENT_DISCRETIZER_HPP_
+
+#include <LightGBM/bin.h>
+#include <LightGBM/meta.h>
+#include <LightGBM/tree.h>
+#include <LightGBM/utils/threading.h>
+
+#include <random>
+#include <vector>
+
+#include "data_partition.hpp"
+#include "feature_histogram.hpp"
+
+namespace LightGBM {
+
+class GradientDiscretizer {
+ public:
+  GradientDiscretizer(int num_grad_quant_bins, int num_trees, int random_seed, bool is_constant_hessian, const bool stochastic_rounding) {
+    num_grad_quant_bins_ = num_grad_quant_bins;
+    iter_ = 0;
+    num_trees_ = num_trees;
+    random_seed_ = random_seed;
+    is_constant_hessian_ = is_constant_hessian;
+    stochastic_rounding_ = stochastic_rounding;
+  }
+
+  ~GradientDiscretizer() {}
+
+  virtual void DiscretizeGradients(
+    const data_size_t num_data,
+    const score_t* input_gradients,
+    const score_t* input_hessians);
+
+  virtual const int8_t* discretized_gradients_and_hessians() const {
+    return discretized_gradients_and_hessians_vector_.data();
+  }
+
+  virtual double grad_scale() const {
+    return gradient_scale_;
+  }
+
+  virtual double hess_scale() const {
+    return hessian_scale_;
+  }
+
+  virtual void Init(
+    const data_size_t num_data, const int num_leaves,
+    const int num_features, const Dataset* train_data);
+
+  template <bool IS_GLOBAL>
+  void SetNumBitsInHistogramBin(
+    const int left_leaf_index, const int right_leaf_index,
+    const data_size_t num_data_in_left_leaf, const data_size_t num_data_in_right_leaf);
+
+  template <bool IS_GLOBAL>
+  int8_t GetHistBitsInLeaf(const int leaf_index) {
+    if (IS_GLOBAL) {
+      return global_leaf_num_bits_in_histogram_bin_[leaf_index];
+    } else {
+      return leaf_num_bits_in_histogram_bin_[leaf_index];
+    }
+  }
+
+  template <bool IS_GLOBAL>
+  int8_t GetHistBitsInNode(const int node_index) {
+    if (IS_GLOBAL) {
+      return global_node_num_bits_in_histogram_bin_[node_index];
+    } else {
+      return node_num_bits_in_histogram_bin_[node_index];
+    }
+  }
+
+  int8_t* ordered_int_gradients_and_hessians() {
+    return ordered_int_gradients_and_hessians_.data();
+  }
+
+  void RenewIntGradTreeOutput(
+    Tree* tree, const Config* config, const DataPartition* data_partition,
+    const score_t* gradients, const score_t* hessians,
+    const std::function<data_size_t(int)>& leaf_index_to_global_num_data);
+
+  int32_t* GetChangeHistBitsBuffer(const int feature_index) {
+    return change_hist_bits_buffer_[feature_index].data();
+  }
+
+ protected:
+  int num_grad_quant_bins_;
+  int iter_;
+  int num_trees_;
+  int random_seed_;
+  bool stochastic_rounding_;
+
+  std::vector<double> gradient_random_values_;
+  std::vector<double> hessian_random_values_;
+  std::mt19937 random_values_use_start_eng_;
+  std::uniform_int_distribution<data_size_t> random_values_use_start_dist_;
+  std::vector<int8_t> discretized_gradients_and_hessians_vector_;
+  std::vector<int8_t> ordered_int_gradients_and_hessians_;
+
+  double max_gradient_abs_;
+  double max_hessian_abs_;
+
+  double gradient_scale_;
+  double hessian_scale_;
+  double inverse_gradient_scale_;
+  double inverse_hessian_scale_;
+
+  bool is_constant_hessian_;
+  int num_leaves_;
+
+  std::vector<int8_t> leaf_num_bits_in_histogram_bin_;
+  std::vector<int8_t> node_num_bits_in_histogram_bin_;
+  std::vector<int8_t> global_leaf_num_bits_in_histogram_bin_;
+  std::vector<int8_t> global_node_num_bits_in_histogram_bin_;
+
+  std::vector<double> leaf_grad_hess_stats_;
+  std::vector<std::vector<int32_t>> change_hist_bits_buffer_;
+};
+
+}  // namespace LightGBM
+
+#endif  // LIGHTGBM_TREE_LEARNER_GRADIENT_DISCRETIZER_HPP_
diff --git a/src/treelearner/leaf_splits.hpp b/src/treelearner/leaf_splits.hpp
index 46d8ce417857..163bfc4df9ca 100644
--- a/src/treelearner/leaf_splits.hpp
+++ b/src/treelearner/leaf_splits.hpp
@@ -85,6 +85,38 @@ class LeafSplits {
     sum_hessians_ = tmp_sum_hessians;
   }
 
+
+  /*!
+   * \brief Init splits on the current leaf, it will traverse all data to sum up the results
+   * \param int_gradients_and_hessians Discretized gradients and hessians
+   * \param grad_scale Scaling factor to recover original gradients from discretized gradients
+   * \param hess_scale Scaling factor to recover original hessians from discretized hessians
+   */
+  void Init(const int8_t* int_gradients_and_hessians,
+    const double grad_scale, const double hess_scale) {
+    num_data_in_leaf_ = num_data_;
+    leaf_index_ = 0;
+    data_indices_ = nullptr;
+    double tmp_sum_gradients = 0.0f;
+    double tmp_sum_hessians = 0.0f;
+    const int16_t* packed_int_gradients_and_hessians = reinterpret_cast<const int16_t*>(int_gradients_and_hessians);
+    int64_t tmp_sum_gradients_and_hessians = 0;
+#pragma omp parallel for schedule(static, 512) reduction(+:tmp_sum_gradients, tmp_sum_hessians, tmp_sum_gradients_and_hessians) if (num_data_in_leaf_ >= 1024 && !deterministic_)
+    for (data_size_t i = 0; i < num_data_in_leaf_; ++i) {
+      tmp_sum_gradients += int_gradients_and_hessians[2 * i + 1] * grad_scale;
+      tmp_sum_hessians += int_gradients_and_hessians[2 * i] * hess_scale;
+      const int16_t packed_int_grad_and_hess = packed_int_gradients_and_hessians[i];
+      const int64_t packed_long_int_grad_and_hess =
+        (static_cast<int64_t>(static_cast<int8_t>(packed_int_grad_and_hess >> 8)) << 32) |
+        (static_cast<int64_t>(packed_int_grad_and_hess & 0x00ff));
+      tmp_sum_gradients_and_hessians += packed_long_int_grad_and_hess;
+    }
+    sum_gradients_ = tmp_sum_gradients;
+    sum_hessians_ = tmp_sum_hessians;
+    int_sum_gradients_and_hessians_ = tmp_sum_gradients_and_hessians;
+  }
+
+
   /*!
    * \brief Init splits on current leaf of partial data.
    * \param leaf Index of current leaf
@@ -109,6 +141,40 @@ class LeafSplits {
   }
 
 
+  /*!
+   * \brief Init splits on current leaf of partial data.
+   * \param leaf Index of current leaf
+   * \param data_partition current data partition
+   * \param int_gradients_and_hessians Discretized gradients and hessians
+   * \param grad_scale Scaling factor to recover original gradients from discretized gradients
+   * \param hess_scale Scaling factor to recover original hessians from discretized hessians
+   */
+  void Init(int leaf, const DataPartition* data_partition,
+            const int8_t* int_gradients_and_hessians,
+            const score_t grad_scale, const score_t hess_scale) {
+    leaf_index_ = leaf;
+    data_indices_ = data_partition->GetIndexOnLeaf(leaf, &num_data_in_leaf_);
+    double tmp_sum_gradients = 0.0f;
+    double tmp_sum_hessians = 0.0f;
+    const int16_t* packed_int_gradients_and_hessians = reinterpret_cast<const int16_t*>(int_gradients_and_hessians);
+    int64_t tmp_sum_gradients_and_hessians = 0;
+#pragma omp parallel for schedule(static, 512) reduction(+:tmp_sum_gradients, tmp_sum_hessians, tmp_sum_gradients_and_hessians) if (num_data_in_leaf_ >= 1024 && deterministic_)
+    for (data_size_t i = 0; i < num_data_in_leaf_; ++i) {
+      const data_size_t idx = data_indices_[i];
+      tmp_sum_gradients += int_gradients_and_hessians[2 * idx + 1] * grad_scale;
+      tmp_sum_hessians += int_gradients_and_hessians[2 * idx] * hess_scale;
+      const int16_t packed_int_grad_and_hess = packed_int_gradients_and_hessians[i];
+      const int64_t packed_long_int_grad_and_hess =
+        (static_cast<int64_t>(static_cast<int8_t>(packed_int_grad_and_hess >> 8)) << 32) |
+        (static_cast<int64_t>(packed_int_grad_and_hess & 0x00ff));
+      tmp_sum_gradients_and_hessians += packed_long_int_grad_and_hess;
+    }
+    sum_gradients_ = tmp_sum_gradients;
+    sum_hessians_ = tmp_sum_hessians;
+    int_sum_gradients_and_hessians_ = tmp_sum_gradients_and_hessians;
+  }
+
+
   /*!
   * \brief Init splits on current leaf, only update sum_gradients and sum_hessians
   * \param sum_gradients
@@ -120,6 +186,19 @@ class LeafSplits {
     sum_hessians_ = sum_hessians;
   }
 
+  /*!
+  * \brief Init splits on current leaf, only update sum_gradients and sum_hessians
+  * \param sum_gradients
+  * \param sum_hessians
+  * \param int_sum_gradients_and_hessians
+  */
+  void Init(double sum_gradients, double sum_hessians, int64_t int_sum_gradients_and_hessians) {
+    leaf_index_ = 0;
+    sum_gradients_ = sum_gradients;
+    sum_hessians_ = sum_hessians;
+    int_sum_gradients_and_hessians_ = int_sum_gradients_and_hessians;
+  }
+
   /*!
   * \brief Init splits on current leaf
   */
@@ -142,6 +221,9 @@ class LeafSplits {
   /*! \brief Get sum of Hessians of current leaf */
   double sum_hessians() const { return sum_hessians_; }
 
+  /*! \brief Get sum of discretized gradients and Hessians of current leaf */
+  int64_t int_sum_gradients_and_hessians() const { return int_sum_gradients_and_hessians_; }
+
   /*! \brief Get indices of data of current leaf */
   const data_size_t* data_indices() const { return data_indices_; }
 
@@ -162,6 +244,8 @@ class LeafSplits {
   double sum_gradients_;
   /*! \brief sum of Hessians of current leaf */
   double sum_hessians_;
+  /*! \brief sum of discretized gradients and Hessians of current leaf */
+  int64_t int_sum_gradients_and_hessians_;
   /*! \brief indices of data of current leaf */
   const data_size_t* data_indices_;
   /*! \brief weight of current leaf */
diff --git a/src/treelearner/parallel_tree_learner.h b/src/treelearner/parallel_tree_learner.h
index 29f4e1688b99..b942dceab28b 100644
--- a/src/treelearner/parallel_tree_learner.h
+++ b/src/treelearner/parallel_tree_learner.h
@@ -71,15 +71,24 @@ class DataParallelTreeLearner: public TREELEARNER_T {
     }
   }
 
+  void PrepareBufferPos(
+    const std::vector<std::vector<int>>& feature_distribution,
+    std::vector<comm_size_t>* block_start,
+    std::vector<comm_size_t>* block_len,
+    std::vector<comm_size_t>* buffer_write_start_pos,
+    std::vector<comm_size_t>* buffer_read_start_pos,
+    comm_size_t* reduce_scatter_size,
+    size_t hist_entry_size);
+
  private:
   /*! \brief Rank of local machine */
   int rank_;
   /*! \brief Number of machines of this parallel task */
   int num_machines_;
   /*! \brief Buffer for network send */
-  std::vector<char> input_buffer_;
+  std::vector<char, Common::AlignmentAllocator<char, 32>> input_buffer_;
   /*! \brief Buffer for network receive */
-  std::vector<char> output_buffer_;
+  std::vector<char, Common::AlignmentAllocator<char, 32>> output_buffer_;
   /*! \brief different machines will aggregate histograms for different features,
        use this to mark local aggregate features*/
   std::vector<bool> is_feature_aggregated_;
@@ -87,12 +96,22 @@ class DataParallelTreeLearner: public TREELEARNER_T {
   std::vector<comm_size_t> block_start_;
   /*! \brief Block size for reduce scatter */
   std::vector<comm_size_t> block_len_;
+  /*! \brief Block start index for reduce scatter with int16 histograms */
+  std::vector<comm_size_t> block_start_int16_;
+  /*! \brief Block size for reduce scatter with int16 histograms */
+  std::vector<comm_size_t> block_len_int16_;
   /*! \brief Write positions for feature histograms */
   std::vector<comm_size_t> buffer_write_start_pos_;
   /*! \brief Read positions for local feature histograms */
   std::vector<comm_size_t> buffer_read_start_pos_;
+  /*! \brief Write positions for feature histograms with int16 histograms*/
+  std::vector<comm_size_t> buffer_write_start_pos_int16_;
+  /*! \brief Read positions for local feature histograms with int16 histograms */
+  std::vector<comm_size_t> buffer_read_start_pos_int16_;
   /*! \brief Size for reduce scatter */
   comm_size_t reduce_scatter_size_;
+  /*! \brief Size for reduce scatter with int16 histogram*/
+  comm_size_t reduce_scatter_size_int16_;
   /*! \brief Store global number of data in leaves  */
   std::vector<data_size_t> global_data_count_in_leaf_;
 };
diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp
index 5ca8a3f047f6..c322c1a796c2 100644
--- a/src/treelearner/serial_tree_learner.cpp
+++ b/src/treelearner/serial_tree_learner.cpp
@@ -21,6 +21,7 @@ namespace LightGBM {
 
 SerialTreeLearner::SerialTreeLearner(const Config* config)
     : config_(config), col_sampler_(config) {
+  gradient_discretizer_ = nullptr;
 }
 
 SerialTreeLearner::~SerialTreeLearner() {
@@ -60,6 +61,11 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian
   ordered_gradients_.resize(num_data_);
   ordered_hessians_.resize(num_data_);
 
+  if (config_->use_quantized_grad) {
+    gradient_discretizer_.reset(new GradientDiscretizer(config_->num_grad_quant_bins, config_->num_iterations, config_->seed, is_constant_hessian, config_->stochastic_rounding));
+    gradient_discretizer_->Init(num_data_, config_->num_leaves, num_features_, train_data_);
+  }
+
   GetShareStates(train_data_, is_constant_hessian, true);
   histogram_pool_.DynamicChangeSize(train_data_,
   share_state_->num_hist_total_bin(),
@@ -76,17 +82,31 @@ void SerialTreeLearner::GetShareStates(const Dataset* dataset,
                                        bool is_constant_hessian,
                                        bool is_first_time) {
   if (is_first_time) {
-    share_state_.reset(dataset->GetShareStates(
-        ordered_gradients_.data(), ordered_hessians_.data(),
+    if (config_->use_quantized_grad) {
+      share_state_.reset(dataset->GetShareStates<true, 32>(
+        reinterpret_cast<score_t*>(gradient_discretizer_->ordered_int_gradients_and_hessians()), nullptr,
         col_sampler_.is_feature_used_bytree(), is_constant_hessian,
-        config_->force_col_wise, config_->force_row_wise));
+        config_->force_col_wise, config_->force_row_wise, config_->num_grad_quant_bins));
+    } else {
+      share_state_.reset(dataset->GetShareStates<false, 0>(
+          ordered_gradients_.data(), ordered_hessians_.data(),
+          col_sampler_.is_feature_used_bytree(), is_constant_hessian,
+          config_->force_col_wise, config_->force_row_wise, config_->num_grad_quant_bins));
+    }
   } else {
     CHECK_NOTNULL(share_state_);
     // cannot change is_hist_col_wise during training
-    share_state_.reset(dataset->GetShareStates(
-        ordered_gradients_.data(), ordered_hessians_.data(), col_sampler_.is_feature_used_bytree(),
-        is_constant_hessian, share_state_->is_col_wise,
-        !share_state_->is_col_wise));
+    if (config_->use_quantized_grad) {
+      share_state_.reset(dataset->GetShareStates<true, 32>(
+          reinterpret_cast<score_t*>(gradient_discretizer_->ordered_int_gradients_and_hessians()), nullptr,
+          col_sampler_.is_feature_used_bytree(), is_constant_hessian,
+          share_state_->is_col_wise, !share_state_->is_col_wise, config_->num_grad_quant_bins));
+    } else {
+      share_state_.reset(dataset->GetShareStates<false, 0>(
+          ordered_gradients_.data(), ordered_hessians_.data(), col_sampler_.is_feature_used_bytree(),
+          is_constant_hessian, share_state_->is_col_wise,
+          !share_state_->is_col_wise, config_->num_grad_quant_bins));
+    }
   }
   CHECK_NOTNULL(share_state_);
 }
@@ -169,6 +189,10 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians
   }
   share_state_->num_threads = num_threads;
 
+  if (config_->use_quantized_grad) {
+    gradient_discretizer_->DiscretizeGradients(num_data_, gradients_, hessians_);
+  }
+
   // some initial works before training
   BeforeTrain();
 
@@ -205,6 +229,11 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians
     cur_depth = std::max(cur_depth, tree->leaf_depth(left_leaf));
   }
 
+  if (config_->use_quantized_grad && config_->quant_train_renew_leaf) {
+    gradient_discretizer_->RenewIntGradTreeOutput(tree.get(), config_, data_partition_.get(), gradients_, hessians_,
+      [this] (int leaf_index) { return GetGlobalDataCountInLeaf(leaf_index); });
+  }
+
   Log::Debug("Trained a tree with leaves = %d and depth = %d", tree->num_leaves(), cur_depth);
   return tree.release();
 }
@@ -270,11 +299,25 @@ void SerialTreeLearner::BeforeTrain() {
   // Sumup for root
   if (data_partition_->leaf_count(0) == num_data_) {
     // use all data
-    smaller_leaf_splits_->Init(gradients_, hessians_);
-
+    if (!config_->use_quantized_grad) {
+      smaller_leaf_splits_->Init(gradients_, hessians_);
+    } else {
+      smaller_leaf_splits_->Init(
+        gradient_discretizer_->discretized_gradients_and_hessians(),
+        gradient_discretizer_->grad_scale(),
+        gradient_discretizer_->hess_scale());
+    }
   } else {
     // use bagging, only use part of data
-    smaller_leaf_splits_->Init(0, data_partition_.get(), gradients_, hessians_);
+    if (!config_->use_quantized_grad) {
+      smaller_leaf_splits_->Init(0, data_partition_.get(), gradients_, hessians_);
+    } else {
+      smaller_leaf_splits_->Init(
+        0, data_partition_.get(),
+        gradient_discretizer_->discretized_gradients_and_hessians(),
+        gradient_discretizer_->grad_scale(),
+        gradient_discretizer_->hess_scale());
+    }
   }
 
   larger_leaf_splits_->Init();
@@ -282,6 +325,10 @@ void SerialTreeLearner::BeforeTrain() {
   if (cegb_ != nullptr) {
     cegb_->BeforeTrain();
   }
+
+  if (config_->use_quantized_grad && config_->tree_learner != std::string("data")) {
+    gradient_discretizer_->SetNumBitsInHistogramBin<false>(0, -1, data_partition_->leaf_count(0), 0);
+  }
 }
 
 bool SerialTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) {
@@ -353,22 +400,67 @@ void SerialTreeLearner::ConstructHistograms(
   Common::FunctionTimer fun_timer("SerialTreeLearner::ConstructHistograms",
                                   global_timer);
   // construct smaller leaf
-  hist_t* ptr_smaller_leaf_hist_data =
-      smaller_leaf_histogram_array_[0].RawData() - kHistOffset;
-  train_data_->ConstructHistograms(
-      is_feature_used, smaller_leaf_splits_->data_indices(),
-      smaller_leaf_splits_->num_data_in_leaf(), gradients_, hessians_,
-      ordered_gradients_.data(), ordered_hessians_.data(), share_state_.get(),
-      ptr_smaller_leaf_hist_data);
-  if (larger_leaf_histogram_array_ != nullptr && !use_subtract) {
-    // construct larger leaf
-    hist_t* ptr_larger_leaf_hist_data =
-        larger_leaf_histogram_array_[0].RawData() - kHistOffset;
-    train_data_->ConstructHistograms(
-        is_feature_used, larger_leaf_splits_->data_indices(),
-        larger_leaf_splits_->num_data_in_leaf(), gradients_, hessians_,
+  if (config_->use_quantized_grad) {
+    const uint8_t smaller_leaf_num_bits = gradient_discretizer_->GetHistBitsInLeaf<false>(smaller_leaf_splits_->leaf_index());
+    hist_t* ptr_smaller_leaf_hist_data =
+        smaller_leaf_num_bits <= 16 ?
+        reinterpret_cast<hist_t*>(smaller_leaf_histogram_array_[0].RawDataInt16() - kHistOffset) :
+        reinterpret_cast<hist_t*>(smaller_leaf_histogram_array_[0].RawDataInt32() - kHistOffset);
+    #define SMALLER_LEAF_ARGS \
+      is_feature_used, smaller_leaf_splits_->data_indices(), \
+      smaller_leaf_splits_->num_data_in_leaf(), \
+      reinterpret_cast<const score_t*>(gradient_discretizer_->discretized_gradients_and_hessians()), \
+      nullptr, \
+      reinterpret_cast<score_t*>(gradient_discretizer_->ordered_int_gradients_and_hessians()), \
+      nullptr, \
+      share_state_.get(), \
+      reinterpret_cast<hist_t*>(ptr_smaller_leaf_hist_data)
+    if (smaller_leaf_num_bits <= 16) {
+      train_data_->ConstructHistograms<true, 16>(SMALLER_LEAF_ARGS);
+    } else {
+      train_data_->ConstructHistograms<true, 32>(SMALLER_LEAF_ARGS);
+    }
+    #undef SMALLER_LEAF_ARGS
+    if (larger_leaf_histogram_array_ && !use_subtract) {
+      const uint8_t larger_leaf_num_bits = gradient_discretizer_->GetHistBitsInLeaf<false>(larger_leaf_splits_->leaf_index());
+      hist_t* ptr_larger_leaf_hist_data =
+        larger_leaf_num_bits <= 16 ?
+        reinterpret_cast<hist_t*>(larger_leaf_histogram_array_[0].RawDataInt16() - kHistOffset) :
+        reinterpret_cast<hist_t*>(larger_leaf_histogram_array_[0].RawDataInt32() - kHistOffset);
+      #define LARGER_LEAF_ARGS \
+        is_feature_used, larger_leaf_splits_->data_indices(), \
+        larger_leaf_splits_->num_data_in_leaf(), \
+        reinterpret_cast<const score_t*>(gradient_discretizer_->discretized_gradients_and_hessians()), \
+        nullptr, \
+        reinterpret_cast<score_t*>(gradient_discretizer_->ordered_int_gradients_and_hessians()), \
+        nullptr, \
+        share_state_.get(), \
+        reinterpret_cast<hist_t*>(ptr_larger_leaf_hist_data)
+      if (larger_leaf_num_bits <= 16) {
+        train_data_->ConstructHistograms<true, 16>(LARGER_LEAF_ARGS);
+      } else {
+        train_data_->ConstructHistograms<true, 32>(LARGER_LEAF_ARGS);
+      }
+      #undef LARGER_LEAF_ARGS
+    }
+  } else {
+    hist_t* ptr_smaller_leaf_hist_data =
+        smaller_leaf_histogram_array_[0].RawData() - kHistOffset;
+    train_data_->ConstructHistograms<false, 0>(
+        is_feature_used, smaller_leaf_splits_->data_indices(),
+        smaller_leaf_splits_->num_data_in_leaf(), gradients_, hessians_,
         ordered_gradients_.data(), ordered_hessians_.data(), share_state_.get(),
-        ptr_larger_leaf_hist_data);
+        ptr_smaller_leaf_hist_data);
+    if (larger_leaf_histogram_array_ != nullptr && !use_subtract) {
+      // construct larger leaf
+      hist_t* ptr_larger_leaf_hist_data =
+          larger_leaf_histogram_array_[0].RawData() - kHistOffset;
+      train_data_->ConstructHistograms<false, 0>(
+          is_feature_used, larger_leaf_splits_->data_indices(),
+          larger_leaf_splits_->num_data_in_leaf(), gradients_, hessians_,
+          ordered_gradients_.data(), ordered_hessians_.data(), share_state_.get(),
+          ptr_larger_leaf_hist_data);
+    }
   }
 }
 
@@ -388,6 +480,26 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(
   if (larger_leaf_splits_->leaf_index() >= 0) {
     larger_node_used_features = col_sampler_.GetByNode(tree, larger_leaf_splits_->leaf_index());
   }
+
+  if (use_subtract && config_->use_quantized_grad) {
+    const int parent_index = std::min(smaller_leaf_splits_->leaf_index(), larger_leaf_splits_->leaf_index());
+    const uint8_t parent_hist_bits = gradient_discretizer_->GetHistBitsInNode<false>(parent_index);
+    const uint8_t larger_hist_bits = gradient_discretizer_->GetHistBitsInLeaf<false>(larger_leaf_splits_->leaf_index());
+    if (parent_hist_bits > 16 && larger_hist_bits <= 16) {
+      OMP_INIT_EX();
+      #pragma omp parallel for schedule(static) num_threads(share_state_->num_threads)
+      for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
+        OMP_LOOP_EX_BEGIN();
+        if (!is_feature_used[feature_index]) {
+          continue;
+        }
+        larger_leaf_histogram_array_[feature_index].CopyToBuffer(gradient_discretizer_->GetChangeHistBitsBuffer(feature_index));
+        OMP_LOOP_EX_END();
+      }
+      OMP_THROW_EX();
+    }
+  }
+
   OMP_INIT_EX();
 // find splits
 #pragma omp parallel for schedule(static) num_threads(share_state_->num_threads)
@@ -397,10 +509,24 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(
       continue;
     }
     const int tid = omp_get_thread_num();
-    train_data_->FixHistogram(
-        feature_index, smaller_leaf_splits_->sum_gradients(),
-        smaller_leaf_splits_->sum_hessians(),
-        smaller_leaf_histogram_array_[feature_index].RawData());
+    if (config_->use_quantized_grad) {
+      const uint8_t hist_bits_bin = gradient_discretizer_->GetHistBitsInLeaf<false>(smaller_leaf_splits_->leaf_index());
+      const int64_t int_sum_gradient_and_hessian = smaller_leaf_splits_->int_sum_gradients_and_hessians();
+      if (hist_bits_bin <= 16) {
+        train_data_->FixHistogramInt<int32_t, int32_t, 16, 16>(
+            feature_index, int_sum_gradient_and_hessian,
+            reinterpret_cast<hist_t*>(smaller_leaf_histogram_array_[feature_index].RawDataInt16()));
+      } else {
+        train_data_->FixHistogramInt<int64_t, int64_t, 32, 32>(
+            feature_index, int_sum_gradient_and_hessian,
+            reinterpret_cast<hist_t*>(smaller_leaf_histogram_array_[feature_index].RawDataInt32()));
+      }
+    } else {
+      train_data_->FixHistogram(
+          feature_index, smaller_leaf_splits_->sum_gradients(),
+          smaller_leaf_splits_->sum_hessians(),
+          smaller_leaf_histogram_array_[feature_index].RawData());
+    }
     int real_fidx = train_data_->RealFeatureIndex(feature_index);
 
     ComputeBestSplitForFeature(smaller_leaf_histogram_array_, feature_index,
@@ -417,13 +543,50 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(
     }
 
     if (use_subtract) {
-      larger_leaf_histogram_array_[feature_index].Subtract(
-          smaller_leaf_histogram_array_[feature_index]);
+      if (config_->use_quantized_grad) {
+        const int parent_index = std::min(smaller_leaf_splits_->leaf_index(), larger_leaf_splits_->leaf_index());
+        const uint8_t parent_hist_bits = gradient_discretizer_->GetHistBitsInNode<false>(parent_index);
+        const uint8_t smaller_hist_bits = gradient_discretizer_->GetHistBitsInLeaf<false>(smaller_leaf_splits_->leaf_index());
+        const uint8_t larger_hist_bits = gradient_discretizer_->GetHistBitsInLeaf<false>(larger_leaf_splits_->leaf_index());
+        if (parent_hist_bits <= 16) {
+          CHECK_LE(smaller_hist_bits, 16);
+          CHECK_LE(larger_hist_bits, 16);
+          larger_leaf_histogram_array_[feature_index].Subtract<true, int32_t, int32_t, int32_t, 16, 16, 16>(
+              smaller_leaf_histogram_array_[feature_index]);
+        } else if (larger_hist_bits <= 16) {
+          CHECK_LE(smaller_hist_bits, 16);
+          larger_leaf_histogram_array_[feature_index].Subtract<true, int64_t, int32_t, int32_t, 32, 16, 16>(
+              smaller_leaf_histogram_array_[feature_index], gradient_discretizer_->GetChangeHistBitsBuffer(feature_index));
+        } else if (smaller_hist_bits <= 16) {
+          larger_leaf_histogram_array_[feature_index].Subtract<true, int64_t, int32_t, int64_t, 32, 16, 32>(
+              smaller_leaf_histogram_array_[feature_index]);
+        } else {
+          larger_leaf_histogram_array_[feature_index].Subtract<true, int64_t, int64_t, int64_t, 32, 32, 32>(
+              smaller_leaf_histogram_array_[feature_index]);
+        }
+      } else {
+        larger_leaf_histogram_array_[feature_index].Subtract<false>(
+            smaller_leaf_histogram_array_[feature_index]);
+      }
     } else {
-      train_data_->FixHistogram(
-          feature_index, larger_leaf_splits_->sum_gradients(),
-          larger_leaf_splits_->sum_hessians(),
-          larger_leaf_histogram_array_[feature_index].RawData());
+      if (config_->use_quantized_grad) {
+        const int64_t int_sum_gradient_and_hessian = larger_leaf_splits_->int_sum_gradients_and_hessians();
+        const uint8_t hist_bits_bin = gradient_discretizer_->GetHistBitsInLeaf<false>(larger_leaf_splits_->leaf_index());
+        if (hist_bits_bin <= 16) {
+          train_data_->FixHistogramInt<int32_t, int32_t, 16, 16>(
+              feature_index, int_sum_gradient_and_hessian,
+              reinterpret_cast<hist_t*>(larger_leaf_histogram_array_[feature_index].RawDataInt16()));
+        } else {
+          train_data_->FixHistogramInt<int64_t, int64_t, 32, 32>(
+              feature_index, int_sum_gradient_and_hessian,
+              reinterpret_cast<hist_t*>(larger_leaf_histogram_array_[feature_index].RawDataInt32()));
+        }
+      } else {
+        train_data_->FixHistogram(
+            feature_index, larger_leaf_splits_->sum_gradients(),
+            larger_leaf_splits_->sum_hessians(),
+            larger_leaf_histogram_array_[feature_index].RawData());
+      }
     }
 
     ComputeBestSplitForFeature(larger_leaf_histogram_array_, feature_index,
@@ -699,6 +862,11 @@ void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf,
                               best_split_info.left_sum_hessian,
                               best_split_info.left_output);
   }
+  if (config_->use_quantized_grad && config_->tree_learner != std::string("data")) {
+    gradient_discretizer_->SetNumBitsInHistogramBin<false>(*left_leaf, *right_leaf,
+                                                    data_partition_->leaf_count(*left_leaf),
+                                                    data_partition_->leaf_count(*right_leaf));
+  }
   auto leaves_need_update = constraints_->Update(
       is_numerical_split, *left_leaf, *right_leaf,
       best_split_info.monotone_type, best_split_info.right_output,
@@ -762,9 +930,21 @@ void SerialTreeLearner::ComputeBestSplitForFeature(
         train_data_->FeatureNumBin(feature_index));
   }
   SplitInfo new_split;
-  histogram_array_[feature_index].FindBestThreshold(
-      leaf_splits->sum_gradients(), leaf_splits->sum_hessians(), num_data,
-      constraints_->GetFeatureConstraint(leaf_splits->leaf_index(), feature_index), parent_output, &new_split);
+  if (config_->use_quantized_grad) {
+    const uint8_t hist_bits_bin = gradient_discretizer_->GetHistBitsInLeaf<false>(leaf_splits->leaf_index());
+    histogram_array_[feature_index].FindBestThresholdInt(
+        leaf_splits->int_sum_gradients_and_hessians(),
+        gradient_discretizer_->grad_scale(),
+        gradient_discretizer_->hess_scale(),
+        hist_bits_bin,
+        hist_bits_bin,
+        num_data,
+        constraints_->GetFeatureConstraint(leaf_splits->leaf_index(), feature_index), parent_output, &new_split);
+  } else {
+    histogram_array_[feature_index].FindBestThreshold(
+        leaf_splits->sum_gradients(), leaf_splits->sum_hessians(), num_data,
+        constraints_->GetFeatureConstraint(leaf_splits->leaf_index(), feature_index), parent_output, &new_split);
+  }
   new_split.feature = real_fidx;
   if (cegb_ != nullptr) {
     new_split.gain -=
diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h
index 14b78eb6a577..1f8e3add0d8c 100644
--- a/src/treelearner/serial_tree_learner.h
+++ b/src/treelearner/serial_tree_learner.h
@@ -24,6 +24,7 @@
 #include "col_sampler.hpp"
 #include "data_partition.hpp"
 #include "feature_histogram.hpp"
+#include "gradient_discretizer.hpp"
 #include "leaf_splits.hpp"
 #include "monotone_constraints.hpp"
 #include "split_info.hpp"
@@ -170,6 +171,8 @@ class SerialTreeLearner: public TreeLearner {
 
   std::set<int> FindAllForceFeatures(Json force_split_leaf_setting);
 
+  void CheckSplit(const SplitInfo& best_split_info, const int left_leaf_index, const int right_leaf_index);
+
   /*!
   * \brief Get the number of data in a leaf
   * \param leaf_idx The index of leaf
@@ -230,6 +233,7 @@ class SerialTreeLearner: public TreeLearner {
   const Json* forced_split_json_;
   std::unique_ptr<TrainingShareStates> share_state_;
   std::unique_ptr<CostEfficientGradientBoosting> cegb_;
+  std::unique_ptr<GradientDiscretizer> gradient_discretizer_;
 };
 
 inline data_size_t SerialTreeLearner::GetGlobalDataCountInLeaf(int leaf_idx) const {
diff --git a/src/treelearner/split_info.hpp b/src/treelearner/split_info.hpp
index 644bd329b3a6..234105eb9a34 100644
--- a/src/treelearner/split_info.hpp
+++ b/src/treelearner/split_info.hpp
@@ -40,10 +40,14 @@ struct SplitInfo {
   double left_sum_gradient = 0;
   /*! \brief Left sum hessian after split */
   double left_sum_hessian = 0;
+  /*! \brief Left sum discretized gradient and hessian after split */
+  int64_t left_sum_gradient_and_hessian = 0;
   /*! \brief Right sum gradient after split */
   double right_sum_gradient = 0;
   /*! \brief Right sum hessian after split */
   double right_sum_hessian = 0;
+  /*! \brief Right sum discretized gradient and hessian after split */
+  int64_t right_sum_gradient_and_hessian = 0;
   std::vector<uint32_t> cat_threshold;
   /*! \brief True if default split is left */
   bool default_left = true;
@@ -71,10 +75,14 @@ struct SplitInfo {
     buffer += sizeof(left_sum_gradient);
     std::memcpy(buffer, &left_sum_hessian, sizeof(left_sum_hessian));
     buffer += sizeof(left_sum_hessian);
+    std::memcpy(buffer, &left_sum_gradient_and_hessian, sizeof(left_sum_gradient_and_hessian));
+    buffer += sizeof(left_sum_gradient_and_hessian);
     std::memcpy(buffer, &right_sum_gradient, sizeof(right_sum_gradient));
     buffer += sizeof(right_sum_gradient);
     std::memcpy(buffer, &right_sum_hessian, sizeof(right_sum_hessian));
     buffer += sizeof(right_sum_hessian);
+    std::memcpy(buffer, &right_sum_gradient_and_hessian, sizeof(right_sum_gradient_and_hessian));
+    buffer += sizeof(right_sum_gradient_and_hessian);
     std::memcpy(buffer, &default_left, sizeof(default_left));
     buffer += sizeof(default_left);
     std::memcpy(buffer, &monotone_type, sizeof(monotone_type));
@@ -103,10 +111,14 @@ struct SplitInfo {
     buffer += sizeof(left_sum_gradient);
     std::memcpy(&left_sum_hessian, buffer, sizeof(left_sum_hessian));
     buffer += sizeof(left_sum_hessian);
+    std::memcpy(&left_sum_gradient_and_hessian, buffer, sizeof(left_sum_gradient_and_hessian));
+    buffer += sizeof(left_sum_gradient_and_hessian);
     std::memcpy(&right_sum_gradient, buffer, sizeof(right_sum_gradient));
     buffer += sizeof(right_sum_gradient);
     std::memcpy(&right_sum_hessian, buffer, sizeof(right_sum_hessian));
     buffer += sizeof(right_sum_hessian);
+    std::memcpy(&right_sum_gradient_and_hessian, buffer, sizeof(right_sum_gradient_and_hessian));
+    buffer += sizeof(right_sum_gradient_and_hessian);
     std::memcpy(&default_left, buffer, sizeof(default_left));
     buffer += sizeof(default_left);
     std::memcpy(&monotone_type, buffer, sizeof(monotone_type));
diff --git a/swig/pointer_manipulation.i b/swig/pointer_manipulation.i
index 28635b34ac62..de0bddd42f8e 100644
--- a/swig/pointer_manipulation.i
+++ b/swig/pointer_manipulation.i
@@ -15,6 +15,7 @@
  * to arrays of size max(int64_t) instead of max(int32_t).
  */
 
+%pointer_functions(uint8_t, bytep)
 %pointer_functions(int, intp)
 %pointer_functions(long, longp)
 %pointer_functions(double, doublep)
@@ -33,6 +34,7 @@
 %pointer_cast(double *, void *, double_to_voidp_ptr)
 %pointer_cast(float *, void *, float_to_voidp_ptr)
 %pointer_cast(int *, void *, int_to_voidp_ptr)
+%pointer_cast(uint8_t *, void *, byte_to_voidp_ptr)
 %pointer_cast(int32_t *, void *, int32_t_to_voidp_ptr)
 %pointer_cast(int64_t *, void *, int64_t_to_voidp_ptr)
 
diff --git a/tests/distributed/_test_distributed.py b/tests/distributed/_test_distributed.py
index 9e1dd8e4f5a4..9ede4e0800fb 100644
--- a/tests/distributed/_test_distributed.py
+++ b/tests/distributed/_test_distributed.py
@@ -106,7 +106,7 @@ def _write_data(self, partitions: List[np.ndarray]) -> None:
         for i, partition in enumerate(partitions):
             np.savetxt(str(TESTS_DIR / f'train{i}.txt'), partition, delimiter=',')
 
-    def fit(self, partitions: List[np.ndarray], train_config: Dict = {}) -> None:
+    def fit(self, partitions: List[np.ndarray], train_config: Dict) -> None:
         """Run the distributed training process on a single machine.
 
         For each worker i:
@@ -134,7 +134,7 @@ def fit(self, partitions: List[np.ndarray], train_config: Dict = {}) -> None:
             if result.returncode != 0:
                 raise RuntimeError('Error in training')
 
-    def predict(self, predict_config: Dict[str, Any] = {}) -> np.ndarray:
+    def predict(self, predict_config: Dict[str, Any]) -> np.ndarray:
         """Compute the predictions using the model created in the fit step.
 
         predict_config is used to predict the training set train.txt
@@ -178,7 +178,7 @@ def test_classifier(executable):
     }
     clf = DistributedMockup(executable)
     clf.fit(partitions, train_params)
-    y_probas = clf.predict()
+    y_probas = clf.predict(predict_config={})
     y_pred = y_probas > 0.5
     assert accuracy_score(clf.label_, y_pred) == 1.
 
@@ -194,5 +194,5 @@ def test_regressor(executable):
     }
     reg = DistributedMockup(executable)
     reg.fit(partitions, train_params)
-    y_pred = reg.predict()
+    y_pred = reg.predict(predict_config={})
     np.testing.assert_allclose(y_pred, reg.label_, rtol=0.2, atol=50.)
diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py
index 7bb2d99a4037..5e237724ae85 100644
--- a/tests/python_package_test/test_basic.py
+++ b/tests/python_package_test/test_basic.py
@@ -2,6 +2,7 @@
 import filecmp
 import numbers
 import re
+from copy import deepcopy
 from os import getenv
 from pathlib import Path
 
@@ -324,7 +325,7 @@ def test_add_features_same_booster_behaviour(tmp_path):
         d.set_label(y)
         b1 = lgb.Booster(train_set=d1)
         b = lgb.Booster(train_set=d)
-        for k in range(10):
+        for _ in range(10):
             b.update()
             b1.update()
         dname = tmp_path / "d.txt"
@@ -365,7 +366,7 @@ def test_add_features_from_different_sources():
 
         # test that method works for different data types
         d1 = lgb.Dataset(x_1, feature_name=names, free_raw_data=False).construct()
-        res_feature_names = [name for name in names]
+        res_feature_names = deepcopy(names)
         for idx, x_2 in enumerate(xxs, 2):
             original_type = type(d1.get_data())
             d2 = lgb.Dataset(x_2, feature_name=names, free_raw_data=False).construct()
@@ -407,7 +408,7 @@ def test_cegb_affects_behavior(tmp_path):
     ds = lgb.Dataset(X, feature_name=names).construct()
     ds.set_label(y)
     base = lgb.Booster(train_set=ds)
-    for k in range(10):
+    for _ in range(10):
         base.update()
     basename = tmp_path / "basename.txt"
     base.save_model(basename)
@@ -419,7 +420,7 @@ def test_cegb_affects_behavior(tmp_path):
              {'cegb_penalty_split': 1}]
     for case in cases:
         booster = lgb.Booster(train_set=ds, params=case)
-        for k in range(10):
+        for _ in range(10):
             booster.update()
         casename = tmp_path / "casename.txt"
         booster.save_model(casename)
@@ -445,7 +446,7 @@ def test_cegb_scaling_equalities(tmp_path):
     for (p1, p2) in pairs:
         booster1 = lgb.Booster(train_set=ds, params=p1)
         booster2 = lgb.Booster(train_set=ds, params=p2)
-        for k in range(10):
+        for _ in range(10):
             booster1.update()
             booster2.update()
         p1name = tmp_path / "p1.txt"
@@ -632,17 +633,17 @@ def test_list_to_1d_numpy(collection, dtype):
             y = pd_Series(y)
     if isinstance(y, np.ndarray) and len(y.shape) == 2:
         with pytest.warns(UserWarning, match='column-vector'):
-            lgb.basic._list_to_1d_numpy(y)
+            lgb.basic._list_to_1d_numpy(y, dtype=np.float32, name="list")
         return
     elif isinstance(y, list) and isinstance(y[0], list):
         with pytest.raises(TypeError):
-            lgb.basic._list_to_1d_numpy(y)
+            lgb.basic._list_to_1d_numpy(y, dtype=np.float32, name="list")
         return
     elif isinstance(y, pd_Series) and y.dtype == object:
         with pytest.raises(ValueError):
-            lgb.basic._list_to_1d_numpy(y)
+            lgb.basic._list_to_1d_numpy(y, dtype=np.float32, name="list")
         return
-    result = lgb.basic._list_to_1d_numpy(y, dtype=dtype)
+    result = lgb.basic._list_to_1d_numpy(y, dtype=dtype, name="list")
     assert result.size == 10
     assert result.dtype == dtype
 
@@ -752,10 +753,10 @@ def test_feature_num_bin(min_data_in_bin):
     ]).T
     n_continuous = X.shape[1] - 1
     feature_name = [f'x{i}' for i in range(n_continuous)] + ['cat1']
-    ds_kwargs = dict(
-        params={'min_data_in_bin': min_data_in_bin},
-        categorical_feature=[n_continuous],  # last feature
-    )
+    ds_kwargs = {
+        "params": {'min_data_in_bin': min_data_in_bin},
+        "categorical_feature": [n_continuous],  # last feature
+    }
     ds = lgb.Dataset(X, feature_name=feature_name, **ds_kwargs).construct()
     expected_num_bins = [
         100 // min_data_in_bin + 1,  # extra bin for zero
diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py
index 594f88f527ac..662020428270 100644
--- a/tests/python_package_test/test_dask.py
+++ b/tests/python_package_test/test_dask.py
@@ -1062,9 +1062,9 @@ def test_eval_set_no_early_stopping(task, output, eval_sizes, eval_names_prefix,
                 eval_class_weight.append({0: n_neg / n_pos, 1: n_pos / n_neg})
                 init_score_value = np.log(np.mean(y_e) / (1 - np.mean(y_e)))
                 if 'dataframe' in output:
-                    d_init_score = dy_e.map_partitions(lambda x: pd.Series([init_score_value] * x.size))
+                    d_init_score = dy_e.map_partitions(lambda x, val=init_score_value: pd.Series([val] * x.size))
                 else:
-                    d_init_score = dy_e.map_blocks(lambda x: np.repeat(init_score_value, x.size))
+                    d_init_score = dy_e.map_blocks(lambda x, val=init_score_value: np.repeat(val, x.size))
 
                 eval_init_score.append(d_init_score)
 
@@ -1854,3 +1854,44 @@ def test_predict_with_raw_score(task, output, cluster):
         if task.endswith('classification'):
             pred_proba_raw = model.predict_proba(dX, raw_score=True).compute()
             assert_eq(raw_predictions, pred_proba_raw)
+
+
+def test_distributed_quantized_training(cluster):
+    with Client(cluster) as client:
+        X, y, w, _, dX, dy, dw, _ = _create_data(
+            objective='regression',
+            output='array'
+        )
+
+        np.savetxt("data_dask.csv", np.hstack([np.array([y]).T, X]), fmt="%f,%f,%f,%f,%f")
+
+        params = {
+            "boosting_type": 'gbdt',
+            "n_estimators": 50,
+            "num_leaves": 31,
+            'use_quantized_grad': True,
+            'num_grad_quant_bins': 30,
+            'quant_train_renew_leaf': True,
+            'verbose': -1,
+            'force_row_wise': True,
+        }
+
+        quant_dask_classifier = lgb.DaskLGBMRegressor(
+            client=client,
+            time_out=5,
+            **params
+        )
+        quant_dask_classifier = quant_dask_classifier.fit(dX, dy, sample_weight=dw)
+        quant_p1 = quant_dask_classifier.predict(dX)
+        quant_rmse = np.sqrt(np.mean((quant_p1.compute() - y) ** 2))
+
+        params["use_quantized_grad"] = False
+        dask_classifier = lgb.DaskLGBMRegressor(
+            client=client,
+            time_out=5,
+            **params
+        )
+        dask_classifier = dask_classifier.fit(dX, dy, sample_weight=dw)
+        p1 = dask_classifier.predict(dX)
+        rmse = np.sqrt(np.mean((p1.compute() - y) ** 2))
+        assert quant_rmse < rmse + 7.0
diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py
index b9709e6bcea6..e87cea3bfcbb 100644
--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -886,13 +886,13 @@ def test_early_stopping_min_delta(first_only, single_metric, greater_is_better):
         min_delta = metric2min_delta[metric[0]]
     else:
         min_delta = [metric2min_delta[m] for m in metric]
-    train_kwargs = dict(
-        params=params,
-        train_set=train_ds,
-        num_boost_round=50,
-        valid_sets=[train_ds, valid_ds],
-        valid_names=['training', 'valid'],
-    )
+    train_kwargs = {
+        "params": params,
+        "train_set": train_ds,
+        "num_boost_round": 50,
+        "valid_sets": [train_ds, valid_ds],
+        "valid_names": ['training', 'valid'],
+    }
 
     # regular early stopping
     evals_result = {}
@@ -1075,6 +1075,67 @@ def test_cv():
     np.testing.assert_allclose(cv_res_lambda['valid ndcg@3-mean'], cv_res_lambda_obj['valid ndcg@3-mean'])
 
 
+def test_cv_works_with_init_model(tmp_path):
+    X, y = make_synthetic_regression()
+    params = {'objective': 'regression', 'verbose': -1}
+    num_train_rounds = 2
+    lgb_train = lgb.Dataset(X, y, free_raw_data=False)
+    bst = lgb.train(
+        params=params,
+        train_set=lgb_train,
+        num_boost_round=num_train_rounds
+    )
+    preds_raw = bst.predict(X, raw_score=True)
+    model_path_txt = str(tmp_path / 'lgb.model')
+    bst.save_model(model_path_txt)
+
+    num_cv_rounds = 5
+    cv_kwargs = {
+        "num_boost_round": num_cv_rounds,
+        "nfold": 3,
+        "stratified": False,
+        "shuffle": False,
+        "seed": 708,
+        "return_cvbooster": True,
+        "params": params
+    }
+
+    # init_model from an in-memory Booster
+    cv_res = lgb.cv(
+        train_set=lgb_train,
+        init_model=bst,
+        **cv_kwargs
+    )
+    cv_bst_w_in_mem_init_model = cv_res["cvbooster"]
+    assert cv_bst_w_in_mem_init_model.current_iteration() == [num_train_rounds + num_cv_rounds] * 3
+    for booster in cv_bst_w_in_mem_init_model.boosters:
+        np.testing.assert_allclose(
+            preds_raw,
+            booster.predict(X, raw_score=True, num_iteration=num_train_rounds)
+        )
+
+    # init_model from a text file
+    cv_res = lgb.cv(
+        train_set=lgb_train,
+        init_model=model_path_txt,
+        **cv_kwargs
+    )
+    cv_bst_w_file_init_model = cv_res["cvbooster"]
+    assert cv_bst_w_file_init_model.current_iteration() == [num_train_rounds + num_cv_rounds] * 3
+    for booster in cv_bst_w_file_init_model.boosters:
+        np.testing.assert_allclose(
+            preds_raw,
+            booster.predict(X, raw_score=True, num_iteration=num_train_rounds)
+        )
+
+    # predictions should be identical
+    for i in range(3):
+        np.testing.assert_allclose(
+            cv_bst_w_in_mem_init_model.boosters[i].predict(X),
+            cv_bst_w_file_init_model.boosters[i].predict(X)
+        )
+
+
 def test_cvbooster():
     X, y = load_breast_cancer(return_X_y=True)
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
@@ -1710,7 +1771,7 @@ def parse_tree_features(gbm):
             for tree in tree_str:
                 # split_features are in 4th line.
                 features = tree.splitlines()[3].split("=")[1].split(" ")
-                features = set(f"Column_{f}" for f in features)
+                features = {f"Column_{f}" for f in features}
                 feature_sets.append(features)
             return np.array(feature_sets)
 
@@ -2799,14 +2860,14 @@ def metrics_combination_cv_regression(metric_list, assumed_iteration,
     iter_valid1_l2 = 3
     iter_valid2_l1 = 3
     iter_valid2_l2 = 15
-    assert len(set([iter_valid1_l1, iter_valid1_l2, iter_valid2_l1, iter_valid2_l2])) == 2
+    assert len({iter_valid1_l1, iter_valid1_l2, iter_valid2_l1, iter_valid2_l2}) == 2
     iter_min_l1 = min([iter_valid1_l1, iter_valid2_l1])
     iter_min_l2 = min([iter_valid1_l2, iter_valid2_l2])
     iter_min_valid1 = min([iter_valid1_l1, iter_valid1_l2])
 
     iter_cv_l1 = 15
     iter_cv_l2 = 13
-    assert len(set([iter_cv_l1, iter_cv_l2])) == 2
+    assert len({iter_cv_l1, iter_cv_l2}) == 2
     iter_cv_min = min([iter_cv_l1, iter_cv_l2])
 
     # test for lgb.train
@@ -4015,3 +4076,59 @@ def test_validate_features():
 
     # check that disabling the check doesn't raise the error
     bst.refit(df2, y, validate_features=False)
+
+
+def test_train_and_cv_raise_informative_error_for_train_set_of_wrong_type():
+    with pytest.raises(TypeError, match=r"train\(\) only accepts Dataset object, train_set has type 'list'\."):
+        lgb.train({}, train_set=[])
+    with pytest.raises(TypeError, match=r"cv\(\) only accepts Dataset object, train_set has type 'list'\."):
+        lgb.cv({}, train_set=[])
+
+
+@pytest.mark.parametrize('num_boost_round', [-7, -1, 0])
+def test_train_and_cv_raise_informative_error_for_impossible_num_boost_round(num_boost_round):
+    X, y = make_synthetic_regression(n_samples=100)
+    error_msg = rf"num_boost_round must be greater than 0\. Got {num_boost_round}\."
+    with pytest.raises(ValueError, match=error_msg):
+        lgb.train({}, train_set=lgb.Dataset(X, y), num_boost_round=num_boost_round)
+    with pytest.raises(ValueError, match=error_msg):
+        lgb.cv({}, train_set=lgb.Dataset(X, y), num_boost_round=num_boost_round)
+
+
+def test_train_raises_informative_error_if_any_valid_sets_are_not_dataset_objects():
+    X, y = make_synthetic_regression(n_samples=100)
+    X_valid = X * 2.0
+    with pytest.raises(TypeError, match=r"Every item in valid_sets must be a Dataset object\. Item 1 has type 'tuple'\."):
+        lgb.train(
+            params={},
+            train_set=lgb.Dataset(X, y),
+            valid_sets=[
+                lgb.Dataset(X_valid, y),
+                ([1.0], [2.0]),
+                [5.6, 5.7, 5.8]
+            ]
+        )
+
+
+def test_train_raises_informative_error_for_params_of_wrong_type():
+    X, y = make_synthetic_regression()
+    params = {"early_stopping_round": "too-many"}
+    dtrain = lgb.Dataset(X, label=y)
+    with pytest.raises(lgb.basic.LightGBMError, match="Parameter early_stopping_round should be of type int, got \"too-many\""):
+        lgb.train(params, dtrain)
+
+
+def test_quantized_training():
+    X, y = make_synthetic_regression()
+    ds = lgb.Dataset(X, label=y)
+    bst_params = {'num_leaves': 15, 'verbose': -1, 'seed': 0}
+    bst = lgb.train(bst_params, ds, num_boost_round=10)
+    rmse = np.sqrt(np.mean((bst.predict(X) - y) ** 2))
+    bst_params.update({
+        'use_quantized_grad': True,
+        'num_grad_quant_bins': 30,
+        'quant_train_renew_leaf': True,
+    })
+    quant_bst = lgb.train(bst_params, ds, num_boost_round=10)
+    quant_rmse = np.sqrt(np.mean((quant_bst.predict(X) - y) ** 2))
+    assert quant_rmse < rmse + 6.0
diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py
index 746c958a7304..2f1372545067 100644
--- a/tests/python_package_test/test_sklearn.py
+++ b/tests/python_package_test/test_sklearn.py
@@ -9,17 +9,19 @@
 import joblib
 import numpy as np
 import pytest
+import scipy.sparse
+from scipy.stats import spearmanr
 from sklearn.base import clone
 from sklearn.datasets import load_svmlight_file, make_blobs, make_multilabel_classification
 from sklearn.ensemble import StackingClassifier, StackingRegressor
-from sklearn.metrics import log_loss, mean_squared_error
+from sklearn.metrics import accuracy_score, log_loss, mean_squared_error, r2_score
 from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
 from sklearn.multioutput import ClassifierChain, MultiOutputClassifier, MultiOutputRegressor, RegressorChain
 from sklearn.utils.estimator_checks import parametrize_with_checks
 from sklearn.utils.validation import check_is_fitted
 
 import lightgbm as lgb
-from lightgbm.compat import PANDAS_INSTALLED, pd_DataFrame
+from lightgbm.compat import DATATABLE_INSTALLED, PANDAS_INSTALLED, dt_DataTable, pd_DataFrame, pd_Series
 
 from .utils import (load_breast_cancer, load_digits, load_iris, load_linnerud, make_ranking, make_synthetic_regression,
                     sklearn_multiclass_custom_objective, softmax)
@@ -27,20 +29,27 @@
 decreasing_generator = itertools.count(0, -1)
 task_to_model_factory = {
     'ranking': lgb.LGBMRanker,
-    'classification': lgb.LGBMClassifier,
+    'binary-classification': lgb.LGBMClassifier,
+    'multiclass-classification': lgb.LGBMClassifier,
     'regression': lgb.LGBMRegressor,
 }
 
 
-def _create_data(task):
+def _create_data(task, n_samples=100, n_features=4):
     if task == 'ranking':
-        X, y, g = make_ranking(n_features=4)
+        X, y, g = make_ranking(n_features=4, n_samples=n_samples)
         g = np.bincount(g)
-    elif task == 'classification':
-        X, y = load_iris(return_X_y=True)
+    elif task.endswith('classification'):
+        if task == 'binary-classification':
+            centers = 2
+        elif task == 'multiclass-classification':
+            centers = 3
+        else:
+            ValueError(f"Unknown classification task '{task}'")
+        X, y = make_blobs(n_samples=n_samples, n_features=n_features, centers=centers, random_state=42)
         g = None
     elif task == 'regression':
-        X, y = make_synthetic_regression()
+        X, y = make_synthetic_regression(n_samples=n_samples, n_features=n_features)
         g = None
     return X, y, g
 
@@ -248,6 +257,212 @@ def test_binary_classification_with_custom_objective():
     assert ret < 0.05
 
 
+def test_early_stopping_validation_set_split_strategy_param_check():
+
+    X, y = load_breast_cancer(return_X_y=True)
+    gbm = lgb.LGBMClassifier(
+        n_estimators=50,
+        random_state=42,
+        verbose=-1,
+        early_stopping=True,
+        validation_set_split_strategy="invalid_strategy"
+    )
+    with pytest.raises(
+        ValueError,
+        match=r"validation_set_split_strategy must be a callable or one of the following*"
+    ):
+        gbm.fit(X, y)
+
+
+@pytest.mark.parametrize('use_weight', [True, False])
+def test_binary_classification_with_auto_early_stopping(use_weight):
+
+    X, y = load_breast_cancer(return_X_y=True)
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
+    n_estimators = 1000
+    gbm = lgb.LGBMClassifier(
+        n_estimators=n_estimators, random_state=42, verbose=-1, early_stopping=True
+    )
+    weight = np.full_like(y_train, 2) if use_weight else None
+    gbm.fit(X_train, y_train, sample_weight=weight)
+    ret = log_loss(y_test, gbm.predict_proba(X_test))
+    assert gbm._Booster.num_trees() < n_estimators
+    assert ret < 0.21
+
+
+@pytest.mark.parametrize('use_weight', [True, False])
+def test_binary_classification_with_auto_early_stopping_use_train_as_val_set(use_weight, recwarn):
+
+    X, y = load_breast_cancer(return_X_y=True)
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
+    n_estimators = 1000
+    gbm = lgb.LGBMClassifier(
+        n_estimators=n_estimators,
+        random_state=42,
+        verbose=-1,
+        early_stopping=True,
+        validation_fraction=None  # Use train as validation set
+    )
+    weight = np.full_like(y_train, 2) if use_weight else None
+    gbm.fit(X_train, y_train, sample_weight=weight)
+    # Check that the warning UserWarning("Only training set found, disabling early stopping.")
+    # is not raised
+    assert len(recwarn) == 0
+    ret = log_loss(y_test, gbm.predict_proba(X_test))
+    assert gbm._Booster.num_trees() < n_estimators
+    assert ret < 0.24
+
+
+@pytest.mark.parametrize('use_weight', [True, False])
+def test_binary_classification_with_auto_early_stopping_random(use_weight):
+
+    X, y = load_breast_cancer(return_X_y=True)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
+    n_estimators = 1000
+    gbm = lgb.LGBMClassifier(
+        n_estimators=n_estimators,
+        random_state=42,
+        verbose=-1,
+        early_stopping=True,
+        validation_set_split_strategy="random"
+    )
+    weight = np.full_like(y_train, 2) if use_weight else None
+    gbm.fit(X_train, y_train, sample_weight=weight)
+    ret = log_loss(y_test, gbm.predict_proba(X_test))
+    assert gbm._Booster.num_trees() < n_estimators
+    assert ret < 0.18
+
+
+def test_binary_classification_with_custom_eval_set_splitter():
+    def custom_val_splitter(X, y):
+        return train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)
+    X, y = load_breast_cancer(return_X_y=True)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
+    n_estimators = 1000
+    gbm = lgb.LGBMClassifier(
+        n_estimators=n_estimators,
+        random_state=42,
+        early_stopping=True,
+        verbose=-1,
+        validation_set_split_strategy=custom_val_splitter
+    )
+    gbm.fit(X_train, y_train, callbacks=[lgb.early_stopping(5)])
+    ret = log_loss(y_test, gbm.predict_proba(X_test))
+    assert gbm._Booster.num_trees() < n_estimators
+    assert ret < 0.18
+
+
+@pytest.mark.parametrize('use_weight', [True, False])
+def test_regression_with_auto_early_stopping(use_weight):
+    X, y = make_synthetic_regression()
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
+    n_estimators = 1000
+    gbm = lgb.LGBMRegressor(
+        n_estimators=n_estimators,
+        random_state=42,
+        early_stopping=True,
+        verbose=-1,
+    )
+    weight = np.full_like(y_train, 2) if use_weight else None
+    gbm.fit(X_train, y_train, sample_weight=weight)
+    ret = mean_squared_error(y_test, gbm.predict(X_test))
+    assert gbm._Booster.num_trees() < n_estimators
+    assert ret < 400
+
+
+def test_regression_with_custom_eval_set_splitter():
+    def custom_val_splitter(X, y):
+        return train_test_split(X, y, test_size=0.1, random_state=42)
+    X, y = make_synthetic_regression()
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
+    n_estimators = 1000
+    gbm = lgb.LGBMRegressor(
+        n_estimators=n_estimators,
+        random_state=42,
+        early_stopping=True,
+        verbose=-1,
+        validation_set_split_strategy=custom_val_splitter
+    )
+    gbm.fit(X_train, y_train, callbacks=[lgb.early_stopping(5)])
+    ret = mean_squared_error(y_test, gbm.predict(X_test))
+    assert gbm._Booster.num_trees() < n_estimators
+    assert ret < 400
+
+
+@pytest.mark.skipif(getenv('TASK', '') == 'cuda', reason='Skip due to differences in implementation details of CUDA version')
+@pytest.mark.parametrize('use_weight', [True, False])
+def test_lambdarank_with_auto_early_stopping(use_weight):
+    rank_example_dir = Path(__file__).absolute().parents[2] / 'examples' / 'lambdarank'
+    X_train, y_train = load_svmlight_file(str(rank_example_dir / 'rank.train'))
+    q_train = np.loadtxt(str(rank_example_dir / 'rank.train.query'))
+    gbm = lgb.LGBMRanker(
+        n_estimators=50, random_state=42, early_stopping=True
+    )
+    weight = np.full_like(y_train, 2) if use_weight else None
+    gbm.fit(
+        X_train,
+        y_train,
+        sample_weight=weight,
+        group=q_train,
+        eval_at=[1, 3],
+        callbacks=[
+            lgb.reset_parameter(learning_rate=lambda x: max(0.01, 0.1 - 0.01 * x))
+        ]
+    )
+    assert gbm.best_iteration_ <= 24
+    assert gbm.best_score_['valid_0']['ndcg@1'] > 0.5674
+    assert gbm.best_score_['valid_0']['ndcg@3'] > 0.578
+
+
+@pytest.mark.skipif(getenv('TASK', '') == 'cuda', reason='Skip due to differences in implementation details of CUDA version')
+@pytest.mark.parametrize('validation_set_split_strategy', ["random", "stratify"])
+def test_lambdarank_with_auto_early_stopping_raise_exception(validation_set_split_strategy):
+    rank_example_dir = Path(__file__).absolute().parents[2] / 'examples' / 'lambdarank'
+    X_train, y_train = load_svmlight_file(str(rank_example_dir / 'rank.train'))
+    q_train = np.loadtxt(str(rank_example_dir / 'rank.train.query'))
+    gbm = lgb.LGBMRanker(
+        n_estimators=50,
+        random_state=42,
+        early_stopping=True,
+        validation_set_split_strategy=validation_set_split_strategy
+    )
+    with pytest.raises(ValueError, match=r"Parameter group has been specified but the selected*"):
+        gbm.fit(
+            X_train,
+            y_train,
+            group=q_train,
+        )
+
+
+@pytest.mark.skipif(getenv('TASK', '') == 'cuda', reason='Skip due to differences in implementation details of CUDA version')
+def test_lambdarank_with_custom_eval_set_splitter():
+    def custom_val_splitter(X, y, weight, group):
+        X_test, y_test = load_svmlight_file(str(rank_example_dir / 'rank.test'))
+        q_test = np.loadtxt(str(rank_example_dir / 'rank.test.query'))
+        return X, X_test, y, y_test, weight, None, group, q_test
+    rank_example_dir = Path(__file__).absolute().parents[2] / 'examples' / 'lambdarank'
+    X_train, y_train = load_svmlight_file(str(rank_example_dir / 'rank.train'))
+    q_train = np.loadtxt(str(rank_example_dir / 'rank.train.query'))
+    gbm = lgb.LGBMRanker(
+        n_estimators=50, random_state=42, early_stopping=True, validation_set_split_strategy=custom_val_splitter
+    )
+    gbm.fit(
+        X_train,
+        y_train,
+        group=q_train,
+        eval_at=[1, 3],
+        callbacks=[
+            lgb.early_stopping(10),
+            lgb.reset_parameter(learning_rate=lambda x: max(0.01, 0.1 - 0.01 * x))
+        ]
+    )
+    assert gbm.best_iteration_ <= 24
+    assert gbm.best_score_['valid_0']['ndcg@1'] > 0.5674
+    assert gbm.best_score_['valid_0']['ndcg@3'] > 0.578
+
+
 def test_dart():
     X, y = make_synthetic_regression()
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
@@ -304,20 +519,24 @@ def test_grid_search():
     y = y.astype(str)  # utilize label encoder at it's max power
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
     X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)
-    params = dict(subsample=0.8,
-                  subsample_freq=1)
-    grid_params = dict(boosting_type=['rf', 'gbdt'],
-                       n_estimators=[4, 6],
-                       reg_alpha=[0.01, 0.005])
+    params = {
+        "subsample": 0.8,
+        "subsample_freq": 1
+    }
+    grid_params = {
+        "boosting_type": ['rf', 'gbdt'],
+        "n_estimators": [4, 6],
+        "reg_alpha": [0.01, 0.005]
+    }
     evals_result = {}
-    fit_params = dict(
-        eval_set=[(X_val, y_val)],
-        eval_metric=constant_metric,
-        callbacks=[
+    fit_params = {
+        "eval_set": [(X_val, y_val)],
+        "eval_metric": constant_metric,
+        "callbacks": [
             lgb.early_stopping(2),
             lgb.record_evaluation(evals_result)
         ]
-    )
+    }
     grid = GridSearchCV(estimator=lgb.LGBMClassifier(**params), param_grid=grid_params, cv=2)
     grid.fit(X_train, y_train, **fit_params)
     score = grid.score(X_test, y_test)  # utilizes GridSearchCV default refit=True
@@ -341,14 +560,20 @@ def test_random_search():
     X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1,
                                                       random_state=42)
     n_iter = 3  # Number of samples
-    params = dict(subsample=0.8,
-                  subsample_freq=1)
-    param_dist = dict(boosting_type=['rf', 'gbdt'],
-                      n_estimators=[np.random.randint(low=3, high=10) for i in range(n_iter)],
-                      reg_alpha=[np.random.uniform(low=0.01, high=0.06) for i in range(n_iter)])
-    fit_params = dict(eval_set=[(X_val, y_val)],
-                      eval_metric=constant_metric,
-                      callbacks=[lgb.early_stopping(2)])
+    params = {
+        "subsample": 0.8,
+        "subsample_freq": 1
+    }
+    param_dist = {
+        "boosting_type": ['rf', 'gbdt'],
+        "n_estimators": [np.random.randint(low=3, high=10) for i in range(n_iter)],
+        "reg_alpha": [np.random.uniform(low=0.01, high=0.06) for i in range(n_iter)]
+    }
+    fit_params = {
+        "eval_set": [(X_val, y_val)],
+        "eval_metric": constant_metric,
+        "callbacks": [lgb.early_stopping(2)]
+    }
     rand = RandomizedSearchCV(estimator=lgb.LGBMClassifier(**params),
                               param_distributions=param_dist, cv=2,
                               n_iter=n_iter, random_state=42)
@@ -1130,7 +1355,7 @@ def fit_and_check(eval_set_names, metric_names, assumed_iteration, first_metric_
     iter_valid1_l2 = 4
     iter_valid2_l1 = 2
     iter_valid2_l2 = 2
-    assert len(set([iter_valid1_l1, iter_valid1_l2, iter_valid2_l1, iter_valid2_l2])) == 2
+    assert len({iter_valid1_l1, iter_valid1_l2, iter_valid2_l1, iter_valid2_l2}) == 2
     iter_min_l1 = min([iter_valid1_l1, iter_valid2_l1])
     iter_min_l2 = min([iter_valid1_l2, iter_valid2_l2])
     iter_min = min([iter_min_l1, iter_min_l2])
@@ -1268,7 +1493,7 @@ def test_sklearn_integration(estimator, check):
     check(estimator)
 
 
-@pytest.mark.parametrize('task', ['classification', 'ranking', 'regression'])
+@pytest.mark.parametrize('task', ['binary-classification', 'multiclass-classification', 'ranking', 'regression'])
 def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array(task):
     pd = pytest.importorskip("pandas")
     X, y, g = _create_data(task)
@@ -1378,9 +1603,9 @@ def test_default_n_jobs(tmp_path):
 
 
 @pytest.mark.skipif(not PANDAS_INSTALLED, reason='pandas is not installed')
-@pytest.mark.parametrize('task', ['classification', 'ranking', 'regression'])
+@pytest.mark.parametrize('task', ['binary-classification', 'multiclass-classification', 'ranking', 'regression'])
 def test_validate_features(task):
-    X, y, g = _create_data(task)
+    X, y, g = _create_data(task, n_features=4)
     features = ['x1', 'x2', 'x3', 'x4']
     df = pd_DataFrame(X, columns=features)
     model = task_to_model_factory[task](n_estimators=10, num_leaves=15, verbose=-1)
@@ -1397,3 +1622,148 @@ def test_validate_features(task):
 
     # check that disabling the check doesn't raise the error
     model.predict(df2, validate_features=False)
+
+
+@pytest.mark.parametrize('X_type', ['dt_DataTable', 'list2d', 'numpy', 'scipy_csc', 'scipy_csr', 'pd_DataFrame'])
+@pytest.mark.parametrize('y_type', ['list1d', 'numpy', 'pd_Series', 'pd_DataFrame'])
+@pytest.mark.parametrize('task', ['binary-classification', 'multiclass-classification', 'regression'])
+def test_classification_and_regression_minimally_work_with_all_all_accepted_data_types(X_type, y_type, task):
+    if any(t.startswith("pd_") for t in [X_type, y_type]) and not PANDAS_INSTALLED:
+        pytest.skip('pandas is not installed')
+    if any(t.startswith("dt_") for t in [X_type, y_type]) and not DATATABLE_INSTALLED:
+        pytest.skip('datatable is not installed')
+    X, y, g = _create_data(task, n_samples=2_000)
+    weights = np.abs(np.random.randn(y.shape[0]))
+
+    if task == 'binary-classification' or task == 'regression':
+        init_score = np.full_like(y, np.mean(y))
+    elif task == 'multiclass-classification':
+        init_score = np.outer(y, np.array([0.1, 0.2, 0.7]))
+    else:
+        raise ValueError(f"Unrecognized task '{task}'")
+
+    X_valid = X * 2
+    if X_type == 'dt_DataTable':
+        X = dt_DataTable(X)
+    elif X_type == 'list2d':
+        X = X.tolist()
+    elif X_type == 'scipy_csc':
+        X = scipy.sparse.csc_matrix(X)
+    elif X_type == 'scipy_csr':
+        X = scipy.sparse.csr_matrix(X)
+    elif X_type == 'pd_DataFrame':
+        X = pd_DataFrame(X)
+    elif X_type != 'numpy':
+        raise ValueError(f"Unrecognized X_type: '{X_type}'")
+
+    # make weights and init_score same types as y, just to avoid
+    # a huge number of combinations and therefore test cases
+    if y_type == 'list1d':
+        y = y.tolist()
+        weights = weights.tolist()
+        init_score = init_score.tolist()
+    elif y_type == 'pd_DataFrame':
+        y = pd_DataFrame(y)
+        weights = pd_Series(weights)
+        if task == 'multiclass-classification':
+            init_score = pd_DataFrame(init_score)
+        else:
+            init_score = pd_Series(init_score)
+    elif y_type == 'pd_Series':
+        y = pd_Series(y)
+        weights = pd_Series(weights)
+        if task == 'multiclass-classification':
+            init_score = pd_DataFrame(init_score)
+        else:
+            init_score = pd_Series(init_score)
+    elif y_type != 'numpy':
+        raise ValueError(f"Unrecognized y_type: '{y_type}'")
+
+    model = task_to_model_factory[task](n_estimators=10, verbose=-1)
+    model.fit(
+        X=X,
+        y=y,
+        sample_weight=weights,
+        init_score=init_score,
+        eval_set=[(X_valid, y)],
+        eval_sample_weight=[weights],
+        eval_init_score=[init_score]
+    )
+
+    preds = model.predict(X)
+    if task == 'binary-classification':
+        assert accuracy_score(y, preds) >= 0.99
+    elif task == 'multiclass-classification':
+        assert accuracy_score(y, preds) >= 0.99
+    elif task == 'regression':
+        assert r2_score(y, preds) > 0.86
+    else:
+        raise ValueError(f"Unrecognized task: '{task}'")
+
+
+@pytest.mark.parametrize('X_type', ['dt_DataTable', 'list2d', 'numpy', 'scipy_csc', 'scipy_csr', 'pd_DataFrame'])
+@pytest.mark.parametrize('y_type', ['list1d', 'numpy', 'pd_DataFrame', 'pd_Series'])
+@pytest.mark.parametrize('g_type', ['list1d_float', 'list1d_int', 'numpy', 'pd_Series'])
+def test_ranking_minimally_works_with_all_all_accepted_data_types(X_type, y_type, g_type):
+    if any(t.startswith("pd_") for t in [X_type, y_type, g_type]) and not PANDAS_INSTALLED:
+        pytest.skip('pandas is not installed')
+    if any(t.startswith("dt_") for t in [X_type, y_type, g_type]) and not DATATABLE_INSTALLED:
+        pytest.skip('datatable is not installed')
+    X, y, g = _create_data(task='ranking', n_samples=1_000)
+    weights = np.abs(np.random.randn(y.shape[0]))
+    init_score = np.full_like(y, np.mean(y))
+    X_valid = X * 2
+
+    if X_type == 'dt_DataTable':
+        X = dt_DataTable(X)
+    elif X_type == 'list2d':
+        X = X.tolist()
+    elif X_type == 'scipy_csc':
+        X = scipy.sparse.csc_matrix(X)
+    elif X_type == 'scipy_csr':
+        X = scipy.sparse.csr_matrix(X)
+    elif X_type == 'pd_DataFrame':
+        X = pd_DataFrame(X)
+    elif X_type != 'numpy':
+        raise ValueError(f"Unrecognized X_type: '{X_type}'")
+
+    # make weights and init_score same types as y, just to avoid
+    # a huge number of combinations and therefore test cases
+    if y_type == 'list1d':
+        y = y.tolist()
+        weights = weights.tolist()
+        init_score = init_score.tolist()
+    elif y_type == 'pd_DataFrame':
+        y = pd_DataFrame(y)
+        weights = pd_Series(weights)
+        init_score = pd_Series(init_score)
+    elif y_type == 'pd_Series':
+        y = pd_Series(y)
+        weights = pd_Series(weights)
+        init_score = pd_Series(init_score)
+    elif y_type != 'numpy':
+        raise ValueError(f"Unrecognized y_type: '{y_type}'")
+
+    if g_type == 'list1d_float':
+        g = g.astype("float").tolist()
+    elif g_type == 'list1d_int':
+        g = g.astype("int").tolist()
+    elif g_type == 'pd_Series':
+        g = pd_Series(g)
+    elif g_type != 'numpy':
+        raise ValueError(f"Unrecognized g_type: '{g_type}'")
+
+    model = task_to_model_factory['ranking'](n_estimators=10, verbose=-1)
+    model.fit(
+        X=X,
+        y=y,
+        sample_weight=weights,
+        init_score=init_score,
+        group=g,
+        eval_set=[(X_valid, y)],
+        eval_sample_weight=[weights],
+        eval_init_score=[init_score],
+        eval_group=[g]
+    )
+    preds = model.predict(X)
+    assert spearmanr(preds, y).correlation >= 0.99
diff --git a/windows/LightGBM.vcxproj b/windows/LightGBM.vcxproj
index 342616d27daa..96fe017e96b8 100644
--- a/windows/LightGBM.vcxproj
+++ b/windows/LightGBM.vcxproj
@@ -34,7 +34,7 @@
     <SccLocalPath>SAK</SccLocalPath>
     <SccProvider>SAK</SccProvider>
     <ProjectName>LightGBM</ProjectName>
-    <WindowsTargetPlatformVersion>8.1</WindowsTargetPlatformVersion>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Label="Configuration" Condition="'$(Configuration)|$(Platform)'=='Debug_mpi|x64'">
@@ -101,7 +101,7 @@
   </PropertyGroup>
   <ItemDefinitionGroup>
     <ClCompile>
-      <PreprocessorDefinitions>EIGEN_MPL2_ONLY;EIGEN_DONT_PARALLELIZE</PreprocessorDefinitions>
+      <PreprocessorDefinitions>EIGEN_MPL2_ONLY;EIGEN_DONT_PARALLELIZE;WIN_HAS_INET_PTON;</PreprocessorDefinitions>
     </ClCompile>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug_mpi|x64'">
@@ -306,6 +306,7 @@
     <ClInclude Include="..\src\treelearner\parallel_tree_learner.h" />
     <ClInclude Include="..\src\treelearner\serial_tree_learner.h" />
     <ClInclude Include="..\src\treelearner\split_info.hpp" />
+    <ClInclude Include="..\src\treelearner\gradient_discretizer.hpp" />
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="..\src\application\application.cpp" />
@@ -341,6 +342,7 @@
     <ClCompile Include="..\src\treelearner\serial_tree_learner.cpp" />
     <ClCompile Include="..\src\treelearner\tree_learner.cpp" />
     <ClCompile Include="..\src\treelearner\voting_parallel_tree_learner.cpp" />
+    <ClCompile Include="..\src\treelearner\gradient_discretizer.cpp" />
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
diff --git a/windows/LightGBM.vcxproj.filters b/windows/LightGBM.vcxproj.filters
index ed591fc4d87a..27b445893c0f 100644
--- a/windows/LightGBM.vcxproj.filters
+++ b/windows/LightGBM.vcxproj.filters
@@ -51,6 +51,9 @@
     <ClInclude Include="..\src\treelearner\serial_tree_learner.h">
       <Filter>src\treelearner</Filter>
     </ClInclude>
+    <ClInclude Include="..\src\treelearner\gradient_discretizer.hpp">
+      <Filter>src\treelearner</Filter>
+    </ClInclude>
     <ClInclude Include="..\src\application\predictor.hpp">
       <Filter>src\application</Filter>
     </ClInclude>
@@ -338,5 +341,8 @@
     <ClCompile Include="..\src\treelearner\linear_tree_learner.cpp">
       <Filter>src\treelearner</Filter>
     </ClCompile>
+    <ClCompile Include="..\src\treelearner\gradient_discretizer.cpp">
+      <Filter>src\treelearner</Filter>
+    </ClCompile>
   </ItemGroup>
 </Project>
\ No newline at end of file